vdev_raidz.c revision 297112
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
25 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26 * Copyright (c) 2014 Integros [integros.com]
27 */
28
29#include <sys/zfs_context.h>
30#include <sys/spa.h>
31#include <sys/vdev_impl.h>
32#ifdef illumos
33#include <sys/vdev_disk.h>
34#endif
35#include <sys/vdev_file.h>
36#include <sys/vdev_raidz.h>
37#include <sys/zio.h>
38#include <sys/zio_checksum.h>
39#include <sys/fs/zfs.h>
40#include <sys/fm/fs/zfs.h>
41#include <sys/bio.h>
42
43/*
44 * Virtual device vector for RAID-Z.
45 *
46 * This vdev supports single, double, and triple parity. For single parity,
47 * we use a simple XOR of all the data columns. For double or triple parity,
48 * we use a special case of Reed-Solomon coding. This extends the
49 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
50 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
51 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
52 * former is also based. The latter is designed to provide higher performance
53 * for writes.
54 *
55 * Note that the Plank paper claimed to support arbitrary N+M, but was then
56 * amended six years later identifying a critical flaw that invalidates its
57 * claims. Nevertheless, the technique can be adapted to work for up to
58 * triple parity. For additional parity, the amendment "Note: Correction to
59 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
60 * is viable, but the additional complexity means that write performance will
61 * suffer.
62 *
63 * All of the methods above operate on a Galois field, defined over the
64 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
65 * can be expressed with a single byte. Briefly, the operations on the
66 * field are defined as follows:
67 *
68 *   o addition (+) is represented by a bitwise XOR
69 *   o subtraction (-) is therefore identical to addition: A + B = A - B
70 *   o multiplication of A by 2 is defined by the following bitwise expression:
71 *
72 *	(A * 2)_7 = A_6
73 *	(A * 2)_6 = A_5
74 *	(A * 2)_5 = A_4
75 *	(A * 2)_4 = A_3 + A_7
76 *	(A * 2)_3 = A_2 + A_7
77 *	(A * 2)_2 = A_1 + A_7
78 *	(A * 2)_1 = A_0
79 *	(A * 2)_0 = A_7
80 *
81 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
82 * As an aside, this multiplication is derived from the error correcting
83 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
84 *
85 * Observe that any number in the field (except for 0) can be expressed as a
86 * power of 2 -- a generator for the field. We store a table of the powers of
87 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
88 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
89 * than field addition). The inverse of a field element A (A^-1) is therefore
90 * A ^ (255 - 1) = A^254.
91 *
92 * The up-to-three parity columns, P, Q, R over several data columns,
93 * D_0, ... D_n-1, can be expressed by field operations:
94 *
95 *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
96 *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
97 *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
98 *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
99 *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
100 *
101 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
102 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
103 * independent coefficients. (There are no additional coefficients that have
104 * this property which is why the uncorrected Plank method breaks down.)
105 *
106 * See the reconstruction code below for how P, Q and R can used individually
107 * or in concert to recover missing data columns.
108 */
109
110typedef struct raidz_col {
111	uint64_t rc_devidx;		/* child device index for I/O */
112	uint64_t rc_offset;		/* device offset */
113	uint64_t rc_size;		/* I/O size */
114	void *rc_data;			/* I/O data */
115	void *rc_gdata;			/* used to store the "good" version */
116	int rc_error;			/* I/O error for this device */
117	uint8_t rc_tried;		/* Did we attempt this I/O column? */
118	uint8_t rc_skipped;		/* Did we skip this I/O column? */
119} raidz_col_t;
120
121typedef struct raidz_map {
122	uint64_t rm_cols;		/* Regular column count */
123	uint64_t rm_scols;		/* Count including skipped columns */
124	uint64_t rm_bigcols;		/* Number of oversized columns */
125	uint64_t rm_asize;		/* Actual total I/O size */
126	uint64_t rm_missingdata;	/* Count of missing data devices */
127	uint64_t rm_missingparity;	/* Count of missing parity devices */
128	uint64_t rm_firstdatacol;	/* First data column/parity count */
129	uint64_t rm_nskip;		/* Skipped sectors for padding */
130	uint64_t rm_skipstart;		/* Column index of padding start */
131	void *rm_datacopy;		/* rm_asize-buffer of copied data */
132	uintptr_t rm_reports;		/* # of referencing checksum reports */
133	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
134	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
135	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
136} raidz_map_t;
137
138#define	VDEV_RAIDZ_P		0
139#define	VDEV_RAIDZ_Q		1
140#define	VDEV_RAIDZ_R		2
141
142#define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
143#define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
144
145/*
146 * We provide a mechanism to perform the field multiplication operation on a
147 * 64-bit value all at once rather than a byte at a time. This works by
148 * creating a mask from the top bit in each byte and using that to
149 * conditionally apply the XOR of 0x1d.
150 */
151#define	VDEV_RAIDZ_64MUL_2(x, mask) \
152{ \
153	(mask) = (x) & 0x8080808080808080ULL; \
154	(mask) = ((mask) << 1) - ((mask) >> 7); \
155	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
156	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
157}
158
159#define	VDEV_RAIDZ_64MUL_4(x, mask) \
160{ \
161	VDEV_RAIDZ_64MUL_2((x), mask); \
162	VDEV_RAIDZ_64MUL_2((x), mask); \
163}
164
165#define	VDEV_LABEL_OFFSET(x)	(x + VDEV_LABEL_START_SIZE)
166
167/*
168 * Force reconstruction to use the general purpose method.
169 */
170int vdev_raidz_default_to_general;
171
172/* Powers of 2 in the Galois field defined above. */
173static const uint8_t vdev_raidz_pow2[256] = {
174	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
175	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
176	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
177	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
178	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
179	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
180	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
181	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
182	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
183	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
184	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
185	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
186	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
187	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
188	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
189	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
190	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
191	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
192	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
193	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
194	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
195	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
196	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
197	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
198	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
199	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
200	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
201	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
202	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
203	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
204	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
205	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
206};
207/* Logs of 2 in the Galois field defined above. */
208static const uint8_t vdev_raidz_log2[256] = {
209	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
210	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
211	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
212	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
213	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
214	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
215	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
216	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
217	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
218	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
219	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
220	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
221	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
222	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
223	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
224	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
225	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
226	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
227	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
228	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
229	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
230	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
231	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
232	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
233	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
234	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
235	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
236	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
237	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
238	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
239	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
240	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
241};
242
243static void vdev_raidz_generate_parity(raidz_map_t *rm);
244
245/*
246 * Multiply a given number by 2 raised to the given power.
247 */
248static uint8_t
249vdev_raidz_exp2(uint_t a, int exp)
250{
251	if (a == 0)
252		return (0);
253
254	ASSERT(exp >= 0);
255	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
256
257	exp += vdev_raidz_log2[a];
258	if (exp > 255)
259		exp -= 255;
260
261	return (vdev_raidz_pow2[exp]);
262}
263
264static void
265vdev_raidz_map_free(raidz_map_t *rm)
266{
267	int c;
268	size_t size;
269
270	for (c = 0; c < rm->rm_firstdatacol; c++) {
271		if (rm->rm_col[c].rc_data != NULL)
272			zio_buf_free(rm->rm_col[c].rc_data,
273			    rm->rm_col[c].rc_size);
274
275		if (rm->rm_col[c].rc_gdata != NULL)
276			zio_buf_free(rm->rm_col[c].rc_gdata,
277			    rm->rm_col[c].rc_size);
278	}
279
280	size = 0;
281	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
282		size += rm->rm_col[c].rc_size;
283
284	if (rm->rm_datacopy != NULL)
285		zio_buf_free(rm->rm_datacopy, size);
286
287	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
288}
289
290static void
291vdev_raidz_map_free_vsd(zio_t *zio)
292{
293	raidz_map_t *rm = zio->io_vsd;
294
295	ASSERT0(rm->rm_freed);
296	rm->rm_freed = 1;
297
298	if (rm->rm_reports == 0)
299		vdev_raidz_map_free(rm);
300}
301
302/*ARGSUSED*/
303static void
304vdev_raidz_cksum_free(void *arg, size_t ignored)
305{
306	raidz_map_t *rm = arg;
307
308	ASSERT3U(rm->rm_reports, >, 0);
309
310	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
311		vdev_raidz_map_free(rm);
312}
313
314static void
315vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
316{
317	raidz_map_t *rm = zcr->zcr_cbdata;
318	size_t c = zcr->zcr_cbinfo;
319	size_t x;
320
321	const char *good = NULL;
322	const char *bad = rm->rm_col[c].rc_data;
323
324	if (good_data == NULL) {
325		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
326		return;
327	}
328
329	if (c < rm->rm_firstdatacol) {
330		/*
331		 * The first time through, calculate the parity blocks for
332		 * the good data (this relies on the fact that the good
333		 * data never changes for a given logical ZIO)
334		 */
335		if (rm->rm_col[0].rc_gdata == NULL) {
336			char *bad_parity[VDEV_RAIDZ_MAXPARITY];
337			char *buf;
338
339			/*
340			 * Set up the rm_col[]s to generate the parity for
341			 * good_data, first saving the parity bufs and
342			 * replacing them with buffers to hold the result.
343			 */
344			for (x = 0; x < rm->rm_firstdatacol; x++) {
345				bad_parity[x] = rm->rm_col[x].rc_data;
346				rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
347				    zio_buf_alloc(rm->rm_col[x].rc_size);
348			}
349
350			/* fill in the data columns from good_data */
351			buf = (char *)good_data;
352			for (; x < rm->rm_cols; x++) {
353				rm->rm_col[x].rc_data = buf;
354				buf += rm->rm_col[x].rc_size;
355			}
356
357			/*
358			 * Construct the parity from the good data.
359			 */
360			vdev_raidz_generate_parity(rm);
361
362			/* restore everything back to its original state */
363			for (x = 0; x < rm->rm_firstdatacol; x++)
364				rm->rm_col[x].rc_data = bad_parity[x];
365
366			buf = rm->rm_datacopy;
367			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
368				rm->rm_col[x].rc_data = buf;
369				buf += rm->rm_col[x].rc_size;
370			}
371		}
372
373		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
374		good = rm->rm_col[c].rc_gdata;
375	} else {
376		/* adjust good_data to point at the start of our column */
377		good = good_data;
378
379		for (x = rm->rm_firstdatacol; x < c; x++)
380			good += rm->rm_col[x].rc_size;
381	}
382
383	/* we drop the ereport if it ends up that the data was good */
384	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
385}
386
387/*
388 * Invoked indirectly by zfs_ereport_start_checksum(), called
389 * below when our read operation fails completely.  The main point
390 * is to keep a copy of everything we read from disk, so that at
391 * vdev_raidz_cksum_finish() time we can compare it with the good data.
392 */
393static void
394vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
395{
396	size_t c = (size_t)(uintptr_t)arg;
397	caddr_t buf;
398
399	raidz_map_t *rm = zio->io_vsd;
400	size_t size;
401
402	/* set up the report and bump the refcount  */
403	zcr->zcr_cbdata = rm;
404	zcr->zcr_cbinfo = c;
405	zcr->zcr_finish = vdev_raidz_cksum_finish;
406	zcr->zcr_free = vdev_raidz_cksum_free;
407
408	rm->rm_reports++;
409	ASSERT3U(rm->rm_reports, >, 0);
410
411	if (rm->rm_datacopy != NULL)
412		return;
413
414	/*
415	 * It's the first time we're called for this raidz_map_t, so we need
416	 * to copy the data aside; there's no guarantee that our zio's buffer
417	 * won't be re-used for something else.
418	 *
419	 * Our parity data is already in separate buffers, so there's no need
420	 * to copy them.
421	 */
422
423	size = 0;
424	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
425		size += rm->rm_col[c].rc_size;
426
427	buf = rm->rm_datacopy = zio_buf_alloc(size);
428
429	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
430		raidz_col_t *col = &rm->rm_col[c];
431
432		bcopy(col->rc_data, buf, col->rc_size);
433		col->rc_data = buf;
434
435		buf += col->rc_size;
436	}
437	ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
438}
439
440static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
441	vdev_raidz_map_free_vsd,
442	vdev_raidz_cksum_report
443};
444
445/*
446 * Divides the IO evenly across all child vdevs; usually, dcols is
447 * the number of children in the target vdev.
448 */
449static raidz_map_t *
450vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, boolean_t dofree,
451    uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
452{
453	raidz_map_t *rm;
454	/* The starting RAIDZ (parent) vdev sector of the block. */
455	uint64_t b = offset >> unit_shift;
456	/* The zio's size in units of the vdev's minimum sector size. */
457	uint64_t s = size >> unit_shift;
458	/* The first column for this stripe. */
459	uint64_t f = b % dcols;
460	/* The starting byte offset on each child vdev. */
461	uint64_t o = (b / dcols) << unit_shift;
462	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
463
464	/*
465	 * "Quotient": The number of data sectors for this stripe on all but
466	 * the "big column" child vdevs that also contain "remainder" data.
467	 */
468	q = s / (dcols - nparity);
469
470	/*
471	 * "Remainder": The number of partial stripe data sectors in this I/O.
472	 * This will add a sector to some, but not all, child vdevs.
473	 */
474	r = s - q * (dcols - nparity);
475
476	/* The number of "big columns" - those which contain remainder data. */
477	bc = (r == 0 ? 0 : r + nparity);
478
479	/*
480	 * The total number of data and parity sectors associated with
481	 * this I/O.
482	 */
483	tot = s + nparity * (q + (r == 0 ? 0 : 1));
484
485	/* acols: The columns that will be accessed. */
486	/* scols: The columns that will be accessed or skipped. */
487	if (q == 0) {
488		/* Our I/O request doesn't span all child vdevs. */
489		acols = bc;
490		scols = MIN(dcols, roundup(bc, nparity + 1));
491	} else {
492		acols = dcols;
493		scols = dcols;
494	}
495
496	ASSERT3U(acols, <=, scols);
497
498	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
499
500	rm->rm_cols = acols;
501	rm->rm_scols = scols;
502	rm->rm_bigcols = bc;
503	rm->rm_skipstart = bc;
504	rm->rm_missingdata = 0;
505	rm->rm_missingparity = 0;
506	rm->rm_firstdatacol = nparity;
507	rm->rm_datacopy = NULL;
508	rm->rm_reports = 0;
509	rm->rm_freed = 0;
510	rm->rm_ecksuminjected = 0;
511
512	asize = 0;
513
514	for (c = 0; c < scols; c++) {
515		col = f + c;
516		coff = o;
517		if (col >= dcols) {
518			col -= dcols;
519			coff += 1ULL << unit_shift;
520		}
521		rm->rm_col[c].rc_devidx = col;
522		rm->rm_col[c].rc_offset = coff;
523		rm->rm_col[c].rc_data = NULL;
524		rm->rm_col[c].rc_gdata = NULL;
525		rm->rm_col[c].rc_error = 0;
526		rm->rm_col[c].rc_tried = 0;
527		rm->rm_col[c].rc_skipped = 0;
528
529		if (c >= acols)
530			rm->rm_col[c].rc_size = 0;
531		else if (c < bc)
532			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
533		else
534			rm->rm_col[c].rc_size = q << unit_shift;
535
536		asize += rm->rm_col[c].rc_size;
537	}
538
539	ASSERT3U(asize, ==, tot << unit_shift);
540	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
541	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
542	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
543	ASSERT3U(rm->rm_nskip, <=, nparity);
544
545	if (!dofree) {
546		for (c = 0; c < rm->rm_firstdatacol; c++) {
547			rm->rm_col[c].rc_data =
548			    zio_buf_alloc(rm->rm_col[c].rc_size);
549		}
550
551		rm->rm_col[c].rc_data = data;
552
553		for (c = c + 1; c < acols; c++) {
554			rm->rm_col[c].rc_data =
555			    (char *)rm->rm_col[c - 1].rc_data +
556			    rm->rm_col[c - 1].rc_size;
557		}
558	}
559
560	/*
561	 * If all data stored spans all columns, there's a danger that parity
562	 * will always be on the same device and, since parity isn't read
563	 * during normal operation, that that device's I/O bandwidth won't be
564	 * used effectively. We therefore switch the parity every 1MB.
565	 *
566	 * ... at least that was, ostensibly, the theory. As a practical
567	 * matter unless we juggle the parity between all devices evenly, we
568	 * won't see any benefit. Further, occasional writes that aren't a
569	 * multiple of the LCM of the number of children and the minimum
570	 * stripe width are sufficient to avoid pessimal behavior.
571	 * Unfortunately, this decision created an implicit on-disk format
572	 * requirement that we need to support for all eternity, but only
573	 * for single-parity RAID-Z.
574	 *
575	 * If we intend to skip a sector in the zeroth column for padding
576	 * we must make sure to note this swap. We will never intend to
577	 * skip the first column since at least one data and one parity
578	 * column must appear in each row.
579	 */
580	ASSERT(rm->rm_cols >= 2);
581	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
582
583	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
584		devidx = rm->rm_col[0].rc_devidx;
585		o = rm->rm_col[0].rc_offset;
586		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
587		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
588		rm->rm_col[1].rc_devidx = devidx;
589		rm->rm_col[1].rc_offset = o;
590
591		if (rm->rm_skipstart == 0)
592			rm->rm_skipstart = 1;
593	}
594
595	return (rm);
596}
597
598static void
599vdev_raidz_generate_parity_p(raidz_map_t *rm)
600{
601	uint64_t *p, *src, pcount, ccount, i;
602	int c;
603
604	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
605
606	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
607		src = rm->rm_col[c].rc_data;
608		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
609		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
610
611		if (c == rm->rm_firstdatacol) {
612			ASSERT(ccount == pcount);
613			for (i = 0; i < ccount; i++, src++, p++) {
614				*p = *src;
615			}
616		} else {
617			ASSERT(ccount <= pcount);
618			for (i = 0; i < ccount; i++, src++, p++) {
619				*p ^= *src;
620			}
621		}
622	}
623}
624
625static void
626vdev_raidz_generate_parity_pq(raidz_map_t *rm)
627{
628	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
629	int c;
630
631	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
632	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
633	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
634
635	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
636		src = rm->rm_col[c].rc_data;
637		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
638		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
639
640		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
641
642		if (c == rm->rm_firstdatacol) {
643			ASSERT(ccnt == pcnt || ccnt == 0);
644			for (i = 0; i < ccnt; i++, src++, p++, q++) {
645				*p = *src;
646				*q = *src;
647			}
648			for (; i < pcnt; i++, src++, p++, q++) {
649				*p = 0;
650				*q = 0;
651			}
652		} else {
653			ASSERT(ccnt <= pcnt);
654
655			/*
656			 * Apply the algorithm described above by multiplying
657			 * the previous result and adding in the new value.
658			 */
659			for (i = 0; i < ccnt; i++, src++, p++, q++) {
660				*p ^= *src;
661
662				VDEV_RAIDZ_64MUL_2(*q, mask);
663				*q ^= *src;
664			}
665
666			/*
667			 * Treat short columns as though they are full of 0s.
668			 * Note that there's therefore nothing needed for P.
669			 */
670			for (; i < pcnt; i++, q++) {
671				VDEV_RAIDZ_64MUL_2(*q, mask);
672			}
673		}
674	}
675}
676
677static void
678vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
679{
680	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
681	int c;
682
683	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
684	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
685	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
686	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
687	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
688
689	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
690		src = rm->rm_col[c].rc_data;
691		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
692		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
693		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
694
695		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
696
697		if (c == rm->rm_firstdatacol) {
698			ASSERT(ccnt == pcnt || ccnt == 0);
699			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
700				*p = *src;
701				*q = *src;
702				*r = *src;
703			}
704			for (; i < pcnt; i++, src++, p++, q++, r++) {
705				*p = 0;
706				*q = 0;
707				*r = 0;
708			}
709		} else {
710			ASSERT(ccnt <= pcnt);
711
712			/*
713			 * Apply the algorithm described above by multiplying
714			 * the previous result and adding in the new value.
715			 */
716			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
717				*p ^= *src;
718
719				VDEV_RAIDZ_64MUL_2(*q, mask);
720				*q ^= *src;
721
722				VDEV_RAIDZ_64MUL_4(*r, mask);
723				*r ^= *src;
724			}
725
726			/*
727			 * Treat short columns as though they are full of 0s.
728			 * Note that there's therefore nothing needed for P.
729			 */
730			for (; i < pcnt; i++, q++, r++) {
731				VDEV_RAIDZ_64MUL_2(*q, mask);
732				VDEV_RAIDZ_64MUL_4(*r, mask);
733			}
734		}
735	}
736}
737
738/*
739 * Generate RAID parity in the first virtual columns according to the number of
740 * parity columns available.
741 */
742static void
743vdev_raidz_generate_parity(raidz_map_t *rm)
744{
745	switch (rm->rm_firstdatacol) {
746	case 1:
747		vdev_raidz_generate_parity_p(rm);
748		break;
749	case 2:
750		vdev_raidz_generate_parity_pq(rm);
751		break;
752	case 3:
753		vdev_raidz_generate_parity_pqr(rm);
754		break;
755	default:
756		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
757	}
758}
759
760static int
761vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
762{
763	uint64_t *dst, *src, xcount, ccount, count, i;
764	int x = tgts[0];
765	int c;
766
767	ASSERT(ntgts == 1);
768	ASSERT(x >= rm->rm_firstdatacol);
769	ASSERT(x < rm->rm_cols);
770
771	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
772	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
773	ASSERT(xcount > 0);
774
775	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
776	dst = rm->rm_col[x].rc_data;
777	for (i = 0; i < xcount; i++, dst++, src++) {
778		*dst = *src;
779	}
780
781	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
782		src = rm->rm_col[c].rc_data;
783		dst = rm->rm_col[x].rc_data;
784
785		if (c == x)
786			continue;
787
788		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
789		count = MIN(ccount, xcount);
790
791		for (i = 0; i < count; i++, dst++, src++) {
792			*dst ^= *src;
793		}
794	}
795
796	return (1 << VDEV_RAIDZ_P);
797}
798
799static int
800vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
801{
802	uint64_t *dst, *src, xcount, ccount, count, mask, i;
803	uint8_t *b;
804	int x = tgts[0];
805	int c, j, exp;
806
807	ASSERT(ntgts == 1);
808
809	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
810	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
811
812	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
813		src = rm->rm_col[c].rc_data;
814		dst = rm->rm_col[x].rc_data;
815
816		if (c == x)
817			ccount = 0;
818		else
819			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
820
821		count = MIN(ccount, xcount);
822
823		if (c == rm->rm_firstdatacol) {
824			for (i = 0; i < count; i++, dst++, src++) {
825				*dst = *src;
826			}
827			for (; i < xcount; i++, dst++) {
828				*dst = 0;
829			}
830
831		} else {
832			for (i = 0; i < count; i++, dst++, src++) {
833				VDEV_RAIDZ_64MUL_2(*dst, mask);
834				*dst ^= *src;
835			}
836
837			for (; i < xcount; i++, dst++) {
838				VDEV_RAIDZ_64MUL_2(*dst, mask);
839			}
840		}
841	}
842
843	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
844	dst = rm->rm_col[x].rc_data;
845	exp = 255 - (rm->rm_cols - 1 - x);
846
847	for (i = 0; i < xcount; i++, dst++, src++) {
848		*dst ^= *src;
849		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
850			*b = vdev_raidz_exp2(*b, exp);
851		}
852	}
853
854	return (1 << VDEV_RAIDZ_Q);
855}
856
857static int
858vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
859{
860	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
861	void *pdata, *qdata;
862	uint64_t xsize, ysize, i;
863	int x = tgts[0];
864	int y = tgts[1];
865
866	ASSERT(ntgts == 2);
867	ASSERT(x < y);
868	ASSERT(x >= rm->rm_firstdatacol);
869	ASSERT(y < rm->rm_cols);
870
871	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
872
873	/*
874	 * Move the parity data aside -- we're going to compute parity as
875	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
876	 * reuse the parity generation mechanism without trashing the actual
877	 * parity so we make those columns appear to be full of zeros by
878	 * setting their lengths to zero.
879	 */
880	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
881	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
882	xsize = rm->rm_col[x].rc_size;
883	ysize = rm->rm_col[y].rc_size;
884
885	rm->rm_col[VDEV_RAIDZ_P].rc_data =
886	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
887	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
888	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
889	rm->rm_col[x].rc_size = 0;
890	rm->rm_col[y].rc_size = 0;
891
892	vdev_raidz_generate_parity_pq(rm);
893
894	rm->rm_col[x].rc_size = xsize;
895	rm->rm_col[y].rc_size = ysize;
896
897	p = pdata;
898	q = qdata;
899	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
900	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
901	xd = rm->rm_col[x].rc_data;
902	yd = rm->rm_col[y].rc_data;
903
904	/*
905	 * We now have:
906	 *	Pxy = P + D_x + D_y
907	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
908	 *
909	 * We can then solve for D_x:
910	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
911	 * where
912	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
913	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
914	 *
915	 * With D_x in hand, we can easily solve for D_y:
916	 *	D_y = P + Pxy + D_x
917	 */
918
919	a = vdev_raidz_pow2[255 + x - y];
920	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
921	tmp = 255 - vdev_raidz_log2[a ^ 1];
922
923	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
924	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
925
926	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
927		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
928		    vdev_raidz_exp2(*q ^ *qxy, bexp);
929
930		if (i < ysize)
931			*yd = *p ^ *pxy ^ *xd;
932	}
933
934	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
935	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
936	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
937	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
938
939	/*
940	 * Restore the saved parity data.
941	 */
942	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
943	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
944
945	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
946}
947
948/* BEGIN CSTYLED */
949/*
950 * In the general case of reconstruction, we must solve the system of linear
951 * equations defined by the coeffecients used to generate parity as well as
952 * the contents of the data and parity disks. This can be expressed with
953 * vectors for the original data (D) and the actual data (d) and parity (p)
954 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
955 *
956 *            __   __                     __     __
957 *            |     |         __     __   |  p_0  |
958 *            |  V  |         |  D_0  |   | p_m-1 |
959 *            |     |    x    |   :   | = |  d_0  |
960 *            |  I  |         | D_n-1 |   |   :   |
961 *            |     |         ~~     ~~   | d_n-1 |
962 *            ~~   ~~                     ~~     ~~
963 *
964 * I is simply a square identity matrix of size n, and V is a vandermonde
965 * matrix defined by the coeffecients we chose for the various parity columns
966 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
967 * computation as well as linear separability.
968 *
969 *      __               __               __     __
970 *      |   1   ..  1 1 1 |               |  p_0  |
971 *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
972 *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
973 *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
974 *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
975 *      |   :       : : : |   |   :   |   |  d_2  |
976 *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
977 *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
978 *      |   0   ..  0 0 1 |               | d_n-1 |
979 *      ~~               ~~               ~~     ~~
980 *
981 * Note that I, V, d, and p are known. To compute D, we must invert the
982 * matrix and use the known data and parity values to reconstruct the unknown
983 * data values. We begin by removing the rows in V|I and d|p that correspond
984 * to failed or missing columns; we then make V|I square (n x n) and d|p
985 * sized n by removing rows corresponding to unused parity from the bottom up
986 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
987 * using Gauss-Jordan elimination. In the example below we use m=3 parity
988 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
989 *           __                               __
990 *           |  1   1   1   1   1   1   1   1  |
991 *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
992 *           |  19 205 116  29  64  16  4   1  |      / /
993 *           |  1   0   0   0   0   0   0   0  |     / /
994 *           |  0   1   0   0   0   0   0   0  | <--' /
995 *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
996 *           |  0   0   0   1   0   0   0   0  |
997 *           |  0   0   0   0   1   0   0   0  |
998 *           |  0   0   0   0   0   1   0   0  |
999 *           |  0   0   0   0   0   0   1   0  |
1000 *           |  0   0   0   0   0   0   0   1  |
1001 *           ~~                               ~~
1002 *           __                               __
1003 *           |  1   1   1   1   1   1   1   1  |
1004 *           |  19 205 116  29  64  16  4   1  |
1005 *           |  1   0   0   0   0   0   0   0  |
1006 *  (V|I)' = |  0   0   0   1   0   0   0   0  |
1007 *           |  0   0   0   0   1   0   0   0  |
1008 *           |  0   0   0   0   0   1   0   0  |
1009 *           |  0   0   0   0   0   0   1   0  |
1010 *           |  0   0   0   0   0   0   0   1  |
1011 *           ~~                               ~~
1012 *
1013 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1014 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1015 * matrix is not singular.
1016 * __                                                                 __
1017 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1018 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1019 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1020 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1021 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1022 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1023 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1024 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1025 * ~~                                                                 ~~
1026 * __                                                                 __
1027 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1028 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1029 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1030 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1031 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1032 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1033 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1034 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1035 * ~~                                                                 ~~
1036 * __                                                                 __
1037 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1038 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1039 * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1040 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1041 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1042 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1043 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1044 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1045 * ~~                                                                 ~~
1046 * __                                                                 __
1047 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1048 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1049 * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1050 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1051 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1052 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1053 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1054 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1055 * ~~                                                                 ~~
1056 * __                                                                 __
1057 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1058 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1059 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1060 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1061 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1062 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1063 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1064 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1065 * ~~                                                                 ~~
1066 * __                                                                 __
1067 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1068 * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1069 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1070 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1071 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1072 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1073 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1074 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1075 * ~~                                                                 ~~
1076 *                   __                               __
1077 *                   |  0   0   1   0   0   0   0   0  |
1078 *                   | 167 100  5   41 159 169 217 208 |
1079 *                   | 166 100  4   40 158 168 216 209 |
1080 *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1081 *                   |  0   0   0   0   1   0   0   0  |
1082 *                   |  0   0   0   0   0   1   0   0  |
1083 *                   |  0   0   0   0   0   0   1   0  |
1084 *                   |  0   0   0   0   0   0   0   1  |
1085 *                   ~~                               ~~
1086 *
1087 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1088 * of the missing data.
1089 *
1090 * As is apparent from the example above, the only non-trivial rows in the
1091 * inverse matrix correspond to the data disks that we're trying to
1092 * reconstruct. Indeed, those are the only rows we need as the others would
1093 * only be useful for reconstructing data known or assumed to be valid. For
1094 * that reason, we only build the coefficients in the rows that correspond to
1095 * targeted columns.
1096 */
1097/* END CSTYLED */
1098
1099static void
1100vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1101    uint8_t **rows)
1102{
1103	int i, j;
1104	int pow;
1105
1106	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1107
1108	/*
1109	 * Fill in the missing rows of interest.
1110	 */
1111	for (i = 0; i < nmap; i++) {
1112		ASSERT3S(0, <=, map[i]);
1113		ASSERT3S(map[i], <=, 2);
1114
1115		pow = map[i] * n;
1116		if (pow > 255)
1117			pow -= 255;
1118		ASSERT(pow <= 255);
1119
1120		for (j = 0; j < n; j++) {
1121			pow -= map[i];
1122			if (pow < 0)
1123				pow += 255;
1124			rows[i][j] = vdev_raidz_pow2[pow];
1125		}
1126	}
1127}
1128
1129static void
1130vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1131    uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1132{
1133	int i, j, ii, jj;
1134	uint8_t log;
1135
1136	/*
1137	 * Assert that the first nmissing entries from the array of used
1138	 * columns correspond to parity columns and that subsequent entries
1139	 * correspond to data columns.
1140	 */
1141	for (i = 0; i < nmissing; i++) {
1142		ASSERT3S(used[i], <, rm->rm_firstdatacol);
1143	}
1144	for (; i < n; i++) {
1145		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1146	}
1147
1148	/*
1149	 * First initialize the storage where we'll compute the inverse rows.
1150	 */
1151	for (i = 0; i < nmissing; i++) {
1152		for (j = 0; j < n; j++) {
1153			invrows[i][j] = (i == j) ? 1 : 0;
1154		}
1155	}
1156
1157	/*
1158	 * Subtract all trivial rows from the rows of consequence.
1159	 */
1160	for (i = 0; i < nmissing; i++) {
1161		for (j = nmissing; j < n; j++) {
1162			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1163			jj = used[j] - rm->rm_firstdatacol;
1164			ASSERT3S(jj, <, n);
1165			invrows[i][j] = rows[i][jj];
1166			rows[i][jj] = 0;
1167		}
1168	}
1169
1170	/*
1171	 * For each of the rows of interest, we must normalize it and subtract
1172	 * a multiple of it from the other rows.
1173	 */
1174	for (i = 0; i < nmissing; i++) {
1175		for (j = 0; j < missing[i]; j++) {
1176			ASSERT0(rows[i][j]);
1177		}
1178		ASSERT3U(rows[i][missing[i]], !=, 0);
1179
1180		/*
1181		 * Compute the inverse of the first element and multiply each
1182		 * element in the row by that value.
1183		 */
1184		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1185
1186		for (j = 0; j < n; j++) {
1187			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1188			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1189		}
1190
1191		for (ii = 0; ii < nmissing; ii++) {
1192			if (i == ii)
1193				continue;
1194
1195			ASSERT3U(rows[ii][missing[i]], !=, 0);
1196
1197			log = vdev_raidz_log2[rows[ii][missing[i]]];
1198
1199			for (j = 0; j < n; j++) {
1200				rows[ii][j] ^=
1201				    vdev_raidz_exp2(rows[i][j], log);
1202				invrows[ii][j] ^=
1203				    vdev_raidz_exp2(invrows[i][j], log);
1204			}
1205		}
1206	}
1207
1208	/*
1209	 * Verify that the data that is left in the rows are properly part of
1210	 * an identity matrix.
1211	 */
1212	for (i = 0; i < nmissing; i++) {
1213		for (j = 0; j < n; j++) {
1214			if (j == missing[i]) {
1215				ASSERT3U(rows[i][j], ==, 1);
1216			} else {
1217				ASSERT0(rows[i][j]);
1218			}
1219		}
1220	}
1221}
1222
1223static void
1224vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1225    int *missing, uint8_t **invrows, const uint8_t *used)
1226{
1227	int i, j, x, cc, c;
1228	uint8_t *src;
1229	uint64_t ccount;
1230	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1231	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1232	uint8_t log = 0;
1233	uint8_t val;
1234	int ll;
1235	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1236	uint8_t *p, *pp;
1237	size_t psize;
1238
1239	psize = sizeof (invlog[0][0]) * n * nmissing;
1240	p = kmem_alloc(psize, KM_SLEEP);
1241
1242	for (pp = p, i = 0; i < nmissing; i++) {
1243		invlog[i] = pp;
1244		pp += n;
1245	}
1246
1247	for (i = 0; i < nmissing; i++) {
1248		for (j = 0; j < n; j++) {
1249			ASSERT3U(invrows[i][j], !=, 0);
1250			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1251		}
1252	}
1253
1254	for (i = 0; i < n; i++) {
1255		c = used[i];
1256		ASSERT3U(c, <, rm->rm_cols);
1257
1258		src = rm->rm_col[c].rc_data;
1259		ccount = rm->rm_col[c].rc_size;
1260		for (j = 0; j < nmissing; j++) {
1261			cc = missing[j] + rm->rm_firstdatacol;
1262			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1263			ASSERT3U(cc, <, rm->rm_cols);
1264			ASSERT3U(cc, !=, c);
1265
1266			dst[j] = rm->rm_col[cc].rc_data;
1267			dcount[j] = rm->rm_col[cc].rc_size;
1268		}
1269
1270		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1271
1272		for (x = 0; x < ccount; x++, src++) {
1273			if (*src != 0)
1274				log = vdev_raidz_log2[*src];
1275
1276			for (cc = 0; cc < nmissing; cc++) {
1277				if (x >= dcount[cc])
1278					continue;
1279
1280				if (*src == 0) {
1281					val = 0;
1282				} else {
1283					if ((ll = log + invlog[cc][i]) >= 255)
1284						ll -= 255;
1285					val = vdev_raidz_pow2[ll];
1286				}
1287
1288				if (i == 0)
1289					dst[cc][x] = val;
1290				else
1291					dst[cc][x] ^= val;
1292			}
1293		}
1294	}
1295
1296	kmem_free(p, psize);
1297}
1298
1299static int
1300vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1301{
1302	int n, i, c, t, tt;
1303	int nmissing_rows;
1304	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1305	int parity_map[VDEV_RAIDZ_MAXPARITY];
1306
1307	uint8_t *p, *pp;
1308	size_t psize;
1309
1310	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1311	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1312	uint8_t *used;
1313
1314	int code = 0;
1315
1316
1317	n = rm->rm_cols - rm->rm_firstdatacol;
1318
1319	/*
1320	 * Figure out which data columns are missing.
1321	 */
1322	nmissing_rows = 0;
1323	for (t = 0; t < ntgts; t++) {
1324		if (tgts[t] >= rm->rm_firstdatacol) {
1325			missing_rows[nmissing_rows++] =
1326			    tgts[t] - rm->rm_firstdatacol;
1327		}
1328	}
1329
1330	/*
1331	 * Figure out which parity columns to use to help generate the missing
1332	 * data columns.
1333	 */
1334	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1335		ASSERT(tt < ntgts);
1336		ASSERT(c < rm->rm_firstdatacol);
1337
1338		/*
1339		 * Skip any targeted parity columns.
1340		 */
1341		if (c == tgts[tt]) {
1342			tt++;
1343			continue;
1344		}
1345
1346		code |= 1 << c;
1347
1348		parity_map[i] = c;
1349		i++;
1350	}
1351
1352	ASSERT(code != 0);
1353	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1354
1355	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1356	    nmissing_rows * n + sizeof (used[0]) * n;
1357	p = kmem_alloc(psize, KM_SLEEP);
1358
1359	for (pp = p, i = 0; i < nmissing_rows; i++) {
1360		rows[i] = pp;
1361		pp += n;
1362		invrows[i] = pp;
1363		pp += n;
1364	}
1365	used = pp;
1366
1367	for (i = 0; i < nmissing_rows; i++) {
1368		used[i] = parity_map[i];
1369	}
1370
1371	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1372		if (tt < nmissing_rows &&
1373		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1374			tt++;
1375			continue;
1376		}
1377
1378		ASSERT3S(i, <, n);
1379		used[i] = c;
1380		i++;
1381	}
1382
1383	/*
1384	 * Initialize the interesting rows of the matrix.
1385	 */
1386	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1387
1388	/*
1389	 * Invert the matrix.
1390	 */
1391	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1392	    invrows, used);
1393
1394	/*
1395	 * Reconstruct the missing data using the generated matrix.
1396	 */
1397	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1398	    invrows, used);
1399
1400	kmem_free(p, psize);
1401
1402	return (code);
1403}
1404
1405static int
1406vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1407{
1408	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1409	int ntgts;
1410	int i, c;
1411	int code;
1412	int nbadparity, nbaddata;
1413	int parity_valid[VDEV_RAIDZ_MAXPARITY];
1414
1415	/*
1416	 * The tgts list must already be sorted.
1417	 */
1418	for (i = 1; i < nt; i++) {
1419		ASSERT(t[i] > t[i - 1]);
1420	}
1421
1422	nbadparity = rm->rm_firstdatacol;
1423	nbaddata = rm->rm_cols - nbadparity;
1424	ntgts = 0;
1425	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1426		if (c < rm->rm_firstdatacol)
1427			parity_valid[c] = B_FALSE;
1428
1429		if (i < nt && c == t[i]) {
1430			tgts[ntgts++] = c;
1431			i++;
1432		} else if (rm->rm_col[c].rc_error != 0) {
1433			tgts[ntgts++] = c;
1434		} else if (c >= rm->rm_firstdatacol) {
1435			nbaddata--;
1436		} else {
1437			parity_valid[c] = B_TRUE;
1438			nbadparity--;
1439		}
1440	}
1441
1442	ASSERT(ntgts >= nt);
1443	ASSERT(nbaddata >= 0);
1444	ASSERT(nbaddata + nbadparity == ntgts);
1445
1446	dt = &tgts[nbadparity];
1447
1448	/*
1449	 * See if we can use any of our optimized reconstruction routines.
1450	 */
1451	if (!vdev_raidz_default_to_general) {
1452		switch (nbaddata) {
1453		case 1:
1454			if (parity_valid[VDEV_RAIDZ_P])
1455				return (vdev_raidz_reconstruct_p(rm, dt, 1));
1456
1457			ASSERT(rm->rm_firstdatacol > 1);
1458
1459			if (parity_valid[VDEV_RAIDZ_Q])
1460				return (vdev_raidz_reconstruct_q(rm, dt, 1));
1461
1462			ASSERT(rm->rm_firstdatacol > 2);
1463			break;
1464
1465		case 2:
1466			ASSERT(rm->rm_firstdatacol > 1);
1467
1468			if (parity_valid[VDEV_RAIDZ_P] &&
1469			    parity_valid[VDEV_RAIDZ_Q])
1470				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1471
1472			ASSERT(rm->rm_firstdatacol > 2);
1473
1474			break;
1475		}
1476	}
1477
1478	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1479	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1480	ASSERT(code > 0);
1481	return (code);
1482}
1483
1484static int
1485vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1486    uint64_t *logical_ashift, uint64_t *physical_ashift)
1487{
1488	vdev_t *cvd;
1489	uint64_t nparity = vd->vdev_nparity;
1490	int c;
1491	int lasterror = 0;
1492	int numerrors = 0;
1493
1494	ASSERT(nparity > 0);
1495
1496	if (nparity > VDEV_RAIDZ_MAXPARITY ||
1497	    vd->vdev_children < nparity + 1) {
1498		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1499		return (SET_ERROR(EINVAL));
1500	}
1501
1502	vdev_open_children(vd);
1503
1504	for (c = 0; c < vd->vdev_children; c++) {
1505		cvd = vd->vdev_child[c];
1506
1507		if (cvd->vdev_open_error != 0) {
1508			lasterror = cvd->vdev_open_error;
1509			numerrors++;
1510			continue;
1511		}
1512
1513		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1514		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1515		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
1516		*physical_ashift = MAX(*physical_ashift,
1517		    cvd->vdev_physical_ashift);
1518	}
1519
1520	*asize *= vd->vdev_children;
1521	*max_asize *= vd->vdev_children;
1522
1523	if (numerrors > nparity) {
1524		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1525		return (lasterror);
1526	}
1527
1528	return (0);
1529}
1530
1531static void
1532vdev_raidz_close(vdev_t *vd)
1533{
1534	int c;
1535
1536	for (c = 0; c < vd->vdev_children; c++)
1537		vdev_close(vd->vdev_child[c]);
1538}
1539
1540#ifdef illumos
1541/*
1542 * Handle a read or write I/O to a RAID-Z dump device.
1543 *
1544 * The dump device is in a unique situation compared to other ZFS datasets:
1545 * writing to this device should be as simple and fast as possible.  In
1546 * addition, durability matters much less since the dump will be extracted
1547 * once the machine reboots.  For that reason, this function eschews parity for
1548 * performance and simplicity.  The dump device uses the checksum setting
1549 * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1550 * dataset.
1551 *
1552 * Blocks of size 128 KB have been preallocated for this volume.  I/Os less than
1553 * 128 KB will not fill an entire block; in addition, they may not be properly
1554 * aligned.  In that case, this function uses the preallocated 128 KB block and
1555 * omits reading or writing any "empty" portions of that block, as opposed to
1556 * allocating a fresh appropriately-sized block.
1557 *
1558 * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1559 *
1560 *     vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1561 *
1562 * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1563 * allocated which spans all five child vdevs.  8 KB of data would be written to
1564 * each of four vdevs, with the fifth containing the parity bits.
1565 *
1566 *       parity    data     data     data     data
1567 *     |   PP   |   XX   |   XX   |   XX   |   XX   |
1568 *         ^        ^        ^        ^        ^
1569 *         |        |        |        |        |
1570 *   8 KB parity    ------8 KB data blocks------
1571 *
1572 * However, when writing to the dump device, the behavior is different:
1573 *
1574 *     vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1575 *
1576 * Unlike the normal RAID-Z case in which the block is allocated based on the
1577 * I/O size, reads and writes here always use a 128 KB logical I/O size.  If the
1578 * I/O size is less than 128 KB, only the actual portions of data are written.
1579 * In this example the data is written to the third data vdev since that vdev
1580 * contains the offset [64 KB, 96 KB).
1581 *
1582 *       parity    data     data     data     data
1583 *     |        |        |        |   XX   |        |
1584 *                                    ^
1585 *                                    |
1586 *                             32 KB data block
1587 *
1588 * As a result, an individual I/O may not span all child vdevs; moreover, a
1589 * small I/O may only operate on a single child vdev.
1590 *
1591 * Note that since there are no parity bits calculated or written, this format
1592 * remains the same no matter how many parity bits are used in a normal RAID-Z
1593 * stripe.  On a RAID-Z3 configuration with seven child vdevs, the example above
1594 * would look like:
1595 *
1596 *       parity   parity   parity    data     data     data     data
1597 *     |        |        |        |        |        |   XX   |        |
1598 *                                                      ^
1599 *                                                      |
1600 *                                               32 KB data block
1601 */
1602int
1603vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1604    uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1605{
1606	vdev_t *tvd = vd->vdev_top;
1607	vdev_t *cvd;
1608	raidz_map_t *rm;
1609	raidz_col_t *rc;
1610	int c, err = 0;
1611
1612	uint64_t start, end, colstart, colend;
1613	uint64_t coloffset, colsize, colskip;
1614
1615	int flags = doread ? BIO_READ : BIO_WRITE;
1616
1617#ifdef	_KERNEL
1618
1619	/*
1620	 * Don't write past the end of the block
1621	 */
1622	VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1623
1624	start = offset;
1625	end = start + size;
1626
1627	/*
1628	 * Allocate a RAID-Z map for this block.  Note that this block starts
1629	 * from the "original" offset, this is, the offset of the extent which
1630	 * contains the requisite offset of the data being read or written.
1631	 *
1632	 * Even if this I/O operation doesn't span the full block size, let's
1633	 * treat the on-disk format as if the only blocks are the complete 128
1634	 * KB size.
1635	 */
1636	rm = vdev_raidz_map_alloc(data - (offset - origoffset),
1637	    SPA_OLD_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift,
1638	    vd->vdev_children, vd->vdev_nparity);
1639
1640	coloffset = origoffset;
1641
1642	for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1643	    c++, coloffset += rc->rc_size) {
1644		rc = &rm->rm_col[c];
1645		cvd = vd->vdev_child[rc->rc_devidx];
1646
1647		/*
1648		 * Find the start and end of this column in the RAID-Z map,
1649		 * keeping in mind that the stated size and offset of the
1650		 * operation may not fill the entire column for this vdev.
1651		 *
1652		 * If any portion of the data spans this column, issue the
1653		 * appropriate operation to the vdev.
1654		 */
1655		if (coloffset + rc->rc_size <= start)
1656			continue;
1657		if (coloffset >= end)
1658			continue;
1659
1660		colstart = MAX(coloffset, start);
1661		colend = MIN(end, coloffset + rc->rc_size);
1662		colsize = colend - colstart;
1663		colskip = colstart - coloffset;
1664
1665		VERIFY3U(colsize, <=, rc->rc_size);
1666		VERIFY3U(colskip, <=, rc->rc_size);
1667
1668		/*
1669		 * Note that the child vdev will have a vdev label at the start
1670		 * of its range of offsets, hence the need for
1671		 * VDEV_LABEL_OFFSET().  See zio_vdev_child_io() for another
1672		 * example of why this calculation is needed.
1673		 */
1674		if ((err = vdev_disk_physio(cvd,
1675		    ((char *)rc->rc_data) + colskip, colsize,
1676		    VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1677		    flags, isdump)) != 0)
1678			break;
1679	}
1680
1681	vdev_raidz_map_free(rm);
1682#endif	/* KERNEL */
1683
1684	return (err);
1685}
1686#endif
1687
1688static uint64_t
1689vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1690{
1691	uint64_t asize;
1692	uint64_t ashift = vd->vdev_top->vdev_ashift;
1693	uint64_t cols = vd->vdev_children;
1694	uint64_t nparity = vd->vdev_nparity;
1695
1696	asize = ((psize - 1) >> ashift) + 1;
1697	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1698	asize = roundup(asize, nparity + 1) << ashift;
1699
1700	return (asize);
1701}
1702
1703static void
1704vdev_raidz_child_done(zio_t *zio)
1705{
1706	raidz_col_t *rc = zio->io_private;
1707
1708	rc->rc_error = zio->io_error;
1709	rc->rc_tried = 1;
1710	rc->rc_skipped = 0;
1711}
1712
1713/*
1714 * Start an IO operation on a RAIDZ VDev
1715 *
1716 * Outline:
1717 * - For write operations:
1718 *   1. Generate the parity data
1719 *   2. Create child zio write operations to each column's vdev, for both
1720 *      data and parity.
1721 *   3. If the column skips any sectors for padding, create optional dummy
1722 *      write zio children for those areas to improve aggregation continuity.
1723 * - For read operations:
1724 *   1. Create child zio read operations to each data column's vdev to read
1725 *      the range of data required for zio.
1726 *   2. If this is a scrub or resilver operation, or if any of the data
1727 *      vdevs have had errors, then create zio read operations to the parity
1728 *      columns' VDevs as well.
1729 */
1730static void
1731vdev_raidz_io_start(zio_t *zio)
1732{
1733	vdev_t *vd = zio->io_vd;
1734	vdev_t *tvd = vd->vdev_top;
1735	vdev_t *cvd;
1736	raidz_map_t *rm;
1737	raidz_col_t *rc;
1738	int c, i;
1739
1740	rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset,
1741	    zio->io_type == ZIO_TYPE_FREE,
1742	    tvd->vdev_ashift, vd->vdev_children,
1743	    vd->vdev_nparity);
1744
1745	zio->io_vsd = rm;
1746	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1747
1748	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1749
1750	if (zio->io_type == ZIO_TYPE_FREE) {
1751		for (c = 0; c < rm->rm_cols; c++) {
1752			rc = &rm->rm_col[c];
1753			cvd = vd->vdev_child[rc->rc_devidx];
1754			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1755			    rc->rc_offset, rc->rc_data, rc->rc_size,
1756			    zio->io_type, zio->io_priority, 0,
1757			    vdev_raidz_child_done, rc));
1758		}
1759
1760		zio_execute(zio);
1761		return;
1762	}
1763
1764	if (zio->io_type == ZIO_TYPE_WRITE) {
1765		vdev_raidz_generate_parity(rm);
1766
1767		for (c = 0; c < rm->rm_cols; c++) {
1768			rc = &rm->rm_col[c];
1769			cvd = vd->vdev_child[rc->rc_devidx];
1770			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1771			    rc->rc_offset, rc->rc_data, rc->rc_size,
1772			    zio->io_type, zio->io_priority, 0,
1773			    vdev_raidz_child_done, rc));
1774		}
1775
1776		/*
1777		 * Generate optional I/Os for any skipped sectors to improve
1778		 * aggregation contiguity.
1779		 */
1780		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1781			ASSERT(c <= rm->rm_scols);
1782			if (c == rm->rm_scols)
1783				c = 0;
1784			rc = &rm->rm_col[c];
1785			cvd = vd->vdev_child[rc->rc_devidx];
1786			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1787			    rc->rc_offset + rc->rc_size, NULL,
1788			    1 << tvd->vdev_ashift,
1789			    zio->io_type, zio->io_priority,
1790			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1791		}
1792
1793		zio_execute(zio);
1794		return;
1795	}
1796
1797	ASSERT(zio->io_type == ZIO_TYPE_READ);
1798
1799	/*
1800	 * Iterate over the columns in reverse order so that we hit the parity
1801	 * last -- any errors along the way will force us to read the parity.
1802	 */
1803	for (c = rm->rm_cols - 1; c >= 0; c--) {
1804		rc = &rm->rm_col[c];
1805		cvd = vd->vdev_child[rc->rc_devidx];
1806		if (!vdev_readable(cvd)) {
1807			if (c >= rm->rm_firstdatacol)
1808				rm->rm_missingdata++;
1809			else
1810				rm->rm_missingparity++;
1811			rc->rc_error = SET_ERROR(ENXIO);
1812			rc->rc_tried = 1;	/* don't even try */
1813			rc->rc_skipped = 1;
1814			continue;
1815		}
1816		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1817			if (c >= rm->rm_firstdatacol)
1818				rm->rm_missingdata++;
1819			else
1820				rm->rm_missingparity++;
1821			rc->rc_error = SET_ERROR(ESTALE);
1822			rc->rc_skipped = 1;
1823			continue;
1824		}
1825		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1826		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1827			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1828			    rc->rc_offset, rc->rc_data, rc->rc_size,
1829			    zio->io_type, zio->io_priority, 0,
1830			    vdev_raidz_child_done, rc));
1831		}
1832	}
1833
1834	zio_execute(zio);
1835}
1836
1837
1838/*
1839 * Report a checksum error for a child of a RAID-Z device.
1840 */
1841static void
1842raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1843{
1844	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1845
1846	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1847		zio_bad_cksum_t zbc;
1848		raidz_map_t *rm = zio->io_vsd;
1849
1850		mutex_enter(&vd->vdev_stat_lock);
1851		vd->vdev_stat.vs_checksum_errors++;
1852		mutex_exit(&vd->vdev_stat_lock);
1853
1854		zbc.zbc_has_cksum = 0;
1855		zbc.zbc_injected = rm->rm_ecksuminjected;
1856
1857		zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1858		    rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1859		    &zbc);
1860	}
1861}
1862
1863/*
1864 * We keep track of whether or not there were any injected errors, so that
1865 * any ereports we generate can note it.
1866 */
1867static int
1868raidz_checksum_verify(zio_t *zio)
1869{
1870	zio_bad_cksum_t zbc;
1871	raidz_map_t *rm = zio->io_vsd;
1872
1873	int ret = zio_checksum_error(zio, &zbc);
1874	if (ret != 0 && zbc.zbc_injected != 0)
1875		rm->rm_ecksuminjected = 1;
1876
1877	return (ret);
1878}
1879
1880/*
1881 * Generate the parity from the data columns. If we tried and were able to
1882 * read the parity without error, verify that the generated parity matches the
1883 * data we read. If it doesn't, we fire off a checksum error. Return the
1884 * number such failures.
1885 */
1886static int
1887raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1888{
1889	void *orig[VDEV_RAIDZ_MAXPARITY];
1890	int c, ret = 0;
1891	raidz_col_t *rc;
1892
1893	blkptr_t *bp = zio->io_bp;
1894	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
1895	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
1896
1897	if (checksum == ZIO_CHECKSUM_NOPARITY)
1898		return (ret);
1899
1900	for (c = 0; c < rm->rm_firstdatacol; c++) {
1901		rc = &rm->rm_col[c];
1902		if (!rc->rc_tried || rc->rc_error != 0)
1903			continue;
1904		orig[c] = zio_buf_alloc(rc->rc_size);
1905		bcopy(rc->rc_data, orig[c], rc->rc_size);
1906	}
1907
1908	vdev_raidz_generate_parity(rm);
1909
1910	for (c = 0; c < rm->rm_firstdatacol; c++) {
1911		rc = &rm->rm_col[c];
1912		if (!rc->rc_tried || rc->rc_error != 0)
1913			continue;
1914		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1915			raidz_checksum_error(zio, rc, orig[c]);
1916			rc->rc_error = SET_ERROR(ECKSUM);
1917			ret++;
1918		}
1919		zio_buf_free(orig[c], rc->rc_size);
1920	}
1921
1922	return (ret);
1923}
1924
1925/*
1926 * Keep statistics on all the ways that we used parity to correct data.
1927 */
1928static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1929
1930static int
1931vdev_raidz_worst_error(raidz_map_t *rm)
1932{
1933	int error = 0;
1934
1935	for (int c = 0; c < rm->rm_cols; c++)
1936		error = zio_worst_error(error, rm->rm_col[c].rc_error);
1937
1938	return (error);
1939}
1940
1941/*
1942 * Iterate over all combinations of bad data and attempt a reconstruction.
1943 * Note that the algorithm below is non-optimal because it doesn't take into
1944 * account how reconstruction is actually performed. For example, with
1945 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1946 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1947 * cases we'd only use parity information in column 0.
1948 */
1949static int
1950vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1951{
1952	raidz_map_t *rm = zio->io_vsd;
1953	raidz_col_t *rc;
1954	void *orig[VDEV_RAIDZ_MAXPARITY];
1955	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1956	int *tgts = &tstore[1];
1957	int current, next, i, c, n;
1958	int code, ret = 0;
1959
1960	ASSERT(total_errors < rm->rm_firstdatacol);
1961
1962	/*
1963	 * This simplifies one edge condition.
1964	 */
1965	tgts[-1] = -1;
1966
1967	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1968		/*
1969		 * Initialize the targets array by finding the first n columns
1970		 * that contain no error.
1971		 *
1972		 * If there were no data errors, we need to ensure that we're
1973		 * always explicitly attempting to reconstruct at least one
1974		 * data column. To do this, we simply push the highest target
1975		 * up into the data columns.
1976		 */
1977		for (c = 0, i = 0; i < n; i++) {
1978			if (i == n - 1 && data_errors == 0 &&
1979			    c < rm->rm_firstdatacol) {
1980				c = rm->rm_firstdatacol;
1981			}
1982
1983			while (rm->rm_col[c].rc_error != 0) {
1984				c++;
1985				ASSERT3S(c, <, rm->rm_cols);
1986			}
1987
1988			tgts[i] = c++;
1989		}
1990
1991		/*
1992		 * Setting tgts[n] simplifies the other edge condition.
1993		 */
1994		tgts[n] = rm->rm_cols;
1995
1996		/*
1997		 * These buffers were allocated in previous iterations.
1998		 */
1999		for (i = 0; i < n - 1; i++) {
2000			ASSERT(orig[i] != NULL);
2001		}
2002
2003		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
2004
2005		current = 0;
2006		next = tgts[current];
2007
2008		while (current != n) {
2009			tgts[current] = next;
2010			current = 0;
2011
2012			/*
2013			 * Save off the original data that we're going to
2014			 * attempt to reconstruct.
2015			 */
2016			for (i = 0; i < n; i++) {
2017				ASSERT(orig[i] != NULL);
2018				c = tgts[i];
2019				ASSERT3S(c, >=, 0);
2020				ASSERT3S(c, <, rm->rm_cols);
2021				rc = &rm->rm_col[c];
2022				bcopy(rc->rc_data, orig[i], rc->rc_size);
2023			}
2024
2025			/*
2026			 * Attempt a reconstruction and exit the outer loop on
2027			 * success.
2028			 */
2029			code = vdev_raidz_reconstruct(rm, tgts, n);
2030			if (raidz_checksum_verify(zio) == 0) {
2031				atomic_inc_64(&raidz_corrected[code]);
2032
2033				for (i = 0; i < n; i++) {
2034					c = tgts[i];
2035					rc = &rm->rm_col[c];
2036					ASSERT(rc->rc_error == 0);
2037					if (rc->rc_tried)
2038						raidz_checksum_error(zio, rc,
2039						    orig[i]);
2040					rc->rc_error = SET_ERROR(ECKSUM);
2041				}
2042
2043				ret = code;
2044				goto done;
2045			}
2046
2047			/*
2048			 * Restore the original data.
2049			 */
2050			for (i = 0; i < n; i++) {
2051				c = tgts[i];
2052				rc = &rm->rm_col[c];
2053				bcopy(orig[i], rc->rc_data, rc->rc_size);
2054			}
2055
2056			do {
2057				/*
2058				 * Find the next valid column after the current
2059				 * position..
2060				 */
2061				for (next = tgts[current] + 1;
2062				    next < rm->rm_cols &&
2063				    rm->rm_col[next].rc_error != 0; next++)
2064					continue;
2065
2066				ASSERT(next <= tgts[current + 1]);
2067
2068				/*
2069				 * If that spot is available, we're done here.
2070				 */
2071				if (next != tgts[current + 1])
2072					break;
2073
2074				/*
2075				 * Otherwise, find the next valid column after
2076				 * the previous position.
2077				 */
2078				for (c = tgts[current - 1] + 1;
2079				    rm->rm_col[c].rc_error != 0; c++)
2080					continue;
2081
2082				tgts[current] = c;
2083				current++;
2084
2085			} while (current != n);
2086		}
2087	}
2088	n--;
2089done:
2090	for (i = 0; i < n; i++) {
2091		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2092	}
2093
2094	return (ret);
2095}
2096
2097/*
2098 * Complete an IO operation on a RAIDZ VDev
2099 *
2100 * Outline:
2101 * - For write operations:
2102 *   1. Check for errors on the child IOs.
2103 *   2. Return, setting an error code if too few child VDevs were written
2104 *      to reconstruct the data later.  Note that partial writes are
2105 *      considered successful if they can be reconstructed at all.
2106 * - For read operations:
2107 *   1. Check for errors on the child IOs.
2108 *   2. If data errors occurred:
2109 *      a. Try to reassemble the data from the parity available.
2110 *      b. If we haven't yet read the parity drives, read them now.
2111 *      c. If all parity drives have been read but the data still doesn't
2112 *         reassemble with a correct checksum, then try combinatorial
2113 *         reconstruction.
2114 *      d. If that doesn't work, return an error.
2115 *   3. If there were unexpected errors or this is a resilver operation,
2116 *      rewrite the vdevs that had errors.
2117 */
2118static void
2119vdev_raidz_io_done(zio_t *zio)
2120{
2121	vdev_t *vd = zio->io_vd;
2122	vdev_t *cvd;
2123	raidz_map_t *rm = zio->io_vsd;
2124	raidz_col_t *rc;
2125	int unexpected_errors = 0;
2126	int parity_errors = 0;
2127	int parity_untried = 0;
2128	int data_errors = 0;
2129	int total_errors = 0;
2130	int n, c;
2131	int tgts[VDEV_RAIDZ_MAXPARITY];
2132	int code;
2133
2134	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
2135
2136	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2137	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2138
2139	for (c = 0; c < rm->rm_cols; c++) {
2140		rc = &rm->rm_col[c];
2141
2142		if (rc->rc_error) {
2143			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
2144
2145			if (c < rm->rm_firstdatacol)
2146				parity_errors++;
2147			else
2148				data_errors++;
2149
2150			if (!rc->rc_skipped)
2151				unexpected_errors++;
2152
2153			total_errors++;
2154		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2155			parity_untried++;
2156		}
2157	}
2158
2159	if (zio->io_type == ZIO_TYPE_WRITE) {
2160		/*
2161		 * XXX -- for now, treat partial writes as a success.
2162		 * (If we couldn't write enough columns to reconstruct
2163		 * the data, the I/O failed.  Otherwise, good enough.)
2164		 *
2165		 * Now that we support write reallocation, it would be better
2166		 * to treat partial failure as real failure unless there are
2167		 * no non-degraded top-level vdevs left, and not update DTLs
2168		 * if we intend to reallocate.
2169		 */
2170		/* XXPOLICY */
2171		if (total_errors > rm->rm_firstdatacol)
2172			zio->io_error = vdev_raidz_worst_error(rm);
2173
2174		return;
2175	} else if (zio->io_type == ZIO_TYPE_FREE) {
2176		return;
2177	}
2178
2179	ASSERT(zio->io_type == ZIO_TYPE_READ);
2180	/*
2181	 * There are three potential phases for a read:
2182	 *	1. produce valid data from the columns read
2183	 *	2. read all disks and try again
2184	 *	3. perform combinatorial reconstruction
2185	 *
2186	 * Each phase is progressively both more expensive and less likely to
2187	 * occur. If we encounter more errors than we can repair or all phases
2188	 * fail, we have no choice but to return an error.
2189	 */
2190
2191	/*
2192	 * If the number of errors we saw was correctable -- less than or equal
2193	 * to the number of parity disks read -- attempt to produce data that
2194	 * has a valid checksum. Naturally, this case applies in the absence of
2195	 * any errors.
2196	 */
2197	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2198		if (data_errors == 0) {
2199			if (raidz_checksum_verify(zio) == 0) {
2200				/*
2201				 * If we read parity information (unnecessarily
2202				 * as it happens since no reconstruction was
2203				 * needed) regenerate and verify the parity.
2204				 * We also regenerate parity when resilvering
2205				 * so we can write it out to the failed device
2206				 * later.
2207				 */
2208				if (parity_errors + parity_untried <
2209				    rm->rm_firstdatacol ||
2210				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2211					n = raidz_parity_verify(zio, rm);
2212					unexpected_errors += n;
2213					ASSERT(parity_errors + n <=
2214					    rm->rm_firstdatacol);
2215				}
2216				goto done;
2217			}
2218		} else {
2219			/*
2220			 * We either attempt to read all the parity columns or
2221			 * none of them. If we didn't try to read parity, we
2222			 * wouldn't be here in the correctable case. There must
2223			 * also have been fewer parity errors than parity
2224			 * columns or, again, we wouldn't be in this code path.
2225			 */
2226			ASSERT(parity_untried == 0);
2227			ASSERT(parity_errors < rm->rm_firstdatacol);
2228
2229			/*
2230			 * Identify the data columns that reported an error.
2231			 */
2232			n = 0;
2233			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2234				rc = &rm->rm_col[c];
2235				if (rc->rc_error != 0) {
2236					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2237					tgts[n++] = c;
2238				}
2239			}
2240
2241			ASSERT(rm->rm_firstdatacol >= n);
2242
2243			code = vdev_raidz_reconstruct(rm, tgts, n);
2244
2245			if (raidz_checksum_verify(zio) == 0) {
2246				atomic_inc_64(&raidz_corrected[code]);
2247
2248				/*
2249				 * If we read more parity disks than were used
2250				 * for reconstruction, confirm that the other
2251				 * parity disks produced correct data. This
2252				 * routine is suboptimal in that it regenerates
2253				 * the parity that we already used in addition
2254				 * to the parity that we're attempting to
2255				 * verify, but this should be a relatively
2256				 * uncommon case, and can be optimized if it
2257				 * becomes a problem. Note that we regenerate
2258				 * parity when resilvering so we can write it
2259				 * out to failed devices later.
2260				 */
2261				if (parity_errors < rm->rm_firstdatacol - n ||
2262				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2263					n = raidz_parity_verify(zio, rm);
2264					unexpected_errors += n;
2265					ASSERT(parity_errors + n <=
2266					    rm->rm_firstdatacol);
2267				}
2268
2269				goto done;
2270			}
2271		}
2272	}
2273
2274	/*
2275	 * This isn't a typical situation -- either we got a read error or
2276	 * a child silently returned bad data. Read every block so we can
2277	 * try again with as much data and parity as we can track down. If
2278	 * we've already been through once before, all children will be marked
2279	 * as tried so we'll proceed to combinatorial reconstruction.
2280	 */
2281	unexpected_errors = 1;
2282	rm->rm_missingdata = 0;
2283	rm->rm_missingparity = 0;
2284
2285	for (c = 0; c < rm->rm_cols; c++) {
2286		if (rm->rm_col[c].rc_tried)
2287			continue;
2288
2289		zio_vdev_io_redone(zio);
2290		do {
2291			rc = &rm->rm_col[c];
2292			if (rc->rc_tried)
2293				continue;
2294			zio_nowait(zio_vdev_child_io(zio, NULL,
2295			    vd->vdev_child[rc->rc_devidx],
2296			    rc->rc_offset, rc->rc_data, rc->rc_size,
2297			    zio->io_type, zio->io_priority, 0,
2298			    vdev_raidz_child_done, rc));
2299		} while (++c < rm->rm_cols);
2300
2301		return;
2302	}
2303
2304	/*
2305	 * At this point we've attempted to reconstruct the data given the
2306	 * errors we detected, and we've attempted to read all columns. There
2307	 * must, therefore, be one or more additional problems -- silent errors
2308	 * resulting in invalid data rather than explicit I/O errors resulting
2309	 * in absent data. We check if there is enough additional data to
2310	 * possibly reconstruct the data and then perform combinatorial
2311	 * reconstruction over all possible combinations. If that fails,
2312	 * we're cooked.
2313	 */
2314	if (total_errors > rm->rm_firstdatacol) {
2315		zio->io_error = vdev_raidz_worst_error(rm);
2316
2317	} else if (total_errors < rm->rm_firstdatacol &&
2318	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2319		/*
2320		 * If we didn't use all the available parity for the
2321		 * combinatorial reconstruction, verify that the remaining
2322		 * parity is correct.
2323		 */
2324		if (code != (1 << rm->rm_firstdatacol) - 1)
2325			(void) raidz_parity_verify(zio, rm);
2326	} else {
2327		/*
2328		 * We're here because either:
2329		 *
2330		 *	total_errors == rm_first_datacol, or
2331		 *	vdev_raidz_combrec() failed
2332		 *
2333		 * In either case, there is enough bad data to prevent
2334		 * reconstruction.
2335		 *
2336		 * Start checksum ereports for all children which haven't
2337		 * failed, and the IO wasn't speculative.
2338		 */
2339		zio->io_error = SET_ERROR(ECKSUM);
2340
2341		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2342			for (c = 0; c < rm->rm_cols; c++) {
2343				rc = &rm->rm_col[c];
2344				if (rc->rc_error == 0) {
2345					zio_bad_cksum_t zbc;
2346					zbc.zbc_has_cksum = 0;
2347					zbc.zbc_injected =
2348					    rm->rm_ecksuminjected;
2349
2350					zfs_ereport_start_checksum(
2351					    zio->io_spa,
2352					    vd->vdev_child[rc->rc_devidx],
2353					    zio, rc->rc_offset, rc->rc_size,
2354					    (void *)(uintptr_t)c, &zbc);
2355				}
2356			}
2357		}
2358	}
2359
2360done:
2361	zio_checksum_verified(zio);
2362
2363	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2364	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2365		/*
2366		 * Use the good data we have in hand to repair damaged children.
2367		 */
2368		for (c = 0; c < rm->rm_cols; c++) {
2369			rc = &rm->rm_col[c];
2370			cvd = vd->vdev_child[rc->rc_devidx];
2371
2372			if (rc->rc_error == 0)
2373				continue;
2374
2375			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2376			    rc->rc_offset, rc->rc_data, rc->rc_size,
2377			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2378			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2379			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2380		}
2381	}
2382}
2383
2384static void
2385vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2386{
2387	if (faulted > vd->vdev_nparity)
2388		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2389		    VDEV_AUX_NO_REPLICAS);
2390	else if (degraded + faulted != 0)
2391		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2392	else
2393		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2394}
2395
2396vdev_ops_t vdev_raidz_ops = {
2397	vdev_raidz_open,
2398	vdev_raidz_close,
2399	vdev_raidz_asize,
2400	vdev_raidz_io_start,
2401	vdev_raidz_io_done,
2402	vdev_raidz_state_change,
2403	NULL,
2404	NULL,
2405	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
2406	B_FALSE			/* not a leaf vdev */
2407};
2408