vdev_raidz.c revision 297078
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
25 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26 */
27
28#include <sys/zfs_context.h>
29#include <sys/spa.h>
30#include <sys/vdev_impl.h>
31#ifdef illumos
32#include <sys/vdev_disk.h>
33#endif
34#include <sys/vdev_file.h>
35#include <sys/vdev_raidz.h>
36#include <sys/zio.h>
37#include <sys/zio_checksum.h>
38#include <sys/fs/zfs.h>
39#include <sys/fm/fs/zfs.h>
40#include <sys/bio.h>
41
42/*
43 * Virtual device vector for RAID-Z.
44 *
45 * This vdev supports single, double, and triple parity. For single parity,
46 * we use a simple XOR of all the data columns. For double or triple parity,
47 * we use a special case of Reed-Solomon coding. This extends the
48 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
49 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
50 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
51 * former is also based. The latter is designed to provide higher performance
52 * for writes.
53 *
54 * Note that the Plank paper claimed to support arbitrary N+M, but was then
55 * amended six years later identifying a critical flaw that invalidates its
56 * claims. Nevertheless, the technique can be adapted to work for up to
57 * triple parity. For additional parity, the amendment "Note: Correction to
58 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
59 * is viable, but the additional complexity means that write performance will
60 * suffer.
61 *
62 * All of the methods above operate on a Galois field, defined over the
63 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
64 * can be expressed with a single byte. Briefly, the operations on the
65 * field are defined as follows:
66 *
67 *   o addition (+) is represented by a bitwise XOR
68 *   o subtraction (-) is therefore identical to addition: A + B = A - B
69 *   o multiplication of A by 2 is defined by the following bitwise expression:
70 *
71 *	(A * 2)_7 = A_6
72 *	(A * 2)_6 = A_5
73 *	(A * 2)_5 = A_4
74 *	(A * 2)_4 = A_3 + A_7
75 *	(A * 2)_3 = A_2 + A_7
76 *	(A * 2)_2 = A_1 + A_7
77 *	(A * 2)_1 = A_0
78 *	(A * 2)_0 = A_7
79 *
80 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
81 * As an aside, this multiplication is derived from the error correcting
82 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
83 *
84 * Observe that any number in the field (except for 0) can be expressed as a
85 * power of 2 -- a generator for the field. We store a table of the powers of
86 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
87 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
88 * than field addition). The inverse of a field element A (A^-1) is therefore
89 * A ^ (255 - 1) = A^254.
90 *
91 * The up-to-three parity columns, P, Q, R over several data columns,
92 * D_0, ... D_n-1, can be expressed by field operations:
93 *
94 *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
95 *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
96 *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
97 *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
98 *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
99 *
100 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
101 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
102 * independent coefficients. (There are no additional coefficients that have
103 * this property which is why the uncorrected Plank method breaks down.)
104 *
105 * See the reconstruction code below for how P, Q and R can used individually
106 * or in concert to recover missing data columns.
107 */
108
109typedef struct raidz_col {
110	uint64_t rc_devidx;		/* child device index for I/O */
111	uint64_t rc_offset;		/* device offset */
112	uint64_t rc_size;		/* I/O size */
113	void *rc_data;			/* I/O data */
114	void *rc_gdata;			/* used to store the "good" version */
115	int rc_error;			/* I/O error for this device */
116	uint8_t rc_tried;		/* Did we attempt this I/O column? */
117	uint8_t rc_skipped;		/* Did we skip this I/O column? */
118} raidz_col_t;
119
120typedef struct raidz_map {
121	uint64_t rm_cols;		/* Regular column count */
122	uint64_t rm_scols;		/* Count including skipped columns */
123	uint64_t rm_bigcols;		/* Number of oversized columns */
124	uint64_t rm_asize;		/* Actual total I/O size */
125	uint64_t rm_missingdata;	/* Count of missing data devices */
126	uint64_t rm_missingparity;	/* Count of missing parity devices */
127	uint64_t rm_firstdatacol;	/* First data column/parity count */
128	uint64_t rm_nskip;		/* Skipped sectors for padding */
129	uint64_t rm_skipstart;		/* Column index of padding start */
130	void *rm_datacopy;		/* rm_asize-buffer of copied data */
131	uintptr_t rm_reports;		/* # of referencing checksum reports */
132	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
133	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
134	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
135} raidz_map_t;
136
137#define	VDEV_RAIDZ_P		0
138#define	VDEV_RAIDZ_Q		1
139#define	VDEV_RAIDZ_R		2
140
141#define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
142#define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
143
144/*
145 * We provide a mechanism to perform the field multiplication operation on a
146 * 64-bit value all at once rather than a byte at a time. This works by
147 * creating a mask from the top bit in each byte and using that to
148 * conditionally apply the XOR of 0x1d.
149 */
150#define	VDEV_RAIDZ_64MUL_2(x, mask) \
151{ \
152	(mask) = (x) & 0x8080808080808080ULL; \
153	(mask) = ((mask) << 1) - ((mask) >> 7); \
154	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
155	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
156}
157
158#define	VDEV_RAIDZ_64MUL_4(x, mask) \
159{ \
160	VDEV_RAIDZ_64MUL_2((x), mask); \
161	VDEV_RAIDZ_64MUL_2((x), mask); \
162}
163
164#define	VDEV_LABEL_OFFSET(x)	(x + VDEV_LABEL_START_SIZE)
165
166/*
167 * Force reconstruction to use the general purpose method.
168 */
169int vdev_raidz_default_to_general;
170
171/* Powers of 2 in the Galois field defined above. */
172static const uint8_t vdev_raidz_pow2[256] = {
173	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
174	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
175	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
176	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
177	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
178	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
179	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
180	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
181	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
182	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
183	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
184	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
185	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
186	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
187	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
188	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
189	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
190	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
191	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
192	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
193	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
194	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
195	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
196	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
197	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
198	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
199	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
200	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
201	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
202	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
203	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
204	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
205};
206/* Logs of 2 in the Galois field defined above. */
207static const uint8_t vdev_raidz_log2[256] = {
208	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
209	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
210	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
211	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
212	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
213	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
214	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
215	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
216	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
217	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
218	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
219	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
220	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
221	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
222	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
223	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
224	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
225	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
226	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
227	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
228	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
229	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
230	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
231	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
232	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
233	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
234	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
235	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
236	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
237	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
238	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
239	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
240};
241
242static void vdev_raidz_generate_parity(raidz_map_t *rm);
243
244/*
245 * Multiply a given number by 2 raised to the given power.
246 */
247static uint8_t
248vdev_raidz_exp2(uint_t a, int exp)
249{
250	if (a == 0)
251		return (0);
252
253	ASSERT(exp >= 0);
254	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
255
256	exp += vdev_raidz_log2[a];
257	if (exp > 255)
258		exp -= 255;
259
260	return (vdev_raidz_pow2[exp]);
261}
262
263static void
264vdev_raidz_map_free(raidz_map_t *rm)
265{
266	int c;
267	size_t size;
268
269	for (c = 0; c < rm->rm_firstdatacol; c++) {
270		if (rm->rm_col[c].rc_data != NULL)
271			zio_buf_free(rm->rm_col[c].rc_data,
272			    rm->rm_col[c].rc_size);
273
274		if (rm->rm_col[c].rc_gdata != NULL)
275			zio_buf_free(rm->rm_col[c].rc_gdata,
276			    rm->rm_col[c].rc_size);
277	}
278
279	size = 0;
280	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
281		size += rm->rm_col[c].rc_size;
282
283	if (rm->rm_datacopy != NULL)
284		zio_buf_free(rm->rm_datacopy, size);
285
286	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
287}
288
289static void
290vdev_raidz_map_free_vsd(zio_t *zio)
291{
292	raidz_map_t *rm = zio->io_vsd;
293
294	ASSERT0(rm->rm_freed);
295	rm->rm_freed = 1;
296
297	if (rm->rm_reports == 0)
298		vdev_raidz_map_free(rm);
299}
300
301/*ARGSUSED*/
302static void
303vdev_raidz_cksum_free(void *arg, size_t ignored)
304{
305	raidz_map_t *rm = arg;
306
307	ASSERT3U(rm->rm_reports, >, 0);
308
309	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
310		vdev_raidz_map_free(rm);
311}
312
313static void
314vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
315{
316	raidz_map_t *rm = zcr->zcr_cbdata;
317	size_t c = zcr->zcr_cbinfo;
318	size_t x;
319
320	const char *good = NULL;
321	const char *bad = rm->rm_col[c].rc_data;
322
323	if (good_data == NULL) {
324		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
325		return;
326	}
327
328	if (c < rm->rm_firstdatacol) {
329		/*
330		 * The first time through, calculate the parity blocks for
331		 * the good data (this relies on the fact that the good
332		 * data never changes for a given logical ZIO)
333		 */
334		if (rm->rm_col[0].rc_gdata == NULL) {
335			char *bad_parity[VDEV_RAIDZ_MAXPARITY];
336			char *buf;
337
338			/*
339			 * Set up the rm_col[]s to generate the parity for
340			 * good_data, first saving the parity bufs and
341			 * replacing them with buffers to hold the result.
342			 */
343			for (x = 0; x < rm->rm_firstdatacol; x++) {
344				bad_parity[x] = rm->rm_col[x].rc_data;
345				rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
346				    zio_buf_alloc(rm->rm_col[x].rc_size);
347			}
348
349			/* fill in the data columns from good_data */
350			buf = (char *)good_data;
351			for (; x < rm->rm_cols; x++) {
352				rm->rm_col[x].rc_data = buf;
353				buf += rm->rm_col[x].rc_size;
354			}
355
356			/*
357			 * Construct the parity from the good data.
358			 */
359			vdev_raidz_generate_parity(rm);
360
361			/* restore everything back to its original state */
362			for (x = 0; x < rm->rm_firstdatacol; x++)
363				rm->rm_col[x].rc_data = bad_parity[x];
364
365			buf = rm->rm_datacopy;
366			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
367				rm->rm_col[x].rc_data = buf;
368				buf += rm->rm_col[x].rc_size;
369			}
370		}
371
372		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
373		good = rm->rm_col[c].rc_gdata;
374	} else {
375		/* adjust good_data to point at the start of our column */
376		good = good_data;
377
378		for (x = rm->rm_firstdatacol; x < c; x++)
379			good += rm->rm_col[x].rc_size;
380	}
381
382	/* we drop the ereport if it ends up that the data was good */
383	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
384}
385
386/*
387 * Invoked indirectly by zfs_ereport_start_checksum(), called
388 * below when our read operation fails completely.  The main point
389 * is to keep a copy of everything we read from disk, so that at
390 * vdev_raidz_cksum_finish() time we can compare it with the good data.
391 */
392static void
393vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
394{
395	size_t c = (size_t)(uintptr_t)arg;
396	caddr_t buf;
397
398	raidz_map_t *rm = zio->io_vsd;
399	size_t size;
400
401	/* set up the report and bump the refcount  */
402	zcr->zcr_cbdata = rm;
403	zcr->zcr_cbinfo = c;
404	zcr->zcr_finish = vdev_raidz_cksum_finish;
405	zcr->zcr_free = vdev_raidz_cksum_free;
406
407	rm->rm_reports++;
408	ASSERT3U(rm->rm_reports, >, 0);
409
410	if (rm->rm_datacopy != NULL)
411		return;
412
413	/*
414	 * It's the first time we're called for this raidz_map_t, so we need
415	 * to copy the data aside; there's no guarantee that our zio's buffer
416	 * won't be re-used for something else.
417	 *
418	 * Our parity data is already in separate buffers, so there's no need
419	 * to copy them.
420	 */
421
422	size = 0;
423	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
424		size += rm->rm_col[c].rc_size;
425
426	buf = rm->rm_datacopy = zio_buf_alloc(size);
427
428	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
429		raidz_col_t *col = &rm->rm_col[c];
430
431		bcopy(col->rc_data, buf, col->rc_size);
432		col->rc_data = buf;
433
434		buf += col->rc_size;
435	}
436	ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
437}
438
439static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
440	vdev_raidz_map_free_vsd,
441	vdev_raidz_cksum_report
442};
443
444/*
445 * Divides the IO evenly across all child vdevs; usually, dcols is
446 * the number of children in the target vdev.
447 */
448static raidz_map_t *
449vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, boolean_t dofree,
450    uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
451{
452	raidz_map_t *rm;
453	/* The starting RAIDZ (parent) vdev sector of the block. */
454	uint64_t b = offset >> unit_shift;
455	/* The zio's size in units of the vdev's minimum sector size. */
456	uint64_t s = size >> unit_shift;
457	/* The first column for this stripe. */
458	uint64_t f = b % dcols;
459	/* The starting byte offset on each child vdev. */
460	uint64_t o = (b / dcols) << unit_shift;
461	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
462
463	/*
464	 * "Quotient": The number of data sectors for this stripe on all but
465	 * the "big column" child vdevs that also contain "remainder" data.
466	 */
467	q = s / (dcols - nparity);
468
469	/*
470	 * "Remainder": The number of partial stripe data sectors in this I/O.
471	 * This will add a sector to some, but not all, child vdevs.
472	 */
473	r = s - q * (dcols - nparity);
474
475	/* The number of "big columns" - those which contain remainder data. */
476	bc = (r == 0 ? 0 : r + nparity);
477
478	/*
479	 * The total number of data and parity sectors associated with
480	 * this I/O.
481	 */
482	tot = s + nparity * (q + (r == 0 ? 0 : 1));
483
484	/* acols: The columns that will be accessed. */
485	/* scols: The columns that will be accessed or skipped. */
486	if (q == 0) {
487		/* Our I/O request doesn't span all child vdevs. */
488		acols = bc;
489		scols = MIN(dcols, roundup(bc, nparity + 1));
490	} else {
491		acols = dcols;
492		scols = dcols;
493	}
494
495	ASSERT3U(acols, <=, scols);
496
497	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
498
499	rm->rm_cols = acols;
500	rm->rm_scols = scols;
501	rm->rm_bigcols = bc;
502	rm->rm_skipstart = bc;
503	rm->rm_missingdata = 0;
504	rm->rm_missingparity = 0;
505	rm->rm_firstdatacol = nparity;
506	rm->rm_datacopy = NULL;
507	rm->rm_reports = 0;
508	rm->rm_freed = 0;
509	rm->rm_ecksuminjected = 0;
510
511	asize = 0;
512
513	for (c = 0; c < scols; c++) {
514		col = f + c;
515		coff = o;
516		if (col >= dcols) {
517			col -= dcols;
518			coff += 1ULL << unit_shift;
519		}
520		rm->rm_col[c].rc_devidx = col;
521		rm->rm_col[c].rc_offset = coff;
522		rm->rm_col[c].rc_data = NULL;
523		rm->rm_col[c].rc_gdata = NULL;
524		rm->rm_col[c].rc_error = 0;
525		rm->rm_col[c].rc_tried = 0;
526		rm->rm_col[c].rc_skipped = 0;
527
528		if (c >= acols)
529			rm->rm_col[c].rc_size = 0;
530		else if (c < bc)
531			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
532		else
533			rm->rm_col[c].rc_size = q << unit_shift;
534
535		asize += rm->rm_col[c].rc_size;
536	}
537
538	ASSERT3U(asize, ==, tot << unit_shift);
539	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
540	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
541	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
542	ASSERT3U(rm->rm_nskip, <=, nparity);
543
544	if (!dofree) {
545		for (c = 0; c < rm->rm_firstdatacol; c++) {
546			rm->rm_col[c].rc_data =
547			    zio_buf_alloc(rm->rm_col[c].rc_size);
548		}
549
550		rm->rm_col[c].rc_data = data;
551
552		for (c = c + 1; c < acols; c++) {
553			rm->rm_col[c].rc_data =
554			    (char *)rm->rm_col[c - 1].rc_data +
555			    rm->rm_col[c - 1].rc_size;
556		}
557	}
558
559	/*
560	 * If all data stored spans all columns, there's a danger that parity
561	 * will always be on the same device and, since parity isn't read
562	 * during normal operation, that that device's I/O bandwidth won't be
563	 * used effectively. We therefore switch the parity every 1MB.
564	 *
565	 * ... at least that was, ostensibly, the theory. As a practical
566	 * matter unless we juggle the parity between all devices evenly, we
567	 * won't see any benefit. Further, occasional writes that aren't a
568	 * multiple of the LCM of the number of children and the minimum
569	 * stripe width are sufficient to avoid pessimal behavior.
570	 * Unfortunately, this decision created an implicit on-disk format
571	 * requirement that we need to support for all eternity, but only
572	 * for single-parity RAID-Z.
573	 *
574	 * If we intend to skip a sector in the zeroth column for padding
575	 * we must make sure to note this swap. We will never intend to
576	 * skip the first column since at least one data and one parity
577	 * column must appear in each row.
578	 */
579	ASSERT(rm->rm_cols >= 2);
580	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
581
582	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
583		devidx = rm->rm_col[0].rc_devidx;
584		o = rm->rm_col[0].rc_offset;
585		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
586		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
587		rm->rm_col[1].rc_devidx = devidx;
588		rm->rm_col[1].rc_offset = o;
589
590		if (rm->rm_skipstart == 0)
591			rm->rm_skipstart = 1;
592	}
593
594	return (rm);
595}
596
597static void
598vdev_raidz_generate_parity_p(raidz_map_t *rm)
599{
600	uint64_t *p, *src, pcount, ccount, i;
601	int c;
602
603	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
604
605	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
606		src = rm->rm_col[c].rc_data;
607		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
608		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
609
610		if (c == rm->rm_firstdatacol) {
611			ASSERT(ccount == pcount);
612			for (i = 0; i < ccount; i++, src++, p++) {
613				*p = *src;
614			}
615		} else {
616			ASSERT(ccount <= pcount);
617			for (i = 0; i < ccount; i++, src++, p++) {
618				*p ^= *src;
619			}
620		}
621	}
622}
623
624static void
625vdev_raidz_generate_parity_pq(raidz_map_t *rm)
626{
627	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
628	int c;
629
630	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
631	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
632	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
633
634	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
635		src = rm->rm_col[c].rc_data;
636		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
637		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
638
639		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
640
641		if (c == rm->rm_firstdatacol) {
642			ASSERT(ccnt == pcnt || ccnt == 0);
643			for (i = 0; i < ccnt; i++, src++, p++, q++) {
644				*p = *src;
645				*q = *src;
646			}
647			for (; i < pcnt; i++, src++, p++, q++) {
648				*p = 0;
649				*q = 0;
650			}
651		} else {
652			ASSERT(ccnt <= pcnt);
653
654			/*
655			 * Apply the algorithm described above by multiplying
656			 * the previous result and adding in the new value.
657			 */
658			for (i = 0; i < ccnt; i++, src++, p++, q++) {
659				*p ^= *src;
660
661				VDEV_RAIDZ_64MUL_2(*q, mask);
662				*q ^= *src;
663			}
664
665			/*
666			 * Treat short columns as though they are full of 0s.
667			 * Note that there's therefore nothing needed for P.
668			 */
669			for (; i < pcnt; i++, q++) {
670				VDEV_RAIDZ_64MUL_2(*q, mask);
671			}
672		}
673	}
674}
675
676static void
677vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
678{
679	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
680	int c;
681
682	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
683	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
684	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
685	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
686	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
687
688	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
689		src = rm->rm_col[c].rc_data;
690		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
691		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
692		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
693
694		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
695
696		if (c == rm->rm_firstdatacol) {
697			ASSERT(ccnt == pcnt || ccnt == 0);
698			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
699				*p = *src;
700				*q = *src;
701				*r = *src;
702			}
703			for (; i < pcnt; i++, src++, p++, q++, r++) {
704				*p = 0;
705				*q = 0;
706				*r = 0;
707			}
708		} else {
709			ASSERT(ccnt <= pcnt);
710
711			/*
712			 * Apply the algorithm described above by multiplying
713			 * the previous result and adding in the new value.
714			 */
715			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
716				*p ^= *src;
717
718				VDEV_RAIDZ_64MUL_2(*q, mask);
719				*q ^= *src;
720
721				VDEV_RAIDZ_64MUL_4(*r, mask);
722				*r ^= *src;
723			}
724
725			/*
726			 * Treat short columns as though they are full of 0s.
727			 * Note that there's therefore nothing needed for P.
728			 */
729			for (; i < pcnt; i++, q++, r++) {
730				VDEV_RAIDZ_64MUL_2(*q, mask);
731				VDEV_RAIDZ_64MUL_4(*r, mask);
732			}
733		}
734	}
735}
736
737/*
738 * Generate RAID parity in the first virtual columns according to the number of
739 * parity columns available.
740 */
741static void
742vdev_raidz_generate_parity(raidz_map_t *rm)
743{
744	switch (rm->rm_firstdatacol) {
745	case 1:
746		vdev_raidz_generate_parity_p(rm);
747		break;
748	case 2:
749		vdev_raidz_generate_parity_pq(rm);
750		break;
751	case 3:
752		vdev_raidz_generate_parity_pqr(rm);
753		break;
754	default:
755		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
756	}
757}
758
759static int
760vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
761{
762	uint64_t *dst, *src, xcount, ccount, count, i;
763	int x = tgts[0];
764	int c;
765
766	ASSERT(ntgts == 1);
767	ASSERT(x >= rm->rm_firstdatacol);
768	ASSERT(x < rm->rm_cols);
769
770	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
771	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
772	ASSERT(xcount > 0);
773
774	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
775	dst = rm->rm_col[x].rc_data;
776	for (i = 0; i < xcount; i++, dst++, src++) {
777		*dst = *src;
778	}
779
780	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
781		src = rm->rm_col[c].rc_data;
782		dst = rm->rm_col[x].rc_data;
783
784		if (c == x)
785			continue;
786
787		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
788		count = MIN(ccount, xcount);
789
790		for (i = 0; i < count; i++, dst++, src++) {
791			*dst ^= *src;
792		}
793	}
794
795	return (1 << VDEV_RAIDZ_P);
796}
797
798static int
799vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
800{
801	uint64_t *dst, *src, xcount, ccount, count, mask, i;
802	uint8_t *b;
803	int x = tgts[0];
804	int c, j, exp;
805
806	ASSERT(ntgts == 1);
807
808	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
809	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
810
811	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
812		src = rm->rm_col[c].rc_data;
813		dst = rm->rm_col[x].rc_data;
814
815		if (c == x)
816			ccount = 0;
817		else
818			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
819
820		count = MIN(ccount, xcount);
821
822		if (c == rm->rm_firstdatacol) {
823			for (i = 0; i < count; i++, dst++, src++) {
824				*dst = *src;
825			}
826			for (; i < xcount; i++, dst++) {
827				*dst = 0;
828			}
829
830		} else {
831			for (i = 0; i < count; i++, dst++, src++) {
832				VDEV_RAIDZ_64MUL_2(*dst, mask);
833				*dst ^= *src;
834			}
835
836			for (; i < xcount; i++, dst++) {
837				VDEV_RAIDZ_64MUL_2(*dst, mask);
838			}
839		}
840	}
841
842	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
843	dst = rm->rm_col[x].rc_data;
844	exp = 255 - (rm->rm_cols - 1 - x);
845
846	for (i = 0; i < xcount; i++, dst++, src++) {
847		*dst ^= *src;
848		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
849			*b = vdev_raidz_exp2(*b, exp);
850		}
851	}
852
853	return (1 << VDEV_RAIDZ_Q);
854}
855
856static int
857vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
858{
859	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
860	void *pdata, *qdata;
861	uint64_t xsize, ysize, i;
862	int x = tgts[0];
863	int y = tgts[1];
864
865	ASSERT(ntgts == 2);
866	ASSERT(x < y);
867	ASSERT(x >= rm->rm_firstdatacol);
868	ASSERT(y < rm->rm_cols);
869
870	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
871
872	/*
873	 * Move the parity data aside -- we're going to compute parity as
874	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
875	 * reuse the parity generation mechanism without trashing the actual
876	 * parity so we make those columns appear to be full of zeros by
877	 * setting their lengths to zero.
878	 */
879	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
880	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
881	xsize = rm->rm_col[x].rc_size;
882	ysize = rm->rm_col[y].rc_size;
883
884	rm->rm_col[VDEV_RAIDZ_P].rc_data =
885	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
886	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
887	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
888	rm->rm_col[x].rc_size = 0;
889	rm->rm_col[y].rc_size = 0;
890
891	vdev_raidz_generate_parity_pq(rm);
892
893	rm->rm_col[x].rc_size = xsize;
894	rm->rm_col[y].rc_size = ysize;
895
896	p = pdata;
897	q = qdata;
898	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
899	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
900	xd = rm->rm_col[x].rc_data;
901	yd = rm->rm_col[y].rc_data;
902
903	/*
904	 * We now have:
905	 *	Pxy = P + D_x + D_y
906	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
907	 *
908	 * We can then solve for D_x:
909	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
910	 * where
911	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
912	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
913	 *
914	 * With D_x in hand, we can easily solve for D_y:
915	 *	D_y = P + Pxy + D_x
916	 */
917
918	a = vdev_raidz_pow2[255 + x - y];
919	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
920	tmp = 255 - vdev_raidz_log2[a ^ 1];
921
922	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
923	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
924
925	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
926		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
927		    vdev_raidz_exp2(*q ^ *qxy, bexp);
928
929		if (i < ysize)
930			*yd = *p ^ *pxy ^ *xd;
931	}
932
933	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
934	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
935	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
936	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
937
938	/*
939	 * Restore the saved parity data.
940	 */
941	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
942	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
943
944	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
945}
946
947/* BEGIN CSTYLED */
948/*
949 * In the general case of reconstruction, we must solve the system of linear
950 * equations defined by the coeffecients used to generate parity as well as
951 * the contents of the data and parity disks. This can be expressed with
952 * vectors for the original data (D) and the actual data (d) and parity (p)
953 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
954 *
955 *            __   __                     __     __
956 *            |     |         __     __   |  p_0  |
957 *            |  V  |         |  D_0  |   | p_m-1 |
958 *            |     |    x    |   :   | = |  d_0  |
959 *            |  I  |         | D_n-1 |   |   :   |
960 *            |     |         ~~     ~~   | d_n-1 |
961 *            ~~   ~~                     ~~     ~~
962 *
963 * I is simply a square identity matrix of size n, and V is a vandermonde
964 * matrix defined by the coeffecients we chose for the various parity columns
965 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
966 * computation as well as linear separability.
967 *
968 *      __               __               __     __
969 *      |   1   ..  1 1 1 |               |  p_0  |
970 *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
971 *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
972 *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
973 *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
974 *      |   :       : : : |   |   :   |   |  d_2  |
975 *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
976 *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
977 *      |   0   ..  0 0 1 |               | d_n-1 |
978 *      ~~               ~~               ~~     ~~
979 *
980 * Note that I, V, d, and p are known. To compute D, we must invert the
981 * matrix and use the known data and parity values to reconstruct the unknown
982 * data values. We begin by removing the rows in V|I and d|p that correspond
983 * to failed or missing columns; we then make V|I square (n x n) and d|p
984 * sized n by removing rows corresponding to unused parity from the bottom up
985 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
986 * using Gauss-Jordan elimination. In the example below we use m=3 parity
987 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
988 *           __                               __
989 *           |  1   1   1   1   1   1   1   1  |
990 *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
991 *           |  19 205 116  29  64  16  4   1  |      / /
992 *           |  1   0   0   0   0   0   0   0  |     / /
993 *           |  0   1   0   0   0   0   0   0  | <--' /
994 *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
995 *           |  0   0   0   1   0   0   0   0  |
996 *           |  0   0   0   0   1   0   0   0  |
997 *           |  0   0   0   0   0   1   0   0  |
998 *           |  0   0   0   0   0   0   1   0  |
999 *           |  0   0   0   0   0   0   0   1  |
1000 *           ~~                               ~~
1001 *           __                               __
1002 *           |  1   1   1   1   1   1   1   1  |
1003 *           |  19 205 116  29  64  16  4   1  |
1004 *           |  1   0   0   0   0   0   0   0  |
1005 *  (V|I)' = |  0   0   0   1   0   0   0   0  |
1006 *           |  0   0   0   0   1   0   0   0  |
1007 *           |  0   0   0   0   0   1   0   0  |
1008 *           |  0   0   0   0   0   0   1   0  |
1009 *           |  0   0   0   0   0   0   0   1  |
1010 *           ~~                               ~~
1011 *
1012 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1013 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1014 * matrix is not singular.
1015 * __                                                                 __
1016 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1017 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1018 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1019 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1020 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1021 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1022 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1023 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1024 * ~~                                                                 ~~
1025 * __                                                                 __
1026 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1027 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1028 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1029 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1030 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1031 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1032 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1033 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1034 * ~~                                                                 ~~
1035 * __                                                                 __
1036 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1037 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1038 * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1039 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1040 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1041 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1042 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1043 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1044 * ~~                                                                 ~~
1045 * __                                                                 __
1046 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1047 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1048 * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1049 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1050 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1051 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1052 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1053 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1054 * ~~                                                                 ~~
1055 * __                                                                 __
1056 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1057 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1058 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1059 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1060 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1061 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1062 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1063 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1064 * ~~                                                                 ~~
1065 * __                                                                 __
1066 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1067 * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1068 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1069 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1070 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1071 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1072 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1073 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1074 * ~~                                                                 ~~
1075 *                   __                               __
1076 *                   |  0   0   1   0   0   0   0   0  |
1077 *                   | 167 100  5   41 159 169 217 208 |
1078 *                   | 166 100  4   40 158 168 216 209 |
1079 *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1080 *                   |  0   0   0   0   1   0   0   0  |
1081 *                   |  0   0   0   0   0   1   0   0  |
1082 *                   |  0   0   0   0   0   0   1   0  |
1083 *                   |  0   0   0   0   0   0   0   1  |
1084 *                   ~~                               ~~
1085 *
1086 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1087 * of the missing data.
1088 *
1089 * As is apparent from the example above, the only non-trivial rows in the
1090 * inverse matrix correspond to the data disks that we're trying to
1091 * reconstruct. Indeed, those are the only rows we need as the others would
1092 * only be useful for reconstructing data known or assumed to be valid. For
1093 * that reason, we only build the coefficients in the rows that correspond to
1094 * targeted columns.
1095 */
1096/* END CSTYLED */
1097
1098static void
1099vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1100    uint8_t **rows)
1101{
1102	int i, j;
1103	int pow;
1104
1105	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1106
1107	/*
1108	 * Fill in the missing rows of interest.
1109	 */
1110	for (i = 0; i < nmap; i++) {
1111		ASSERT3S(0, <=, map[i]);
1112		ASSERT3S(map[i], <=, 2);
1113
1114		pow = map[i] * n;
1115		if (pow > 255)
1116			pow -= 255;
1117		ASSERT(pow <= 255);
1118
1119		for (j = 0; j < n; j++) {
1120			pow -= map[i];
1121			if (pow < 0)
1122				pow += 255;
1123			rows[i][j] = vdev_raidz_pow2[pow];
1124		}
1125	}
1126}
1127
1128static void
1129vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1130    uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1131{
1132	int i, j, ii, jj;
1133	uint8_t log;
1134
1135	/*
1136	 * Assert that the first nmissing entries from the array of used
1137	 * columns correspond to parity columns and that subsequent entries
1138	 * correspond to data columns.
1139	 */
1140	for (i = 0; i < nmissing; i++) {
1141		ASSERT3S(used[i], <, rm->rm_firstdatacol);
1142	}
1143	for (; i < n; i++) {
1144		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1145	}
1146
1147	/*
1148	 * First initialize the storage where we'll compute the inverse rows.
1149	 */
1150	for (i = 0; i < nmissing; i++) {
1151		for (j = 0; j < n; j++) {
1152			invrows[i][j] = (i == j) ? 1 : 0;
1153		}
1154	}
1155
1156	/*
1157	 * Subtract all trivial rows from the rows of consequence.
1158	 */
1159	for (i = 0; i < nmissing; i++) {
1160		for (j = nmissing; j < n; j++) {
1161			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1162			jj = used[j] - rm->rm_firstdatacol;
1163			ASSERT3S(jj, <, n);
1164			invrows[i][j] = rows[i][jj];
1165			rows[i][jj] = 0;
1166		}
1167	}
1168
1169	/*
1170	 * For each of the rows of interest, we must normalize it and subtract
1171	 * a multiple of it from the other rows.
1172	 */
1173	for (i = 0; i < nmissing; i++) {
1174		for (j = 0; j < missing[i]; j++) {
1175			ASSERT0(rows[i][j]);
1176		}
1177		ASSERT3U(rows[i][missing[i]], !=, 0);
1178
1179		/*
1180		 * Compute the inverse of the first element and multiply each
1181		 * element in the row by that value.
1182		 */
1183		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1184
1185		for (j = 0; j < n; j++) {
1186			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1187			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1188		}
1189
1190		for (ii = 0; ii < nmissing; ii++) {
1191			if (i == ii)
1192				continue;
1193
1194			ASSERT3U(rows[ii][missing[i]], !=, 0);
1195
1196			log = vdev_raidz_log2[rows[ii][missing[i]]];
1197
1198			for (j = 0; j < n; j++) {
1199				rows[ii][j] ^=
1200				    vdev_raidz_exp2(rows[i][j], log);
1201				invrows[ii][j] ^=
1202				    vdev_raidz_exp2(invrows[i][j], log);
1203			}
1204		}
1205	}
1206
1207	/*
1208	 * Verify that the data that is left in the rows are properly part of
1209	 * an identity matrix.
1210	 */
1211	for (i = 0; i < nmissing; i++) {
1212		for (j = 0; j < n; j++) {
1213			if (j == missing[i]) {
1214				ASSERT3U(rows[i][j], ==, 1);
1215			} else {
1216				ASSERT0(rows[i][j]);
1217			}
1218		}
1219	}
1220}
1221
1222static void
1223vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1224    int *missing, uint8_t **invrows, const uint8_t *used)
1225{
1226	int i, j, x, cc, c;
1227	uint8_t *src;
1228	uint64_t ccount;
1229	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1230	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1231	uint8_t log = 0;
1232	uint8_t val;
1233	int ll;
1234	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1235	uint8_t *p, *pp;
1236	size_t psize;
1237
1238	psize = sizeof (invlog[0][0]) * n * nmissing;
1239	p = kmem_alloc(psize, KM_SLEEP);
1240
1241	for (pp = p, i = 0; i < nmissing; i++) {
1242		invlog[i] = pp;
1243		pp += n;
1244	}
1245
1246	for (i = 0; i < nmissing; i++) {
1247		for (j = 0; j < n; j++) {
1248			ASSERT3U(invrows[i][j], !=, 0);
1249			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1250		}
1251	}
1252
1253	for (i = 0; i < n; i++) {
1254		c = used[i];
1255		ASSERT3U(c, <, rm->rm_cols);
1256
1257		src = rm->rm_col[c].rc_data;
1258		ccount = rm->rm_col[c].rc_size;
1259		for (j = 0; j < nmissing; j++) {
1260			cc = missing[j] + rm->rm_firstdatacol;
1261			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1262			ASSERT3U(cc, <, rm->rm_cols);
1263			ASSERT3U(cc, !=, c);
1264
1265			dst[j] = rm->rm_col[cc].rc_data;
1266			dcount[j] = rm->rm_col[cc].rc_size;
1267		}
1268
1269		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1270
1271		for (x = 0; x < ccount; x++, src++) {
1272			if (*src != 0)
1273				log = vdev_raidz_log2[*src];
1274
1275			for (cc = 0; cc < nmissing; cc++) {
1276				if (x >= dcount[cc])
1277					continue;
1278
1279				if (*src == 0) {
1280					val = 0;
1281				} else {
1282					if ((ll = log + invlog[cc][i]) >= 255)
1283						ll -= 255;
1284					val = vdev_raidz_pow2[ll];
1285				}
1286
1287				if (i == 0)
1288					dst[cc][x] = val;
1289				else
1290					dst[cc][x] ^= val;
1291			}
1292		}
1293	}
1294
1295	kmem_free(p, psize);
1296}
1297
1298static int
1299vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1300{
1301	int n, i, c, t, tt;
1302	int nmissing_rows;
1303	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1304	int parity_map[VDEV_RAIDZ_MAXPARITY];
1305
1306	uint8_t *p, *pp;
1307	size_t psize;
1308
1309	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1310	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1311	uint8_t *used;
1312
1313	int code = 0;
1314
1315
1316	n = rm->rm_cols - rm->rm_firstdatacol;
1317
1318	/*
1319	 * Figure out which data columns are missing.
1320	 */
1321	nmissing_rows = 0;
1322	for (t = 0; t < ntgts; t++) {
1323		if (tgts[t] >= rm->rm_firstdatacol) {
1324			missing_rows[nmissing_rows++] =
1325			    tgts[t] - rm->rm_firstdatacol;
1326		}
1327	}
1328
1329	/*
1330	 * Figure out which parity columns to use to help generate the missing
1331	 * data columns.
1332	 */
1333	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1334		ASSERT(tt < ntgts);
1335		ASSERT(c < rm->rm_firstdatacol);
1336
1337		/*
1338		 * Skip any targeted parity columns.
1339		 */
1340		if (c == tgts[tt]) {
1341			tt++;
1342			continue;
1343		}
1344
1345		code |= 1 << c;
1346
1347		parity_map[i] = c;
1348		i++;
1349	}
1350
1351	ASSERT(code != 0);
1352	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1353
1354	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1355	    nmissing_rows * n + sizeof (used[0]) * n;
1356	p = kmem_alloc(psize, KM_SLEEP);
1357
1358	for (pp = p, i = 0; i < nmissing_rows; i++) {
1359		rows[i] = pp;
1360		pp += n;
1361		invrows[i] = pp;
1362		pp += n;
1363	}
1364	used = pp;
1365
1366	for (i = 0; i < nmissing_rows; i++) {
1367		used[i] = parity_map[i];
1368	}
1369
1370	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1371		if (tt < nmissing_rows &&
1372		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1373			tt++;
1374			continue;
1375		}
1376
1377		ASSERT3S(i, <, n);
1378		used[i] = c;
1379		i++;
1380	}
1381
1382	/*
1383	 * Initialize the interesting rows of the matrix.
1384	 */
1385	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1386
1387	/*
1388	 * Invert the matrix.
1389	 */
1390	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1391	    invrows, used);
1392
1393	/*
1394	 * Reconstruct the missing data using the generated matrix.
1395	 */
1396	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1397	    invrows, used);
1398
1399	kmem_free(p, psize);
1400
1401	return (code);
1402}
1403
1404static int
1405vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1406{
1407	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1408	int ntgts;
1409	int i, c;
1410	int code;
1411	int nbadparity, nbaddata;
1412	int parity_valid[VDEV_RAIDZ_MAXPARITY];
1413
1414	/*
1415	 * The tgts list must already be sorted.
1416	 */
1417	for (i = 1; i < nt; i++) {
1418		ASSERT(t[i] > t[i - 1]);
1419	}
1420
1421	nbadparity = rm->rm_firstdatacol;
1422	nbaddata = rm->rm_cols - nbadparity;
1423	ntgts = 0;
1424	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1425		if (c < rm->rm_firstdatacol)
1426			parity_valid[c] = B_FALSE;
1427
1428		if (i < nt && c == t[i]) {
1429			tgts[ntgts++] = c;
1430			i++;
1431		} else if (rm->rm_col[c].rc_error != 0) {
1432			tgts[ntgts++] = c;
1433		} else if (c >= rm->rm_firstdatacol) {
1434			nbaddata--;
1435		} else {
1436			parity_valid[c] = B_TRUE;
1437			nbadparity--;
1438		}
1439	}
1440
1441	ASSERT(ntgts >= nt);
1442	ASSERT(nbaddata >= 0);
1443	ASSERT(nbaddata + nbadparity == ntgts);
1444
1445	dt = &tgts[nbadparity];
1446
1447	/*
1448	 * See if we can use any of our optimized reconstruction routines.
1449	 */
1450	if (!vdev_raidz_default_to_general) {
1451		switch (nbaddata) {
1452		case 1:
1453			if (parity_valid[VDEV_RAIDZ_P])
1454				return (vdev_raidz_reconstruct_p(rm, dt, 1));
1455
1456			ASSERT(rm->rm_firstdatacol > 1);
1457
1458			if (parity_valid[VDEV_RAIDZ_Q])
1459				return (vdev_raidz_reconstruct_q(rm, dt, 1));
1460
1461			ASSERT(rm->rm_firstdatacol > 2);
1462			break;
1463
1464		case 2:
1465			ASSERT(rm->rm_firstdatacol > 1);
1466
1467			if (parity_valid[VDEV_RAIDZ_P] &&
1468			    parity_valid[VDEV_RAIDZ_Q])
1469				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1470
1471			ASSERT(rm->rm_firstdatacol > 2);
1472
1473			break;
1474		}
1475	}
1476
1477	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1478	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1479	ASSERT(code > 0);
1480	return (code);
1481}
1482
1483static int
1484vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1485    uint64_t *logical_ashift, uint64_t *physical_ashift)
1486{
1487	vdev_t *cvd;
1488	uint64_t nparity = vd->vdev_nparity;
1489	int c;
1490	int lasterror = 0;
1491	int numerrors = 0;
1492
1493	ASSERT(nparity > 0);
1494
1495	if (nparity > VDEV_RAIDZ_MAXPARITY ||
1496	    vd->vdev_children < nparity + 1) {
1497		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1498		return (SET_ERROR(EINVAL));
1499	}
1500
1501	vdev_open_children(vd);
1502
1503	for (c = 0; c < vd->vdev_children; c++) {
1504		cvd = vd->vdev_child[c];
1505
1506		if (cvd->vdev_open_error != 0) {
1507			lasterror = cvd->vdev_open_error;
1508			numerrors++;
1509			continue;
1510		}
1511
1512		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1513		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1514		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
1515		*physical_ashift = MAX(*physical_ashift,
1516		    cvd->vdev_physical_ashift);
1517	}
1518
1519	*asize *= vd->vdev_children;
1520	*max_asize *= vd->vdev_children;
1521
1522	if (numerrors > nparity) {
1523		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1524		return (lasterror);
1525	}
1526
1527	return (0);
1528}
1529
1530static void
1531vdev_raidz_close(vdev_t *vd)
1532{
1533	int c;
1534
1535	for (c = 0; c < vd->vdev_children; c++)
1536		vdev_close(vd->vdev_child[c]);
1537}
1538
1539#ifdef illumos
1540/*
1541 * Handle a read or write I/O to a RAID-Z dump device.
1542 *
1543 * The dump device is in a unique situation compared to other ZFS datasets:
1544 * writing to this device should be as simple and fast as possible.  In
1545 * addition, durability matters much less since the dump will be extracted
1546 * once the machine reboots.  For that reason, this function eschews parity for
1547 * performance and simplicity.  The dump device uses the checksum setting
1548 * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1549 * dataset.
1550 *
1551 * Blocks of size 128 KB have been preallocated for this volume.  I/Os less than
1552 * 128 KB will not fill an entire block; in addition, they may not be properly
1553 * aligned.  In that case, this function uses the preallocated 128 KB block and
1554 * omits reading or writing any "empty" portions of that block, as opposed to
1555 * allocating a fresh appropriately-sized block.
1556 *
1557 * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1558 *
1559 *     vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1560 *
1561 * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1562 * allocated which spans all five child vdevs.  8 KB of data would be written to
1563 * each of four vdevs, with the fifth containing the parity bits.
1564 *
1565 *       parity    data     data     data     data
1566 *     |   PP   |   XX   |   XX   |   XX   |   XX   |
1567 *         ^        ^        ^        ^        ^
1568 *         |        |        |        |        |
1569 *   8 KB parity    ------8 KB data blocks------
1570 *
1571 * However, when writing to the dump device, the behavior is different:
1572 *
1573 *     vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1574 *
1575 * Unlike the normal RAID-Z case in which the block is allocated based on the
1576 * I/O size, reads and writes here always use a 128 KB logical I/O size.  If the
1577 * I/O size is less than 128 KB, only the actual portions of data are written.
1578 * In this example the data is written to the third data vdev since that vdev
1579 * contains the offset [64 KB, 96 KB).
1580 *
1581 *       parity    data     data     data     data
1582 *     |        |        |        |   XX   |        |
1583 *                                    ^
1584 *                                    |
1585 *                             32 KB data block
1586 *
1587 * As a result, an individual I/O may not span all child vdevs; moreover, a
1588 * small I/O may only operate on a single child vdev.
1589 *
1590 * Note that since there are no parity bits calculated or written, this format
1591 * remains the same no matter how many parity bits are used in a normal RAID-Z
1592 * stripe.  On a RAID-Z3 configuration with seven child vdevs, the example above
1593 * would look like:
1594 *
1595 *       parity   parity   parity    data     data     data     data
1596 *     |        |        |        |        |        |   XX   |        |
1597 *                                                      ^
1598 *                                                      |
1599 *                                               32 KB data block
1600 */
1601int
1602vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1603    uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1604{
1605	vdev_t *tvd = vd->vdev_top;
1606	vdev_t *cvd;
1607	raidz_map_t *rm;
1608	raidz_col_t *rc;
1609	int c, err = 0;
1610
1611	uint64_t start, end, colstart, colend;
1612	uint64_t coloffset, colsize, colskip;
1613
1614	int flags = doread ? BIO_READ : BIO_WRITE;
1615
1616#ifdef	_KERNEL
1617
1618	/*
1619	 * Don't write past the end of the block
1620	 */
1621	VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1622
1623	start = offset;
1624	end = start + size;
1625
1626	/*
1627	 * Allocate a RAID-Z map for this block.  Note that this block starts
1628	 * from the "original" offset, this is, the offset of the extent which
1629	 * contains the requisite offset of the data being read or written.
1630	 *
1631	 * Even if this I/O operation doesn't span the full block size, let's
1632	 * treat the on-disk format as if the only blocks are the complete 128
1633	 * KB size.
1634	 */
1635	rm = vdev_raidz_map_alloc(data - (offset - origoffset),
1636	    SPA_OLD_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift,
1637	    vd->vdev_children, vd->vdev_nparity);
1638
1639	coloffset = origoffset;
1640
1641	for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1642	    c++, coloffset += rc->rc_size) {
1643		rc = &rm->rm_col[c];
1644		cvd = vd->vdev_child[rc->rc_devidx];
1645
1646		/*
1647		 * Find the start and end of this column in the RAID-Z map,
1648		 * keeping in mind that the stated size and offset of the
1649		 * operation may not fill the entire column for this vdev.
1650		 *
1651		 * If any portion of the data spans this column, issue the
1652		 * appropriate operation to the vdev.
1653		 */
1654		if (coloffset + rc->rc_size <= start)
1655			continue;
1656		if (coloffset >= end)
1657			continue;
1658
1659		colstart = MAX(coloffset, start);
1660		colend = MIN(end, coloffset + rc->rc_size);
1661		colsize = colend - colstart;
1662		colskip = colstart - coloffset;
1663
1664		VERIFY3U(colsize, <=, rc->rc_size);
1665		VERIFY3U(colskip, <=, rc->rc_size);
1666
1667		/*
1668		 * Note that the child vdev will have a vdev label at the start
1669		 * of its range of offsets, hence the need for
1670		 * VDEV_LABEL_OFFSET().  See zio_vdev_child_io() for another
1671		 * example of why this calculation is needed.
1672		 */
1673		if ((err = vdev_disk_physio(cvd,
1674		    ((char *)rc->rc_data) + colskip, colsize,
1675		    VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1676		    flags, isdump)) != 0)
1677			break;
1678	}
1679
1680	vdev_raidz_map_free(rm);
1681#endif	/* KERNEL */
1682
1683	return (err);
1684}
1685#endif
1686
1687static uint64_t
1688vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1689{
1690	uint64_t asize;
1691	uint64_t ashift = vd->vdev_top->vdev_ashift;
1692	uint64_t cols = vd->vdev_children;
1693	uint64_t nparity = vd->vdev_nparity;
1694
1695	asize = ((psize - 1) >> ashift) + 1;
1696	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1697	asize = roundup(asize, nparity + 1) << ashift;
1698
1699	return (asize);
1700}
1701
1702static void
1703vdev_raidz_child_done(zio_t *zio)
1704{
1705	raidz_col_t *rc = zio->io_private;
1706
1707	rc->rc_error = zio->io_error;
1708	rc->rc_tried = 1;
1709	rc->rc_skipped = 0;
1710}
1711
1712/*
1713 * Start an IO operation on a RAIDZ VDev
1714 *
1715 * Outline:
1716 * - For write operations:
1717 *   1. Generate the parity data
1718 *   2. Create child zio write operations to each column's vdev, for both
1719 *      data and parity.
1720 *   3. If the column skips any sectors for padding, create optional dummy
1721 *      write zio children for those areas to improve aggregation continuity.
1722 * - For read operations:
1723 *   1. Create child zio read operations to each data column's vdev to read
1724 *      the range of data required for zio.
1725 *   2. If this is a scrub or resilver operation, or if any of the data
1726 *      vdevs have had errors, then create zio read operations to the parity
1727 *      columns' VDevs as well.
1728 */
1729static void
1730vdev_raidz_io_start(zio_t *zio)
1731{
1732	vdev_t *vd = zio->io_vd;
1733	vdev_t *tvd = vd->vdev_top;
1734	vdev_t *cvd;
1735	raidz_map_t *rm;
1736	raidz_col_t *rc;
1737	int c, i;
1738
1739	rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset,
1740	    zio->io_type == ZIO_TYPE_FREE,
1741	    tvd->vdev_ashift, vd->vdev_children,
1742	    vd->vdev_nparity);
1743
1744	zio->io_vsd = rm;
1745	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1746
1747	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1748
1749	if (zio->io_type == ZIO_TYPE_FREE) {
1750		for (c = 0; c < rm->rm_cols; c++) {
1751			rc = &rm->rm_col[c];
1752			cvd = vd->vdev_child[rc->rc_devidx];
1753			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1754			    rc->rc_offset, rc->rc_data, rc->rc_size,
1755			    zio->io_type, zio->io_priority, 0,
1756			    vdev_raidz_child_done, rc));
1757		}
1758
1759		zio_execute(zio);
1760		return;
1761	}
1762
1763	if (zio->io_type == ZIO_TYPE_WRITE) {
1764		vdev_raidz_generate_parity(rm);
1765
1766		for (c = 0; c < rm->rm_cols; c++) {
1767			rc = &rm->rm_col[c];
1768			cvd = vd->vdev_child[rc->rc_devidx];
1769			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1770			    rc->rc_offset, rc->rc_data, rc->rc_size,
1771			    zio->io_type, zio->io_priority, 0,
1772			    vdev_raidz_child_done, rc));
1773		}
1774
1775		/*
1776		 * Generate optional I/Os for any skipped sectors to improve
1777		 * aggregation contiguity.
1778		 */
1779		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1780			ASSERT(c <= rm->rm_scols);
1781			if (c == rm->rm_scols)
1782				c = 0;
1783			rc = &rm->rm_col[c];
1784			cvd = vd->vdev_child[rc->rc_devidx];
1785			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1786			    rc->rc_offset + rc->rc_size, NULL,
1787			    1 << tvd->vdev_ashift,
1788			    zio->io_type, zio->io_priority,
1789			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1790		}
1791
1792		zio_execute(zio);
1793		return;
1794	}
1795
1796	ASSERT(zio->io_type == ZIO_TYPE_READ);
1797
1798	/*
1799	 * Iterate over the columns in reverse order so that we hit the parity
1800	 * last -- any errors along the way will force us to read the parity.
1801	 */
1802	for (c = rm->rm_cols - 1; c >= 0; c--) {
1803		rc = &rm->rm_col[c];
1804		cvd = vd->vdev_child[rc->rc_devidx];
1805		if (!vdev_readable(cvd)) {
1806			if (c >= rm->rm_firstdatacol)
1807				rm->rm_missingdata++;
1808			else
1809				rm->rm_missingparity++;
1810			rc->rc_error = SET_ERROR(ENXIO);
1811			rc->rc_tried = 1;	/* don't even try */
1812			rc->rc_skipped = 1;
1813			continue;
1814		}
1815		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1816			if (c >= rm->rm_firstdatacol)
1817				rm->rm_missingdata++;
1818			else
1819				rm->rm_missingparity++;
1820			rc->rc_error = SET_ERROR(ESTALE);
1821			rc->rc_skipped = 1;
1822			continue;
1823		}
1824		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1825		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1826			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1827			    rc->rc_offset, rc->rc_data, rc->rc_size,
1828			    zio->io_type, zio->io_priority, 0,
1829			    vdev_raidz_child_done, rc));
1830		}
1831	}
1832
1833	zio_execute(zio);
1834}
1835
1836
1837/*
1838 * Report a checksum error for a child of a RAID-Z device.
1839 */
1840static void
1841raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1842{
1843	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1844
1845	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1846		zio_bad_cksum_t zbc;
1847		raidz_map_t *rm = zio->io_vsd;
1848
1849		mutex_enter(&vd->vdev_stat_lock);
1850		vd->vdev_stat.vs_checksum_errors++;
1851		mutex_exit(&vd->vdev_stat_lock);
1852
1853		zbc.zbc_has_cksum = 0;
1854		zbc.zbc_injected = rm->rm_ecksuminjected;
1855
1856		zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1857		    rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1858		    &zbc);
1859	}
1860}
1861
1862/*
1863 * We keep track of whether or not there were any injected errors, so that
1864 * any ereports we generate can note it.
1865 */
1866static int
1867raidz_checksum_verify(zio_t *zio)
1868{
1869	zio_bad_cksum_t zbc;
1870	raidz_map_t *rm = zio->io_vsd;
1871
1872	int ret = zio_checksum_error(zio, &zbc);
1873	if (ret != 0 && zbc.zbc_injected != 0)
1874		rm->rm_ecksuminjected = 1;
1875
1876	return (ret);
1877}
1878
1879/*
1880 * Generate the parity from the data columns. If we tried and were able to
1881 * read the parity without error, verify that the generated parity matches the
1882 * data we read. If it doesn't, we fire off a checksum error. Return the
1883 * number such failures.
1884 */
1885static int
1886raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1887{
1888	void *orig[VDEV_RAIDZ_MAXPARITY];
1889	int c, ret = 0;
1890	raidz_col_t *rc;
1891
1892	blkptr_t *bp = zio->io_bp;
1893	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
1894	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
1895
1896	if (checksum == ZIO_CHECKSUM_NOPARITY)
1897		return (ret);
1898
1899	for (c = 0; c < rm->rm_firstdatacol; c++) {
1900		rc = &rm->rm_col[c];
1901		if (!rc->rc_tried || rc->rc_error != 0)
1902			continue;
1903		orig[c] = zio_buf_alloc(rc->rc_size);
1904		bcopy(rc->rc_data, orig[c], rc->rc_size);
1905	}
1906
1907	vdev_raidz_generate_parity(rm);
1908
1909	for (c = 0; c < rm->rm_firstdatacol; c++) {
1910		rc = &rm->rm_col[c];
1911		if (!rc->rc_tried || rc->rc_error != 0)
1912			continue;
1913		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1914			raidz_checksum_error(zio, rc, orig[c]);
1915			rc->rc_error = SET_ERROR(ECKSUM);
1916			ret++;
1917		}
1918		zio_buf_free(orig[c], rc->rc_size);
1919	}
1920
1921	return (ret);
1922}
1923
1924/*
1925 * Keep statistics on all the ways that we used parity to correct data.
1926 */
1927static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1928
1929static int
1930vdev_raidz_worst_error(raidz_map_t *rm)
1931{
1932	int error = 0;
1933
1934	for (int c = 0; c < rm->rm_cols; c++)
1935		error = zio_worst_error(error, rm->rm_col[c].rc_error);
1936
1937	return (error);
1938}
1939
1940/*
1941 * Iterate over all combinations of bad data and attempt a reconstruction.
1942 * Note that the algorithm below is non-optimal because it doesn't take into
1943 * account how reconstruction is actually performed. For example, with
1944 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1945 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1946 * cases we'd only use parity information in column 0.
1947 */
1948static int
1949vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1950{
1951	raidz_map_t *rm = zio->io_vsd;
1952	raidz_col_t *rc;
1953	void *orig[VDEV_RAIDZ_MAXPARITY];
1954	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1955	int *tgts = &tstore[1];
1956	int current, next, i, c, n;
1957	int code, ret = 0;
1958
1959	ASSERT(total_errors < rm->rm_firstdatacol);
1960
1961	/*
1962	 * This simplifies one edge condition.
1963	 */
1964	tgts[-1] = -1;
1965
1966	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1967		/*
1968		 * Initialize the targets array by finding the first n columns
1969		 * that contain no error.
1970		 *
1971		 * If there were no data errors, we need to ensure that we're
1972		 * always explicitly attempting to reconstruct at least one
1973		 * data column. To do this, we simply push the highest target
1974		 * up into the data columns.
1975		 */
1976		for (c = 0, i = 0; i < n; i++) {
1977			if (i == n - 1 && data_errors == 0 &&
1978			    c < rm->rm_firstdatacol) {
1979				c = rm->rm_firstdatacol;
1980			}
1981
1982			while (rm->rm_col[c].rc_error != 0) {
1983				c++;
1984				ASSERT3S(c, <, rm->rm_cols);
1985			}
1986
1987			tgts[i] = c++;
1988		}
1989
1990		/*
1991		 * Setting tgts[n] simplifies the other edge condition.
1992		 */
1993		tgts[n] = rm->rm_cols;
1994
1995		/*
1996		 * These buffers were allocated in previous iterations.
1997		 */
1998		for (i = 0; i < n - 1; i++) {
1999			ASSERT(orig[i] != NULL);
2000		}
2001
2002		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
2003
2004		current = 0;
2005		next = tgts[current];
2006
2007		while (current != n) {
2008			tgts[current] = next;
2009			current = 0;
2010
2011			/*
2012			 * Save off the original data that we're going to
2013			 * attempt to reconstruct.
2014			 */
2015			for (i = 0; i < n; i++) {
2016				ASSERT(orig[i] != NULL);
2017				c = tgts[i];
2018				ASSERT3S(c, >=, 0);
2019				ASSERT3S(c, <, rm->rm_cols);
2020				rc = &rm->rm_col[c];
2021				bcopy(rc->rc_data, orig[i], rc->rc_size);
2022			}
2023
2024			/*
2025			 * Attempt a reconstruction and exit the outer loop on
2026			 * success.
2027			 */
2028			code = vdev_raidz_reconstruct(rm, tgts, n);
2029			if (raidz_checksum_verify(zio) == 0) {
2030				atomic_inc_64(&raidz_corrected[code]);
2031
2032				for (i = 0; i < n; i++) {
2033					c = tgts[i];
2034					rc = &rm->rm_col[c];
2035					ASSERT(rc->rc_error == 0);
2036					if (rc->rc_tried)
2037						raidz_checksum_error(zio, rc,
2038						    orig[i]);
2039					rc->rc_error = SET_ERROR(ECKSUM);
2040				}
2041
2042				ret = code;
2043				goto done;
2044			}
2045
2046			/*
2047			 * Restore the original data.
2048			 */
2049			for (i = 0; i < n; i++) {
2050				c = tgts[i];
2051				rc = &rm->rm_col[c];
2052				bcopy(orig[i], rc->rc_data, rc->rc_size);
2053			}
2054
2055			do {
2056				/*
2057				 * Find the next valid column after the current
2058				 * position..
2059				 */
2060				for (next = tgts[current] + 1;
2061				    next < rm->rm_cols &&
2062				    rm->rm_col[next].rc_error != 0; next++)
2063					continue;
2064
2065				ASSERT(next <= tgts[current + 1]);
2066
2067				/*
2068				 * If that spot is available, we're done here.
2069				 */
2070				if (next != tgts[current + 1])
2071					break;
2072
2073				/*
2074				 * Otherwise, find the next valid column after
2075				 * the previous position.
2076				 */
2077				for (c = tgts[current - 1] + 1;
2078				    rm->rm_col[c].rc_error != 0; c++)
2079					continue;
2080
2081				tgts[current] = c;
2082				current++;
2083
2084			} while (current != n);
2085		}
2086	}
2087	n--;
2088done:
2089	for (i = 0; i < n; i++) {
2090		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2091	}
2092
2093	return (ret);
2094}
2095
2096/*
2097 * Complete an IO operation on a RAIDZ VDev
2098 *
2099 * Outline:
2100 * - For write operations:
2101 *   1. Check for errors on the child IOs.
2102 *   2. Return, setting an error code if too few child VDevs were written
2103 *      to reconstruct the data later.  Note that partial writes are
2104 *      considered successful if they can be reconstructed at all.
2105 * - For read operations:
2106 *   1. Check for errors on the child IOs.
2107 *   2. If data errors occurred:
2108 *      a. Try to reassemble the data from the parity available.
2109 *      b. If we haven't yet read the parity drives, read them now.
2110 *      c. If all parity drives have been read but the data still doesn't
2111 *         reassemble with a correct checksum, then try combinatorial
2112 *         reconstruction.
2113 *      d. If that doesn't work, return an error.
2114 *   3. If there were unexpected errors or this is a resilver operation,
2115 *      rewrite the vdevs that had errors.
2116 */
2117static void
2118vdev_raidz_io_done(zio_t *zio)
2119{
2120	vdev_t *vd = zio->io_vd;
2121	vdev_t *cvd;
2122	raidz_map_t *rm = zio->io_vsd;
2123	raidz_col_t *rc;
2124	int unexpected_errors = 0;
2125	int parity_errors = 0;
2126	int parity_untried = 0;
2127	int data_errors = 0;
2128	int total_errors = 0;
2129	int n, c;
2130	int tgts[VDEV_RAIDZ_MAXPARITY];
2131	int code;
2132
2133	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
2134
2135	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2136	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2137
2138	for (c = 0; c < rm->rm_cols; c++) {
2139		rc = &rm->rm_col[c];
2140
2141		if (rc->rc_error) {
2142			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
2143
2144			if (c < rm->rm_firstdatacol)
2145				parity_errors++;
2146			else
2147				data_errors++;
2148
2149			if (!rc->rc_skipped)
2150				unexpected_errors++;
2151
2152			total_errors++;
2153		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2154			parity_untried++;
2155		}
2156	}
2157
2158	if (zio->io_type == ZIO_TYPE_WRITE) {
2159		/*
2160		 * XXX -- for now, treat partial writes as a success.
2161		 * (If we couldn't write enough columns to reconstruct
2162		 * the data, the I/O failed.  Otherwise, good enough.)
2163		 *
2164		 * Now that we support write reallocation, it would be better
2165		 * to treat partial failure as real failure unless there are
2166		 * no non-degraded top-level vdevs left, and not update DTLs
2167		 * if we intend to reallocate.
2168		 */
2169		/* XXPOLICY */
2170		if (total_errors > rm->rm_firstdatacol)
2171			zio->io_error = vdev_raidz_worst_error(rm);
2172
2173		return;
2174	} else if (zio->io_type == ZIO_TYPE_FREE) {
2175		return;
2176	}
2177
2178	ASSERT(zio->io_type == ZIO_TYPE_READ);
2179	/*
2180	 * There are three potential phases for a read:
2181	 *	1. produce valid data from the columns read
2182	 *	2. read all disks and try again
2183	 *	3. perform combinatorial reconstruction
2184	 *
2185	 * Each phase is progressively both more expensive and less likely to
2186	 * occur. If we encounter more errors than we can repair or all phases
2187	 * fail, we have no choice but to return an error.
2188	 */
2189
2190	/*
2191	 * If the number of errors we saw was correctable -- less than or equal
2192	 * to the number of parity disks read -- attempt to produce data that
2193	 * has a valid checksum. Naturally, this case applies in the absence of
2194	 * any errors.
2195	 */
2196	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2197		if (data_errors == 0) {
2198			if (raidz_checksum_verify(zio) == 0) {
2199				/*
2200				 * If we read parity information (unnecessarily
2201				 * as it happens since no reconstruction was
2202				 * needed) regenerate and verify the parity.
2203				 * We also regenerate parity when resilvering
2204				 * so we can write it out to the failed device
2205				 * later.
2206				 */
2207				if (parity_errors + parity_untried <
2208				    rm->rm_firstdatacol ||
2209				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2210					n = raidz_parity_verify(zio, rm);
2211					unexpected_errors += n;
2212					ASSERT(parity_errors + n <=
2213					    rm->rm_firstdatacol);
2214				}
2215				goto done;
2216			}
2217		} else {
2218			/*
2219			 * We either attempt to read all the parity columns or
2220			 * none of them. If we didn't try to read parity, we
2221			 * wouldn't be here in the correctable case. There must
2222			 * also have been fewer parity errors than parity
2223			 * columns or, again, we wouldn't be in this code path.
2224			 */
2225			ASSERT(parity_untried == 0);
2226			ASSERT(parity_errors < rm->rm_firstdatacol);
2227
2228			/*
2229			 * Identify the data columns that reported an error.
2230			 */
2231			n = 0;
2232			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2233				rc = &rm->rm_col[c];
2234				if (rc->rc_error != 0) {
2235					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2236					tgts[n++] = c;
2237				}
2238			}
2239
2240			ASSERT(rm->rm_firstdatacol >= n);
2241
2242			code = vdev_raidz_reconstruct(rm, tgts, n);
2243
2244			if (raidz_checksum_verify(zio) == 0) {
2245				atomic_inc_64(&raidz_corrected[code]);
2246
2247				/*
2248				 * If we read more parity disks than were used
2249				 * for reconstruction, confirm that the other
2250				 * parity disks produced correct data. This
2251				 * routine is suboptimal in that it regenerates
2252				 * the parity that we already used in addition
2253				 * to the parity that we're attempting to
2254				 * verify, but this should be a relatively
2255				 * uncommon case, and can be optimized if it
2256				 * becomes a problem. Note that we regenerate
2257				 * parity when resilvering so we can write it
2258				 * out to failed devices later.
2259				 */
2260				if (parity_errors < rm->rm_firstdatacol - n ||
2261				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2262					n = raidz_parity_verify(zio, rm);
2263					unexpected_errors += n;
2264					ASSERT(parity_errors + n <=
2265					    rm->rm_firstdatacol);
2266				}
2267
2268				goto done;
2269			}
2270		}
2271	}
2272
2273	/*
2274	 * This isn't a typical situation -- either we got a read error or
2275	 * a child silently returned bad data. Read every block so we can
2276	 * try again with as much data and parity as we can track down. If
2277	 * we've already been through once before, all children will be marked
2278	 * as tried so we'll proceed to combinatorial reconstruction.
2279	 */
2280	unexpected_errors = 1;
2281	rm->rm_missingdata = 0;
2282	rm->rm_missingparity = 0;
2283
2284	for (c = 0; c < rm->rm_cols; c++) {
2285		if (rm->rm_col[c].rc_tried)
2286			continue;
2287
2288		zio_vdev_io_redone(zio);
2289		do {
2290			rc = &rm->rm_col[c];
2291			if (rc->rc_tried)
2292				continue;
2293			zio_nowait(zio_vdev_child_io(zio, NULL,
2294			    vd->vdev_child[rc->rc_devidx],
2295			    rc->rc_offset, rc->rc_data, rc->rc_size,
2296			    zio->io_type, zio->io_priority, 0,
2297			    vdev_raidz_child_done, rc));
2298		} while (++c < rm->rm_cols);
2299
2300		return;
2301	}
2302
2303	/*
2304	 * At this point we've attempted to reconstruct the data given the
2305	 * errors we detected, and we've attempted to read all columns. There
2306	 * must, therefore, be one or more additional problems -- silent errors
2307	 * resulting in invalid data rather than explicit I/O errors resulting
2308	 * in absent data. We check if there is enough additional data to
2309	 * possibly reconstruct the data and then perform combinatorial
2310	 * reconstruction over all possible combinations. If that fails,
2311	 * we're cooked.
2312	 */
2313	if (total_errors > rm->rm_firstdatacol) {
2314		zio->io_error = vdev_raidz_worst_error(rm);
2315
2316	} else if (total_errors < rm->rm_firstdatacol &&
2317	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2318		/*
2319		 * If we didn't use all the available parity for the
2320		 * combinatorial reconstruction, verify that the remaining
2321		 * parity is correct.
2322		 */
2323		if (code != (1 << rm->rm_firstdatacol) - 1)
2324			(void) raidz_parity_verify(zio, rm);
2325	} else {
2326		/*
2327		 * We're here because either:
2328		 *
2329		 *	total_errors == rm_first_datacol, or
2330		 *	vdev_raidz_combrec() failed
2331		 *
2332		 * In either case, there is enough bad data to prevent
2333		 * reconstruction.
2334		 *
2335		 * Start checksum ereports for all children which haven't
2336		 * failed, and the IO wasn't speculative.
2337		 */
2338		zio->io_error = SET_ERROR(ECKSUM);
2339
2340		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2341			for (c = 0; c < rm->rm_cols; c++) {
2342				rc = &rm->rm_col[c];
2343				if (rc->rc_error == 0) {
2344					zio_bad_cksum_t zbc;
2345					zbc.zbc_has_cksum = 0;
2346					zbc.zbc_injected =
2347					    rm->rm_ecksuminjected;
2348
2349					zfs_ereport_start_checksum(
2350					    zio->io_spa,
2351					    vd->vdev_child[rc->rc_devidx],
2352					    zio, rc->rc_offset, rc->rc_size,
2353					    (void *)(uintptr_t)c, &zbc);
2354				}
2355			}
2356		}
2357	}
2358
2359done:
2360	zio_checksum_verified(zio);
2361
2362	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2363	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2364		/*
2365		 * Use the good data we have in hand to repair damaged children.
2366		 */
2367		for (c = 0; c < rm->rm_cols; c++) {
2368			rc = &rm->rm_col[c];
2369			cvd = vd->vdev_child[rc->rc_devidx];
2370
2371			if (rc->rc_error == 0)
2372				continue;
2373
2374			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2375			    rc->rc_offset, rc->rc_data, rc->rc_size,
2376			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2377			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2378			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2379		}
2380	}
2381}
2382
2383static void
2384vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2385{
2386	if (faulted > vd->vdev_nparity)
2387		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2388		    VDEV_AUX_NO_REPLICAS);
2389	else if (degraded + faulted != 0)
2390		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2391	else
2392		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2393}
2394
2395vdev_ops_t vdev_raidz_ops = {
2396	vdev_raidz_open,
2397	vdev_raidz_close,
2398	vdev_raidz_asize,
2399	vdev_raidz_io_start,
2400	vdev_raidz_io_done,
2401	vdev_raidz_state_change,
2402	NULL,
2403	NULL,
2404	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
2405	B_FALSE			/* not a leaf vdev */
2406};
2407