vdev_raidz.c revision 260763
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2013 by Delphix. All rights reserved.
25 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26 */
27
28#include <sys/zfs_context.h>
29#include <sys/spa.h>
30#include <sys/vdev_impl.h>
31#ifdef illumos
32#include <sys/vdev_disk.h>
33#endif
34#include <sys/vdev_file.h>
35#include <sys/vdev_raidz.h>
36#include <sys/zio.h>
37#include <sys/zio_checksum.h>
38#include <sys/fs/zfs.h>
39#include <sys/fm/fs/zfs.h>
40#include <sys/bio.h>
41
42/*
43 * Virtual device vector for RAID-Z.
44 *
45 * This vdev supports single, double, and triple parity. For single parity,
46 * we use a simple XOR of all the data columns. For double or triple parity,
47 * we use a special case of Reed-Solomon coding. This extends the
48 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
49 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
50 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
51 * former is also based. The latter is designed to provide higher performance
52 * for writes.
53 *
54 * Note that the Plank paper claimed to support arbitrary N+M, but was then
55 * amended six years later identifying a critical flaw that invalidates its
56 * claims. Nevertheless, the technique can be adapted to work for up to
57 * triple parity. For additional parity, the amendment "Note: Correction to
58 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
59 * is viable, but the additional complexity means that write performance will
60 * suffer.
61 *
62 * All of the methods above operate on a Galois field, defined over the
63 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
64 * can be expressed with a single byte. Briefly, the operations on the
65 * field are defined as follows:
66 *
67 *   o addition (+) is represented by a bitwise XOR
68 *   o subtraction (-) is therefore identical to addition: A + B = A - B
69 *   o multiplication of A by 2 is defined by the following bitwise expression:
70 *
71 *	(A * 2)_7 = A_6
72 *	(A * 2)_6 = A_5
73 *	(A * 2)_5 = A_4
74 *	(A * 2)_4 = A_3 + A_7
75 *	(A * 2)_3 = A_2 + A_7
76 *	(A * 2)_2 = A_1 + A_7
77 *	(A * 2)_1 = A_0
78 *	(A * 2)_0 = A_7
79 *
80 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
81 * As an aside, this multiplication is derived from the error correcting
82 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
83 *
84 * Observe that any number in the field (except for 0) can be expressed as a
85 * power of 2 -- a generator for the field. We store a table of the powers of
86 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
87 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
88 * than field addition). The inverse of a field element A (A^-1) is therefore
89 * A ^ (255 - 1) = A^254.
90 *
91 * The up-to-three parity columns, P, Q, R over several data columns,
92 * D_0, ... D_n-1, can be expressed by field operations:
93 *
94 *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
95 *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
96 *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
97 *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
98 *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
99 *
100 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
101 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
102 * independent coefficients. (There are no additional coefficients that have
103 * this property which is why the uncorrected Plank method breaks down.)
104 *
105 * See the reconstruction code below for how P, Q and R can used individually
106 * or in concert to recover missing data columns.
107 */
108
109typedef struct raidz_col {
110	uint64_t rc_devidx;		/* child device index for I/O */
111	uint64_t rc_offset;		/* device offset */
112	uint64_t rc_size;		/* I/O size */
113	void *rc_data;			/* I/O data */
114	void *rc_gdata;			/* used to store the "good" version */
115	int rc_error;			/* I/O error for this device */
116	uint8_t rc_tried;		/* Did we attempt this I/O column? */
117	uint8_t rc_skipped;		/* Did we skip this I/O column? */
118} raidz_col_t;
119
120typedef struct raidz_map {
121	uint64_t rm_cols;		/* Regular column count */
122	uint64_t rm_scols;		/* Count including skipped columns */
123	uint64_t rm_bigcols;		/* Number of oversized columns */
124	uint64_t rm_asize;		/* Actual total I/O size */
125	uint64_t rm_missingdata;	/* Count of missing data devices */
126	uint64_t rm_missingparity;	/* Count of missing parity devices */
127	uint64_t rm_firstdatacol;	/* First data column/parity count */
128	uint64_t rm_nskip;		/* Skipped sectors for padding */
129	uint64_t rm_skipstart;		/* Column index of padding start */
130	void *rm_datacopy;		/* rm_asize-buffer of copied data */
131	uintptr_t rm_reports;		/* # of referencing checksum reports */
132	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
133	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
134	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
135} raidz_map_t;
136
137#define	VDEV_RAIDZ_P		0
138#define	VDEV_RAIDZ_Q		1
139#define	VDEV_RAIDZ_R		2
140
141#define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
142#define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
143
144/*
145 * We provide a mechanism to perform the field multiplication operation on a
146 * 64-bit value all at once rather than a byte at a time. This works by
147 * creating a mask from the top bit in each byte and using that to
148 * conditionally apply the XOR of 0x1d.
149 */
150#define	VDEV_RAIDZ_64MUL_2(x, mask) \
151{ \
152	(mask) = (x) & 0x8080808080808080ULL; \
153	(mask) = ((mask) << 1) - ((mask) >> 7); \
154	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
155	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
156}
157
158#define	VDEV_RAIDZ_64MUL_4(x, mask) \
159{ \
160	VDEV_RAIDZ_64MUL_2((x), mask); \
161	VDEV_RAIDZ_64MUL_2((x), mask); \
162}
163
164#define	VDEV_LABEL_OFFSET(x)	(x + VDEV_LABEL_START_SIZE)
165
166/*
167 * Force reconstruction to use the general purpose method.
168 */
169int vdev_raidz_default_to_general;
170
171/* Powers of 2 in the Galois field defined above. */
172static const uint8_t vdev_raidz_pow2[256] = {
173	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
174	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
175	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
176	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
177	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
178	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
179	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
180	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
181	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
182	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
183	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
184	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
185	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
186	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
187	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
188	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
189	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
190	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
191	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
192	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
193	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
194	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
195	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
196	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
197	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
198	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
199	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
200	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
201	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
202	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
203	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
204	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
205};
206/* Logs of 2 in the Galois field defined above. */
207static const uint8_t vdev_raidz_log2[256] = {
208	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
209	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
210	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
211	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
212	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
213	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
214	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
215	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
216	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
217	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
218	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
219	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
220	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
221	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
222	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
223	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
224	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
225	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
226	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
227	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
228	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
229	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
230	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
231	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
232	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
233	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
234	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
235	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
236	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
237	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
238	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
239	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
240};
241
242static void vdev_raidz_generate_parity(raidz_map_t *rm);
243
244/*
245 * Multiply a given number by 2 raised to the given power.
246 */
247static uint8_t
248vdev_raidz_exp2(uint_t a, int exp)
249{
250	if (a == 0)
251		return (0);
252
253	ASSERT(exp >= 0);
254	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
255
256	exp += vdev_raidz_log2[a];
257	if (exp > 255)
258		exp -= 255;
259
260	return (vdev_raidz_pow2[exp]);
261}
262
263static void
264vdev_raidz_map_free(raidz_map_t *rm)
265{
266	int c;
267	size_t size;
268
269	for (c = 0; c < rm->rm_firstdatacol; c++) {
270		if (rm->rm_col[c].rc_data != NULL)
271			zio_buf_free(rm->rm_col[c].rc_data,
272			    rm->rm_col[c].rc_size);
273
274		if (rm->rm_col[c].rc_gdata != NULL)
275			zio_buf_free(rm->rm_col[c].rc_gdata,
276			    rm->rm_col[c].rc_size);
277	}
278
279	size = 0;
280	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
281		size += rm->rm_col[c].rc_size;
282
283	if (rm->rm_datacopy != NULL)
284		zio_buf_free(rm->rm_datacopy, size);
285
286	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
287}
288
289static void
290vdev_raidz_map_free_vsd(zio_t *zio)
291{
292	raidz_map_t *rm = zio->io_vsd;
293
294	ASSERT0(rm->rm_freed);
295	rm->rm_freed = 1;
296
297	if (rm->rm_reports == 0)
298		vdev_raidz_map_free(rm);
299}
300
301/*ARGSUSED*/
302static void
303vdev_raidz_cksum_free(void *arg, size_t ignored)
304{
305	raidz_map_t *rm = arg;
306
307	ASSERT3U(rm->rm_reports, >, 0);
308
309	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
310		vdev_raidz_map_free(rm);
311}
312
313static void
314vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
315{
316	raidz_map_t *rm = zcr->zcr_cbdata;
317	size_t c = zcr->zcr_cbinfo;
318	size_t x;
319
320	const char *good = NULL;
321	const char *bad = rm->rm_col[c].rc_data;
322
323	if (good_data == NULL) {
324		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
325		return;
326	}
327
328	if (c < rm->rm_firstdatacol) {
329		/*
330		 * The first time through, calculate the parity blocks for
331		 * the good data (this relies on the fact that the good
332		 * data never changes for a given logical ZIO)
333		 */
334		if (rm->rm_col[0].rc_gdata == NULL) {
335			char *bad_parity[VDEV_RAIDZ_MAXPARITY];
336			char *buf;
337
338			/*
339			 * Set up the rm_col[]s to generate the parity for
340			 * good_data, first saving the parity bufs and
341			 * replacing them with buffers to hold the result.
342			 */
343			for (x = 0; x < rm->rm_firstdatacol; x++) {
344				bad_parity[x] = rm->rm_col[x].rc_data;
345				rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
346				    zio_buf_alloc(rm->rm_col[x].rc_size);
347			}
348
349			/* fill in the data columns from good_data */
350			buf = (char *)good_data;
351			for (; x < rm->rm_cols; x++) {
352				rm->rm_col[x].rc_data = buf;
353				buf += rm->rm_col[x].rc_size;
354			}
355
356			/*
357			 * Construct the parity from the good data.
358			 */
359			vdev_raidz_generate_parity(rm);
360
361			/* restore everything back to its original state */
362			for (x = 0; x < rm->rm_firstdatacol; x++)
363				rm->rm_col[x].rc_data = bad_parity[x];
364
365			buf = rm->rm_datacopy;
366			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
367				rm->rm_col[x].rc_data = buf;
368				buf += rm->rm_col[x].rc_size;
369			}
370		}
371
372		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
373		good = rm->rm_col[c].rc_gdata;
374	} else {
375		/* adjust good_data to point at the start of our column */
376		good = good_data;
377
378		for (x = rm->rm_firstdatacol; x < c; x++)
379			good += rm->rm_col[x].rc_size;
380	}
381
382	/* we drop the ereport if it ends up that the data was good */
383	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
384}
385
386/*
387 * Invoked indirectly by zfs_ereport_start_checksum(), called
388 * below when our read operation fails completely.  The main point
389 * is to keep a copy of everything we read from disk, so that at
390 * vdev_raidz_cksum_finish() time we can compare it with the good data.
391 */
392static void
393vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
394{
395	size_t c = (size_t)(uintptr_t)arg;
396	caddr_t buf;
397
398	raidz_map_t *rm = zio->io_vsd;
399	size_t size;
400
401	/* set up the report and bump the refcount  */
402	zcr->zcr_cbdata = rm;
403	zcr->zcr_cbinfo = c;
404	zcr->zcr_finish = vdev_raidz_cksum_finish;
405	zcr->zcr_free = vdev_raidz_cksum_free;
406
407	rm->rm_reports++;
408	ASSERT3U(rm->rm_reports, >, 0);
409
410	if (rm->rm_datacopy != NULL)
411		return;
412
413	/*
414	 * It's the first time we're called for this raidz_map_t, so we need
415	 * to copy the data aside; there's no guarantee that our zio's buffer
416	 * won't be re-used for something else.
417	 *
418	 * Our parity data is already in separate buffers, so there's no need
419	 * to copy them.
420	 */
421
422	size = 0;
423	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
424		size += rm->rm_col[c].rc_size;
425
426	buf = rm->rm_datacopy = zio_buf_alloc(size);
427
428	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
429		raidz_col_t *col = &rm->rm_col[c];
430
431		bcopy(col->rc_data, buf, col->rc_size);
432		col->rc_data = buf;
433
434		buf += col->rc_size;
435	}
436	ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
437}
438
439static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
440	vdev_raidz_map_free_vsd,
441	vdev_raidz_cksum_report
442};
443
444/*
445 * Divides the IO evenly across all child vdevs; usually, dcols is
446 * the number of children in the target vdev.
447 */
448static raidz_map_t *
449vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, boolean_t dofree,
450    uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
451{
452	raidz_map_t *rm;
453	/* The starting RAIDZ (parent) vdev sector of the block. */
454	uint64_t b = offset >> unit_shift;
455	/* The zio's size in units of the vdev's minimum sector size. */
456	uint64_t s = size >> unit_shift;
457	/* The first column for this stripe. */
458	uint64_t f = b % dcols;
459	/* The starting byte offset on each child vdev. */
460	uint64_t o = (b / dcols) << unit_shift;
461	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
462
463	/*
464	 * "Quotient": The number of data sectors for this stripe on all but
465	 * the "big column" child vdevs that also contain "remainder" data.
466	 */
467	q = s / (dcols - nparity);
468
469	/*
470	 * "Remainder": The number of partial stripe data sectors in this I/O.
471	 * This will add a sector to some, but not all, child vdevs.
472	 */
473	r = s - q * (dcols - nparity);
474
475	/* The number of "big columns" - those which contain remainder data. */
476	bc = (r == 0 ? 0 : r + nparity);
477
478	/*
479	 * The total number of data and parity sectors associated with
480	 * this I/O.
481	 */
482	tot = s + nparity * (q + (r == 0 ? 0 : 1));
483
484	/* acols: The columns that will be accessed. */
485	/* scols: The columns that will be accessed or skipped. */
486	if (q == 0) {
487		/* Our I/O request doesn't span all child vdevs. */
488		acols = bc;
489		scols = MIN(dcols, roundup(bc, nparity + 1));
490	} else {
491		acols = dcols;
492		scols = dcols;
493	}
494
495	ASSERT3U(acols, <=, scols);
496
497	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
498
499	rm->rm_cols = acols;
500	rm->rm_scols = scols;
501	rm->rm_bigcols = bc;
502	rm->rm_skipstart = bc;
503	rm->rm_missingdata = 0;
504	rm->rm_missingparity = 0;
505	rm->rm_firstdatacol = nparity;
506	rm->rm_datacopy = NULL;
507	rm->rm_reports = 0;
508	rm->rm_freed = 0;
509	rm->rm_ecksuminjected = 0;
510
511	asize = 0;
512
513	for (c = 0; c < scols; c++) {
514		col = f + c;
515		coff = o;
516		if (col >= dcols) {
517			col -= dcols;
518			coff += 1ULL << unit_shift;
519		}
520		rm->rm_col[c].rc_devidx = col;
521		rm->rm_col[c].rc_offset = coff;
522		rm->rm_col[c].rc_data = NULL;
523		rm->rm_col[c].rc_gdata = NULL;
524		rm->rm_col[c].rc_error = 0;
525		rm->rm_col[c].rc_tried = 0;
526		rm->rm_col[c].rc_skipped = 0;
527
528		if (c >= acols)
529			rm->rm_col[c].rc_size = 0;
530		else if (c < bc)
531			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
532		else
533			rm->rm_col[c].rc_size = q << unit_shift;
534
535		asize += rm->rm_col[c].rc_size;
536	}
537
538	ASSERT3U(asize, ==, tot << unit_shift);
539	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
540	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
541	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
542	ASSERT3U(rm->rm_nskip, <=, nparity);
543
544	if (!dofree) {
545		for (c = 0; c < rm->rm_firstdatacol; c++) {
546			rm->rm_col[c].rc_data =
547			    zio_buf_alloc(rm->rm_col[c].rc_size);
548		}
549
550		rm->rm_col[c].rc_data = data;
551
552		for (c = c + 1; c < acols; c++) {
553			rm->rm_col[c].rc_data =
554			    (char *)rm->rm_col[c - 1].rc_data +
555			    rm->rm_col[c - 1].rc_size;
556		}
557	}
558
559	/*
560	 * If all data stored spans all columns, there's a danger that parity
561	 * will always be on the same device and, since parity isn't read
562	 * during normal operation, that that device's I/O bandwidth won't be
563	 * used effectively. We therefore switch the parity every 1MB.
564	 *
565	 * ... at least that was, ostensibly, the theory. As a practical
566	 * matter unless we juggle the parity between all devices evenly, we
567	 * won't see any benefit. Further, occasional writes that aren't a
568	 * multiple of the LCM of the number of children and the minimum
569	 * stripe width are sufficient to avoid pessimal behavior.
570	 * Unfortunately, this decision created an implicit on-disk format
571	 * requirement that we need to support for all eternity, but only
572	 * for single-parity RAID-Z.
573	 *
574	 * If we intend to skip a sector in the zeroth column for padding
575	 * we must make sure to note this swap. We will never intend to
576	 * skip the first column since at least one data and one parity
577	 * column must appear in each row.
578	 */
579	ASSERT(rm->rm_cols >= 2);
580	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
581
582	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
583		devidx = rm->rm_col[0].rc_devidx;
584		o = rm->rm_col[0].rc_offset;
585		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
586		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
587		rm->rm_col[1].rc_devidx = devidx;
588		rm->rm_col[1].rc_offset = o;
589
590		if (rm->rm_skipstart == 0)
591			rm->rm_skipstart = 1;
592	}
593
594	return (rm);
595}
596
597static void
598vdev_raidz_generate_parity_p(raidz_map_t *rm)
599{
600	uint64_t *p, *src, pcount, ccount, i;
601	int c;
602
603	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
604
605	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
606		src = rm->rm_col[c].rc_data;
607		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
608		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
609
610		if (c == rm->rm_firstdatacol) {
611			ASSERT(ccount == pcount);
612			for (i = 0; i < ccount; i++, src++, p++) {
613				*p = *src;
614			}
615		} else {
616			ASSERT(ccount <= pcount);
617			for (i = 0; i < ccount; i++, src++, p++) {
618				*p ^= *src;
619			}
620		}
621	}
622}
623
624static void
625vdev_raidz_generate_parity_pq(raidz_map_t *rm)
626{
627	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
628	int c;
629
630	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
631	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
632	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
633
634	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
635		src = rm->rm_col[c].rc_data;
636		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
637		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
638
639		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
640
641		if (c == rm->rm_firstdatacol) {
642			ASSERT(ccnt == pcnt || ccnt == 0);
643			for (i = 0; i < ccnt; i++, src++, p++, q++) {
644				*p = *src;
645				*q = *src;
646			}
647			for (; i < pcnt; i++, src++, p++, q++) {
648				*p = 0;
649				*q = 0;
650			}
651		} else {
652			ASSERT(ccnt <= pcnt);
653
654			/*
655			 * Apply the algorithm described above by multiplying
656			 * the previous result and adding in the new value.
657			 */
658			for (i = 0; i < ccnt; i++, src++, p++, q++) {
659				*p ^= *src;
660
661				VDEV_RAIDZ_64MUL_2(*q, mask);
662				*q ^= *src;
663			}
664
665			/*
666			 * Treat short columns as though they are full of 0s.
667			 * Note that there's therefore nothing needed for P.
668			 */
669			for (; i < pcnt; i++, q++) {
670				VDEV_RAIDZ_64MUL_2(*q, mask);
671			}
672		}
673	}
674}
675
676static void
677vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
678{
679	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
680	int c;
681
682	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
683	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
684	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
685	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
686	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
687
688	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
689		src = rm->rm_col[c].rc_data;
690		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
691		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
692		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
693
694		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
695
696		if (c == rm->rm_firstdatacol) {
697			ASSERT(ccnt == pcnt || ccnt == 0);
698			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
699				*p = *src;
700				*q = *src;
701				*r = *src;
702			}
703			for (; i < pcnt; i++, src++, p++, q++, r++) {
704				*p = 0;
705				*q = 0;
706				*r = 0;
707			}
708		} else {
709			ASSERT(ccnt <= pcnt);
710
711			/*
712			 * Apply the algorithm described above by multiplying
713			 * the previous result and adding in the new value.
714			 */
715			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
716				*p ^= *src;
717
718				VDEV_RAIDZ_64MUL_2(*q, mask);
719				*q ^= *src;
720
721				VDEV_RAIDZ_64MUL_4(*r, mask);
722				*r ^= *src;
723			}
724
725			/*
726			 * Treat short columns as though they are full of 0s.
727			 * Note that there's therefore nothing needed for P.
728			 */
729			for (; i < pcnt; i++, q++, r++) {
730				VDEV_RAIDZ_64MUL_2(*q, mask);
731				VDEV_RAIDZ_64MUL_4(*r, mask);
732			}
733		}
734	}
735}
736
737/*
738 * Generate RAID parity in the first virtual columns according to the number of
739 * parity columns available.
740 */
741static void
742vdev_raidz_generate_parity(raidz_map_t *rm)
743{
744	switch (rm->rm_firstdatacol) {
745	case 1:
746		vdev_raidz_generate_parity_p(rm);
747		break;
748	case 2:
749		vdev_raidz_generate_parity_pq(rm);
750		break;
751	case 3:
752		vdev_raidz_generate_parity_pqr(rm);
753		break;
754	default:
755		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
756	}
757}
758
759static int
760vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
761{
762	uint64_t *dst, *src, xcount, ccount, count, i;
763	int x = tgts[0];
764	int c;
765
766	ASSERT(ntgts == 1);
767	ASSERT(x >= rm->rm_firstdatacol);
768	ASSERT(x < rm->rm_cols);
769
770	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
771	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
772	ASSERT(xcount > 0);
773
774	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
775	dst = rm->rm_col[x].rc_data;
776	for (i = 0; i < xcount; i++, dst++, src++) {
777		*dst = *src;
778	}
779
780	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
781		src = rm->rm_col[c].rc_data;
782		dst = rm->rm_col[x].rc_data;
783
784		if (c == x)
785			continue;
786
787		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
788		count = MIN(ccount, xcount);
789
790		for (i = 0; i < count; i++, dst++, src++) {
791			*dst ^= *src;
792		}
793	}
794
795	return (1 << VDEV_RAIDZ_P);
796}
797
798static int
799vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
800{
801	uint64_t *dst, *src, xcount, ccount, count, mask, i;
802	uint8_t *b;
803	int x = tgts[0];
804	int c, j, exp;
805
806	ASSERT(ntgts == 1);
807
808	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
809	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
810
811	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
812		src = rm->rm_col[c].rc_data;
813		dst = rm->rm_col[x].rc_data;
814
815		if (c == x)
816			ccount = 0;
817		else
818			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
819
820		count = MIN(ccount, xcount);
821
822		if (c == rm->rm_firstdatacol) {
823			for (i = 0; i < count; i++, dst++, src++) {
824				*dst = *src;
825			}
826			for (; i < xcount; i++, dst++) {
827				*dst = 0;
828			}
829
830		} else {
831			for (i = 0; i < count; i++, dst++, src++) {
832				VDEV_RAIDZ_64MUL_2(*dst, mask);
833				*dst ^= *src;
834			}
835
836			for (; i < xcount; i++, dst++) {
837				VDEV_RAIDZ_64MUL_2(*dst, mask);
838			}
839		}
840	}
841
842	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
843	dst = rm->rm_col[x].rc_data;
844	exp = 255 - (rm->rm_cols - 1 - x);
845
846	for (i = 0; i < xcount; i++, dst++, src++) {
847		*dst ^= *src;
848		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
849			*b = vdev_raidz_exp2(*b, exp);
850		}
851	}
852
853	return (1 << VDEV_RAIDZ_Q);
854}
855
856static int
857vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
858{
859	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
860	void *pdata, *qdata;
861	uint64_t xsize, ysize, i;
862	int x = tgts[0];
863	int y = tgts[1];
864
865	ASSERT(ntgts == 2);
866	ASSERT(x < y);
867	ASSERT(x >= rm->rm_firstdatacol);
868	ASSERT(y < rm->rm_cols);
869
870	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
871
872	/*
873	 * Move the parity data aside -- we're going to compute parity as
874	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
875	 * reuse the parity generation mechanism without trashing the actual
876	 * parity so we make those columns appear to be full of zeros by
877	 * setting their lengths to zero.
878	 */
879	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
880	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
881	xsize = rm->rm_col[x].rc_size;
882	ysize = rm->rm_col[y].rc_size;
883
884	rm->rm_col[VDEV_RAIDZ_P].rc_data =
885	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
886	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
887	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
888	rm->rm_col[x].rc_size = 0;
889	rm->rm_col[y].rc_size = 0;
890
891	vdev_raidz_generate_parity_pq(rm);
892
893	rm->rm_col[x].rc_size = xsize;
894	rm->rm_col[y].rc_size = ysize;
895
896	p = pdata;
897	q = qdata;
898	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
899	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
900	xd = rm->rm_col[x].rc_data;
901	yd = rm->rm_col[y].rc_data;
902
903	/*
904	 * We now have:
905	 *	Pxy = P + D_x + D_y
906	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
907	 *
908	 * We can then solve for D_x:
909	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
910	 * where
911	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
912	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
913	 *
914	 * With D_x in hand, we can easily solve for D_y:
915	 *	D_y = P + Pxy + D_x
916	 */
917
918	a = vdev_raidz_pow2[255 + x - y];
919	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
920	tmp = 255 - vdev_raidz_log2[a ^ 1];
921
922	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
923	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
924
925	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
926		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
927		    vdev_raidz_exp2(*q ^ *qxy, bexp);
928
929		if (i < ysize)
930			*yd = *p ^ *pxy ^ *xd;
931	}
932
933	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
934	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
935	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
936	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
937
938	/*
939	 * Restore the saved parity data.
940	 */
941	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
942	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
943
944	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
945}
946
947/* BEGIN CSTYLED */
948/*
949 * In the general case of reconstruction, we must solve the system of linear
950 * equations defined by the coeffecients used to generate parity as well as
951 * the contents of the data and parity disks. This can be expressed with
952 * vectors for the original data (D) and the actual data (d) and parity (p)
953 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
954 *
955 *            __   __                     __     __
956 *            |     |         __     __   |  p_0  |
957 *            |  V  |         |  D_0  |   | p_m-1 |
958 *            |     |    x    |   :   | = |  d_0  |
959 *            |  I  |         | D_n-1 |   |   :   |
960 *            |     |         ~~     ~~   | d_n-1 |
961 *            ~~   ~~                     ~~     ~~
962 *
963 * I is simply a square identity matrix of size n, and V is a vandermonde
964 * matrix defined by the coeffecients we chose for the various parity columns
965 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
966 * computation as well as linear separability.
967 *
968 *      __               __               __     __
969 *      |   1   ..  1 1 1 |               |  p_0  |
970 *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
971 *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
972 *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
973 *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
974 *      |   :       : : : |   |   :   |   |  d_2  |
975 *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
976 *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
977 *      |   0   ..  0 0 1 |               | d_n-1 |
978 *      ~~               ~~               ~~     ~~
979 *
980 * Note that I, V, d, and p are known. To compute D, we must invert the
981 * matrix and use the known data and parity values to reconstruct the unknown
982 * data values. We begin by removing the rows in V|I and d|p that correspond
983 * to failed or missing columns; we then make V|I square (n x n) and d|p
984 * sized n by removing rows corresponding to unused parity from the bottom up
985 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
986 * using Gauss-Jordan elimination. In the example below we use m=3 parity
987 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
988 *           __                               __
989 *           |  1   1   1   1   1   1   1   1  |
990 *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
991 *           |  19 205 116  29  64  16  4   1  |      / /
992 *           |  1   0   0   0   0   0   0   0  |     / /
993 *           |  0   1   0   0   0   0   0   0  | <--' /
994 *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
995 *           |  0   0   0   1   0   0   0   0  |
996 *           |  0   0   0   0   1   0   0   0  |
997 *           |  0   0   0   0   0   1   0   0  |
998 *           |  0   0   0   0   0   0   1   0  |
999 *           |  0   0   0   0   0   0   0   1  |
1000 *           ~~                               ~~
1001 *           __                               __
1002 *           |  1   1   1   1   1   1   1   1  |
1003 *           |  19 205 116  29  64  16  4   1  |
1004 *           |  1   0   0   0   0   0   0   0  |
1005 *  (V|I)' = |  0   0   0   1   0   0   0   0  |
1006 *           |  0   0   0   0   1   0   0   0  |
1007 *           |  0   0   0   0   0   1   0   0  |
1008 *           |  0   0   0   0   0   0   1   0  |
1009 *           |  0   0   0   0   0   0   0   1  |
1010 *           ~~                               ~~
1011 *
1012 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1013 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1014 * matrix is not singular.
1015 * __                                                                 __
1016 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1017 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1018 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1019 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1020 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1021 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1022 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1023 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1024 * ~~                                                                 ~~
1025 * __                                                                 __
1026 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1027 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1028 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1029 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1030 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1031 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1032 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1033 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1034 * ~~                                                                 ~~
1035 * __                                                                 __
1036 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1037 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1038 * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1039 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1040 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1041 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1042 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1043 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1044 * ~~                                                                 ~~
1045 * __                                                                 __
1046 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1047 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1048 * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1049 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1050 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1051 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1052 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1053 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1054 * ~~                                                                 ~~
1055 * __                                                                 __
1056 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1057 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1058 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1059 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1060 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1061 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1062 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1063 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1064 * ~~                                                                 ~~
1065 * __                                                                 __
1066 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1067 * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1068 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1069 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1070 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1071 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1072 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1073 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1074 * ~~                                                                 ~~
1075 *                   __                               __
1076 *                   |  0   0   1   0   0   0   0   0  |
1077 *                   | 167 100  5   41 159 169 217 208 |
1078 *                   | 166 100  4   40 158 168 216 209 |
1079 *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1080 *                   |  0   0   0   0   1   0   0   0  |
1081 *                   |  0   0   0   0   0   1   0   0  |
1082 *                   |  0   0   0   0   0   0   1   0  |
1083 *                   |  0   0   0   0   0   0   0   1  |
1084 *                   ~~                               ~~
1085 *
1086 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1087 * of the missing data.
1088 *
1089 * As is apparent from the example above, the only non-trivial rows in the
1090 * inverse matrix correspond to the data disks that we're trying to
1091 * reconstruct. Indeed, those are the only rows we need as the others would
1092 * only be useful for reconstructing data known or assumed to be valid. For
1093 * that reason, we only build the coefficients in the rows that correspond to
1094 * targeted columns.
1095 */
1096/* END CSTYLED */
1097
1098static void
1099vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1100    uint8_t **rows)
1101{
1102	int i, j;
1103	int pow;
1104
1105	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1106
1107	/*
1108	 * Fill in the missing rows of interest.
1109	 */
1110	for (i = 0; i < nmap; i++) {
1111		ASSERT3S(0, <=, map[i]);
1112		ASSERT3S(map[i], <=, 2);
1113
1114		pow = map[i] * n;
1115		if (pow > 255)
1116			pow -= 255;
1117		ASSERT(pow <= 255);
1118
1119		for (j = 0; j < n; j++) {
1120			pow -= map[i];
1121			if (pow < 0)
1122				pow += 255;
1123			rows[i][j] = vdev_raidz_pow2[pow];
1124		}
1125	}
1126}
1127
1128static void
1129vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1130    uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1131{
1132	int i, j, ii, jj;
1133	uint8_t log;
1134
1135	/*
1136	 * Assert that the first nmissing entries from the array of used
1137	 * columns correspond to parity columns and that subsequent entries
1138	 * correspond to data columns.
1139	 */
1140	for (i = 0; i < nmissing; i++) {
1141		ASSERT3S(used[i], <, rm->rm_firstdatacol);
1142	}
1143	for (; i < n; i++) {
1144		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1145	}
1146
1147	/*
1148	 * First initialize the storage where we'll compute the inverse rows.
1149	 */
1150	for (i = 0; i < nmissing; i++) {
1151		for (j = 0; j < n; j++) {
1152			invrows[i][j] = (i == j) ? 1 : 0;
1153		}
1154	}
1155
1156	/*
1157	 * Subtract all trivial rows from the rows of consequence.
1158	 */
1159	for (i = 0; i < nmissing; i++) {
1160		for (j = nmissing; j < n; j++) {
1161			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1162			jj = used[j] - rm->rm_firstdatacol;
1163			ASSERT3S(jj, <, n);
1164			invrows[i][j] = rows[i][jj];
1165			rows[i][jj] = 0;
1166		}
1167	}
1168
1169	/*
1170	 * For each of the rows of interest, we must normalize it and subtract
1171	 * a multiple of it from the other rows.
1172	 */
1173	for (i = 0; i < nmissing; i++) {
1174		for (j = 0; j < missing[i]; j++) {
1175			ASSERT0(rows[i][j]);
1176		}
1177		ASSERT3U(rows[i][missing[i]], !=, 0);
1178
1179		/*
1180		 * Compute the inverse of the first element and multiply each
1181		 * element in the row by that value.
1182		 */
1183		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1184
1185		for (j = 0; j < n; j++) {
1186			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1187			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1188		}
1189
1190		for (ii = 0; ii < nmissing; ii++) {
1191			if (i == ii)
1192				continue;
1193
1194			ASSERT3U(rows[ii][missing[i]], !=, 0);
1195
1196			log = vdev_raidz_log2[rows[ii][missing[i]]];
1197
1198			for (j = 0; j < n; j++) {
1199				rows[ii][j] ^=
1200				    vdev_raidz_exp2(rows[i][j], log);
1201				invrows[ii][j] ^=
1202				    vdev_raidz_exp2(invrows[i][j], log);
1203			}
1204		}
1205	}
1206
1207	/*
1208	 * Verify that the data that is left in the rows are properly part of
1209	 * an identity matrix.
1210	 */
1211	for (i = 0; i < nmissing; i++) {
1212		for (j = 0; j < n; j++) {
1213			if (j == missing[i]) {
1214				ASSERT3U(rows[i][j], ==, 1);
1215			} else {
1216				ASSERT0(rows[i][j]);
1217			}
1218		}
1219	}
1220}
1221
1222static void
1223vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1224    int *missing, uint8_t **invrows, const uint8_t *used)
1225{
1226	int i, j, x, cc, c;
1227	uint8_t *src;
1228	uint64_t ccount;
1229	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1230	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1231	uint8_t log = 0;
1232	uint8_t val;
1233	int ll;
1234	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1235	uint8_t *p, *pp;
1236	size_t psize;
1237
1238	psize = sizeof (invlog[0][0]) * n * nmissing;
1239	p = kmem_alloc(psize, KM_SLEEP);
1240
1241	for (pp = p, i = 0; i < nmissing; i++) {
1242		invlog[i] = pp;
1243		pp += n;
1244	}
1245
1246	for (i = 0; i < nmissing; i++) {
1247		for (j = 0; j < n; j++) {
1248			ASSERT3U(invrows[i][j], !=, 0);
1249			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1250		}
1251	}
1252
1253	for (i = 0; i < n; i++) {
1254		c = used[i];
1255		ASSERT3U(c, <, rm->rm_cols);
1256
1257		src = rm->rm_col[c].rc_data;
1258		ccount = rm->rm_col[c].rc_size;
1259		for (j = 0; j < nmissing; j++) {
1260			cc = missing[j] + rm->rm_firstdatacol;
1261			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1262			ASSERT3U(cc, <, rm->rm_cols);
1263			ASSERT3U(cc, !=, c);
1264
1265			dst[j] = rm->rm_col[cc].rc_data;
1266			dcount[j] = rm->rm_col[cc].rc_size;
1267		}
1268
1269		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1270
1271		for (x = 0; x < ccount; x++, src++) {
1272			if (*src != 0)
1273				log = vdev_raidz_log2[*src];
1274
1275			for (cc = 0; cc < nmissing; cc++) {
1276				if (x >= dcount[cc])
1277					continue;
1278
1279				if (*src == 0) {
1280					val = 0;
1281				} else {
1282					if ((ll = log + invlog[cc][i]) >= 255)
1283						ll -= 255;
1284					val = vdev_raidz_pow2[ll];
1285				}
1286
1287				if (i == 0)
1288					dst[cc][x] = val;
1289				else
1290					dst[cc][x] ^= val;
1291			}
1292		}
1293	}
1294
1295	kmem_free(p, psize);
1296}
1297
1298static int
1299vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1300{
1301	int n, i, c, t, tt;
1302	int nmissing_rows;
1303	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1304	int parity_map[VDEV_RAIDZ_MAXPARITY];
1305
1306	uint8_t *p, *pp;
1307	size_t psize;
1308
1309	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1310	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1311	uint8_t *used;
1312
1313	int code = 0;
1314
1315
1316	n = rm->rm_cols - rm->rm_firstdatacol;
1317
1318	/*
1319	 * Figure out which data columns are missing.
1320	 */
1321	nmissing_rows = 0;
1322	for (t = 0; t < ntgts; t++) {
1323		if (tgts[t] >= rm->rm_firstdatacol) {
1324			missing_rows[nmissing_rows++] =
1325			    tgts[t] - rm->rm_firstdatacol;
1326		}
1327	}
1328
1329	/*
1330	 * Figure out which parity columns to use to help generate the missing
1331	 * data columns.
1332	 */
1333	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1334		ASSERT(tt < ntgts);
1335		ASSERT(c < rm->rm_firstdatacol);
1336
1337		/*
1338		 * Skip any targeted parity columns.
1339		 */
1340		if (c == tgts[tt]) {
1341			tt++;
1342			continue;
1343		}
1344
1345		code |= 1 << c;
1346
1347		parity_map[i] = c;
1348		i++;
1349	}
1350
1351	ASSERT(code != 0);
1352	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1353
1354	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1355	    nmissing_rows * n + sizeof (used[0]) * n;
1356	p = kmem_alloc(psize, KM_SLEEP);
1357
1358	for (pp = p, i = 0; i < nmissing_rows; i++) {
1359		rows[i] = pp;
1360		pp += n;
1361		invrows[i] = pp;
1362		pp += n;
1363	}
1364	used = pp;
1365
1366	for (i = 0; i < nmissing_rows; i++) {
1367		used[i] = parity_map[i];
1368	}
1369
1370	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1371		if (tt < nmissing_rows &&
1372		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1373			tt++;
1374			continue;
1375		}
1376
1377		ASSERT3S(i, <, n);
1378		used[i] = c;
1379		i++;
1380	}
1381
1382	/*
1383	 * Initialize the interesting rows of the matrix.
1384	 */
1385	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1386
1387	/*
1388	 * Invert the matrix.
1389	 */
1390	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1391	    invrows, used);
1392
1393	/*
1394	 * Reconstruct the missing data using the generated matrix.
1395	 */
1396	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1397	    invrows, used);
1398
1399	kmem_free(p, psize);
1400
1401	return (code);
1402}
1403
1404static int
1405vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1406{
1407	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1408	int ntgts;
1409	int i, c;
1410	int code;
1411	int nbadparity, nbaddata;
1412	int parity_valid[VDEV_RAIDZ_MAXPARITY];
1413
1414	/*
1415	 * The tgts list must already be sorted.
1416	 */
1417	for (i = 1; i < nt; i++) {
1418		ASSERT(t[i] > t[i - 1]);
1419	}
1420
1421	nbadparity = rm->rm_firstdatacol;
1422	nbaddata = rm->rm_cols - nbadparity;
1423	ntgts = 0;
1424	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1425		if (c < rm->rm_firstdatacol)
1426			parity_valid[c] = B_FALSE;
1427
1428		if (i < nt && c == t[i]) {
1429			tgts[ntgts++] = c;
1430			i++;
1431		} else if (rm->rm_col[c].rc_error != 0) {
1432			tgts[ntgts++] = c;
1433		} else if (c >= rm->rm_firstdatacol) {
1434			nbaddata--;
1435		} else {
1436			parity_valid[c] = B_TRUE;
1437			nbadparity--;
1438		}
1439	}
1440
1441	ASSERT(ntgts >= nt);
1442	ASSERT(nbaddata >= 0);
1443	ASSERT(nbaddata + nbadparity == ntgts);
1444
1445	dt = &tgts[nbadparity];
1446
1447	/*
1448	 * See if we can use any of our optimized reconstruction routines.
1449	 */
1450	if (!vdev_raidz_default_to_general) {
1451		switch (nbaddata) {
1452		case 1:
1453			if (parity_valid[VDEV_RAIDZ_P])
1454				return (vdev_raidz_reconstruct_p(rm, dt, 1));
1455
1456			ASSERT(rm->rm_firstdatacol > 1);
1457
1458			if (parity_valid[VDEV_RAIDZ_Q])
1459				return (vdev_raidz_reconstruct_q(rm, dt, 1));
1460
1461			ASSERT(rm->rm_firstdatacol > 2);
1462			break;
1463
1464		case 2:
1465			ASSERT(rm->rm_firstdatacol > 1);
1466
1467			if (parity_valid[VDEV_RAIDZ_P] &&
1468			    parity_valid[VDEV_RAIDZ_Q])
1469				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1470
1471			ASSERT(rm->rm_firstdatacol > 2);
1472
1473			break;
1474		}
1475	}
1476
1477	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1478	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1479	ASSERT(code > 0);
1480	return (code);
1481}
1482
1483static int
1484vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1485    uint64_t *logical_ashift, uint64_t *physical_ashift)
1486{
1487	vdev_t *cvd;
1488	uint64_t nparity = vd->vdev_nparity;
1489	int c;
1490	int lasterror = 0;
1491	int numerrors = 0;
1492
1493	ASSERT(nparity > 0);
1494
1495	if (nparity > VDEV_RAIDZ_MAXPARITY ||
1496	    vd->vdev_children < nparity + 1) {
1497		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1498		return (SET_ERROR(EINVAL));
1499	}
1500
1501	vdev_open_children(vd);
1502
1503	for (c = 0; c < vd->vdev_children; c++) {
1504		cvd = vd->vdev_child[c];
1505
1506		if (cvd->vdev_open_error != 0) {
1507			lasterror = cvd->vdev_open_error;
1508			numerrors++;
1509			continue;
1510		}
1511
1512		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1513		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1514		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
1515		*physical_ashift = MAX(*physical_ashift,
1516		    cvd->vdev_physical_ashift);
1517	}
1518
1519	*asize *= vd->vdev_children;
1520	*max_asize *= vd->vdev_children;
1521
1522	if (numerrors > nparity) {
1523		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1524		return (lasterror);
1525	}
1526
1527	return (0);
1528}
1529
1530static void
1531vdev_raidz_close(vdev_t *vd)
1532{
1533	int c;
1534
1535	for (c = 0; c < vd->vdev_children; c++)
1536		vdev_close(vd->vdev_child[c]);
1537}
1538
1539#ifdef illumos
1540/*
1541 * Handle a read or write I/O to a RAID-Z dump device.
1542 *
1543 * The dump device is in a unique situation compared to other ZFS datasets:
1544 * writing to this device should be as simple and fast as possible.  In
1545 * addition, durability matters much less since the dump will be extracted
1546 * once the machine reboots.  For that reason, this function eschews parity for
1547 * performance and simplicity.  The dump device uses the checksum setting
1548 * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1549 * dataset.
1550 *
1551 * Blocks of size 128 KB have been preallocated for this volume.  I/Os less than
1552 * 128 KB will not fill an entire block; in addition, they may not be properly
1553 * aligned.  In that case, this function uses the preallocated 128 KB block and
1554 * omits reading or writing any "empty" portions of that block, as opposed to
1555 * allocating a fresh appropriately-sized block.
1556 *
1557 * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1558 *
1559 *     vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1560 *
1561 * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1562 * allocated which spans all five child vdevs.  8 KB of data would be written to
1563 * each of four vdevs, with the fifth containing the parity bits.
1564 *
1565 *       parity    data     data     data     data
1566 *     |   PP   |   XX   |   XX   |   XX   |   XX   |
1567 *         ^        ^        ^        ^        ^
1568 *         |        |        |        |        |
1569 *   8 KB parity    ------8 KB data blocks------
1570 *
1571 * However, when writing to the dump device, the behavior is different:
1572 *
1573 *     vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1574 *
1575 * Unlike the normal RAID-Z case in which the block is allocated based on the
1576 * I/O size, reads and writes here always use a 128 KB logical I/O size.  If the
1577 * I/O size is less than 128 KB, only the actual portions of data are written.
1578 * In this example the data is written to the third data vdev since that vdev
1579 * contains the offset [64 KB, 96 KB).
1580 *
1581 *       parity    data     data     data     data
1582 *     |        |        |        |   XX   |        |
1583 *                                    ^
1584 *                                    |
1585 *                             32 KB data block
1586 *
1587 * As a result, an individual I/O may not span all child vdevs; moreover, a
1588 * small I/O may only operate on a single child vdev.
1589 *
1590 * Note that since there are no parity bits calculated or written, this format
1591 * remains the same no matter how many parity bits are used in a normal RAID-Z
1592 * stripe.  On a RAID-Z3 configuration with seven child vdevs, the example above
1593 * would look like:
1594 *
1595 *       parity   parity   parity    data     data     data     data
1596 *     |        |        |        |        |        |   XX   |        |
1597 *                                                      ^
1598 *                                                      |
1599 *                                               32 KB data block
1600 */
1601int
1602vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1603    uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1604{
1605	vdev_t *tvd = vd->vdev_top;
1606	vdev_t *cvd;
1607	raidz_map_t *rm;
1608	raidz_col_t *rc;
1609	int c, err = 0;
1610
1611	uint64_t start, end, colstart, colend;
1612	uint64_t coloffset, colsize, colskip;
1613
1614	int flags = doread ? BIO_READ : BIO_WRITE;
1615
1616#ifdef	_KERNEL
1617
1618	/*
1619	 * Don't write past the end of the block
1620	 */
1621	VERIFY3U(offset + size, <=, origoffset + SPA_MAXBLOCKSIZE);
1622
1623	start = offset;
1624	end = start + size;
1625
1626	/*
1627	 * Allocate a RAID-Z map for this block.  Note that this block starts
1628	 * from the "original" offset, this is, the offset of the extent which
1629	 * contains the requisite offset of the data being read or written.
1630	 *
1631	 * Even if this I/O operation doesn't span the full block size, let's
1632	 * treat the on-disk format as if the only blocks are the complete 128
1633	 * KB size.
1634	 */
1635	rm = vdev_raidz_map_alloc(data - (offset - origoffset),
1636	    SPA_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift, vd->vdev_children,
1637	    vd->vdev_nparity);
1638
1639	coloffset = origoffset;
1640
1641	for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1642	    c++, coloffset += rc->rc_size) {
1643		rc = &rm->rm_col[c];
1644		cvd = vd->vdev_child[rc->rc_devidx];
1645
1646		/*
1647		 * Find the start and end of this column in the RAID-Z map,
1648		 * keeping in mind that the stated size and offset of the
1649		 * operation may not fill the entire column for this vdev.
1650		 *
1651		 * If any portion of the data spans this column, issue the
1652		 * appropriate operation to the vdev.
1653		 */
1654		if (coloffset + rc->rc_size <= start)
1655			continue;
1656		if (coloffset >= end)
1657			continue;
1658
1659		colstart = MAX(coloffset, start);
1660		colend = MIN(end, coloffset + rc->rc_size);
1661		colsize = colend - colstart;
1662		colskip = colstart - coloffset;
1663
1664		VERIFY3U(colsize, <=, rc->rc_size);
1665		VERIFY3U(colskip, <=, rc->rc_size);
1666
1667		/*
1668		 * Note that the child vdev will have a vdev label at the start
1669		 * of its range of offsets, hence the need for
1670		 * VDEV_LABEL_OFFSET().  See zio_vdev_child_io() for another
1671		 * example of why this calculation is needed.
1672		 */
1673		if ((err = vdev_disk_physio(cvd,
1674		    ((char *)rc->rc_data) + colskip, colsize,
1675		    VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1676		    flags, isdump)) != 0)
1677			break;
1678	}
1679
1680	vdev_raidz_map_free(rm);
1681#endif	/* KERNEL */
1682
1683	return (err);
1684}
1685#endif
1686
1687static uint64_t
1688vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1689{
1690	uint64_t asize;
1691	uint64_t ashift = vd->vdev_top->vdev_ashift;
1692	uint64_t cols = vd->vdev_children;
1693	uint64_t nparity = vd->vdev_nparity;
1694
1695	asize = ((psize - 1) >> ashift) + 1;
1696	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1697	asize = roundup(asize, nparity + 1) << ashift;
1698
1699	return (asize);
1700}
1701
1702static void
1703vdev_raidz_child_done(zio_t *zio)
1704{
1705	raidz_col_t *rc = zio->io_private;
1706
1707	rc->rc_error = zio->io_error;
1708	rc->rc_tried = 1;
1709	rc->rc_skipped = 0;
1710}
1711
1712/*
1713 * Start an IO operation on a RAIDZ VDev
1714 *
1715 * Outline:
1716 * - For write operations:
1717 *   1. Generate the parity data
1718 *   2. Create child zio write operations to each column's vdev, for both
1719 *      data and parity.
1720 *   3. If the column skips any sectors for padding, create optional dummy
1721 *      write zio children for those areas to improve aggregation continuity.
1722 * - For read operations:
1723 *   1. Create child zio read operations to each data column's vdev to read
1724 *      the range of data required for zio.
1725 *   2. If this is a scrub or resilver operation, or if any of the data
1726 *      vdevs have had errors, then create zio read operations to the parity
1727 *      columns' VDevs as well.
1728 */
1729static int
1730vdev_raidz_io_start(zio_t *zio)
1731{
1732	vdev_t *vd = zio->io_vd;
1733	vdev_t *tvd = vd->vdev_top;
1734	vdev_t *cvd;
1735	raidz_map_t *rm;
1736	raidz_col_t *rc;
1737	int c, i;
1738
1739	rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset,
1740	    zio->io_type == ZIO_TYPE_FREE,
1741	    tvd->vdev_ashift, vd->vdev_children,
1742	    vd->vdev_nparity);
1743
1744	zio->io_vsd = rm;
1745	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1746
1747	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1748
1749	if (zio->io_type == ZIO_TYPE_FREE) {
1750		for (c = 0; c < rm->rm_cols; c++) {
1751			rc = &rm->rm_col[c];
1752			cvd = vd->vdev_child[rc->rc_devidx];
1753			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1754			    rc->rc_offset, rc->rc_data, rc->rc_size,
1755			    zio->io_type, zio->io_priority, 0,
1756			    vdev_raidz_child_done, rc));
1757		}
1758		return (ZIO_PIPELINE_CONTINUE);
1759	}
1760
1761	if (zio->io_type == ZIO_TYPE_WRITE) {
1762		vdev_raidz_generate_parity(rm);
1763
1764		for (c = 0; c < rm->rm_cols; c++) {
1765			rc = &rm->rm_col[c];
1766			cvd = vd->vdev_child[rc->rc_devidx];
1767			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1768			    rc->rc_offset, rc->rc_data, rc->rc_size,
1769			    zio->io_type, zio->io_priority, 0,
1770			    vdev_raidz_child_done, rc));
1771		}
1772
1773		/*
1774		 * Generate optional I/Os for any skipped sectors to improve
1775		 * aggregation contiguity.
1776		 */
1777		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1778			ASSERT(c <= rm->rm_scols);
1779			if (c == rm->rm_scols)
1780				c = 0;
1781			rc = &rm->rm_col[c];
1782			cvd = vd->vdev_child[rc->rc_devidx];
1783			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1784			    rc->rc_offset + rc->rc_size, NULL,
1785			    1 << tvd->vdev_ashift,
1786			    zio->io_type, zio->io_priority,
1787			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1788		}
1789
1790		return (ZIO_PIPELINE_CONTINUE);
1791	}
1792
1793	ASSERT(zio->io_type == ZIO_TYPE_READ);
1794
1795	/*
1796	 * Iterate over the columns in reverse order so that we hit the parity
1797	 * last -- any errors along the way will force us to read the parity.
1798	 */
1799	for (c = rm->rm_cols - 1; c >= 0; c--) {
1800		rc = &rm->rm_col[c];
1801		cvd = vd->vdev_child[rc->rc_devidx];
1802		if (!vdev_readable(cvd)) {
1803			if (c >= rm->rm_firstdatacol)
1804				rm->rm_missingdata++;
1805			else
1806				rm->rm_missingparity++;
1807			rc->rc_error = SET_ERROR(ENXIO);
1808			rc->rc_tried = 1;	/* don't even try */
1809			rc->rc_skipped = 1;
1810			continue;
1811		}
1812		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1813			if (c >= rm->rm_firstdatacol)
1814				rm->rm_missingdata++;
1815			else
1816				rm->rm_missingparity++;
1817			rc->rc_error = SET_ERROR(ESTALE);
1818			rc->rc_skipped = 1;
1819			continue;
1820		}
1821		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1822		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1823			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1824			    rc->rc_offset, rc->rc_data, rc->rc_size,
1825			    zio->io_type, zio->io_priority, 0,
1826			    vdev_raidz_child_done, rc));
1827		}
1828	}
1829
1830	return (ZIO_PIPELINE_CONTINUE);
1831}
1832
1833
1834/*
1835 * Report a checksum error for a child of a RAID-Z device.
1836 */
1837static void
1838raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1839{
1840	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1841
1842	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1843		zio_bad_cksum_t zbc;
1844		raidz_map_t *rm = zio->io_vsd;
1845
1846		mutex_enter(&vd->vdev_stat_lock);
1847		vd->vdev_stat.vs_checksum_errors++;
1848		mutex_exit(&vd->vdev_stat_lock);
1849
1850		zbc.zbc_has_cksum = 0;
1851		zbc.zbc_injected = rm->rm_ecksuminjected;
1852
1853		zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1854		    rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1855		    &zbc);
1856	}
1857}
1858
1859/*
1860 * We keep track of whether or not there were any injected errors, so that
1861 * any ereports we generate can note it.
1862 */
1863static int
1864raidz_checksum_verify(zio_t *zio)
1865{
1866	zio_bad_cksum_t zbc;
1867	raidz_map_t *rm = zio->io_vsd;
1868
1869	int ret = zio_checksum_error(zio, &zbc);
1870	if (ret != 0 && zbc.zbc_injected != 0)
1871		rm->rm_ecksuminjected = 1;
1872
1873	return (ret);
1874}
1875
1876/*
1877 * Generate the parity from the data columns. If we tried and were able to
1878 * read the parity without error, verify that the generated parity matches the
1879 * data we read. If it doesn't, we fire off a checksum error. Return the
1880 * number such failures.
1881 */
1882static int
1883raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1884{
1885	void *orig[VDEV_RAIDZ_MAXPARITY];
1886	int c, ret = 0;
1887	raidz_col_t *rc;
1888
1889	blkptr_t *bp = zio->io_bp;
1890	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
1891	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
1892
1893	if (checksum == ZIO_CHECKSUM_NOPARITY)
1894		return (ret);
1895
1896	for (c = 0; c < rm->rm_firstdatacol; c++) {
1897		rc = &rm->rm_col[c];
1898		if (!rc->rc_tried || rc->rc_error != 0)
1899			continue;
1900		orig[c] = zio_buf_alloc(rc->rc_size);
1901		bcopy(rc->rc_data, orig[c], rc->rc_size);
1902	}
1903
1904	vdev_raidz_generate_parity(rm);
1905
1906	for (c = 0; c < rm->rm_firstdatacol; c++) {
1907		rc = &rm->rm_col[c];
1908		if (!rc->rc_tried || rc->rc_error != 0)
1909			continue;
1910		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1911			raidz_checksum_error(zio, rc, orig[c]);
1912			rc->rc_error = SET_ERROR(ECKSUM);
1913			ret++;
1914		}
1915		zio_buf_free(orig[c], rc->rc_size);
1916	}
1917
1918	return (ret);
1919}
1920
1921/*
1922 * Keep statistics on all the ways that we used parity to correct data.
1923 */
1924static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1925
1926static int
1927vdev_raidz_worst_error(raidz_map_t *rm)
1928{
1929	int error = 0;
1930
1931	for (int c = 0; c < rm->rm_cols; c++)
1932		error = zio_worst_error(error, rm->rm_col[c].rc_error);
1933
1934	return (error);
1935}
1936
1937/*
1938 * Iterate over all combinations of bad data and attempt a reconstruction.
1939 * Note that the algorithm below is non-optimal because it doesn't take into
1940 * account how reconstruction is actually performed. For example, with
1941 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1942 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1943 * cases we'd only use parity information in column 0.
1944 */
1945static int
1946vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1947{
1948	raidz_map_t *rm = zio->io_vsd;
1949	raidz_col_t *rc;
1950	void *orig[VDEV_RAIDZ_MAXPARITY];
1951	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1952	int *tgts = &tstore[1];
1953	int current, next, i, c, n;
1954	int code, ret = 0;
1955
1956	ASSERT(total_errors < rm->rm_firstdatacol);
1957
1958	/*
1959	 * This simplifies one edge condition.
1960	 */
1961	tgts[-1] = -1;
1962
1963	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1964		/*
1965		 * Initialize the targets array by finding the first n columns
1966		 * that contain no error.
1967		 *
1968		 * If there were no data errors, we need to ensure that we're
1969		 * always explicitly attempting to reconstruct at least one
1970		 * data column. To do this, we simply push the highest target
1971		 * up into the data columns.
1972		 */
1973		for (c = 0, i = 0; i < n; i++) {
1974			if (i == n - 1 && data_errors == 0 &&
1975			    c < rm->rm_firstdatacol) {
1976				c = rm->rm_firstdatacol;
1977			}
1978
1979			while (rm->rm_col[c].rc_error != 0) {
1980				c++;
1981				ASSERT3S(c, <, rm->rm_cols);
1982			}
1983
1984			tgts[i] = c++;
1985		}
1986
1987		/*
1988		 * Setting tgts[n] simplifies the other edge condition.
1989		 */
1990		tgts[n] = rm->rm_cols;
1991
1992		/*
1993		 * These buffers were allocated in previous iterations.
1994		 */
1995		for (i = 0; i < n - 1; i++) {
1996			ASSERT(orig[i] != NULL);
1997		}
1998
1999		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
2000
2001		current = 0;
2002		next = tgts[current];
2003
2004		while (current != n) {
2005			tgts[current] = next;
2006			current = 0;
2007
2008			/*
2009			 * Save off the original data that we're going to
2010			 * attempt to reconstruct.
2011			 */
2012			for (i = 0; i < n; i++) {
2013				ASSERT(orig[i] != NULL);
2014				c = tgts[i];
2015				ASSERT3S(c, >=, 0);
2016				ASSERT3S(c, <, rm->rm_cols);
2017				rc = &rm->rm_col[c];
2018				bcopy(rc->rc_data, orig[i], rc->rc_size);
2019			}
2020
2021			/*
2022			 * Attempt a reconstruction and exit the outer loop on
2023			 * success.
2024			 */
2025			code = vdev_raidz_reconstruct(rm, tgts, n);
2026			if (raidz_checksum_verify(zio) == 0) {
2027				atomic_inc_64(&raidz_corrected[code]);
2028
2029				for (i = 0; i < n; i++) {
2030					c = tgts[i];
2031					rc = &rm->rm_col[c];
2032					ASSERT(rc->rc_error == 0);
2033					if (rc->rc_tried)
2034						raidz_checksum_error(zio, rc,
2035						    orig[i]);
2036					rc->rc_error = SET_ERROR(ECKSUM);
2037				}
2038
2039				ret = code;
2040				goto done;
2041			}
2042
2043			/*
2044			 * Restore the original data.
2045			 */
2046			for (i = 0; i < n; i++) {
2047				c = tgts[i];
2048				rc = &rm->rm_col[c];
2049				bcopy(orig[i], rc->rc_data, rc->rc_size);
2050			}
2051
2052			do {
2053				/*
2054				 * Find the next valid column after the current
2055				 * position..
2056				 */
2057				for (next = tgts[current] + 1;
2058				    next < rm->rm_cols &&
2059				    rm->rm_col[next].rc_error != 0; next++)
2060					continue;
2061
2062				ASSERT(next <= tgts[current + 1]);
2063
2064				/*
2065				 * If that spot is available, we're done here.
2066				 */
2067				if (next != tgts[current + 1])
2068					break;
2069
2070				/*
2071				 * Otherwise, find the next valid column after
2072				 * the previous position.
2073				 */
2074				for (c = tgts[current - 1] + 1;
2075				    rm->rm_col[c].rc_error != 0; c++)
2076					continue;
2077
2078				tgts[current] = c;
2079				current++;
2080
2081			} while (current != n);
2082		}
2083	}
2084	n--;
2085done:
2086	for (i = 0; i < n; i++) {
2087		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2088	}
2089
2090	return (ret);
2091}
2092
2093/*
2094 * Complete an IO operation on a RAIDZ VDev
2095 *
2096 * Outline:
2097 * - For write operations:
2098 *   1. Check for errors on the child IOs.
2099 *   2. Return, setting an error code if too few child VDevs were written
2100 *      to reconstruct the data later.  Note that partial writes are
2101 *      considered successful if they can be reconstructed at all.
2102 * - For read operations:
2103 *   1. Check for errors on the child IOs.
2104 *   2. If data errors occurred:
2105 *      a. Try to reassemble the data from the parity available.
2106 *      b. If we haven't yet read the parity drives, read them now.
2107 *      c. If all parity drives have been read but the data still doesn't
2108 *         reassemble with a correct checksum, then try combinatorial
2109 *         reconstruction.
2110 *      d. If that doesn't work, return an error.
2111 *   3. If there were unexpected errors or this is a resilver operation,
2112 *      rewrite the vdevs that had errors.
2113 */
2114static void
2115vdev_raidz_io_done(zio_t *zio)
2116{
2117	vdev_t *vd = zio->io_vd;
2118	vdev_t *cvd;
2119	raidz_map_t *rm = zio->io_vsd;
2120	raidz_col_t *rc;
2121	int unexpected_errors = 0;
2122	int parity_errors = 0;
2123	int parity_untried = 0;
2124	int data_errors = 0;
2125	int total_errors = 0;
2126	int n, c;
2127	int tgts[VDEV_RAIDZ_MAXPARITY];
2128	int code;
2129
2130	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
2131
2132	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2133	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2134
2135	for (c = 0; c < rm->rm_cols; c++) {
2136		rc = &rm->rm_col[c];
2137
2138		if (rc->rc_error) {
2139			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
2140
2141			if (c < rm->rm_firstdatacol)
2142				parity_errors++;
2143			else
2144				data_errors++;
2145
2146			if (!rc->rc_skipped)
2147				unexpected_errors++;
2148
2149			total_errors++;
2150		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2151			parity_untried++;
2152		}
2153	}
2154
2155	if (zio->io_type == ZIO_TYPE_WRITE) {
2156		/*
2157		 * XXX -- for now, treat partial writes as a success.
2158		 * (If we couldn't write enough columns to reconstruct
2159		 * the data, the I/O failed.  Otherwise, good enough.)
2160		 *
2161		 * Now that we support write reallocation, it would be better
2162		 * to treat partial failure as real failure unless there are
2163		 * no non-degraded top-level vdevs left, and not update DTLs
2164		 * if we intend to reallocate.
2165		 */
2166		/* XXPOLICY */
2167		if (total_errors > rm->rm_firstdatacol)
2168			zio->io_error = vdev_raidz_worst_error(rm);
2169
2170		return;
2171	} else if (zio->io_type == ZIO_TYPE_FREE) {
2172		return;
2173	}
2174
2175	ASSERT(zio->io_type == ZIO_TYPE_READ);
2176	/*
2177	 * There are three potential phases for a read:
2178	 *	1. produce valid data from the columns read
2179	 *	2. read all disks and try again
2180	 *	3. perform combinatorial reconstruction
2181	 *
2182	 * Each phase is progressively both more expensive and less likely to
2183	 * occur. If we encounter more errors than we can repair or all phases
2184	 * fail, we have no choice but to return an error.
2185	 */
2186
2187	/*
2188	 * If the number of errors we saw was correctable -- less than or equal
2189	 * to the number of parity disks read -- attempt to produce data that
2190	 * has a valid checksum. Naturally, this case applies in the absence of
2191	 * any errors.
2192	 */
2193	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2194		if (data_errors == 0) {
2195			if (raidz_checksum_verify(zio) == 0) {
2196				/*
2197				 * If we read parity information (unnecessarily
2198				 * as it happens since no reconstruction was
2199				 * needed) regenerate and verify the parity.
2200				 * We also regenerate parity when resilvering
2201				 * so we can write it out to the failed device
2202				 * later.
2203				 */
2204				if (parity_errors + parity_untried <
2205				    rm->rm_firstdatacol ||
2206				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2207					n = raidz_parity_verify(zio, rm);
2208					unexpected_errors += n;
2209					ASSERT(parity_errors + n <=
2210					    rm->rm_firstdatacol);
2211				}
2212				goto done;
2213			}
2214		} else {
2215			/*
2216			 * We either attempt to read all the parity columns or
2217			 * none of them. If we didn't try to read parity, we
2218			 * wouldn't be here in the correctable case. There must
2219			 * also have been fewer parity errors than parity
2220			 * columns or, again, we wouldn't be in this code path.
2221			 */
2222			ASSERT(parity_untried == 0);
2223			ASSERT(parity_errors < rm->rm_firstdatacol);
2224
2225			/*
2226			 * Identify the data columns that reported an error.
2227			 */
2228			n = 0;
2229			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2230				rc = &rm->rm_col[c];
2231				if (rc->rc_error != 0) {
2232					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2233					tgts[n++] = c;
2234				}
2235			}
2236
2237			ASSERT(rm->rm_firstdatacol >= n);
2238
2239			code = vdev_raidz_reconstruct(rm, tgts, n);
2240
2241			if (raidz_checksum_verify(zio) == 0) {
2242				atomic_inc_64(&raidz_corrected[code]);
2243
2244				/*
2245				 * If we read more parity disks than were used
2246				 * for reconstruction, confirm that the other
2247				 * parity disks produced correct data. This
2248				 * routine is suboptimal in that it regenerates
2249				 * the parity that we already used in addition
2250				 * to the parity that we're attempting to
2251				 * verify, but this should be a relatively
2252				 * uncommon case, and can be optimized if it
2253				 * becomes a problem. Note that we regenerate
2254				 * parity when resilvering so we can write it
2255				 * out to failed devices later.
2256				 */
2257				if (parity_errors < rm->rm_firstdatacol - n ||
2258				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2259					n = raidz_parity_verify(zio, rm);
2260					unexpected_errors += n;
2261					ASSERT(parity_errors + n <=
2262					    rm->rm_firstdatacol);
2263				}
2264
2265				goto done;
2266			}
2267		}
2268	}
2269
2270	/*
2271	 * This isn't a typical situation -- either we got a read error or
2272	 * a child silently returned bad data. Read every block so we can
2273	 * try again with as much data and parity as we can track down. If
2274	 * we've already been through once before, all children will be marked
2275	 * as tried so we'll proceed to combinatorial reconstruction.
2276	 */
2277	unexpected_errors = 1;
2278	rm->rm_missingdata = 0;
2279	rm->rm_missingparity = 0;
2280
2281	for (c = 0; c < rm->rm_cols; c++) {
2282		if (rm->rm_col[c].rc_tried)
2283			continue;
2284
2285		zio_vdev_io_redone(zio);
2286		do {
2287			rc = &rm->rm_col[c];
2288			if (rc->rc_tried)
2289				continue;
2290			zio_nowait(zio_vdev_child_io(zio, NULL,
2291			    vd->vdev_child[rc->rc_devidx],
2292			    rc->rc_offset, rc->rc_data, rc->rc_size,
2293			    zio->io_type, zio->io_priority, 0,
2294			    vdev_raidz_child_done, rc));
2295		} while (++c < rm->rm_cols);
2296
2297		return;
2298	}
2299
2300	/*
2301	 * At this point we've attempted to reconstruct the data given the
2302	 * errors we detected, and we've attempted to read all columns. There
2303	 * must, therefore, be one or more additional problems -- silent errors
2304	 * resulting in invalid data rather than explicit I/O errors resulting
2305	 * in absent data. We check if there is enough additional data to
2306	 * possibly reconstruct the data and then perform combinatorial
2307	 * reconstruction over all possible combinations. If that fails,
2308	 * we're cooked.
2309	 */
2310	if (total_errors > rm->rm_firstdatacol) {
2311		zio->io_error = vdev_raidz_worst_error(rm);
2312
2313	} else if (total_errors < rm->rm_firstdatacol &&
2314	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2315		/*
2316		 * If we didn't use all the available parity for the
2317		 * combinatorial reconstruction, verify that the remaining
2318		 * parity is correct.
2319		 */
2320		if (code != (1 << rm->rm_firstdatacol) - 1)
2321			(void) raidz_parity_verify(zio, rm);
2322	} else {
2323		/*
2324		 * We're here because either:
2325		 *
2326		 *	total_errors == rm_first_datacol, or
2327		 *	vdev_raidz_combrec() failed
2328		 *
2329		 * In either case, there is enough bad data to prevent
2330		 * reconstruction.
2331		 *
2332		 * Start checksum ereports for all children which haven't
2333		 * failed, and the IO wasn't speculative.
2334		 */
2335		zio->io_error = SET_ERROR(ECKSUM);
2336
2337		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2338			for (c = 0; c < rm->rm_cols; c++) {
2339				rc = &rm->rm_col[c];
2340				if (rc->rc_error == 0) {
2341					zio_bad_cksum_t zbc;
2342					zbc.zbc_has_cksum = 0;
2343					zbc.zbc_injected =
2344					    rm->rm_ecksuminjected;
2345
2346					zfs_ereport_start_checksum(
2347					    zio->io_spa,
2348					    vd->vdev_child[rc->rc_devidx],
2349					    zio, rc->rc_offset, rc->rc_size,
2350					    (void *)(uintptr_t)c, &zbc);
2351				}
2352			}
2353		}
2354	}
2355
2356done:
2357	zio_checksum_verified(zio);
2358
2359	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2360	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2361		/*
2362		 * Use the good data we have in hand to repair damaged children.
2363		 */
2364		for (c = 0; c < rm->rm_cols; c++) {
2365			rc = &rm->rm_col[c];
2366			cvd = vd->vdev_child[rc->rc_devidx];
2367
2368			if (rc->rc_error == 0)
2369				continue;
2370
2371			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2372			    rc->rc_offset, rc->rc_data, rc->rc_size,
2373			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2374			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2375			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2376		}
2377	}
2378}
2379
2380static void
2381vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2382{
2383	if (faulted > vd->vdev_nparity)
2384		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2385		    VDEV_AUX_NO_REPLICAS);
2386	else if (degraded + faulted != 0)
2387		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2388	else
2389		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2390}
2391
2392vdev_ops_t vdev_raidz_ops = {
2393	vdev_raidz_open,
2394	vdev_raidz_close,
2395	vdev_raidz_asize,
2396	vdev_raidz_io_start,
2397	vdev_raidz_io_done,
2398	vdev_raidz_state_change,
2399	NULL,
2400	NULL,
2401	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
2402	B_FALSE			/* not a leaf vdev */
2403};
2404