1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#include <sys/zfs_context.h>
28#include <sys/spa.h>
29#include <sys/vdev_impl.h>
30#include <sys/zio.h>
31#include <sys/zio_checksum.h>
32#include <sys/fs/zfs.h>
33#include <sys/fm/fs/zfs.h>
34
35/*
36 * Virtual device vector for RAID-Z.
37 *
38 * This vdev supports single, double, and triple parity. For single parity,
39 * we use a simple XOR of all the data columns. For double or triple parity,
40 * we use a special case of Reed-Solomon coding. This extends the
41 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
42 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
43 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
44 * former is also based. The latter is designed to provide higher performance
45 * for writes.
46 *
47 * Note that the Plank paper claimed to support arbitrary N+M, but was then
48 * amended six years later identifying a critical flaw that invalidates its
49 * claims. Nevertheless, the technique can be adapted to work for up to
50 * triple parity. For additional parity, the amendment "Note: Correction to
51 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
52 * is viable, but the additional complexity means that write performance will
53 * suffer.
54 *
55 * All of the methods above operate on a Galois field, defined over the
56 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
57 * can be expressed with a single byte. Briefly, the operations on the
58 * field are defined as follows:
59 *
60 *   o addition (+) is represented by a bitwise XOR
61 *   o subtraction (-) is therefore identical to addition: A + B = A - B
62 *   o multiplication of A by 2 is defined by the following bitwise expression:
63 *	(A * 2)_7 = A_6
64 *	(A * 2)_6 = A_5
65 *	(A * 2)_5 = A_4
66 *	(A * 2)_4 = A_3 + A_7
67 *	(A * 2)_3 = A_2 + A_7
68 *	(A * 2)_2 = A_1 + A_7
69 *	(A * 2)_1 = A_0
70 *	(A * 2)_0 = A_7
71 *
72 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
73 * As an aside, this multiplication is derived from the error correcting
74 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
75 *
76 * Observe that any number in the field (except for 0) can be expressed as a
77 * power of 2 -- a generator for the field. We store a table of the powers of
78 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
79 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
80 * than field addition). The inverse of a field element A (A^-1) is therefore
81 * A ^ (255 - 1) = A^254.
82 *
83 * The up-to-three parity columns, P, Q, R over several data columns,
84 * D_0, ... D_n-1, can be expressed by field operations:
85 *
86 *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
87 *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
88 *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
89 *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
90 *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
91 *
92 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
93 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
94 * independent coefficients. (There are no additional coefficients that have
95 * this property which is why the uncorrected Plank method breaks down.)
96 *
97 * See the reconstruction code below for how P, Q and R can used individually
98 * or in concert to recover missing data columns.
99 */
100
101typedef struct raidz_col {
102	uint64_t rc_devidx;		/* child device index for I/O */
103	uint64_t rc_offset;		/* device offset */
104	uint64_t rc_size;		/* I/O size */
105	void *rc_data;			/* I/O data */
106	void *rc_gdata;			/* used to store the "good" version */
107	int rc_error;			/* I/O error for this device */
108	uint8_t rc_tried;		/* Did we attempt this I/O column? */
109	uint8_t rc_skipped;		/* Did we skip this I/O column? */
110} raidz_col_t;
111
112typedef struct raidz_map {
113	uint64_t rm_cols;		/* Regular column count */
114	uint64_t rm_scols;		/* Count including skipped columns */
115	uint64_t rm_bigcols;		/* Number of oversized columns */
116	uint64_t rm_asize;		/* Actual total I/O size */
117	uint64_t rm_missingdata;	/* Count of missing data devices */
118	uint64_t rm_missingparity;	/* Count of missing parity devices */
119	uint64_t rm_firstdatacol;	/* First data column/parity count */
120	uint64_t rm_nskip;		/* Skipped sectors for padding */
121	uint64_t rm_skipstart;	/* Column index of padding start */
122	void *rm_datacopy;		/* rm_asize-buffer of copied data */
123	uintptr_t rm_reports;		/* # of referencing checksum reports */
124	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
125	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
126	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
127} raidz_map_t;
128
129#define	VDEV_RAIDZ_P		0
130#define	VDEV_RAIDZ_Q		1
131#define	VDEV_RAIDZ_R		2
132
133#define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
134#define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
135
136/*
137 * We provide a mechanism to perform the field multiplication operation on a
138 * 64-bit value all at once rather than a byte at a time. This works by
139 * creating a mask from the top bit in each byte and using that to
140 * conditionally apply the XOR of 0x1d.
141 */
142#define	VDEV_RAIDZ_64MUL_2(x, mask) \
143{ \
144	(mask) = (x) & 0x8080808080808080ULL; \
145	(mask) = ((mask) << 1) - ((mask) >> 7); \
146	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
147	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
148}
149
150#define	VDEV_RAIDZ_64MUL_4(x, mask) \
151{ \
152	VDEV_RAIDZ_64MUL_2((x), mask); \
153	VDEV_RAIDZ_64MUL_2((x), mask); \
154}
155
156/*
157 * Force reconstruction to use the general purpose method.
158 */
159int vdev_raidz_default_to_general;
160
161/*
162 * These two tables represent powers and logs of 2 in the Galois field defined
163 * above. These values were computed by repeatedly multiplying by 2 as above.
164 */
165static const uint8_t vdev_raidz_pow2[256] = {
166	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
167	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
168	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
169	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
170	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
171	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
172	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
173	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
174	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
175	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
176	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
177	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
178	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
179	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
180	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
181	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
182	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
183	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
184	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
185	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
186	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
187	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
188	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
189	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
190	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
191	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
192	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
193	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
194	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
195	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
196	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
197	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
198};
199static const uint8_t vdev_raidz_log2[256] = {
200	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
201	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
202	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
203	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
204	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
205	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
206	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
207	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
208	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
209	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
210	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
211	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
212	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
213	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
214	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
215	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
216	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
217	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
218	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
219	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
220	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
221	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
222	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
223	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
224	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
225	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
226	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
227	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
228	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
229	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
230	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
231	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
232};
233
234static void vdev_raidz_generate_parity(raidz_map_t *rm);
235
236/*
237 * Multiply a given number by 2 raised to the given power.
238 */
239static uint8_t
240vdev_raidz_exp2(uint_t a, int exp)
241{
242	if (a == 0)
243		return (0);
244
245	ASSERT(exp >= 0);
246	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
247
248	exp += vdev_raidz_log2[a];
249	if (exp > 255)
250		exp -= 255;
251
252	return (vdev_raidz_pow2[exp]);
253}
254
255static void
256vdev_raidz_map_free(raidz_map_t *rm)
257{
258	int c;
259	size_t size;
260
261	for (c = 0; c < rm->rm_firstdatacol; c++) {
262		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
263
264		if (rm->rm_col[c].rc_gdata != NULL)
265			zio_buf_free(rm->rm_col[c].rc_gdata,
266			    rm->rm_col[c].rc_size);
267	}
268
269	size = 0;
270	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
271		size += rm->rm_col[c].rc_size;
272
273	if (rm->rm_datacopy != NULL)
274		zio_buf_free(rm->rm_datacopy, size);
275
276	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
277}
278
279static void
280vdev_raidz_map_free_vsd(zio_t *zio)
281{
282	raidz_map_t *rm = zio->io_vsd;
283
284	ASSERT3U(rm->rm_freed, ==, 0);
285	rm->rm_freed = 1;
286
287	if (rm->rm_reports == 0)
288		vdev_raidz_map_free(rm);
289}
290
291/*ARGSUSED*/
292static void
293vdev_raidz_cksum_free(void *arg, size_t ignored)
294{
295	raidz_map_t *rm = arg;
296
297	ASSERT3U(rm->rm_reports, >, 0);
298
299	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
300		vdev_raidz_map_free(rm);
301}
302
303static void
304vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
305{
306	raidz_map_t *rm = zcr->zcr_cbdata;
307	size_t c = zcr->zcr_cbinfo;
308	size_t x;
309
310	const char *good = NULL;
311	const char *bad = rm->rm_col[c].rc_data;
312
313	if (good_data == NULL) {
314		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
315		return;
316	}
317
318	if (c < rm->rm_firstdatacol) {
319		/*
320		 * The first time through, calculate the parity blocks for
321		 * the good data (this relies on the fact that the good
322		 * data never changes for a given logical ZIO)
323		 */
324		if (rm->rm_col[0].rc_gdata == NULL) {
325			char *bad_parity[VDEV_RAIDZ_MAXPARITY];
326			char *buf;
327
328			/*
329			 * Set up the rm_col[]s to generate the parity for
330			 * good_data, first saving the parity bufs and
331			 * replacing them with buffers to hold the result.
332			 */
333			for (x = 0; x < rm->rm_firstdatacol; x++) {
334				bad_parity[x] = rm->rm_col[x].rc_data;
335				rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
336				    zio_buf_alloc(rm->rm_col[x].rc_size);
337			}
338
339			/* fill in the data columns from good_data */
340			buf = (char *)good_data;
341			for (; x < rm->rm_cols; x++) {
342				rm->rm_col[x].rc_data = buf;
343				buf += rm->rm_col[x].rc_size;
344			}
345
346			/*
347			 * Construct the parity from the good data.
348			 */
349			vdev_raidz_generate_parity(rm);
350
351			/* restore everything back to its original state */
352			for (x = 0; x < rm->rm_firstdatacol; x++)
353				rm->rm_col[x].rc_data = bad_parity[x];
354
355			buf = rm->rm_datacopy;
356			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
357				rm->rm_col[x].rc_data = buf;
358				buf += rm->rm_col[x].rc_size;
359			}
360		}
361
362		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
363		good = rm->rm_col[c].rc_gdata;
364	} else {
365		/* adjust good_data to point at the start of our column */
366		good = good_data;
367
368		for (x = rm->rm_firstdatacol; x < c; x++)
369			good += rm->rm_col[x].rc_size;
370	}
371
372	/* we drop the ereport if it ends up that the data was good */
373	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
374}
375
376/*
377 * Invoked indirectly by zfs_ereport_start_checksum(), called
378 * below when our read operation fails completely.  The main point
379 * is to keep a copy of everything we read from disk, so that at
380 * vdev_raidz_cksum_finish() time we can compare it with the good data.
381 */
382static void
383vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
384{
385	size_t c = (size_t)(uintptr_t)arg;
386	caddr_t buf;
387
388	raidz_map_t *rm = zio->io_vsd;
389	size_t size;
390
391	/* set up the report and bump the refcount  */
392	zcr->zcr_cbdata = rm;
393	zcr->zcr_cbinfo = c;
394	zcr->zcr_finish = vdev_raidz_cksum_finish;
395	zcr->zcr_free = vdev_raidz_cksum_free;
396
397	rm->rm_reports++;
398	ASSERT3U(rm->rm_reports, >, 0);
399
400	if (rm->rm_datacopy != NULL)
401		return;
402
403	/*
404	 * It's the first time we're called for this raidz_map_t, so we need
405	 * to copy the data aside; there's no guarantee that our zio's buffer
406	 * won't be re-used for something else.
407	 *
408	 * Our parity data is already in separate buffers, so there's no need
409	 * to copy them.
410	 */
411
412	size = 0;
413	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
414		size += rm->rm_col[c].rc_size;
415
416	buf = rm->rm_datacopy = zio_buf_alloc(size);
417
418	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
419		raidz_col_t *col = &rm->rm_col[c];
420
421		bcopy(col->rc_data, buf, col->rc_size);
422		col->rc_data = buf;
423
424		buf += col->rc_size;
425	}
426	ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
427}
428
429static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
430	vdev_raidz_map_free_vsd,
431	vdev_raidz_cksum_report
432};
433
434static raidz_map_t *
435vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
436    uint64_t nparity)
437{
438	raidz_map_t *rm;
439	uint64_t b = zio->io_offset >> unit_shift;
440	uint64_t s = zio->io_size >> unit_shift;
441	uint64_t f = b % dcols;
442	uint64_t o = (b / dcols) << unit_shift;
443	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
444
445	q = s / (dcols - nparity);
446	r = s - q * (dcols - nparity);
447	bc = (r == 0 ? 0 : r + nparity);
448	tot = s + nparity * (q + (r == 0 ? 0 : 1));
449
450	if (q == 0) {
451		acols = bc;
452		scols = MIN(dcols, roundup(bc, nparity + 1));
453	} else {
454		acols = dcols;
455		scols = dcols;
456	}
457
458	ASSERT3U(acols, <=, scols);
459
460	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
461
462	rm->rm_cols = acols;
463	rm->rm_scols = scols;
464	rm->rm_bigcols = bc;
465	rm->rm_skipstart = bc;
466	rm->rm_missingdata = 0;
467	rm->rm_missingparity = 0;
468	rm->rm_firstdatacol = nparity;
469	rm->rm_datacopy = NULL;
470	rm->rm_reports = 0;
471	rm->rm_freed = 0;
472	rm->rm_ecksuminjected = 0;
473
474	asize = 0;
475
476	for (c = 0; c < scols; c++) {
477		col = f + c;
478		coff = o;
479		if (col >= dcols) {
480			col -= dcols;
481			coff += 1ULL << unit_shift;
482		}
483		rm->rm_col[c].rc_devidx = col;
484		rm->rm_col[c].rc_offset = coff;
485		rm->rm_col[c].rc_data = NULL;
486		rm->rm_col[c].rc_gdata = NULL;
487		rm->rm_col[c].rc_error = 0;
488		rm->rm_col[c].rc_tried = 0;
489		rm->rm_col[c].rc_skipped = 0;
490
491		if (c >= acols)
492			rm->rm_col[c].rc_size = 0;
493		else if (c < bc)
494			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
495		else
496			rm->rm_col[c].rc_size = q << unit_shift;
497
498		asize += rm->rm_col[c].rc_size;
499	}
500
501	ASSERT3U(asize, ==, tot << unit_shift);
502	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
503	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
504	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
505	ASSERT3U(rm->rm_nskip, <=, nparity);
506
507	for (c = 0; c < rm->rm_firstdatacol; c++)
508		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
509
510	rm->rm_col[c].rc_data = zio->io_data;
511
512	for (c = c + 1; c < acols; c++)
513		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
514		    rm->rm_col[c - 1].rc_size;
515
516	/*
517	 * If all data stored spans all columns, there's a danger that parity
518	 * will always be on the same device and, since parity isn't read
519	 * during normal operation, that that device's I/O bandwidth won't be
520	 * used effectively. We therefore switch the parity every 1MB.
521	 *
522	 * ... at least that was, ostensibly, the theory. As a practical
523	 * matter unless we juggle the parity between all devices evenly, we
524	 * won't see any benefit. Further, occasional writes that aren't a
525	 * multiple of the LCM of the number of children and the minimum
526	 * stripe width are sufficient to avoid pessimal behavior.
527	 * Unfortunately, this decision created an implicit on-disk format
528	 * requirement that we need to support for all eternity, but only
529	 * for single-parity RAID-Z.
530	 *
531	 * If we intend to skip a sector in the zeroth column for padding
532	 * we must make sure to note this swap. We will never intend to
533	 * skip the first column since at least one data and one parity
534	 * column must appear in each row.
535	 */
536	ASSERT(rm->rm_cols >= 2);
537	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
538
539	if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
540		devidx = rm->rm_col[0].rc_devidx;
541		o = rm->rm_col[0].rc_offset;
542		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
543		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
544		rm->rm_col[1].rc_devidx = devidx;
545		rm->rm_col[1].rc_offset = o;
546
547		if (rm->rm_skipstart == 0)
548			rm->rm_skipstart = 1;
549	}
550
551	zio->io_vsd = rm;
552	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
553	return (rm);
554}
555
556static void
557vdev_raidz_generate_parity_p(raidz_map_t *rm)
558{
559	uint64_t *p, *src, pcount, ccount, i;
560	int c;
561
562	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
563
564	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
565		src = rm->rm_col[c].rc_data;
566		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
567		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
568
569		if (c == rm->rm_firstdatacol) {
570			ASSERT(ccount == pcount);
571			for (i = 0; i < ccount; i++, src++, p++) {
572				*p = *src;
573			}
574		} else {
575			ASSERT(ccount <= pcount);
576			for (i = 0; i < ccount; i++, src++, p++) {
577				*p ^= *src;
578			}
579		}
580	}
581}
582
583static void
584vdev_raidz_generate_parity_pq(raidz_map_t *rm)
585{
586	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
587	int c;
588
589	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
590	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
591	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
592
593	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
594		src = rm->rm_col[c].rc_data;
595		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
596		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
597
598		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
599
600		if (c == rm->rm_firstdatacol) {
601			ASSERT(ccnt == pcnt || ccnt == 0);
602			for (i = 0; i < ccnt; i++, src++, p++, q++) {
603				*p = *src;
604				*q = *src;
605			}
606			for (; i < pcnt; i++, src++, p++, q++) {
607				*p = 0;
608				*q = 0;
609			}
610		} else {
611			ASSERT(ccnt <= pcnt);
612
613			/*
614			 * Apply the algorithm described above by multiplying
615			 * the previous result and adding in the new value.
616			 */
617			for (i = 0; i < ccnt; i++, src++, p++, q++) {
618				*p ^= *src;
619
620				VDEV_RAIDZ_64MUL_2(*q, mask);
621				*q ^= *src;
622			}
623
624			/*
625			 * Treat short columns as though they are full of 0s.
626			 * Note that there's therefore nothing needed for P.
627			 */
628			for (; i < pcnt; i++, q++) {
629				VDEV_RAIDZ_64MUL_2(*q, mask);
630			}
631		}
632	}
633}
634
635static void
636vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
637{
638	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
639	int c;
640
641	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
642	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
643	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
644	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
645	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
646
647	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
648		src = rm->rm_col[c].rc_data;
649		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
650		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
651		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
652
653		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
654
655		if (c == rm->rm_firstdatacol) {
656			ASSERT(ccnt == pcnt || ccnt == 0);
657			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
658				*p = *src;
659				*q = *src;
660				*r = *src;
661			}
662			for (; i < pcnt; i++, src++, p++, q++, r++) {
663				*p = 0;
664				*q = 0;
665				*r = 0;
666			}
667		} else {
668			ASSERT(ccnt <= pcnt);
669
670			/*
671			 * Apply the algorithm described above by multiplying
672			 * the previous result and adding in the new value.
673			 */
674			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
675				*p ^= *src;
676
677				VDEV_RAIDZ_64MUL_2(*q, mask);
678				*q ^= *src;
679
680				VDEV_RAIDZ_64MUL_4(*r, mask);
681				*r ^= *src;
682			}
683
684			/*
685			 * Treat short columns as though they are full of 0s.
686			 * Note that there's therefore nothing needed for P.
687			 */
688			for (; i < pcnt; i++, q++, r++) {
689				VDEV_RAIDZ_64MUL_2(*q, mask);
690				VDEV_RAIDZ_64MUL_4(*r, mask);
691			}
692		}
693	}
694}
695
696/*
697 * Generate RAID parity in the first virtual columns according to the number of
698 * parity columns available.
699 */
700static void
701vdev_raidz_generate_parity(raidz_map_t *rm)
702{
703	switch (rm->rm_firstdatacol) {
704	case 1:
705		vdev_raidz_generate_parity_p(rm);
706		break;
707	case 2:
708		vdev_raidz_generate_parity_pq(rm);
709		break;
710	case 3:
711		vdev_raidz_generate_parity_pqr(rm);
712		break;
713	default:
714		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
715	}
716}
717
718static int
719vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
720{
721	uint64_t *dst, *src, xcount, ccount, count, i;
722	int x = tgts[0];
723	int c;
724
725	ASSERT(ntgts == 1);
726	ASSERT(x >= rm->rm_firstdatacol);
727	ASSERT(x < rm->rm_cols);
728
729	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
730	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
731	ASSERT(xcount > 0);
732
733	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
734	dst = rm->rm_col[x].rc_data;
735	for (i = 0; i < xcount; i++, dst++, src++) {
736		*dst = *src;
737	}
738
739	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
740		src = rm->rm_col[c].rc_data;
741		dst = rm->rm_col[x].rc_data;
742
743		if (c == x)
744			continue;
745
746		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
747		count = MIN(ccount, xcount);
748
749		for (i = 0; i < count; i++, dst++, src++) {
750			*dst ^= *src;
751		}
752	}
753
754	return (1 << VDEV_RAIDZ_P);
755}
756
757static int
758vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
759{
760	uint64_t *dst, *src, xcount, ccount, count, mask, i;
761	uint8_t *b;
762	int x = tgts[0];
763	int c, j, exp;
764
765	ASSERT(ntgts == 1);
766
767	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
768	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
769
770	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
771		src = rm->rm_col[c].rc_data;
772		dst = rm->rm_col[x].rc_data;
773
774		if (c == x)
775			ccount = 0;
776		else
777			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
778
779		count = MIN(ccount, xcount);
780
781		if (c == rm->rm_firstdatacol) {
782			for (i = 0; i < count; i++, dst++, src++) {
783				*dst = *src;
784			}
785			for (; i < xcount; i++, dst++) {
786				*dst = 0;
787			}
788
789		} else {
790			for (i = 0; i < count; i++, dst++, src++) {
791				VDEV_RAIDZ_64MUL_2(*dst, mask);
792				*dst ^= *src;
793			}
794
795			for (; i < xcount; i++, dst++) {
796				VDEV_RAIDZ_64MUL_2(*dst, mask);
797			}
798		}
799	}
800
801	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
802	dst = rm->rm_col[x].rc_data;
803	exp = 255 - (rm->rm_cols - 1 - x);
804
805	for (i = 0; i < xcount; i++, dst++, src++) {
806		*dst ^= *src;
807		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
808			*b = vdev_raidz_exp2(*b, exp);
809		}
810	}
811
812	return (1 << VDEV_RAIDZ_Q);
813}
814
815static int
816vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
817{
818	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
819	void *pdata, *qdata;
820	uint64_t xsize, ysize, i;
821	int x = tgts[0];
822	int y = tgts[1];
823
824	ASSERT(ntgts == 2);
825	ASSERT(x < y);
826	ASSERT(x >= rm->rm_firstdatacol);
827	ASSERT(y < rm->rm_cols);
828
829	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
830
831	/*
832	 * Move the parity data aside -- we're going to compute parity as
833	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
834	 * reuse the parity generation mechanism without trashing the actual
835	 * parity so we make those columns appear to be full of zeros by
836	 * setting their lengths to zero.
837	 */
838	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
839	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
840	xsize = rm->rm_col[x].rc_size;
841	ysize = rm->rm_col[y].rc_size;
842
843	rm->rm_col[VDEV_RAIDZ_P].rc_data =
844	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
845	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
846	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
847	rm->rm_col[x].rc_size = 0;
848	rm->rm_col[y].rc_size = 0;
849
850	vdev_raidz_generate_parity_pq(rm);
851
852	rm->rm_col[x].rc_size = xsize;
853	rm->rm_col[y].rc_size = ysize;
854
855	p = pdata;
856	q = qdata;
857	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
858	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
859	xd = rm->rm_col[x].rc_data;
860	yd = rm->rm_col[y].rc_data;
861
862	/*
863	 * We now have:
864	 *	Pxy = P + D_x + D_y
865	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
866	 *
867	 * We can then solve for D_x:
868	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
869	 * where
870	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
871	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
872	 *
873	 * With D_x in hand, we can easily solve for D_y:
874	 *	D_y = P + Pxy + D_x
875	 */
876
877	a = vdev_raidz_pow2[255 + x - y];
878	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
879	tmp = 255 - vdev_raidz_log2[a ^ 1];
880
881	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
882	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
883
884	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
885		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
886		    vdev_raidz_exp2(*q ^ *qxy, bexp);
887
888		if (i < ysize)
889			*yd = *p ^ *pxy ^ *xd;
890	}
891
892	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
893	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
894	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
895	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
896
897	/*
898	 * Restore the saved parity data.
899	 */
900	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
901	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
902
903	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
904}
905
906/* BEGIN CSTYLED */
907/*
908 * In the general case of reconstruction, we must solve the system of linear
909 * equations defined by the coeffecients used to generate parity as well as
910 * the contents of the data and parity disks. This can be expressed with
911 * vectors for the original data (D) and the actual data (d) and parity (p)
912 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
913 *
914 *            __   __                     __     __
915 *            |     |         __     __   |  p_0  |
916 *            |  V  |         |  D_0  |   | p_m-1 |
917 *            |     |    x    |   :   | = |  d_0  |
918 *            |  I  |         | D_n-1 |   |   :   |
919 *            |     |         ~~     ~~   | d_n-1 |
920 *            ~~   ~~                     ~~     ~~
921 *
922 * I is simply a square identity matrix of size n, and V is a vandermonde
923 * matrix defined by the coeffecients we chose for the various parity columns
924 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
925 * computation as well as linear separability.
926 *
927 *      __               __               __     __
928 *      |   1   ..  1 1 1 |               |  p_0  |
929 *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
930 *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
931 *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
932 *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
933 *      |   :       : : : |   |   :   |   |  d_2  |
934 *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
935 *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
936 *      |   0   ..  0 0 1 |               | d_n-1 |
937 *      ~~               ~~               ~~     ~~
938 *
939 * Note that I, V, d, and p are known. To compute D, we must invert the
940 * matrix and use the known data and parity values to reconstruct the unknown
941 * data values. We begin by removing the rows in V|I and d|p that correspond
942 * to failed or missing columns; we then make V|I square (n x n) and d|p
943 * sized n by removing rows corresponding to unused parity from the bottom up
944 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
945 * using Gauss-Jordan elimination. In the example below we use m=3 parity
946 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
947 *           __                               __
948 *           |  1   1   1   1   1   1   1   1  |
949 *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
950 *           |  19 205 116  29  64  16  4   1  |      / /
951 *           |  1   0   0   0   0   0   0   0  |     / /
952 *           |  0   1   0   0   0   0   0   0  | <--' /
953 *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
954 *           |  0   0   0   1   0   0   0   0  |
955 *           |  0   0   0   0   1   0   0   0  |
956 *           |  0   0   0   0   0   1   0   0  |
957 *           |  0   0   0   0   0   0   1   0  |
958 *           |  0   0   0   0   0   0   0   1  |
959 *           ~~                               ~~
960 *           __                               __
961 *           |  1   1   1   1   1   1   1   1  |
962 *           | 128  64  32  16  8   4   2   1  |
963 *           |  19 205 116  29  64  16  4   1  |
964 *           |  1   0   0   0   0   0   0   0  |
965 *           |  0   1   0   0   0   0   0   0  |
966 *  (V|I)' = |  0   0   1   0   0   0   0   0  |
967 *           |  0   0   0   1   0   0   0   0  |
968 *           |  0   0   0   0   1   0   0   0  |
969 *           |  0   0   0   0   0   1   0   0  |
970 *           |  0   0   0   0   0   0   1   0  |
971 *           |  0   0   0   0   0   0   0   1  |
972 *           ~~                               ~~
973 *
974 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
975 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
976 * matrix is not singular.
977 * __                                                                 __
978 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
979 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
980 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
981 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
982 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
983 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
984 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
985 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
986 * ~~                                                                 ~~
987 * __                                                                 __
988 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
989 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
990 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
991 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
992 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
993 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
994 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
995 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
996 * ~~                                                                 ~~
997 * __                                                                 __
998 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
999 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1000 * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1001 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1002 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1003 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1004 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1005 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1006 * ~~                                                                 ~~
1007 * __                                                                 __
1008 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1009 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1010 * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1011 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1012 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1013 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1014 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1015 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1016 * ~~                                                                 ~~
1017 * __                                                                 __
1018 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1019 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1020 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1021 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1022 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1023 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1024 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1025 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1026 * ~~                                                                 ~~
1027 * __                                                                 __
1028 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1029 * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1030 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1031 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1032 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1033 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1034 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1035 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1036 * ~~                                                                 ~~
1037 *                   __                               __
1038 *                   |  0   0   1   0   0   0   0   0  |
1039 *                   | 167 100  5   41 159 169 217 208 |
1040 *                   | 166 100  4   40 158 168 216 209 |
1041 *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1042 *                   |  0   0   0   0   1   0   0   0  |
1043 *                   |  0   0   0   0   0   1   0   0  |
1044 *                   |  0   0   0   0   0   0   1   0  |
1045 *                   |  0   0   0   0   0   0   0   1  |
1046 *                   ~~                               ~~
1047 *
1048 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1049 * of the missing data.
1050 *
1051 * As is apparent from the example above, the only non-trivial rows in the
1052 * inverse matrix correspond to the data disks that we're trying to
1053 * reconstruct. Indeed, those are the only rows we need as the others would
1054 * only be useful for reconstructing data known or assumed to be valid. For
1055 * that reason, we only build the coefficients in the rows that correspond to
1056 * targeted columns.
1057 */
1058/* END CSTYLED */
1059
1060static void
1061vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1062    uint8_t **rows)
1063{
1064	int i, j;
1065	int pow;
1066
1067	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1068
1069	/*
1070	 * Fill in the missing rows of interest.
1071	 */
1072	for (i = 0; i < nmap; i++) {
1073		ASSERT3S(0, <=, map[i]);
1074		ASSERT3S(map[i], <=, 2);
1075
1076		pow = map[i] * n;
1077		if (pow > 255)
1078			pow -= 255;
1079		ASSERT(pow <= 255);
1080
1081		for (j = 0; j < n; j++) {
1082			pow -= map[i];
1083			if (pow < 0)
1084				pow += 255;
1085			rows[i][j] = vdev_raidz_pow2[pow];
1086		}
1087	}
1088}
1089
1090static void
1091vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1092    uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1093{
1094	int i, j, ii, jj;
1095	uint8_t log;
1096
1097	/*
1098	 * Assert that the first nmissing entries from the array of used
1099	 * columns correspond to parity columns and that subsequent entries
1100	 * correspond to data columns.
1101	 */
1102	for (i = 0; i < nmissing; i++) {
1103		ASSERT3S(used[i], <, rm->rm_firstdatacol);
1104	}
1105	for (; i < n; i++) {
1106		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1107	}
1108
1109	/*
1110	 * First initialize the storage where we'll compute the inverse rows.
1111	 */
1112	for (i = 0; i < nmissing; i++) {
1113		for (j = 0; j < n; j++) {
1114			invrows[i][j] = (i == j) ? 1 : 0;
1115		}
1116	}
1117
1118	/*
1119	 * Subtract all trivial rows from the rows of consequence.
1120	 */
1121	for (i = 0; i < nmissing; i++) {
1122		for (j = nmissing; j < n; j++) {
1123			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1124			jj = used[j] - rm->rm_firstdatacol;
1125			ASSERT3S(jj, <, n);
1126			invrows[i][j] = rows[i][jj];
1127			rows[i][jj] = 0;
1128		}
1129	}
1130
1131	/*
1132	 * For each of the rows of interest, we must normalize it and subtract
1133	 * a multiple of it from the other rows.
1134	 */
1135	for (i = 0; i < nmissing; i++) {
1136		for (j = 0; j < missing[i]; j++) {
1137			ASSERT3U(rows[i][j], ==, 0);
1138		}
1139		ASSERT3U(rows[i][missing[i]], !=, 0);
1140
1141		/*
1142		 * Compute the inverse of the first element and multiply each
1143		 * element in the row by that value.
1144		 */
1145		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1146
1147		for (j = 0; j < n; j++) {
1148			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1149			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1150		}
1151
1152		for (ii = 0; ii < nmissing; ii++) {
1153			if (i == ii)
1154				continue;
1155
1156			ASSERT3U(rows[ii][missing[i]], !=, 0);
1157
1158			log = vdev_raidz_log2[rows[ii][missing[i]]];
1159
1160			for (j = 0; j < n; j++) {
1161				rows[ii][j] ^=
1162				    vdev_raidz_exp2(rows[i][j], log);
1163				invrows[ii][j] ^=
1164				    vdev_raidz_exp2(invrows[i][j], log);
1165			}
1166		}
1167	}
1168
1169	/*
1170	 * Verify that the data that is left in the rows are properly part of
1171	 * an identity matrix.
1172	 */
1173	for (i = 0; i < nmissing; i++) {
1174		for (j = 0; j < n; j++) {
1175			if (j == missing[i]) {
1176				ASSERT3U(rows[i][j], ==, 1);
1177			} else {
1178				ASSERT3U(rows[i][j], ==, 0);
1179			}
1180		}
1181	}
1182}
1183
1184static void
1185vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1186    int *missing, uint8_t **invrows, const uint8_t *used)
1187{
1188	int i, j, x, cc, c;
1189	uint8_t *src;
1190	uint64_t ccount;
1191	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1192	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1193	uint8_t log, val;
1194	int ll;
1195	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1196	uint8_t *p, *pp;
1197	size_t psize;
1198
1199	psize = sizeof (invlog[0][0]) * n * nmissing;
1200	p = kmem_alloc(psize, KM_SLEEP);
1201
1202	for (pp = p, i = 0; i < nmissing; i++) {
1203		invlog[i] = pp;
1204		pp += n;
1205	}
1206
1207	for (i = 0; i < nmissing; i++) {
1208		for (j = 0; j < n; j++) {
1209			ASSERT3U(invrows[i][j], !=, 0);
1210			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1211		}
1212	}
1213
1214	for (i = 0; i < n; i++) {
1215		c = used[i];
1216		ASSERT3U(c, <, rm->rm_cols);
1217
1218		src = rm->rm_col[c].rc_data;
1219		ccount = rm->rm_col[c].rc_size;
1220		for (j = 0; j < nmissing; j++) {
1221			cc = missing[j] + rm->rm_firstdatacol;
1222			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1223			ASSERT3U(cc, <, rm->rm_cols);
1224			ASSERT3U(cc, !=, c);
1225
1226			dst[j] = rm->rm_col[cc].rc_data;
1227			dcount[j] = rm->rm_col[cc].rc_size;
1228		}
1229
1230		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1231
1232		for (x = 0; x < ccount; x++, src++) {
1233			if (*src != 0)
1234				log = vdev_raidz_log2[*src];
1235
1236			for (cc = 0; cc < nmissing; cc++) {
1237				if (x >= dcount[cc])
1238					continue;
1239
1240				if (*src == 0) {
1241					val = 0;
1242				} else {
1243					if ((ll = log + invlog[cc][i]) >= 255)
1244						ll -= 255;
1245					val = vdev_raidz_pow2[ll];
1246				}
1247
1248				if (i == 0)
1249					dst[cc][x] = val;
1250				else
1251					dst[cc][x] ^= val;
1252			}
1253		}
1254	}
1255
1256	kmem_free(p, psize);
1257}
1258
1259static int
1260vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1261{
1262	int n, i, c, t, tt;
1263	int nmissing_rows;
1264	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1265	int parity_map[VDEV_RAIDZ_MAXPARITY];
1266
1267	uint8_t *p, *pp;
1268	size_t psize;
1269
1270	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1271	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1272	uint8_t *used;
1273
1274	int code = 0;
1275
1276
1277	n = rm->rm_cols - rm->rm_firstdatacol;
1278
1279	/*
1280	 * Figure out which data columns are missing.
1281	 */
1282	nmissing_rows = 0;
1283	for (t = 0; t < ntgts; t++) {
1284		if (tgts[t] >= rm->rm_firstdatacol) {
1285			missing_rows[nmissing_rows++] =
1286			    tgts[t] - rm->rm_firstdatacol;
1287		}
1288	}
1289
1290	/*
1291	 * Figure out which parity columns to use to help generate the missing
1292	 * data columns.
1293	 */
1294	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1295		ASSERT(tt < ntgts);
1296		ASSERT(c < rm->rm_firstdatacol);
1297
1298		/*
1299		 * Skip any targeted parity columns.
1300		 */
1301		if (c == tgts[tt]) {
1302			tt++;
1303			continue;
1304		}
1305
1306		code |= 1 << c;
1307
1308		parity_map[i] = c;
1309		i++;
1310	}
1311
1312	ASSERT(code != 0);
1313	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1314
1315	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1316	    nmissing_rows * n + sizeof (used[0]) * n;
1317	p = kmem_alloc(psize, KM_SLEEP);
1318
1319	for (pp = p, i = 0; i < nmissing_rows; i++) {
1320		rows[i] = pp;
1321		pp += n;
1322		invrows[i] = pp;
1323		pp += n;
1324	}
1325	used = pp;
1326
1327	for (i = 0; i < nmissing_rows; i++) {
1328		used[i] = parity_map[i];
1329	}
1330
1331	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1332		if (tt < nmissing_rows &&
1333		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1334			tt++;
1335			continue;
1336		}
1337
1338		ASSERT3S(i, <, n);
1339		used[i] = c;
1340		i++;
1341	}
1342
1343	/*
1344	 * Initialize the interesting rows of the matrix.
1345	 */
1346	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1347
1348	/*
1349	 * Invert the matrix.
1350	 */
1351	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1352	    invrows, used);
1353
1354	/*
1355	 * Reconstruct the missing data using the generated matrix.
1356	 */
1357	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1358	    invrows, used);
1359
1360	kmem_free(p, psize);
1361
1362	return (code);
1363}
1364
1365static int
1366vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1367{
1368	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1369	int ntgts;
1370	int i, c;
1371	int code;
1372	int nbadparity, nbaddata;
1373	int parity_valid[VDEV_RAIDZ_MAXPARITY];
1374
1375	/*
1376	 * The tgts list must already be sorted.
1377	 */
1378	for (i = 1; i < nt; i++) {
1379		ASSERT(t[i] > t[i - 1]);
1380	}
1381
1382	nbadparity = rm->rm_firstdatacol;
1383	nbaddata = rm->rm_cols - nbadparity;
1384	ntgts = 0;
1385	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1386		if (c < rm->rm_firstdatacol)
1387			parity_valid[c] = B_FALSE;
1388
1389		if (i < nt && c == t[i]) {
1390			tgts[ntgts++] = c;
1391			i++;
1392		} else if (rm->rm_col[c].rc_error != 0) {
1393			tgts[ntgts++] = c;
1394		} else if (c >= rm->rm_firstdatacol) {
1395			nbaddata--;
1396		} else {
1397			parity_valid[c] = B_TRUE;
1398			nbadparity--;
1399		}
1400	}
1401
1402	ASSERT(ntgts >= nt);
1403	ASSERT(nbaddata >= 0);
1404	ASSERT(nbaddata + nbadparity == ntgts);
1405
1406	dt = &tgts[nbadparity];
1407
1408	/*
1409	 * See if we can use any of our optimized reconstruction routines.
1410	 */
1411	if (!vdev_raidz_default_to_general) {
1412		switch (nbaddata) {
1413		case 1:
1414			if (parity_valid[VDEV_RAIDZ_P])
1415				return (vdev_raidz_reconstruct_p(rm, dt, 1));
1416
1417			ASSERT(rm->rm_firstdatacol > 1);
1418
1419			if (parity_valid[VDEV_RAIDZ_Q])
1420				return (vdev_raidz_reconstruct_q(rm, dt, 1));
1421
1422			ASSERT(rm->rm_firstdatacol > 2);
1423			break;
1424
1425		case 2:
1426			ASSERT(rm->rm_firstdatacol > 1);
1427
1428			if (parity_valid[VDEV_RAIDZ_P] &&
1429			    parity_valid[VDEV_RAIDZ_Q])
1430				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1431
1432			ASSERT(rm->rm_firstdatacol > 2);
1433
1434			break;
1435		}
1436	}
1437
1438	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1439	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1440	ASSERT(code > 0);
1441	return (code);
1442}
1443
1444static int
1445vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
1446{
1447	vdev_t *cvd;
1448	uint64_t nparity = vd->vdev_nparity;
1449	int c;
1450	int lasterror = 0;
1451	int numerrors = 0;
1452
1453	ASSERT(nparity > 0);
1454
1455	if (nparity > VDEV_RAIDZ_MAXPARITY ||
1456	    vd->vdev_children < nparity + 1) {
1457		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1458		return (EINVAL);
1459	}
1460
1461	vdev_open_children(vd);
1462
1463	for (c = 0; c < vd->vdev_children; c++) {
1464		cvd = vd->vdev_child[c];
1465
1466		if (cvd->vdev_open_error != 0) {
1467			lasterror = cvd->vdev_open_error;
1468			numerrors++;
1469			continue;
1470		}
1471
1472		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1473		*ashift = MAX(*ashift, cvd->vdev_ashift);
1474	}
1475
1476	*asize *= vd->vdev_children;
1477
1478	if (numerrors > nparity) {
1479		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1480		return (lasterror);
1481	}
1482
1483	return (0);
1484}
1485
1486static void
1487vdev_raidz_close(vdev_t *vd)
1488{
1489	int c;
1490
1491	for (c = 0; c < vd->vdev_children; c++)
1492		vdev_close(vd->vdev_child[c]);
1493}
1494
1495static uint64_t
1496vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1497{
1498	uint64_t asize;
1499	uint64_t ashift = vd->vdev_top->vdev_ashift;
1500	uint64_t cols = vd->vdev_children;
1501	uint64_t nparity = vd->vdev_nparity;
1502
1503	asize = ((psize - 1) >> ashift) + 1;
1504	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1505	asize = roundup(asize, nparity + 1) << ashift;
1506
1507	return (asize);
1508}
1509
1510static void
1511vdev_raidz_child_done(zio_t *zio)
1512{
1513	raidz_col_t *rc = zio->io_private;
1514
1515	rc->rc_error = zio->io_error;
1516	rc->rc_tried = 1;
1517	rc->rc_skipped = 0;
1518}
1519
1520static int
1521vdev_raidz_io_start(zio_t *zio)
1522{
1523	vdev_t *vd = zio->io_vd;
1524	vdev_t *tvd = vd->vdev_top;
1525	vdev_t *cvd;
1526	raidz_map_t *rm;
1527	raidz_col_t *rc;
1528	int c, i;
1529
1530	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
1531	    vd->vdev_nparity);
1532
1533	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1534
1535	if (zio->io_type == ZIO_TYPE_WRITE) {
1536		vdev_raidz_generate_parity(rm);
1537
1538		for (c = 0; c < rm->rm_cols; c++) {
1539			rc = &rm->rm_col[c];
1540			cvd = vd->vdev_child[rc->rc_devidx];
1541			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1542			    rc->rc_offset, rc->rc_data, rc->rc_size,
1543			    zio->io_type, zio->io_priority, 0,
1544			    vdev_raidz_child_done, rc));
1545		}
1546
1547		/*
1548		 * Generate optional I/Os for any skipped sectors to improve
1549		 * aggregation contiguity.
1550		 */
1551		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1552			ASSERT(c <= rm->rm_scols);
1553			if (c == rm->rm_scols)
1554				c = 0;
1555			rc = &rm->rm_col[c];
1556			cvd = vd->vdev_child[rc->rc_devidx];
1557			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1558			    rc->rc_offset + rc->rc_size, NULL,
1559			    1 << tvd->vdev_ashift,
1560			    zio->io_type, zio->io_priority,
1561			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1562		}
1563
1564		return (ZIO_PIPELINE_CONTINUE);
1565	}
1566
1567	ASSERT(zio->io_type == ZIO_TYPE_READ);
1568
1569	/*
1570	 * Iterate over the columns in reverse order so that we hit the parity
1571	 * last -- any errors along the way will force us to read the parity.
1572	 */
1573	for (c = rm->rm_cols - 1; c >= 0; c--) {
1574		rc = &rm->rm_col[c];
1575		cvd = vd->vdev_child[rc->rc_devidx];
1576		if (!vdev_readable(cvd)) {
1577			if (c >= rm->rm_firstdatacol)
1578				rm->rm_missingdata++;
1579			else
1580				rm->rm_missingparity++;
1581			rc->rc_error = ENXIO;
1582			rc->rc_tried = 1;	/* don't even try */
1583			rc->rc_skipped = 1;
1584			continue;
1585		}
1586		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1587			if (c >= rm->rm_firstdatacol)
1588				rm->rm_missingdata++;
1589			else
1590				rm->rm_missingparity++;
1591			rc->rc_error = ESTALE;
1592			rc->rc_skipped = 1;
1593			continue;
1594		}
1595		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1596		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1597			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1598			    rc->rc_offset, rc->rc_data, rc->rc_size,
1599			    zio->io_type, zio->io_priority, 0,
1600			    vdev_raidz_child_done, rc));
1601		}
1602	}
1603
1604	return (ZIO_PIPELINE_CONTINUE);
1605}
1606
1607/*
1608 * Report a checksum error for a child of a RAID-Z device.
1609 */
1610static void
1611raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1612{
1613	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1614
1615	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1616		zio_bad_cksum_t zbc;
1617		raidz_map_t *rm = zio->io_vsd;
1618
1619		mutex_enter(&vd->vdev_stat_lock);
1620		vd->vdev_stat.vs_checksum_errors++;
1621		mutex_exit(&vd->vdev_stat_lock);
1622
1623		zbc.zbc_has_cksum = 0;
1624		zbc.zbc_injected = rm->rm_ecksuminjected;
1625
1626		zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1627		    rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1628		    &zbc);
1629	}
1630}
1631
1632/*
1633 * We keep track of whether or not there were any injected errors, so that
1634 * any ereports we generate can note it.
1635 */
1636static int
1637raidz_checksum_verify(zio_t *zio)
1638{
1639	zio_bad_cksum_t zbc;
1640	raidz_map_t *rm = zio->io_vsd;
1641
1642	int ret = zio_checksum_error(zio, &zbc);
1643	if (ret != 0 && zbc.zbc_injected != 0)
1644		rm->rm_ecksuminjected = 1;
1645
1646	return (ret);
1647}
1648
1649/*
1650 * Generate the parity from the data columns. If we tried and were able to
1651 * read the parity without error, verify that the generated parity matches the
1652 * data we read. If it doesn't, we fire off a checksum error. Return the
1653 * number such failures.
1654 */
1655static int
1656raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1657{
1658	void *orig[VDEV_RAIDZ_MAXPARITY];
1659	int c, ret = 0;
1660	raidz_col_t *rc;
1661
1662	for (c = 0; c < rm->rm_firstdatacol; c++) {
1663		rc = &rm->rm_col[c];
1664		if (!rc->rc_tried || rc->rc_error != 0)
1665			continue;
1666		orig[c] = zio_buf_alloc(rc->rc_size);
1667		bcopy(rc->rc_data, orig[c], rc->rc_size);
1668	}
1669
1670	vdev_raidz_generate_parity(rm);
1671
1672	for (c = 0; c < rm->rm_firstdatacol; c++) {
1673		rc = &rm->rm_col[c];
1674		if (!rc->rc_tried || rc->rc_error != 0)
1675			continue;
1676		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1677			raidz_checksum_error(zio, rc, orig[c]);
1678			rc->rc_error = ECKSUM;
1679			ret++;
1680		}
1681		zio_buf_free(orig[c], rc->rc_size);
1682	}
1683
1684	return (ret);
1685}
1686
1687/*
1688 * Keep statistics on all the ways that we used parity to correct data.
1689 */
1690static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1691
1692static int
1693vdev_raidz_worst_error(raidz_map_t *rm)
1694{
1695	int error = 0;
1696
1697	for (int c = 0; c < rm->rm_cols; c++)
1698		error = zio_worst_error(error, rm->rm_col[c].rc_error);
1699
1700	return (error);
1701}
1702
1703/*
1704 * Iterate over all combinations of bad data and attempt a reconstruction.
1705 * Note that the algorithm below is non-optimal because it doesn't take into
1706 * account how reconstruction is actually performed. For example, with
1707 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1708 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1709 * cases we'd only use parity information in column 0.
1710 */
1711static int
1712vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1713{
1714	raidz_map_t *rm = zio->io_vsd;
1715	raidz_col_t *rc;
1716	void *orig[VDEV_RAIDZ_MAXPARITY];
1717	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1718	int *tgts = &tstore[1];
1719	int current, next, i, c, n;
1720	int code, ret = 0;
1721
1722	ASSERT(total_errors < rm->rm_firstdatacol);
1723
1724	/*
1725	 * This simplifies one edge condition.
1726	 */
1727	tgts[-1] = -1;
1728
1729	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1730		/*
1731		 * Initialize the targets array by finding the first n columns
1732		 * that contain no error.
1733		 *
1734		 * If there were no data errors, we need to ensure that we're
1735		 * always explicitly attempting to reconstruct at least one
1736		 * data column. To do this, we simply push the highest target
1737		 * up into the data columns.
1738		 */
1739		for (c = 0, i = 0; i < n; i++) {
1740			if (i == n - 1 && data_errors == 0 &&
1741			    c < rm->rm_firstdatacol) {
1742				c = rm->rm_firstdatacol;
1743			}
1744
1745			while (rm->rm_col[c].rc_error != 0) {
1746				c++;
1747				ASSERT3S(c, <, rm->rm_cols);
1748			}
1749
1750			tgts[i] = c++;
1751		}
1752
1753		/*
1754		 * Setting tgts[n] simplifies the other edge condition.
1755		 */
1756		tgts[n] = rm->rm_cols;
1757
1758		/*
1759		 * These buffers were allocated in previous iterations.
1760		 */
1761		for (i = 0; i < n - 1; i++) {
1762			ASSERT(orig[i] != NULL);
1763		}
1764
1765		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
1766
1767		current = 0;
1768		next = tgts[current];
1769
1770		while (current != n) {
1771			tgts[current] = next;
1772			current = 0;
1773
1774			/*
1775			 * Save off the original data that we're going to
1776			 * attempt to reconstruct.
1777			 */
1778			for (i = 0; i < n; i++) {
1779				ASSERT(orig[i] != NULL);
1780				c = tgts[i];
1781				ASSERT3S(c, >=, 0);
1782				ASSERT3S(c, <, rm->rm_cols);
1783				rc = &rm->rm_col[c];
1784				bcopy(rc->rc_data, orig[i], rc->rc_size);
1785			}
1786
1787			/*
1788			 * Attempt a reconstruction and exit the outer loop on
1789			 * success.
1790			 */
1791			code = vdev_raidz_reconstruct(rm, tgts, n);
1792			if (raidz_checksum_verify(zio) == 0) {
1793				atomic_inc_64(&raidz_corrected[code]);
1794
1795				for (i = 0; i < n; i++) {
1796					c = tgts[i];
1797					rc = &rm->rm_col[c];
1798					ASSERT(rc->rc_error == 0);
1799					if (rc->rc_tried)
1800						raidz_checksum_error(zio, rc,
1801						    orig[i]);
1802					rc->rc_error = ECKSUM;
1803				}
1804
1805				ret = code;
1806				goto done;
1807			}
1808
1809			/*
1810			 * Restore the original data.
1811			 */
1812			for (i = 0; i < n; i++) {
1813				c = tgts[i];
1814				rc = &rm->rm_col[c];
1815				bcopy(orig[i], rc->rc_data, rc->rc_size);
1816			}
1817
1818			do {
1819				/*
1820				 * Find the next valid column after the current
1821				 * position..
1822				 */
1823				for (next = tgts[current] + 1;
1824				    next < rm->rm_cols &&
1825				    rm->rm_col[next].rc_error != 0; next++)
1826					continue;
1827
1828				ASSERT(next <= tgts[current + 1]);
1829
1830				/*
1831				 * If that spot is available, we're done here.
1832				 */
1833				if (next != tgts[current + 1])
1834					break;
1835
1836				/*
1837				 * Otherwise, find the next valid column after
1838				 * the previous position.
1839				 */
1840				for (c = tgts[current - 1] + 1;
1841				    rm->rm_col[c].rc_error != 0; c++)
1842					continue;
1843
1844				tgts[current] = c;
1845				current++;
1846
1847			} while (current != n);
1848		}
1849	}
1850	n--;
1851done:
1852	for (i = 0; i < n; i++) {
1853		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
1854	}
1855
1856	return (ret);
1857}
1858
1859static void
1860vdev_raidz_io_done(zio_t *zio)
1861{
1862	vdev_t *vd = zio->io_vd;
1863	vdev_t *cvd;
1864	raidz_map_t *rm = zio->io_vsd;
1865	raidz_col_t *rc;
1866	int unexpected_errors = 0;
1867	int parity_errors = 0;
1868	int parity_untried = 0;
1869	int data_errors = 0;
1870	int total_errors = 0;
1871	int n, c;
1872	int tgts[VDEV_RAIDZ_MAXPARITY];
1873	int code;
1874
1875	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
1876
1877	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1878	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1879
1880	for (c = 0; c < rm->rm_cols; c++) {
1881		rc = &rm->rm_col[c];
1882
1883		if (rc->rc_error) {
1884			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
1885
1886			if (c < rm->rm_firstdatacol)
1887				parity_errors++;
1888			else
1889				data_errors++;
1890
1891			if (!rc->rc_skipped)
1892				unexpected_errors++;
1893
1894			total_errors++;
1895		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1896			parity_untried++;
1897		}
1898	}
1899
1900	if (zio->io_type == ZIO_TYPE_WRITE) {
1901		/*
1902		 * XXX -- for now, treat partial writes as a success.
1903		 * (If we couldn't write enough columns to reconstruct
1904		 * the data, the I/O failed.  Otherwise, good enough.)
1905		 *
1906		 * Now that we support write reallocation, it would be better
1907		 * to treat partial failure as real failure unless there are
1908		 * no non-degraded top-level vdevs left, and not update DTLs
1909		 * if we intend to reallocate.
1910		 */
1911		/* XXPOLICY */
1912		if (total_errors > rm->rm_firstdatacol)
1913			zio->io_error = vdev_raidz_worst_error(rm);
1914
1915		return;
1916	}
1917
1918	ASSERT(zio->io_type == ZIO_TYPE_READ);
1919	/*
1920	 * There are three potential phases for a read:
1921	 *	1. produce valid data from the columns read
1922	 *	2. read all disks and try again
1923	 *	3. perform combinatorial reconstruction
1924	 *
1925	 * Each phase is progressively both more expensive and less likely to
1926	 * occur. If we encounter more errors than we can repair or all phases
1927	 * fail, we have no choice but to return an error.
1928	 */
1929
1930	/*
1931	 * If the number of errors we saw was correctable -- less than or equal
1932	 * to the number of parity disks read -- attempt to produce data that
1933	 * has a valid checksum. Naturally, this case applies in the absence of
1934	 * any errors.
1935	 */
1936	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1937		if (data_errors == 0) {
1938			if (raidz_checksum_verify(zio) == 0) {
1939				/*
1940				 * If we read parity information (unnecessarily
1941				 * as it happens since no reconstruction was
1942				 * needed) regenerate and verify the parity.
1943				 * We also regenerate parity when resilvering
1944				 * so we can write it out to the failed device
1945				 * later.
1946				 */
1947				if (parity_errors + parity_untried <
1948				    rm->rm_firstdatacol ||
1949				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
1950					n = raidz_parity_verify(zio, rm);
1951					unexpected_errors += n;
1952					ASSERT(parity_errors + n <=
1953					    rm->rm_firstdatacol);
1954				}
1955				goto done;
1956			}
1957		} else {
1958			/*
1959			 * We either attempt to read all the parity columns or
1960			 * none of them. If we didn't try to read parity, we
1961			 * wouldn't be here in the correctable case. There must
1962			 * also have been fewer parity errors than parity
1963			 * columns or, again, we wouldn't be in this code path.
1964			 */
1965			ASSERT(parity_untried == 0);
1966			ASSERT(parity_errors < rm->rm_firstdatacol);
1967
1968			/*
1969			 * Identify the data columns that reported an error.
1970			 */
1971			n = 0;
1972			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1973				rc = &rm->rm_col[c];
1974				if (rc->rc_error != 0) {
1975					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1976					tgts[n++] = c;
1977				}
1978			}
1979
1980			ASSERT(rm->rm_firstdatacol >= n);
1981
1982			code = vdev_raidz_reconstruct(rm, tgts, n);
1983
1984			if (raidz_checksum_verify(zio) == 0) {
1985				atomic_inc_64(&raidz_corrected[code]);
1986
1987				/*
1988				 * If we read more parity disks than were used
1989				 * for reconstruction, confirm that the other
1990				 * parity disks produced correct data. This
1991				 * routine is suboptimal in that it regenerates
1992				 * the parity that we already used in addition
1993				 * to the parity that we're attempting to
1994				 * verify, but this should be a relatively
1995				 * uncommon case, and can be optimized if it
1996				 * becomes a problem. Note that we regenerate
1997				 * parity when resilvering so we can write it
1998				 * out to failed devices later.
1999				 */
2000				if (parity_errors < rm->rm_firstdatacol - n ||
2001				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2002					n = raidz_parity_verify(zio, rm);
2003					unexpected_errors += n;
2004					ASSERT(parity_errors + n <=
2005					    rm->rm_firstdatacol);
2006				}
2007
2008				goto done;
2009			}
2010		}
2011	}
2012
2013	/*
2014	 * This isn't a typical situation -- either we got a read error or
2015	 * a child silently returned bad data. Read every block so we can
2016	 * try again with as much data and parity as we can track down. If
2017	 * we've already been through once before, all children will be marked
2018	 * as tried so we'll proceed to combinatorial reconstruction.
2019	 */
2020	unexpected_errors = 1;
2021	rm->rm_missingdata = 0;
2022	rm->rm_missingparity = 0;
2023
2024	for (c = 0; c < rm->rm_cols; c++) {
2025		if (rm->rm_col[c].rc_tried)
2026			continue;
2027
2028		zio_vdev_io_redone(zio);
2029		do {
2030			rc = &rm->rm_col[c];
2031			if (rc->rc_tried)
2032				continue;
2033			zio_nowait(zio_vdev_child_io(zio, NULL,
2034			    vd->vdev_child[rc->rc_devidx],
2035			    rc->rc_offset, rc->rc_data, rc->rc_size,
2036			    zio->io_type, zio->io_priority, 0,
2037			    vdev_raidz_child_done, rc));
2038		} while (++c < rm->rm_cols);
2039
2040		return;
2041	}
2042
2043	/*
2044	 * At this point we've attempted to reconstruct the data given the
2045	 * errors we detected, and we've attempted to read all columns. There
2046	 * must, therefore, be one or more additional problems -- silent errors
2047	 * resulting in invalid data rather than explicit I/O errors resulting
2048	 * in absent data. We check if there is enough additional data to
2049	 * possibly reconstruct the data and then perform combinatorial
2050	 * reconstruction over all possible combinations. If that fails,
2051	 * we're cooked.
2052	 */
2053	if (total_errors > rm->rm_firstdatacol) {
2054		zio->io_error = vdev_raidz_worst_error(rm);
2055
2056	} else if (total_errors < rm->rm_firstdatacol &&
2057	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2058		/*
2059		 * If we didn't use all the available parity for the
2060		 * combinatorial reconstruction, verify that the remaining
2061		 * parity is correct.
2062		 */
2063		if (code != (1 << rm->rm_firstdatacol) - 1)
2064			(void) raidz_parity_verify(zio, rm);
2065	} else {
2066		/*
2067		 * We're here because either:
2068		 *
2069		 *	total_errors == rm_first_datacol, or
2070		 *	vdev_raidz_combrec() failed
2071		 *
2072		 * In either case, there is enough bad data to prevent
2073		 * reconstruction.
2074		 *
2075		 * Start checksum ereports for all children which haven't
2076		 * failed, and the IO wasn't speculative.
2077		 */
2078		zio->io_error = ECKSUM;
2079
2080		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2081			for (c = 0; c < rm->rm_cols; c++) {
2082				rc = &rm->rm_col[c];
2083				if (rc->rc_error == 0) {
2084					zio_bad_cksum_t zbc;
2085					zbc.zbc_has_cksum = 0;
2086					zbc.zbc_injected =
2087					    rm->rm_ecksuminjected;
2088
2089					zfs_ereport_start_checksum(
2090					    zio->io_spa,
2091					    vd->vdev_child[rc->rc_devidx],
2092					    zio, rc->rc_offset, rc->rc_size,
2093					    (void *)(uintptr_t)c, &zbc);
2094				}
2095			}
2096		}
2097	}
2098
2099done:
2100	zio_checksum_verified(zio);
2101
2102	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2103	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2104		/*
2105		 * Use the good data we have in hand to repair damaged children.
2106		 */
2107		for (c = 0; c < rm->rm_cols; c++) {
2108			rc = &rm->rm_col[c];
2109			cvd = vd->vdev_child[rc->rc_devidx];
2110
2111			if (rc->rc_error == 0)
2112				continue;
2113
2114			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2115			    rc->rc_offset, rc->rc_data, rc->rc_size,
2116			    ZIO_TYPE_WRITE, zio->io_priority,
2117			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2118			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2119		}
2120	}
2121}
2122
2123static void
2124vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2125{
2126	if (faulted > vd->vdev_nparity)
2127		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2128		    VDEV_AUX_NO_REPLICAS);
2129	else if (degraded + faulted != 0)
2130		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2131	else
2132		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2133}
2134
2135vdev_ops_t vdev_raidz_ops = {
2136	vdev_raidz_open,
2137	vdev_raidz_close,
2138	vdev_raidz_asize,
2139	vdev_raidz_io_start,
2140	vdev_raidz_io_done,
2141	vdev_raidz_state_change,
2142	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
2143	B_FALSE			/* not a leaf vdev */
2144};
2145