1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#include <sys/cdefs.h>
27#include <lz4.h>
28
29static uint64_t zfs_crc64_table[256];
30
31#ifndef ASSERT3S	/* Proxy for all the assert defines */
32#define	ASSERT3S(x, y, z)	((void)0)
33#define	ASSERT3U(x, y, z)	((void)0)
34#define	ASSERT3P(x, y, z)	((void)0)
35#define	ASSERT0(x)		((void)0)
36#define	ASSERT(x)		((void)0)
37#endif
38
39#define	panic(...)	do {						\
40	printf(__VA_ARGS__);						\
41	for (;;) ;							\
42} while (0)
43
44static void
45zfs_init_crc(void)
46{
47	int i, j;
48	uint64_t *ct;
49
50	/*
51	 * Calculate the crc64 table (used for the zap hash
52	 * function).
53	 */
54	if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
55		memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
56		for (i = 0; i < 256; i++)
57			for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
58				*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
59	}
60}
61
62static void
63zio_checksum_off(const void *buf, uint64_t size,
64    const void *ctx_template, zio_cksum_t *zcp)
65{
66	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
67}
68
69/*
70 * Signature for checksum functions.
71 */
72typedef void zio_checksum_t(const void *data, uint64_t size,
73    const void *ctx_template, zio_cksum_t *zcp);
74typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
75typedef void zio_checksum_tmpl_free_t(void *ctx_template);
76
77typedef enum zio_checksum_flags {
78	/* Strong enough for metadata? */
79	ZCHECKSUM_FLAG_METADATA = (1 << 1),
80	/* ZIO embedded checksum */
81	ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
82	/* Strong enough for dedup (without verification)? */
83	ZCHECKSUM_FLAG_DEDUP = (1 << 3),
84	/* Uses salt value */
85	ZCHECKSUM_FLAG_SALTED = (1 << 4),
86	/* Strong enough for nopwrite? */
87	ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
88} zio_checksum_flags_t;
89
90/*
91 * Information about each checksum function.
92 */
93typedef struct zio_checksum_info {
94	/* checksum function for each byteorder */
95	zio_checksum_t			*ci_func[2];
96	zio_checksum_tmpl_init_t	*ci_tmpl_init;
97	zio_checksum_tmpl_free_t	*ci_tmpl_free;
98	zio_checksum_flags_t		ci_flags;
99	const char			*ci_name;	/* descriptive name */
100} zio_checksum_info_t;
101
102#include "blkptr.c"
103
104#include "fletcher.c"
105#include "blake3_zfs.c"
106#include "sha256.c"
107#include "skein_zfs.c"
108
109#ifdef HAS_ZSTD_ZFS
110extern int zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len,
111    size_t d_len, int n);
112#endif
113
114static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
115	{{NULL, NULL}, NULL, NULL, 0, "inherit"},
116	{{NULL, NULL}, NULL, NULL, 0, "on"},
117	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL, 0, "off"},
118	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
119	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"},
120	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
121	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"},
122	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
123	    ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
124	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
125	    0, "fletcher2"},
126	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
127	    ZCHECKSUM_FLAG_METADATA, "fletcher4"},
128	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
129	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
130	    ZCHECKSUM_FLAG_NOPWRITE, "SHA256"},
131	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
132	    ZCHECKSUM_FLAG_EMBEDDED, "zillog2"},
133	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL,
134	    0, "noparity"},
135	{{zio_checksum_SHA512_native,	zio_checksum_SHA512_byteswap},
136	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
137	    ZCHECKSUM_FLAG_NOPWRITE, "SHA512"},
138	{{zio_checksum_skein_native, zio_checksum_skein_byteswap},
139	    zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
140	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
141	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
142	/* no edonr for now */
143	{{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA |
144	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
145	{{zio_checksum_blake3_native,	zio_checksum_blake3_byteswap},
146	    zio_checksum_blake3_tmpl_init, zio_checksum_blake3_tmpl_free,
147	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
148	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "blake3"}
149};
150
151/*
152 * Common signature for all zio compress/decompress functions.
153 */
154typedef size_t zio_compress_func_t(void *src, void *dst,
155    size_t s_len, size_t d_len, int);
156typedef int zio_decompress_func_t(void *src, void *dst,
157    size_t s_len, size_t d_len, int);
158
159/*
160 * Information about each compression function.
161 */
162typedef struct zio_compress_info {
163	zio_compress_func_t	*ci_compress;	/* compression function */
164	zio_decompress_func_t	*ci_decompress;	/* decompression function */
165	int			ci_level;	/* level parameter */
166	const char		*ci_name;	/* algorithm name */
167} zio_compress_info_t;
168
169#include "lzjb.c"
170#include "zle.c"
171#include "gzip.c"
172
173/*
174 * Compression vectors.
175 */
176static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
177	{NULL,			NULL,			0,	"inherit"},
178	{NULL,			NULL,			0,	"on"},
179	{NULL,			NULL,			0,	"uncompressed"},
180	{NULL,			lzjb_decompress,	0,	"lzjb"},
181	{NULL,			NULL,			0,	"empty"},
182	{NULL,			gzip_decompress,	1,	"gzip-1"},
183	{NULL,			gzip_decompress,	2,	"gzip-2"},
184	{NULL,			gzip_decompress,	3,	"gzip-3"},
185	{NULL,			gzip_decompress,	4,	"gzip-4"},
186	{NULL,			gzip_decompress,	5,	"gzip-5"},
187	{NULL,			gzip_decompress,	6,	"gzip-6"},
188	{NULL,			gzip_decompress,	7,	"gzip-7"},
189	{NULL,			gzip_decompress,	8,	"gzip-8"},
190	{NULL,			gzip_decompress,	9,	"gzip-9"},
191	{NULL,			zle_decompress,		64,	"zle"},
192	{NULL,			lz4_decompress,		0,	"lz4"},
193#ifdef HAS_ZSTD_ZFS
194	{NULL,			zfs_zstd_decompress, ZIO_ZSTD_LEVEL_DEFAULT, "zstd"}
195#endif
196};
197
198static void
199byteswap_uint64_array(void *vbuf, size_t size)
200{
201	uint64_t *buf = vbuf;
202	size_t count = size >> 3;
203	int i;
204
205	ASSERT((size & 7) == 0);
206
207	for (i = 0; i < count; i++)
208		buf[i] = BSWAP_64(buf[i]);
209}
210
211/*
212 * Set the external verifier for a gang block based on <vdev, offset, txg>,
213 * a tuple which is guaranteed to be unique for the life of the pool.
214 */
215static void
216zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
217{
218	const dva_t *dva = BP_IDENTITY(bp);
219	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
220
221	ASSERT(BP_IS_GANG(bp));
222
223	ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
224}
225
226/*
227 * Set the external verifier for a label block based on its offset.
228 * The vdev is implicit, and the txg is unknowable at pool open time --
229 * hence the logic in vdev_uberblock_load() to find the most recent copy.
230 */
231static void
232zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
233{
234	ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
235}
236
237/*
238 * Calls the template init function of a checksum which supports context
239 * templates and installs the template into the spa_t.
240 */
241static void
242zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
243{
244	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
245
246	if (ci->ci_tmpl_init == NULL)
247		return;
248
249	if (spa->spa_cksum_tmpls[checksum] != NULL)
250		return;
251
252	if (spa->spa_cksum_tmpls[checksum] == NULL) {
253		spa->spa_cksum_tmpls[checksum] =
254		    ci->ci_tmpl_init(&spa->spa_cksum_salt);
255	}
256}
257
258/*
259 * Called by a spa_t that's about to be deallocated. This steps through
260 * all of the checksum context templates and deallocates any that were
261 * initialized using the algorithm-specific template init function.
262 */
263static void __unused
264zio_checksum_templates_free(spa_t *spa)
265{
266	for (enum zio_checksum checksum = 0;
267	    checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
268		if (spa->spa_cksum_tmpls[checksum] != NULL) {
269			zio_checksum_info_t *ci = &zio_checksum_table[checksum];
270
271			ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
272			spa->spa_cksum_tmpls[checksum] = NULL;
273		}
274	}
275}
276
277static int
278zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data)
279{
280	uint64_t size;
281	unsigned int checksum;
282	zio_checksum_info_t *ci;
283	void *ctx = NULL;
284	zio_cksum_t actual_cksum, expected_cksum, verifier;
285	int byteswap;
286
287	checksum = BP_GET_CHECKSUM(bp);
288	size = BP_GET_PSIZE(bp);
289
290	if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
291		return (EINVAL);
292	ci = &zio_checksum_table[checksum];
293	if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
294		return (EINVAL);
295
296	if (spa != NULL) {
297		zio_checksum_template_init(checksum, __DECONST(spa_t *,spa));
298		ctx = spa->spa_cksum_tmpls[checksum];
299	}
300
301	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
302		zio_eck_t *eck;
303
304		ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
305		    checksum == ZIO_CHECKSUM_LABEL);
306
307		eck = (zio_eck_t *)((char *)data + size) - 1;
308
309		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
310			zio_checksum_gang_verifier(&verifier, bp);
311		else if (checksum == ZIO_CHECKSUM_LABEL)
312			zio_checksum_label_verifier(&verifier,
313			    DVA_GET_OFFSET(BP_IDENTITY(bp)));
314		else
315			verifier = bp->blk_cksum;
316
317		byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
318
319		if (byteswap)
320			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
321
322		expected_cksum = eck->zec_cksum;
323		eck->zec_cksum = verifier;
324		ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
325		eck->zec_cksum = expected_cksum;
326
327		if (byteswap)
328			byteswap_uint64_array(&expected_cksum,
329			    sizeof (zio_cksum_t));
330	} else {
331		byteswap = BP_SHOULD_BYTESWAP(bp);
332		expected_cksum = bp->blk_cksum;
333		ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
334	}
335
336	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
337		/*printf("ZFS: read checksum %s failed\n", ci->ci_name);*/
338		return (EIO);
339	}
340
341	return (0);
342}
343
344static int
345zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
346	void *dest, uint64_t destsize)
347{
348	zio_compress_info_t *ci;
349
350	if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
351		printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
352		return (EIO);
353	}
354
355	ci = &zio_compress_table[cpfunc];
356	if (!ci->ci_decompress) {
357		printf("ZFS: unsupported compression algorithm %s\n",
358		    ci->ci_name);
359		return (EIO);
360	}
361
362	return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
363}
364
365static uint64_t
366zap_hash(uint64_t salt, const char *name)
367{
368	const uint8_t *cp;
369	uint8_t c;
370	uint64_t crc = salt;
371
372	ASSERT(crc != 0);
373	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
374	for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
375		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
376
377	/*
378	 * Only use 28 bits, since we need 4 bits in the cookie for the
379	 * collision differentiator.  We MUST use the high bits, since
380	 * those are the onces that we first pay attention to when
381	 * chosing the bucket.
382	 */
383	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
384
385	return (crc);
386}
387
388typedef struct raidz_col {
389	uint64_t rc_devidx;		/* child device index for I/O */
390	uint64_t rc_offset;		/* device offset */
391	uint64_t rc_size;		/* I/O size */
392	void *rc_data;			/* I/O data */
393	int rc_error;			/* I/O error for this device */
394	uint8_t rc_tried;		/* Did we attempt this I/O column? */
395	uint8_t rc_skipped;		/* Did we skip this I/O column? */
396} raidz_col_t;
397
398typedef struct raidz_map {
399	uint64_t rm_cols;		/* Regular column count */
400	uint64_t rm_scols;		/* Count including skipped columns */
401	uint64_t rm_bigcols;		/* Number of oversized columns */
402	uint64_t rm_asize;		/* Actual total I/O size */
403	uint64_t rm_missingdata;	/* Count of missing data devices */
404	uint64_t rm_missingparity;	/* Count of missing parity devices */
405	uint64_t rm_firstdatacol;	/* First data column/parity count */
406	uint64_t rm_nskip;		/* Skipped sectors for padding */
407	uint64_t rm_skipstart;		/* Column index of padding start */
408	uintptr_t rm_reports;		/* # of referencing checksum reports */
409	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
410	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
411	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
412} raidz_map_t;
413
414#define	VDEV_RAIDZ_P		0
415#define	VDEV_RAIDZ_Q		1
416#define	VDEV_RAIDZ_R		2
417
418#define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
419#define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
420
421/*
422 * We provide a mechanism to perform the field multiplication operation on a
423 * 64-bit value all at once rather than a byte at a time. This works by
424 * creating a mask from the top bit in each byte and using that to
425 * conditionally apply the XOR of 0x1d.
426 */
427#define	VDEV_RAIDZ_64MUL_2(x, mask) \
428{ \
429	(mask) = (x) & 0x8080808080808080ULL; \
430	(mask) = ((mask) << 1) - ((mask) >> 7); \
431	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
432	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
433}
434
435#define	VDEV_RAIDZ_64MUL_4(x, mask) \
436{ \
437	VDEV_RAIDZ_64MUL_2((x), mask); \
438	VDEV_RAIDZ_64MUL_2((x), mask); \
439}
440
441/*
442 * These two tables represent powers and logs of 2 in the Galois field defined
443 * above. These values were computed by repeatedly multiplying by 2 as above.
444 */
445static const uint8_t vdev_raidz_pow2[256] = {
446	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
447	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
448	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
449	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
450	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
451	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
452	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
453	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
454	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
455	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
456	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
457	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
458	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
459	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
460	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
461	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
462	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
463	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
464	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
465	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
466	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
467	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
468	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
469	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
470	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
471	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
472	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
473	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
474	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
475	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
476	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
477	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
478};
479static const uint8_t vdev_raidz_log2[256] = {
480	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
481	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
482	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
483	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
484	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
485	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
486	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
487	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
488	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
489	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
490	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
491	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
492	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
493	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
494	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
495	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
496	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
497	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
498	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
499	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
500	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
501	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
502	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
503	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
504	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
505	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
506	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
507	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
508	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
509	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
510	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
511	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
512};
513
514/*
515 * Multiply a given number by 2 raised to the given power.
516 */
517static uint8_t
518vdev_raidz_exp2(uint8_t a, int exp)
519{
520	if (a == 0)
521		return (0);
522
523	ASSERT(exp >= 0);
524	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
525
526	exp += vdev_raidz_log2[a];
527	if (exp > 255)
528		exp -= 255;
529
530	return (vdev_raidz_pow2[exp]);
531}
532
533static void
534vdev_raidz_generate_parity_p(raidz_map_t *rm)
535{
536	uint64_t *p, *src, ccount, i;
537	uint64_t pcount __unused;
538	int c;
539
540	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
541
542	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
543		src = rm->rm_col[c].rc_data;
544		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
545		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
546
547		if (c == rm->rm_firstdatacol) {
548			ASSERT(ccount == pcount);
549			for (i = 0; i < ccount; i++, src++, p++) {
550				*p = *src;
551			}
552		} else {
553			ASSERT(ccount <= pcount);
554			for (i = 0; i < ccount; i++, src++, p++) {
555				*p ^= *src;
556			}
557		}
558	}
559}
560
561static void
562vdev_raidz_generate_parity_pq(raidz_map_t *rm)
563{
564	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
565	int c;
566
567	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
568	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
569	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
570
571	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
572		src = rm->rm_col[c].rc_data;
573		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
574		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
575
576		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
577
578		if (c == rm->rm_firstdatacol) {
579			ASSERT(ccnt == pcnt || ccnt == 0);
580			for (i = 0; i < ccnt; i++, src++, p++, q++) {
581				*p = *src;
582				*q = *src;
583			}
584			for (; i < pcnt; i++, src++, p++, q++) {
585				*p = 0;
586				*q = 0;
587			}
588		} else {
589			ASSERT(ccnt <= pcnt);
590
591			/*
592			 * Apply the algorithm described above by multiplying
593			 * the previous result and adding in the new value.
594			 */
595			for (i = 0; i < ccnt; i++, src++, p++, q++) {
596				*p ^= *src;
597
598				VDEV_RAIDZ_64MUL_2(*q, mask);
599				*q ^= *src;
600			}
601
602			/*
603			 * Treat short columns as though they are full of 0s.
604			 * Note that there's therefore nothing needed for P.
605			 */
606			for (; i < pcnt; i++, q++) {
607				VDEV_RAIDZ_64MUL_2(*q, mask);
608			}
609		}
610	}
611}
612
613static void
614vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
615{
616	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
617	int c;
618
619	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
620	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
621	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
622	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
623	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
624
625	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
626		src = rm->rm_col[c].rc_data;
627		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
628		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
629		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
630
631		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
632
633		if (c == rm->rm_firstdatacol) {
634			ASSERT(ccnt == pcnt || ccnt == 0);
635			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
636				*p = *src;
637				*q = *src;
638				*r = *src;
639			}
640			for (; i < pcnt; i++, src++, p++, q++, r++) {
641				*p = 0;
642				*q = 0;
643				*r = 0;
644			}
645		} else {
646			ASSERT(ccnt <= pcnt);
647
648			/*
649			 * Apply the algorithm described above by multiplying
650			 * the previous result and adding in the new value.
651			 */
652			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
653				*p ^= *src;
654
655				VDEV_RAIDZ_64MUL_2(*q, mask);
656				*q ^= *src;
657
658				VDEV_RAIDZ_64MUL_4(*r, mask);
659				*r ^= *src;
660			}
661
662			/*
663			 * Treat short columns as though they are full of 0s.
664			 * Note that there's therefore nothing needed for P.
665			 */
666			for (; i < pcnt; i++, q++, r++) {
667				VDEV_RAIDZ_64MUL_2(*q, mask);
668				VDEV_RAIDZ_64MUL_4(*r, mask);
669			}
670		}
671	}
672}
673
674/*
675 * Generate RAID parity in the first virtual columns according to the number of
676 * parity columns available.
677 */
678static void
679vdev_raidz_generate_parity(raidz_map_t *rm)
680{
681	switch (rm->rm_firstdatacol) {
682	case 1:
683		vdev_raidz_generate_parity_p(rm);
684		break;
685	case 2:
686		vdev_raidz_generate_parity_pq(rm);
687		break;
688	case 3:
689		vdev_raidz_generate_parity_pqr(rm);
690		break;
691	default:
692		panic("invalid RAID-Z configuration");
693	}
694}
695
696/* BEGIN CSTYLED */
697/*
698 * In the general case of reconstruction, we must solve the system of linear
699 * equations defined by the coeffecients used to generate parity as well as
700 * the contents of the data and parity disks. This can be expressed with
701 * vectors for the original data (D) and the actual data (d) and parity (p)
702 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
703 *
704 *            __   __                     __     __
705 *            |     |         __     __   |  p_0  |
706 *            |  V  |         |  D_0  |   | p_m-1 |
707 *            |     |    x    |   :   | = |  d_0  |
708 *            |  I  |         | D_n-1 |   |   :   |
709 *            |     |         ~~     ~~   | d_n-1 |
710 *            ~~   ~~                     ~~     ~~
711 *
712 * I is simply a square identity matrix of size n, and V is a vandermonde
713 * matrix defined by the coeffecients we chose for the various parity columns
714 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
715 * computation as well as linear separability.
716 *
717 *      __               __               __     __
718 *      |   1   ..  1 1 1 |               |  p_0  |
719 *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
720 *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
721 *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
722 *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
723 *      |   :       : : : |   |   :   |   |  d_2  |
724 *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
725 *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
726 *      |   0   ..  0 0 1 |               | d_n-1 |
727 *      ~~               ~~               ~~     ~~
728 *
729 * Note that I, V, d, and p are known. To compute D, we must invert the
730 * matrix and use the known data and parity values to reconstruct the unknown
731 * data values. We begin by removing the rows in V|I and d|p that correspond
732 * to failed or missing columns; we then make V|I square (n x n) and d|p
733 * sized n by removing rows corresponding to unused parity from the bottom up
734 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
735 * using Gauss-Jordan elimination. In the example below we use m=3 parity
736 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
737 *           __                               __
738 *           |  1   1   1   1   1   1   1   1  |
739 *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
740 *           |  19 205 116  29  64  16  4   1  |      / /
741 *           |  1   0   0   0   0   0   0   0  |     / /
742 *           |  0   1   0   0   0   0   0   0  | <--' /
743 *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
744 *           |  0   0   0   1   0   0   0   0  |
745 *           |  0   0   0   0   1   0   0   0  |
746 *           |  0   0   0   0   0   1   0   0  |
747 *           |  0   0   0   0   0   0   1   0  |
748 *           |  0   0   0   0   0   0   0   1  |
749 *           ~~                               ~~
750 *           __                               __
751 *           |  1   1   1   1   1   1   1   1  |
752 *           | 128  64  32  16  8   4   2   1  |
753 *           |  19 205 116  29  64  16  4   1  |
754 *           |  1   0   0   0   0   0   0   0  |
755 *           |  0   1   0   0   0   0   0   0  |
756 *  (V|I)' = |  0   0   1   0   0   0   0   0  |
757 *           |  0   0   0   1   0   0   0   0  |
758 *           |  0   0   0   0   1   0   0   0  |
759 *           |  0   0   0   0   0   1   0   0  |
760 *           |  0   0   0   0   0   0   1   0  |
761 *           |  0   0   0   0   0   0   0   1  |
762 *           ~~                               ~~
763 *
764 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
765 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
766 * matrix is not singular.
767 * __                                                                 __
768 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
769 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
770 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
771 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
772 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
773 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
774 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
775 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
776 * ~~                                                                 ~~
777 * __                                                                 __
778 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
779 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
780 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
781 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
782 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
783 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
784 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
785 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
786 * ~~                                                                 ~~
787 * __                                                                 __
788 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
789 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
790 * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
791 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
792 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
793 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
794 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
795 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
796 * ~~                                                                 ~~
797 * __                                                                 __
798 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
799 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
800 * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
801 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
802 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
803 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
804 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
805 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
806 * ~~                                                                 ~~
807 * __                                                                 __
808 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
809 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
810 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
811 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
812 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
813 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
814 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
815 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
816 * ~~                                                                 ~~
817 * __                                                                 __
818 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
819 * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
820 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
821 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
822 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
823 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
824 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
825 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
826 * ~~                                                                 ~~
827 *                   __                               __
828 *                   |  0   0   1   0   0   0   0   0  |
829 *                   | 167 100  5   41 159 169 217 208 |
830 *                   | 166 100  4   40 158 168 216 209 |
831 *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
832 *                   |  0   0   0   0   1   0   0   0  |
833 *                   |  0   0   0   0   0   1   0   0  |
834 *                   |  0   0   0   0   0   0   1   0  |
835 *                   |  0   0   0   0   0   0   0   1  |
836 *                   ~~                               ~~
837 *
838 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
839 * of the missing data.
840 *
841 * As is apparent from the example above, the only non-trivial rows in the
842 * inverse matrix correspond to the data disks that we're trying to
843 * reconstruct. Indeed, those are the only rows we need as the others would
844 * only be useful for reconstructing data known or assumed to be valid. For
845 * that reason, we only build the coefficients in the rows that correspond to
846 * targeted columns.
847 */
848/* END CSTYLED */
849
850static void
851vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
852    uint8_t **rows)
853{
854	int i, j;
855	int pow;
856
857	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
858
859	/*
860	 * Fill in the missing rows of interest.
861	 */
862	for (i = 0; i < nmap; i++) {
863		ASSERT3S(0, <=, map[i]);
864		ASSERT3S(map[i], <=, 2);
865
866		pow = map[i] * n;
867		if (pow > 255)
868			pow -= 255;
869		ASSERT(pow <= 255);
870
871		for (j = 0; j < n; j++) {
872			pow -= map[i];
873			if (pow < 0)
874				pow += 255;
875			rows[i][j] = vdev_raidz_pow2[pow];
876		}
877	}
878}
879
880static void
881vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
882    uint8_t **rows, uint8_t **invrows, const uint8_t *used)
883{
884	int i, j, ii, jj;
885	uint8_t log;
886
887	/*
888	 * Assert that the first nmissing entries from the array of used
889	 * columns correspond to parity columns and that subsequent entries
890	 * correspond to data columns.
891	 */
892	for (i = 0; i < nmissing; i++) {
893		ASSERT3S(used[i], <, rm->rm_firstdatacol);
894	}
895	for (; i < n; i++) {
896		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
897	}
898
899	/*
900	 * First initialize the storage where we'll compute the inverse rows.
901	 */
902	for (i = 0; i < nmissing; i++) {
903		for (j = 0; j < n; j++) {
904			invrows[i][j] = (i == j) ? 1 : 0;
905		}
906	}
907
908	/*
909	 * Subtract all trivial rows from the rows of consequence.
910	 */
911	for (i = 0; i < nmissing; i++) {
912		for (j = nmissing; j < n; j++) {
913			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
914			jj = used[j] - rm->rm_firstdatacol;
915			ASSERT3S(jj, <, n);
916			invrows[i][j] = rows[i][jj];
917			rows[i][jj] = 0;
918		}
919	}
920
921	/*
922	 * For each of the rows of interest, we must normalize it and subtract
923	 * a multiple of it from the other rows.
924	 */
925	for (i = 0; i < nmissing; i++) {
926		for (j = 0; j < missing[i]; j++) {
927			ASSERT3U(rows[i][j], ==, 0);
928		}
929		ASSERT3U(rows[i][missing[i]], !=, 0);
930
931		/*
932		 * Compute the inverse of the first element and multiply each
933		 * element in the row by that value.
934		 */
935		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
936
937		for (j = 0; j < n; j++) {
938			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
939			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
940		}
941
942		for (ii = 0; ii < nmissing; ii++) {
943			if (i == ii)
944				continue;
945
946			ASSERT3U(rows[ii][missing[i]], !=, 0);
947
948			log = vdev_raidz_log2[rows[ii][missing[i]]];
949
950			for (j = 0; j < n; j++) {
951				rows[ii][j] ^=
952				    vdev_raidz_exp2(rows[i][j], log);
953				invrows[ii][j] ^=
954				    vdev_raidz_exp2(invrows[i][j], log);
955			}
956		}
957	}
958
959	/*
960	 * Verify that the data that is left in the rows are properly part of
961	 * an identity matrix.
962	 */
963	for (i = 0; i < nmissing; i++) {
964		for (j = 0; j < n; j++) {
965			if (j == missing[i]) {
966				ASSERT3U(rows[i][j], ==, 1);
967			} else {
968				ASSERT3U(rows[i][j], ==, 0);
969			}
970		}
971	}
972}
973
974static void
975vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
976    int *missing, uint8_t **invrows, const uint8_t *used)
977{
978	int i, j, x, cc, c;
979	uint8_t *src;
980	uint64_t ccount;
981	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
982	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
983	uint8_t log, val;
984	int ll;
985	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
986	uint8_t *p, *pp;
987	size_t psize;
988
989	log = 0;	/* gcc */
990	psize = sizeof (invlog[0][0]) * n * nmissing;
991	p = malloc(psize);
992	if (p == NULL) {
993		printf("Out of memory\n");
994		return;
995	}
996
997	for (pp = p, i = 0; i < nmissing; i++) {
998		invlog[i] = pp;
999		pp += n;
1000	}
1001
1002	for (i = 0; i < nmissing; i++) {
1003		for (j = 0; j < n; j++) {
1004			ASSERT3U(invrows[i][j], !=, 0);
1005			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1006		}
1007	}
1008
1009	for (i = 0; i < n; i++) {
1010		c = used[i];
1011		ASSERT3U(c, <, rm->rm_cols);
1012
1013		src = rm->rm_col[c].rc_data;
1014		ccount = rm->rm_col[c].rc_size;
1015		for (j = 0; j < nmissing; j++) {
1016			cc = missing[j] + rm->rm_firstdatacol;
1017			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1018			ASSERT3U(cc, <, rm->rm_cols);
1019			ASSERT3U(cc, !=, c);
1020
1021			dst[j] = rm->rm_col[cc].rc_data;
1022			dcount[j] = rm->rm_col[cc].rc_size;
1023		}
1024
1025		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1026
1027		for (x = 0; x < ccount; x++, src++) {
1028			if (*src != 0)
1029				log = vdev_raidz_log2[*src];
1030
1031			for (cc = 0; cc < nmissing; cc++) {
1032				if (x >= dcount[cc])
1033					continue;
1034
1035				if (*src == 0) {
1036					val = 0;
1037				} else {
1038					if ((ll = log + invlog[cc][i]) >= 255)
1039						ll -= 255;
1040					val = vdev_raidz_pow2[ll];
1041				}
1042
1043				if (i == 0)
1044					dst[cc][x] = val;
1045				else
1046					dst[cc][x] ^= val;
1047			}
1048		}
1049	}
1050
1051	free(p);
1052}
1053
1054static int
1055vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1056{
1057	int n, i, c, t, tt;
1058	int nmissing_rows;
1059	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1060	int parity_map[VDEV_RAIDZ_MAXPARITY];
1061
1062	uint8_t *p, *pp;
1063	size_t psize;
1064
1065	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1066	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1067	uint8_t *used;
1068
1069	int code = 0;
1070
1071
1072	n = rm->rm_cols - rm->rm_firstdatacol;
1073
1074	/*
1075	 * Figure out which data columns are missing.
1076	 */
1077	nmissing_rows = 0;
1078	for (t = 0; t < ntgts; t++) {
1079		if (tgts[t] >= rm->rm_firstdatacol) {
1080			missing_rows[nmissing_rows++] =
1081			    tgts[t] - rm->rm_firstdatacol;
1082		}
1083	}
1084
1085	/*
1086	 * Figure out which parity columns to use to help generate the missing
1087	 * data columns.
1088	 */
1089	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1090		ASSERT(tt < ntgts);
1091		ASSERT(c < rm->rm_firstdatacol);
1092
1093		/*
1094		 * Skip any targeted parity columns.
1095		 */
1096		if (c == tgts[tt]) {
1097			tt++;
1098			continue;
1099		}
1100
1101		code |= 1 << c;
1102
1103		parity_map[i] = c;
1104		i++;
1105	}
1106
1107	ASSERT(code != 0);
1108	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1109
1110	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1111	    nmissing_rows * n + sizeof (used[0]) * n;
1112	p = malloc(psize);
1113	if (p == NULL) {
1114		printf("Out of memory\n");
1115		return (code);
1116	}
1117
1118	for (pp = p, i = 0; i < nmissing_rows; i++) {
1119		rows[i] = pp;
1120		pp += n;
1121		invrows[i] = pp;
1122		pp += n;
1123	}
1124	used = pp;
1125
1126	for (i = 0; i < nmissing_rows; i++) {
1127		used[i] = parity_map[i];
1128	}
1129
1130	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1131		if (tt < nmissing_rows &&
1132		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1133			tt++;
1134			continue;
1135		}
1136
1137		ASSERT3S(i, <, n);
1138		used[i] = c;
1139		i++;
1140	}
1141
1142	/*
1143	 * Initialize the interesting rows of the matrix.
1144	 */
1145	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1146
1147	/*
1148	 * Invert the matrix.
1149	 */
1150	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1151	    invrows, used);
1152
1153	/*
1154	 * Reconstruct the missing data using the generated matrix.
1155	 */
1156	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1157	    invrows, used);
1158
1159	free(p);
1160
1161	return (code);
1162}
1163
1164static int
1165vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1166{
1167	int tgts[VDEV_RAIDZ_MAXPARITY];
1168	int ntgts;
1169	int i, c;
1170	int code;
1171	int nbadparity, nbaddata;
1172
1173	/*
1174	 * The tgts list must already be sorted.
1175	 */
1176	for (i = 1; i < nt; i++) {
1177		ASSERT(t[i] > t[i - 1]);
1178	}
1179
1180	nbadparity = rm->rm_firstdatacol;
1181	nbaddata = rm->rm_cols - nbadparity;
1182	ntgts = 0;
1183	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1184		if (i < nt && c == t[i]) {
1185			tgts[ntgts++] = c;
1186			i++;
1187		} else if (rm->rm_col[c].rc_error != 0) {
1188			tgts[ntgts++] = c;
1189		} else if (c >= rm->rm_firstdatacol) {
1190			nbaddata--;
1191		} else {
1192			nbadparity--;
1193		}
1194	}
1195
1196	ASSERT(ntgts >= nt);
1197	ASSERT(nbaddata >= 0);
1198	ASSERT(nbaddata + nbadparity == ntgts);
1199
1200	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1201	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1202	ASSERT(code > 0);
1203	return (code);
1204}
1205
1206static raidz_map_t *
1207vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1208    uint64_t dcols, uint64_t nparity)
1209{
1210	raidz_map_t *rm;
1211	uint64_t b = offset >> unit_shift;
1212	uint64_t s = size >> unit_shift;
1213	uint64_t f = b % dcols;
1214	uint64_t o = (b / dcols) << unit_shift;
1215	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1216
1217	q = s / (dcols - nparity);
1218	r = s - q * (dcols - nparity);
1219	bc = (r == 0 ? 0 : r + nparity);
1220	tot = s + nparity * (q + (r == 0 ? 0 : 1));
1221
1222	if (q == 0) {
1223		acols = bc;
1224		scols = MIN(dcols, roundup(bc, nparity + 1));
1225	} else {
1226		acols = dcols;
1227		scols = dcols;
1228	}
1229
1230	ASSERT3U(acols, <=, scols);
1231
1232	rm = malloc(offsetof(raidz_map_t, rm_col[scols]));
1233	if (rm == NULL)
1234		return (rm);
1235
1236	rm->rm_cols = acols;
1237	rm->rm_scols = scols;
1238	rm->rm_bigcols = bc;
1239	rm->rm_skipstart = bc;
1240	rm->rm_missingdata = 0;
1241	rm->rm_missingparity = 0;
1242	rm->rm_firstdatacol = nparity;
1243	rm->rm_reports = 0;
1244	rm->rm_freed = 0;
1245	rm->rm_ecksuminjected = 0;
1246
1247	asize = 0;
1248
1249	for (c = 0; c < scols; c++) {
1250		col = f + c;
1251		coff = o;
1252		if (col >= dcols) {
1253			col -= dcols;
1254			coff += 1ULL << unit_shift;
1255		}
1256		rm->rm_col[c].rc_devidx = col;
1257		rm->rm_col[c].rc_offset = coff;
1258		rm->rm_col[c].rc_data = NULL;
1259		rm->rm_col[c].rc_error = 0;
1260		rm->rm_col[c].rc_tried = 0;
1261		rm->rm_col[c].rc_skipped = 0;
1262
1263		if (c >= acols)
1264			rm->rm_col[c].rc_size = 0;
1265		else if (c < bc)
1266			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1267		else
1268			rm->rm_col[c].rc_size = q << unit_shift;
1269
1270		asize += rm->rm_col[c].rc_size;
1271	}
1272
1273	ASSERT3U(asize, ==, tot << unit_shift);
1274	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1275	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1276	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1277	ASSERT3U(rm->rm_nskip, <=, nparity);
1278
1279	for (c = 0; c < rm->rm_firstdatacol; c++) {
1280		rm->rm_col[c].rc_data = malloc(rm->rm_col[c].rc_size);
1281		if (rm->rm_col[c].rc_data == NULL) {
1282			c++;
1283			while (c != 0)
1284				free(rm->rm_col[--c].rc_data);
1285			free(rm);
1286			return (NULL);
1287		}
1288	}
1289
1290	rm->rm_col[c].rc_data = data;
1291
1292	for (c = c + 1; c < acols; c++)
1293		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1294		    rm->rm_col[c - 1].rc_size;
1295
1296	/*
1297	 * If all data stored spans all columns, there's a danger that parity
1298	 * will always be on the same device and, since parity isn't read
1299	 * during normal operation, that that device's I/O bandwidth won't be
1300	 * used effectively. We therefore switch the parity every 1MB.
1301	 *
1302	 * ... at least that was, ostensibly, the theory. As a practical
1303	 * matter unless we juggle the parity between all devices evenly, we
1304	 * won't see any benefit. Further, occasional writes that aren't a
1305	 * multiple of the LCM of the number of children and the minimum
1306	 * stripe width are sufficient to avoid pessimal behavior.
1307	 * Unfortunately, this decision created an implicit on-disk format
1308	 * requirement that we need to support for all eternity, but only
1309	 * for single-parity RAID-Z.
1310	 *
1311	 * If we intend to skip a sector in the zeroth column for padding
1312	 * we must make sure to note this swap. We will never intend to
1313	 * skip the first column since at least one data and one parity
1314	 * column must appear in each row.
1315	 */
1316	ASSERT(rm->rm_cols >= 2);
1317	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1318
1319	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1320		devidx = rm->rm_col[0].rc_devidx;
1321		o = rm->rm_col[0].rc_offset;
1322		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1323		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1324		rm->rm_col[1].rc_devidx = devidx;
1325		rm->rm_col[1].rc_offset = o;
1326
1327		if (rm->rm_skipstart == 0)
1328			rm->rm_skipstart = 1;
1329	}
1330
1331	return (rm);
1332}
1333
1334static void
1335vdev_raidz_map_free(raidz_map_t *rm)
1336{
1337	int c;
1338
1339	for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1340		free(rm->rm_col[c].rc_data);
1341
1342	free(rm);
1343}
1344
1345static vdev_t *
1346vdev_child(vdev_t *pvd, uint64_t devidx)
1347{
1348	vdev_t *cvd;
1349
1350	STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1351		if (cvd->v_id == devidx)
1352			break;
1353	}
1354
1355	return (cvd);
1356}
1357
1358/*
1359 * We keep track of whether or not there were any injected errors, so that
1360 * any ereports we generate can note it.
1361 */
1362static int
1363raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data,
1364    uint64_t size)
1365{
1366	return (zio_checksum_verify(spa, bp, data));
1367}
1368
1369/*
1370 * Generate the parity from the data columns. If we tried and were able to
1371 * read the parity without error, verify that the generated parity matches the
1372 * data we read. If it doesn't, we fire off a checksum error. Return the
1373 * number such failures.
1374 */
1375static int
1376raidz_parity_verify(raidz_map_t *rm)
1377{
1378	void *orig[VDEV_RAIDZ_MAXPARITY];
1379	int c, ret = 0;
1380	raidz_col_t *rc;
1381
1382	for (c = 0; c < rm->rm_firstdatacol; c++) {
1383		rc = &rm->rm_col[c];
1384		if (!rc->rc_tried || rc->rc_error != 0)
1385			continue;
1386		orig[c] = malloc(rc->rc_size);
1387		if (orig[c] != NULL) {
1388			bcopy(rc->rc_data, orig[c], rc->rc_size);
1389		} else {
1390			printf("Out of memory\n");
1391		}
1392	}
1393
1394	vdev_raidz_generate_parity(rm);
1395
1396	for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1397		rc = &rm->rm_col[c];
1398		if (!rc->rc_tried || rc->rc_error != 0)
1399			continue;
1400		if (orig[c] == NULL ||
1401		    bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1402			rc->rc_error = ECKSUM;
1403			ret++;
1404		}
1405		free(orig[c]);
1406	}
1407
1408	return (ret);
1409}
1410
1411/*
1412 * Iterate over all combinations of bad data and attempt a reconstruction.
1413 * Note that the algorithm below is non-optimal because it doesn't take into
1414 * account how reconstruction is actually performed. For example, with
1415 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1416 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1417 * cases we'd only use parity information in column 0.
1418 */
1419static int
1420vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp,
1421    void *data, off_t offset, uint64_t bytes, int total_errors, int data_errors)
1422{
1423	raidz_col_t *rc;
1424	void *orig[VDEV_RAIDZ_MAXPARITY];
1425	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1426	int *tgts = &tstore[1];
1427	int current, next, i, c, n;
1428	int code, ret = 0;
1429
1430	ASSERT(total_errors < rm->rm_firstdatacol);
1431
1432	/*
1433	 * This simplifies one edge condition.
1434	 */
1435	tgts[-1] = -1;
1436
1437	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1438		/*
1439		 * Initialize the targets array by finding the first n columns
1440		 * that contain no error.
1441		 *
1442		 * If there were no data errors, we need to ensure that we're
1443		 * always explicitly attempting to reconstruct at least one
1444		 * data column. To do this, we simply push the highest target
1445		 * up into the data columns.
1446		 */
1447		for (c = 0, i = 0; i < n; i++) {
1448			if (i == n - 1 && data_errors == 0 &&
1449			    c < rm->rm_firstdatacol) {
1450				c = rm->rm_firstdatacol;
1451			}
1452
1453			while (rm->rm_col[c].rc_error != 0) {
1454				c++;
1455				ASSERT3S(c, <, rm->rm_cols);
1456			}
1457
1458			tgts[i] = c++;
1459		}
1460
1461		/*
1462		 * Setting tgts[n] simplifies the other edge condition.
1463		 */
1464		tgts[n] = rm->rm_cols;
1465
1466		/*
1467		 * These buffers were allocated in previous iterations.
1468		 */
1469		for (i = 0; i < n - 1; i++) {
1470			ASSERT(orig[i] != NULL);
1471		}
1472
1473		orig[n - 1] = malloc(rm->rm_col[0].rc_size);
1474		if (orig[n - 1] == NULL) {
1475			ret = ENOMEM;
1476			goto done;
1477		}
1478
1479		current = 0;
1480		next = tgts[current];
1481
1482		while (current != n) {
1483			tgts[current] = next;
1484			current = 0;
1485
1486			/*
1487			 * Save off the original data that we're going to
1488			 * attempt to reconstruct.
1489			 */
1490			for (i = 0; i < n; i++) {
1491				ASSERT(orig[i] != NULL);
1492				c = tgts[i];
1493				ASSERT3S(c, >=, 0);
1494				ASSERT3S(c, <, rm->rm_cols);
1495				rc = &rm->rm_col[c];
1496				bcopy(rc->rc_data, orig[i], rc->rc_size);
1497			}
1498
1499			/*
1500			 * Attempt a reconstruction and exit the outer loop on
1501			 * success.
1502			 */
1503			code = vdev_raidz_reconstruct(rm, tgts, n);
1504			if (raidz_checksum_verify(spa, bp, data, bytes) == 0) {
1505				for (i = 0; i < n; i++) {
1506					c = tgts[i];
1507					rc = &rm->rm_col[c];
1508					ASSERT(rc->rc_error == 0);
1509					rc->rc_error = ECKSUM;
1510				}
1511
1512				ret = code;
1513				goto done;
1514			}
1515
1516			/*
1517			 * Restore the original data.
1518			 */
1519			for (i = 0; i < n; i++) {
1520				c = tgts[i];
1521				rc = &rm->rm_col[c];
1522				bcopy(orig[i], rc->rc_data, rc->rc_size);
1523			}
1524
1525			do {
1526				/*
1527				 * Find the next valid column after the current
1528				 * position..
1529				 */
1530				for (next = tgts[current] + 1;
1531				    next < rm->rm_cols &&
1532				    rm->rm_col[next].rc_error != 0; next++)
1533					continue;
1534
1535				ASSERT(next <= tgts[current + 1]);
1536
1537				/*
1538				 * If that spot is available, we're done here.
1539				 */
1540				if (next != tgts[current + 1])
1541					break;
1542
1543				/*
1544				 * Otherwise, find the next valid column after
1545				 * the previous position.
1546				 */
1547				for (c = tgts[current - 1] + 1;
1548				    rm->rm_col[c].rc_error != 0; c++)
1549					continue;
1550
1551				tgts[current] = c;
1552				current++;
1553
1554			} while (current != n);
1555		}
1556	}
1557	n--;
1558done:
1559	for (i = n - 1; i >= 0; i--) {
1560		free(orig[i]);
1561	}
1562
1563	return (ret);
1564}
1565
1566static int
1567vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1568    off_t offset, size_t bytes)
1569{
1570	vdev_t *tvd = vd->v_top;
1571	vdev_t *cvd;
1572	raidz_map_t *rm;
1573	raidz_col_t *rc;
1574	int c, error;
1575	int unexpected_errors __unused;
1576	int parity_errors;
1577	int parity_untried;
1578	int data_errors;
1579	int total_errors;
1580	int n;
1581	int tgts[VDEV_RAIDZ_MAXPARITY];
1582	int code;
1583
1584	rc = NULL;	/* gcc */
1585	error = 0;
1586
1587	rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1588	    vd->v_nchildren, vd->v_nparity);
1589	if (rm == NULL)
1590		return (ENOMEM);
1591
1592	/*
1593	 * Iterate over the columns in reverse order so that we hit the parity
1594	 * last -- any errors along the way will force us to read the parity.
1595	 */
1596	for (c = rm->rm_cols - 1; c >= 0; c--) {
1597		rc = &rm->rm_col[c];
1598		cvd = vdev_child(vd, rc->rc_devidx);
1599		if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1600			if (c >= rm->rm_firstdatacol)
1601				rm->rm_missingdata++;
1602			else
1603				rm->rm_missingparity++;
1604			rc->rc_error = ENXIO;
1605			rc->rc_tried = 1;	/* don't even try */
1606			rc->rc_skipped = 1;
1607			continue;
1608		}
1609#if 0		/* XXX: Too hard for the boot code. */
1610		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1611			if (c >= rm->rm_firstdatacol)
1612				rm->rm_missingdata++;
1613			else
1614				rm->rm_missingparity++;
1615			rc->rc_error = ESTALE;
1616			rc->rc_skipped = 1;
1617			continue;
1618		}
1619#endif
1620		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1621			rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1622			    rc->rc_offset, rc->rc_size);
1623			rc->rc_tried = 1;
1624			rc->rc_skipped = 0;
1625		}
1626	}
1627
1628reconstruct:
1629	unexpected_errors = 0;
1630	parity_errors = 0;
1631	parity_untried = 0;
1632	data_errors = 0;
1633	total_errors = 0;
1634
1635	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1636	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1637
1638	for (c = 0; c < rm->rm_cols; c++) {
1639		rc = &rm->rm_col[c];
1640
1641		if (rc->rc_error) {
1642			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
1643
1644			if (c < rm->rm_firstdatacol)
1645				parity_errors++;
1646			else
1647				data_errors++;
1648
1649			if (!rc->rc_skipped)
1650				unexpected_errors++;
1651
1652			total_errors++;
1653		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1654			parity_untried++;
1655		}
1656	}
1657
1658	/*
1659	 * There are three potential phases for a read:
1660	 *	1. produce valid data from the columns read
1661	 *	2. read all disks and try again
1662	 *	3. perform combinatorial reconstruction
1663	 *
1664	 * Each phase is progressively both more expensive and less likely to
1665	 * occur. If we encounter more errors than we can repair or all phases
1666	 * fail, we have no choice but to return an error.
1667	 */
1668
1669	/*
1670	 * If the number of errors we saw was correctable -- less than or equal
1671	 * to the number of parity disks read -- attempt to produce data that
1672	 * has a valid checksum. Naturally, this case applies in the absence of
1673	 * any errors.
1674	 */
1675	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1676		int rv;
1677
1678		if (data_errors == 0) {
1679			rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
1680			if (rv == 0) {
1681				/*
1682				 * If we read parity information (unnecessarily
1683				 * as it happens since no reconstruction was
1684				 * needed) regenerate and verify the parity.
1685				 * We also regenerate parity when resilvering
1686				 * so we can write it out to the failed device
1687				 * later.
1688				 */
1689				if (parity_errors + parity_untried <
1690				    rm->rm_firstdatacol) {
1691					n = raidz_parity_verify(rm);
1692					unexpected_errors += n;
1693					ASSERT(parity_errors + n <=
1694					    rm->rm_firstdatacol);
1695				}
1696				goto done;
1697			}
1698		} else {
1699			/*
1700			 * We either attempt to read all the parity columns or
1701			 * none of them. If we didn't try to read parity, we
1702			 * wouldn't be here in the correctable case. There must
1703			 * also have been fewer parity errors than parity
1704			 * columns or, again, we wouldn't be in this code path.
1705			 */
1706			ASSERT(parity_untried == 0);
1707			ASSERT(parity_errors < rm->rm_firstdatacol);
1708
1709			/*
1710			 * Identify the data columns that reported an error.
1711			 */
1712			n = 0;
1713			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1714				rc = &rm->rm_col[c];
1715				if (rc->rc_error != 0) {
1716					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1717					tgts[n++] = c;
1718				}
1719			}
1720
1721			ASSERT(rm->rm_firstdatacol >= n);
1722
1723			code = vdev_raidz_reconstruct(rm, tgts, n);
1724
1725			rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
1726			if (rv == 0) {
1727				/*
1728				 * If we read more parity disks than were used
1729				 * for reconstruction, confirm that the other
1730				 * parity disks produced correct data. This
1731				 * routine is suboptimal in that it regenerates
1732				 * the parity that we already used in addition
1733				 * to the parity that we're attempting to
1734				 * verify, but this should be a relatively
1735				 * uncommon case, and can be optimized if it
1736				 * becomes a problem. Note that we regenerate
1737				 * parity when resilvering so we can write it
1738				 * out to failed devices later.
1739				 */
1740				if (parity_errors < rm->rm_firstdatacol - n) {
1741					n = raidz_parity_verify(rm);
1742					unexpected_errors += n;
1743					ASSERT(parity_errors + n <=
1744					    rm->rm_firstdatacol);
1745				}
1746
1747				goto done;
1748			}
1749		}
1750	}
1751
1752	/*
1753	 * This isn't a typical situation -- either we got a read
1754	 * error or a child silently returned bad data. Read every
1755	 * block so we can try again with as much data and parity as
1756	 * we can track down. If we've already been through once
1757	 * before, all children will be marked as tried so we'll
1758	 * proceed to combinatorial reconstruction.
1759	 */
1760	unexpected_errors = 1;
1761	rm->rm_missingdata = 0;
1762	rm->rm_missingparity = 0;
1763
1764	n = 0;
1765	for (c = 0; c < rm->rm_cols; c++) {
1766		rc = &rm->rm_col[c];
1767
1768		if (rc->rc_tried)
1769			continue;
1770
1771		cvd = vdev_child(vd, rc->rc_devidx);
1772		ASSERT(cvd != NULL);
1773		rc->rc_error = cvd->v_read(cvd, NULL,
1774		    rc->rc_data, rc->rc_offset, rc->rc_size);
1775		if (rc->rc_error == 0)
1776			n++;
1777		rc->rc_tried = 1;
1778		rc->rc_skipped = 0;
1779	}
1780	/*
1781	 * If we managed to read anything more, retry the
1782	 * reconstruction.
1783	 */
1784	if (n > 0)
1785		goto reconstruct;
1786
1787	/*
1788	 * At this point we've attempted to reconstruct the data given the
1789	 * errors we detected, and we've attempted to read all columns. There
1790	 * must, therefore, be one or more additional problems -- silent errors
1791	 * resulting in invalid data rather than explicit I/O errors resulting
1792	 * in absent data. We check if there is enough additional data to
1793	 * possibly reconstruct the data and then perform combinatorial
1794	 * reconstruction over all possible combinations. If that fails,
1795	 * we're cooked.
1796	 */
1797	if (total_errors > rm->rm_firstdatacol) {
1798		error = EIO;
1799	} else if (total_errors < rm->rm_firstdatacol &&
1800	    (code = vdev_raidz_combrec(vd->v_spa, rm, bp, data, offset, bytes,
1801	     total_errors, data_errors)) != 0) {
1802		/*
1803		 * If we didn't use all the available parity for the
1804		 * combinatorial reconstruction, verify that the remaining
1805		 * parity is correct.
1806		 */
1807		if (code != (1 << rm->rm_firstdatacol) - 1)
1808			(void) raidz_parity_verify(rm);
1809	} else {
1810		/*
1811		 * We're here because either:
1812		 *
1813		 *	total_errors == rm_first_datacol, or
1814		 *	vdev_raidz_combrec() failed
1815		 *
1816		 * In either case, there is enough bad data to prevent
1817		 * reconstruction.
1818		 *
1819		 * Start checksum ereports for all children which haven't
1820		 * failed, and the IO wasn't speculative.
1821		 */
1822		error = ECKSUM;
1823	}
1824
1825done:
1826	vdev_raidz_map_free(rm);
1827
1828	return (error);
1829}
1830