1185029Spjd/*
2185029Spjd * CDDL HEADER START
3185029Spjd *
4185029Spjd * The contents of this file are subject to the terms of the
5185029Spjd * Common Development and Distribution License (the "License").
6185029Spjd * You may not use this file except in compliance with the License.
7185029Spjd *
8185029Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9185029Spjd * or http://www.opensolaris.org/os/licensing.
10185029Spjd * See the License for the specific language governing permissions
11185029Spjd * and limitations under the License.
12185029Spjd *
13185029Spjd * When distributing Covered Code, include this CDDL HEADER in each
14185029Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15185029Spjd * If applicable, add the following below this CDDL HEADER, with the
16185029Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17185029Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18185029Spjd *
19185029Spjd * CDDL HEADER END
20185029Spjd */
21185029Spjd/*
22185029Spjd * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23185029Spjd * Use is subject to license terms.
24185029Spjd */
25185029Spjd
26185029Spjd#include <sys/cdefs.h>
27192640Sdes__FBSDID("$FreeBSD$");
28185029Spjd
29185029Spjdstatic uint64_t zfs_crc64_table[256];
30185029Spjd
31219089Spjd#define	ECKSUM	666
32219089Spjd
33219089Spjd#define	ASSERT(...)	do { } while (0)
34219089Spjd#define	ASSERT3U(...)	do { } while (0)
35219089Spjd#define	ASSERT3S(...)	do { } while (0)
36219089Spjd
37219089Spjd#define	panic(...)	do {						\
38219089Spjd	printf(__VA_ARGS__);						\
39219089Spjd	for (;;) ;							\
40219089Spjd} while (0)
41219089Spjd
42219089Spjd#define	kmem_alloc(size, flag)	zfs_alloc((size))
43219089Spjd#define	kmem_free(ptr, size)	zfs_free((ptr), (size))
44219089Spjd
45185029Spjdstatic void
46185029Spjdzfs_init_crc(void)
47185029Spjd{
48185029Spjd	int i, j;
49185029Spjd	uint64_t *ct;
50185029Spjd
51185029Spjd	/*
52185029Spjd	 * Calculate the crc64 table (used for the zap hash
53185029Spjd	 * function).
54185029Spjd	 */
55185029Spjd	if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
56185029Spjd		memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
57185029Spjd		for (i = 0; i < 256; i++)
58185029Spjd			for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
59185029Spjd				*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
60185029Spjd	}
61185029Spjd}
62185029Spjd
63185029Spjdstatic void
64185029Spjdzio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
65185029Spjd{
66185029Spjd	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
67185029Spjd}
68185029Spjd
69185029Spjd/*
70185029Spjd * Signature for checksum functions.
71185029Spjd */
72185029Spjdtypedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp);
73185029Spjd
74185029Spjd/*
75185029Spjd * Information about each checksum function.
76185029Spjd */
77185029Spjdtypedef struct zio_checksum_info {
78185029Spjd	zio_checksum_t	*ci_func[2]; /* checksum function for each byteorder */
79185029Spjd	int		ci_correctable;	/* number of correctable bits	*/
80219089Spjd	int		ci_eck;		/* uses zio embedded checksum? */
81219089Spjd	int		ci_dedup;	/* strong enough for dedup? */
82185029Spjd	const char	*ci_name;	/* descriptive name */
83185029Spjd} zio_checksum_info_t;
84185029Spjd
85185029Spjd#include "fletcher.c"
86185029Spjd#include "sha256.c"
87185029Spjd
88185029Spjdstatic zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
89219089Spjd	{{NULL,			NULL},			0, 0, 0, "inherit"},
90219089Spjd	{{NULL,			NULL},			0, 0, 0, "on"},
91219089Spjd	{{zio_checksum_off,	zio_checksum_off},	0, 0, 0, "off"},
92219089Spjd	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 1, 0, "label"},
93219089Spjd	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 1, 0, "gang_header"},
94219089Spjd	{{fletcher_2_native,	fletcher_2_byteswap},	0, 1, 0, "zilog"},
95219089Spjd	{{fletcher_2_native,	fletcher_2_byteswap},	0, 0, 0, "fletcher2"},
96219089Spjd	{{fletcher_4_native,	fletcher_4_byteswap},	1, 0, 0, "fletcher4"},
97219089Spjd	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 0, 1, "SHA256"},
98219089Spjd	{{fletcher_4_native,	fletcher_4_byteswap},	0, 1, 0, "zillog2"},
99185029Spjd};
100185029Spjd
101219089Spjd
102185029Spjd/*
103185029Spjd * Common signature for all zio compress/decompress functions.
104185029Spjd */
105185029Spjdtypedef size_t zio_compress_func_t(void *src, void *dst,
106185029Spjd    size_t s_len, size_t d_len, int);
107185029Spjdtypedef int zio_decompress_func_t(void *src, void *dst,
108185029Spjd    size_t s_len, size_t d_len, int);
109185029Spjd
110185029Spjd/*
111185029Spjd * Information about each compression function.
112185029Spjd */
113185029Spjdtypedef struct zio_compress_info {
114185029Spjd	zio_compress_func_t	*ci_compress;	/* compression function */
115185029Spjd	zio_decompress_func_t	*ci_decompress;	/* decompression function */
116185029Spjd	int			ci_level;	/* level parameter */
117185029Spjd	const char		*ci_name;	/* algorithm name */
118185029Spjd} zio_compress_info_t;
119185029Spjd
120185029Spjd#include "lzjb.c"
121219089Spjd#include "zle.c"
122246586Sdelphij#include "lz4.c"
123185029Spjd
124185029Spjd/*
125185029Spjd * Compression vectors.
126185029Spjd */
127185029Spjdstatic zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
128185029Spjd	{NULL,			NULL,			0,	"inherit"},
129185029Spjd	{NULL,			NULL,			0,	"on"},
130185029Spjd	{NULL,			NULL,			0,	"uncompressed"},
131185029Spjd	{NULL,			lzjb_decompress,	0,	"lzjb"},
132185029Spjd	{NULL,			NULL,			0,	"empty"},
133185029Spjd	{NULL,			NULL,			1,	"gzip-1"},
134185029Spjd	{NULL,			NULL,			2,	"gzip-2"},
135185029Spjd	{NULL,			NULL,			3,	"gzip-3"},
136185029Spjd	{NULL,			NULL,			4,	"gzip-4"},
137185029Spjd	{NULL,			NULL,			5,	"gzip-5"},
138185029Spjd	{NULL,			NULL,			6,	"gzip-6"},
139185029Spjd	{NULL,			NULL,			7,	"gzip-7"},
140185029Spjd	{NULL,			NULL,			8,	"gzip-8"},
141185029Spjd	{NULL,			NULL,			9,	"gzip-9"},
142219089Spjd	{NULL,			zle_decompress,		64,	"zle"},
143246586Sdelphij	{NULL,			lz4_decompress,		0,	"lz4"},
144185029Spjd};
145185029Spjd
146219089Spjdstatic void
147219089Spjdbyteswap_uint64_array(void *vbuf, size_t size)
148219089Spjd{
149219089Spjd	uint64_t *buf = vbuf;
150219089Spjd	size_t count = size >> 3;
151219089Spjd	int i;
152219089Spjd
153219089Spjd	ASSERT((size & 7) == 0);
154219089Spjd
155219089Spjd	for (i = 0; i < count; i++)
156219089Spjd		buf[i] = BSWAP_64(buf[i]);
157219089Spjd}
158219089Spjd
159219089Spjd/*
160219089Spjd * Set the external verifier for a gang block based on <vdev, offset, txg>,
161219089Spjd * a tuple which is guaranteed to be unique for the life of the pool.
162219089Spjd */
163219089Spjdstatic void
164219089Spjdzio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
165219089Spjd{
166219089Spjd	const dva_t *dva = BP_IDENTITY(bp);
167219089Spjd	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
168219089Spjd
169219089Spjd	ASSERT(BP_IS_GANG(bp));
170219089Spjd
171219089Spjd	ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
172219089Spjd}
173219089Spjd
174219089Spjd/*
175219089Spjd * Set the external verifier for a label block based on its offset.
176219089Spjd * The vdev is implicit, and the txg is unknowable at pool open time --
177219089Spjd * hence the logic in vdev_uberblock_load() to find the most recent copy.
178219089Spjd */
179219089Spjdstatic void
180219089Spjdzio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
181219089Spjd{
182219089Spjd	ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
183219089Spjd}
184219089Spjd
185185029Spjdstatic int
186226568Spjdzio_checksum_verify(const blkptr_t *bp, void *data)
187185029Spjd{
188226568Spjd	uint64_t size;
189226568Spjd	unsigned int checksum;
190219089Spjd	zio_checksum_info_t *ci;
191219089Spjd	zio_cksum_t actual_cksum, expected_cksum, verifier;
192219089Spjd	int byteswap;
193185029Spjd
194226568Spjd	checksum = BP_GET_CHECKSUM(bp);
195226568Spjd	size = BP_GET_PSIZE(bp);
196226568Spjd
197219089Spjd	if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
198185029Spjd		return (EINVAL);
199219089Spjd	ci = &zio_checksum_table[checksum];
200219089Spjd	if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
201219089Spjd		return (EINVAL);
202185029Spjd
203219089Spjd	if (ci->ci_eck) {
204219089Spjd		zio_eck_t *eck;
205219089Spjd
206219089Spjd		ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
207219089Spjd		    checksum == ZIO_CHECKSUM_LABEL);
208219089Spjd
209219089Spjd		eck = (zio_eck_t *)((char *)data + size) - 1;
210219089Spjd
211219089Spjd		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
212219089Spjd			zio_checksum_gang_verifier(&verifier, bp);
213219089Spjd		else if (checksum == ZIO_CHECKSUM_LABEL)
214226568Spjd			zio_checksum_label_verifier(&verifier,
215226568Spjd			    DVA_GET_OFFSET(BP_IDENTITY(bp)));
216219089Spjd		else
217219089Spjd			verifier = bp->blk_cksum;
218219089Spjd
219219089Spjd		byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
220219089Spjd
221219089Spjd		if (byteswap)
222219089Spjd			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
223219089Spjd
224219089Spjd		expected_cksum = eck->zec_cksum;
225219089Spjd		eck->zec_cksum = verifier;
226219089Spjd		ci->ci_func[byteswap](data, size, &actual_cksum);
227219089Spjd		eck->zec_cksum = expected_cksum;
228219089Spjd
229219089Spjd		if (byteswap)
230219089Spjd			byteswap_uint64_array(&expected_cksum,
231219089Spjd			    sizeof (zio_cksum_t));
232185029Spjd	} else {
233219089Spjd		expected_cksum = bp->blk_cksum;
234185029Spjd		ci->ci_func[0](data, size, &actual_cksum);
235185029Spjd	}
236185029Spjd
237219089Spjd	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
238185029Spjd		/*printf("ZFS: read checksum failed\n");*/
239185029Spjd		return (EIO);
240185029Spjd	}
241185029Spjd
242185029Spjd	return (0);
243185029Spjd}
244185029Spjd
245185029Spjdstatic int
246185029Spjdzio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
247185029Spjd	void *dest, uint64_t destsize)
248185029Spjd{
249219089Spjd	zio_compress_info_t *ci;
250185029Spjd
251219089Spjd	if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
252185097Sdfr		printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
253185029Spjd		return (EIO);
254185029Spjd	}
255185029Spjd
256219089Spjd	ci = &zio_compress_table[cpfunc];
257219089Spjd	if (!ci->ci_decompress) {
258219089Spjd		printf("ZFS: unsupported compression algorithm %s\n",
259219089Spjd		    ci->ci_name);
260219089Spjd		return (EIO);
261219089Spjd	}
262219089Spjd
263185029Spjd	return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
264185029Spjd}
265185029Spjd
266185029Spjdstatic uint64_t
267185029Spjdzap_hash(uint64_t salt, const char *name)
268185029Spjd{
269185029Spjd	const uint8_t *cp;
270185029Spjd	uint8_t c;
271185029Spjd	uint64_t crc = salt;
272185029Spjd
273219089Spjd	ASSERT(crc != 0);
274219089Spjd	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
275185029Spjd	for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
276185029Spjd		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
277185029Spjd
278185029Spjd	/*
279185029Spjd	 * Only use 28 bits, since we need 4 bits in the cookie for the
280185029Spjd	 * collision differentiator.  We MUST use the high bits, since
281185029Spjd	 * those are the onces that we first pay attention to when
282185029Spjd	 * chosing the bucket.
283185029Spjd	 */
284185029Spjd	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
285185029Spjd
286185029Spjd	return (crc);
287185029Spjd}
288192194Sdfr
289219089Spjdstatic void *zfs_alloc(size_t size);
290219089Spjdstatic void zfs_free(void *ptr, size_t size);
291192194Sdfr
292192194Sdfrtypedef struct raidz_col {
293192194Sdfr	uint64_t rc_devidx;		/* child device index for I/O */
294192194Sdfr	uint64_t rc_offset;		/* device offset */
295192194Sdfr	uint64_t rc_size;		/* I/O size */
296192194Sdfr	void *rc_data;			/* I/O data */
297192194Sdfr	int rc_error;			/* I/O error for this device */
298192194Sdfr	uint8_t rc_tried;		/* Did we attempt this I/O column? */
299192194Sdfr	uint8_t rc_skipped;		/* Did we skip this I/O column? */
300192194Sdfr} raidz_col_t;
301192194Sdfr
302219089Spjdtypedef struct raidz_map {
303219089Spjd	uint64_t rm_cols;		/* Regular column count */
304219089Spjd	uint64_t rm_scols;		/* Count including skipped columns */
305219089Spjd	uint64_t rm_bigcols;		/* Number of oversized columns */
306219089Spjd	uint64_t rm_asize;		/* Actual total I/O size */
307219089Spjd	uint64_t rm_missingdata;	/* Count of missing data devices */
308219089Spjd	uint64_t rm_missingparity;	/* Count of missing parity devices */
309219089Spjd	uint64_t rm_firstdatacol;	/* First data column/parity count */
310219089Spjd	uint64_t rm_nskip;		/* Skipped sectors for padding */
311219089Spjd	uint64_t rm_skipstart;		/* Column index of padding start */
312219089Spjd	uintptr_t rm_reports;		/* # of referencing checksum reports */
313219089Spjd	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
314219089Spjd	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
315219089Spjd	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
316219089Spjd} raidz_map_t;
317219089Spjd
318192194Sdfr#define	VDEV_RAIDZ_P		0
319192194Sdfr#define	VDEV_RAIDZ_Q		1
320219089Spjd#define	VDEV_RAIDZ_R		2
321192194Sdfr
322219089Spjd#define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
323219089Spjd#define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
324192194Sdfr
325219089Spjd/*
326219089Spjd * We provide a mechanism to perform the field multiplication operation on a
327219089Spjd * 64-bit value all at once rather than a byte at a time. This works by
328219089Spjd * creating a mask from the top bit in each byte and using that to
329219089Spjd * conditionally apply the XOR of 0x1d.
330219089Spjd */
331219089Spjd#define	VDEV_RAIDZ_64MUL_2(x, mask) \
332219089Spjd{ \
333219089Spjd	(mask) = (x) & 0x8080808080808080ULL; \
334219089Spjd	(mask) = ((mask) << 1) - ((mask) >> 7); \
335219089Spjd	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
336225531Savg	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
337219089Spjd}
338192194Sdfr
339219089Spjd#define	VDEV_RAIDZ_64MUL_4(x, mask) \
340219089Spjd{ \
341219089Spjd	VDEV_RAIDZ_64MUL_2((x), mask); \
342219089Spjd	VDEV_RAIDZ_64MUL_2((x), mask); \
343192194Sdfr}
344192194Sdfr
345192194Sdfr/*
346192194Sdfr * These two tables represent powers and logs of 2 in the Galois field defined
347192194Sdfr * above. These values were computed by repeatedly multiplying by 2 as above.
348192194Sdfr */
349192194Sdfrstatic const uint8_t vdev_raidz_pow2[256] = {
350192194Sdfr	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
351192194Sdfr	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
352192194Sdfr	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
353192194Sdfr	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
354192194Sdfr	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
355192194Sdfr	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
356192194Sdfr	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
357192194Sdfr	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
358192194Sdfr	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
359192194Sdfr	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
360192194Sdfr	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
361192194Sdfr	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
362192194Sdfr	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
363192194Sdfr	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
364192194Sdfr	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
365192194Sdfr	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
366192194Sdfr	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
367192194Sdfr	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
368192194Sdfr	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
369192194Sdfr	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
370192194Sdfr	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
371192194Sdfr	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
372192194Sdfr	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
373192194Sdfr	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
374192194Sdfr	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
375192194Sdfr	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
376192194Sdfr	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
377192194Sdfr	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
378192194Sdfr	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
379192194Sdfr	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
380192194Sdfr	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
381192194Sdfr	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
382192194Sdfr};
383192194Sdfrstatic const uint8_t vdev_raidz_log2[256] = {
384192194Sdfr	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
385192194Sdfr	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
386192194Sdfr	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
387192194Sdfr	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
388192194Sdfr	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
389192194Sdfr	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
390192194Sdfr	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
391192194Sdfr	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
392192194Sdfr	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
393192194Sdfr	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
394192194Sdfr	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
395192194Sdfr	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
396192194Sdfr	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
397192194Sdfr	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
398192194Sdfr	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
399192194Sdfr	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
400192194Sdfr	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
401192194Sdfr	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
402192194Sdfr	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
403192194Sdfr	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
404192194Sdfr	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
405192194Sdfr	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
406192194Sdfr	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
407192194Sdfr	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
408192194Sdfr	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
409192194Sdfr	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
410192194Sdfr	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
411192194Sdfr	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
412192194Sdfr	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
413192194Sdfr	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
414192194Sdfr	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
415192194Sdfr	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
416192194Sdfr};
417192194Sdfr
418192194Sdfr/*
419192194Sdfr * Multiply a given number by 2 raised to the given power.
420192194Sdfr */
421192194Sdfrstatic uint8_t
422192194Sdfrvdev_raidz_exp2(uint8_t a, int exp)
423192194Sdfr{
424192194Sdfr	if (a == 0)
425192194Sdfr		return (0);
426192194Sdfr
427219089Spjd	ASSERT(exp >= 0);
428219089Spjd	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
429192194Sdfr
430192194Sdfr	exp += vdev_raidz_log2[a];
431192194Sdfr	if (exp > 255)
432192194Sdfr		exp -= 255;
433192194Sdfr
434192194Sdfr	return (vdev_raidz_pow2[exp]);
435192194Sdfr}
436192194Sdfr
437192194Sdfrstatic void
438219089Spjdvdev_raidz_generate_parity_p(raidz_map_t *rm)
439192194Sdfr{
440219089Spjd	uint64_t *p, *src, pcount, ccount, i;
441192194Sdfr	int c;
442192194Sdfr
443219089Spjd	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
444192194Sdfr
445219089Spjd	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
446219089Spjd		src = rm->rm_col[c].rc_data;
447219089Spjd		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
448219089Spjd		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
449192194Sdfr
450219089Spjd		if (c == rm->rm_firstdatacol) {
451219089Spjd			ASSERT(ccount == pcount);
452219089Spjd			for (i = 0; i < ccount; i++, src++, p++) {
453192194Sdfr				*p = *src;
454192194Sdfr			}
455219089Spjd		} else {
456219089Spjd			ASSERT(ccount <= pcount);
457219089Spjd			for (i = 0; i < ccount; i++, src++, p++) {
458219089Spjd				*p ^= *src;
459219089Spjd			}
460219089Spjd		}
461219089Spjd	}
462219089Spjd}
463219089Spjd
464219089Spjdstatic void
465219089Spjdvdev_raidz_generate_parity_pq(raidz_map_t *rm)
466219089Spjd{
467219089Spjd	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
468219089Spjd	int c;
469219089Spjd
470219089Spjd	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
471219089Spjd	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
472219089Spjd	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
473219089Spjd
474219089Spjd	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
475219089Spjd		src = rm->rm_col[c].rc_data;
476219089Spjd		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
477219089Spjd		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
478219089Spjd
479219089Spjd		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
480219089Spjd
481219089Spjd		if (c == rm->rm_firstdatacol) {
482219089Spjd			ASSERT(ccnt == pcnt || ccnt == 0);
483219089Spjd			for (i = 0; i < ccnt; i++, src++, p++, q++) {
484219089Spjd				*p = *src;
485219089Spjd				*q = *src;
486219089Spjd			}
487219089Spjd			for (; i < pcnt; i++, src++, p++, q++) {
488219089Spjd				*p = 0;
489192194Sdfr				*q = 0;
490192194Sdfr			}
491192194Sdfr		} else {
492219089Spjd			ASSERT(ccnt <= pcnt);
493192194Sdfr
494192194Sdfr			/*
495219089Spjd			 * Apply the algorithm described above by multiplying
496219089Spjd			 * the previous result and adding in the new value.
497192194Sdfr			 */
498219089Spjd			for (i = 0; i < ccnt; i++, src++, p++, q++) {
499219089Spjd				*p ^= *src;
500219089Spjd
501219089Spjd				VDEV_RAIDZ_64MUL_2(*q, mask);
502192194Sdfr				*q ^= *src;
503192194Sdfr			}
504192194Sdfr
505192194Sdfr			/*
506192194Sdfr			 * Treat short columns as though they are full of 0s.
507219089Spjd			 * Note that there's therefore nothing needed for P.
508192194Sdfr			 */
509219089Spjd			for (; i < pcnt; i++, q++) {
510219089Spjd				VDEV_RAIDZ_64MUL_2(*q, mask);
511192194Sdfr			}
512192194Sdfr		}
513192194Sdfr	}
514192194Sdfr}
515192194Sdfr
516192194Sdfrstatic void
517219089Spjdvdev_raidz_generate_parity_pqr(raidz_map_t *rm)
518192194Sdfr{
519219089Spjd	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
520219089Spjd	int c;
521192194Sdfr
522219089Spjd	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
523219089Spjd	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
524219089Spjd	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
525219089Spjd	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
526219089Spjd	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
527192194Sdfr
528219089Spjd	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
529219089Spjd		src = rm->rm_col[c].rc_data;
530219089Spjd		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
531219089Spjd		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
532219089Spjd		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
533192194Sdfr
534219089Spjd		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
535192194Sdfr
536219089Spjd		if (c == rm->rm_firstdatacol) {
537219089Spjd			ASSERT(ccnt == pcnt || ccnt == 0);
538219089Spjd			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
539219089Spjd				*p = *src;
540219089Spjd				*q = *src;
541219089Spjd				*r = *src;
542192194Sdfr			}
543219089Spjd			for (; i < pcnt; i++, src++, p++, q++, r++) {
544219089Spjd				*p = 0;
545219089Spjd				*q = 0;
546219089Spjd				*r = 0;
547192194Sdfr			}
548219089Spjd		} else {
549219089Spjd			ASSERT(ccnt <= pcnt);
550192194Sdfr
551192194Sdfr			/*
552219089Spjd			 * Apply the algorithm described above by multiplying
553219089Spjd			 * the previous result and adding in the new value.
554192194Sdfr			 */
555219089Spjd			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
556219089Spjd				*p ^= *src;
557219089Spjd
558219089Spjd				VDEV_RAIDZ_64MUL_2(*q, mask);
559219089Spjd				*q ^= *src;
560219089Spjd
561219089Spjd				VDEV_RAIDZ_64MUL_4(*r, mask);
562219089Spjd				*r ^= *src;
563192194Sdfr			}
564192194Sdfr
565219089Spjd			/*
566219089Spjd			 * Treat short columns as though they are full of 0s.
567219089Spjd			 * Note that there's therefore nothing needed for P.
568219089Spjd			 */
569219089Spjd			for (; i < pcnt; i++, q++, r++) {
570219089Spjd				VDEV_RAIDZ_64MUL_2(*q, mask);
571219089Spjd				VDEV_RAIDZ_64MUL_4(*r, mask);
572192194Sdfr			}
573192194Sdfr		}
574192194Sdfr	}
575219089Spjd}
576192194Sdfr
577219089Spjd/*
578219089Spjd * Generate RAID parity in the first virtual columns according to the number of
579219089Spjd * parity columns available.
580219089Spjd */
581219089Spjdstatic void
582219089Spjdvdev_raidz_generate_parity(raidz_map_t *rm)
583219089Spjd{
584219089Spjd	switch (rm->rm_firstdatacol) {
585219089Spjd	case 1:
586219089Spjd		vdev_raidz_generate_parity_p(rm);
587219089Spjd		break;
588219089Spjd	case 2:
589219089Spjd		vdev_raidz_generate_parity_pq(rm);
590219089Spjd		break;
591219089Spjd	case 3:
592219089Spjd		vdev_raidz_generate_parity_pqr(rm);
593219089Spjd		break;
594219089Spjd	default:
595219089Spjd		panic("invalid RAID-Z configuration");
596219089Spjd	}
597219089Spjd}
598192194Sdfr
599219089Spjd/* BEGIN CSTYLED */
600219089Spjd/*
601219089Spjd * In the general case of reconstruction, we must solve the system of linear
602219089Spjd * equations defined by the coeffecients used to generate parity as well as
603219089Spjd * the contents of the data and parity disks. This can be expressed with
604219089Spjd * vectors for the original data (D) and the actual data (d) and parity (p)
605219089Spjd * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
606219089Spjd *
607219089Spjd *            __   __                     __     __
608219089Spjd *            |     |         __     __   |  p_0  |
609219089Spjd *            |  V  |         |  D_0  |   | p_m-1 |
610219089Spjd *            |     |    x    |   :   | = |  d_0  |
611219089Spjd *            |  I  |         | D_n-1 |   |   :   |
612219089Spjd *            |     |         ~~     ~~   | d_n-1 |
613219089Spjd *            ~~   ~~                     ~~     ~~
614219089Spjd *
615219089Spjd * I is simply a square identity matrix of size n, and V is a vandermonde
616219089Spjd * matrix defined by the coeffecients we chose for the various parity columns
617219089Spjd * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
618219089Spjd * computation as well as linear separability.
619219089Spjd *
620219089Spjd *      __               __               __     __
621219089Spjd *      |   1   ..  1 1 1 |               |  p_0  |
622219089Spjd *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
623219089Spjd *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
624219089Spjd *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
625219089Spjd *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
626219089Spjd *      |   :       : : : |   |   :   |   |  d_2  |
627219089Spjd *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
628219089Spjd *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
629219089Spjd *      |   0   ..  0 0 1 |               | d_n-1 |
630219089Spjd *      ~~               ~~               ~~     ~~
631219089Spjd *
632219089Spjd * Note that I, V, d, and p are known. To compute D, we must invert the
633219089Spjd * matrix and use the known data and parity values to reconstruct the unknown
634219089Spjd * data values. We begin by removing the rows in V|I and d|p that correspond
635219089Spjd * to failed or missing columns; we then make V|I square (n x n) and d|p
636219089Spjd * sized n by removing rows corresponding to unused parity from the bottom up
637219089Spjd * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
638219089Spjd * using Gauss-Jordan elimination. In the example below we use m=3 parity
639219089Spjd * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
640219089Spjd *           __                               __
641219089Spjd *           |  1   1   1   1   1   1   1   1  |
642219089Spjd *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
643219089Spjd *           |  19 205 116  29  64  16  4   1  |      / /
644219089Spjd *           |  1   0   0   0   0   0   0   0  |     / /
645219089Spjd *           |  0   1   0   0   0   0   0   0  | <--' /
646219089Spjd *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
647219089Spjd *           |  0   0   0   1   0   0   0   0  |
648219089Spjd *           |  0   0   0   0   1   0   0   0  |
649219089Spjd *           |  0   0   0   0   0   1   0   0  |
650219089Spjd *           |  0   0   0   0   0   0   1   0  |
651219089Spjd *           |  0   0   0   0   0   0   0   1  |
652219089Spjd *           ~~                               ~~
653219089Spjd *           __                               __
654219089Spjd *           |  1   1   1   1   1   1   1   1  |
655219089Spjd *           | 128  64  32  16  8   4   2   1  |
656219089Spjd *           |  19 205 116  29  64  16  4   1  |
657219089Spjd *           |  1   0   0   0   0   0   0   0  |
658219089Spjd *           |  0   1   0   0   0   0   0   0  |
659219089Spjd *  (V|I)' = |  0   0   1   0   0   0   0   0  |
660219089Spjd *           |  0   0   0   1   0   0   0   0  |
661219089Spjd *           |  0   0   0   0   1   0   0   0  |
662219089Spjd *           |  0   0   0   0   0   1   0   0  |
663219089Spjd *           |  0   0   0   0   0   0   1   0  |
664219089Spjd *           |  0   0   0   0   0   0   0   1  |
665219089Spjd *           ~~                               ~~
666219089Spjd *
667219089Spjd * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
668219089Spjd * have carefully chosen the seed values 1, 2, and 4 to ensure that this
669219089Spjd * matrix is not singular.
670219089Spjd * __                                                                 __
671219089Spjd * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
672219089Spjd * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
673219089Spjd * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
674219089Spjd * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
675219089Spjd * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
676219089Spjd * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
677219089Spjd * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
678219089Spjd * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
679219089Spjd * ~~                                                                 ~~
680219089Spjd * __                                                                 __
681219089Spjd * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
682219089Spjd * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
683219089Spjd * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
684219089Spjd * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
685219089Spjd * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
686219089Spjd * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
687219089Spjd * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
688219089Spjd * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
689219089Spjd * ~~                                                                 ~~
690219089Spjd * __                                                                 __
691219089Spjd * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
692219089Spjd * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
693219089Spjd * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
694219089Spjd * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
695219089Spjd * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
696219089Spjd * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
697219089Spjd * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
698219089Spjd * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
699219089Spjd * ~~                                                                 ~~
700219089Spjd * __                                                                 __
701219089Spjd * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
702219089Spjd * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
703219089Spjd * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
704219089Spjd * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
705219089Spjd * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
706219089Spjd * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
707219089Spjd * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
708219089Spjd * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
709219089Spjd * ~~                                                                 ~~
710219089Spjd * __                                                                 __
711219089Spjd * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
712219089Spjd * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
713219089Spjd * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
714219089Spjd * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
715219089Spjd * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
716219089Spjd * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
717219089Spjd * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
718219089Spjd * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
719219089Spjd * ~~                                                                 ~~
720219089Spjd * __                                                                 __
721219089Spjd * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
722219089Spjd * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
723219089Spjd * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
724219089Spjd * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
725219089Spjd * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
726219089Spjd * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
727219089Spjd * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
728219089Spjd * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
729219089Spjd * ~~                                                                 ~~
730219089Spjd *                   __                               __
731219089Spjd *                   |  0   0   1   0   0   0   0   0  |
732219089Spjd *                   | 167 100  5   41 159 169 217 208 |
733219089Spjd *                   | 166 100  4   40 158 168 216 209 |
734219089Spjd *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
735219089Spjd *                   |  0   0   0   0   1   0   0   0  |
736219089Spjd *                   |  0   0   0   0   0   1   0   0  |
737219089Spjd *                   |  0   0   0   0   0   0   1   0  |
738219089Spjd *                   |  0   0   0   0   0   0   0   1  |
739219089Spjd *                   ~~                               ~~
740219089Spjd *
741219089Spjd * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
742219089Spjd * of the missing data.
743219089Spjd *
744219089Spjd * As is apparent from the example above, the only non-trivial rows in the
745219089Spjd * inverse matrix correspond to the data disks that we're trying to
746219089Spjd * reconstruct. Indeed, those are the only rows we need as the others would
747219089Spjd * only be useful for reconstructing data known or assumed to be valid. For
748219089Spjd * that reason, we only build the coefficients in the rows that correspond to
749219089Spjd * targeted columns.
750219089Spjd */
751219089Spjd/* END CSTYLED */
752219089Spjd
753219089Spjdstatic void
754219089Spjdvdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
755219089Spjd    uint8_t **rows)
756219089Spjd{
757219089Spjd	int i, j;
758219089Spjd	int pow;
759219089Spjd
760219089Spjd	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
761219089Spjd
762219089Spjd	/*
763219089Spjd	 * Fill in the missing rows of interest.
764219089Spjd	 */
765219089Spjd	for (i = 0; i < nmap; i++) {
766219089Spjd		ASSERT3S(0, <=, map[i]);
767219089Spjd		ASSERT3S(map[i], <=, 2);
768219089Spjd
769219089Spjd		pow = map[i] * n;
770219089Spjd		if (pow > 255)
771219089Spjd			pow -= 255;
772219089Spjd		ASSERT(pow <= 255);
773219089Spjd
774219089Spjd		for (j = 0; j < n; j++) {
775219089Spjd			pow -= map[i];
776219089Spjd			if (pow < 0)
777219089Spjd				pow += 255;
778219089Spjd			rows[i][j] = vdev_raidz_pow2[pow];
779192194Sdfr		}
780192194Sdfr	}
781192194Sdfr}
782192194Sdfr
783192194Sdfrstatic void
784219089Spjdvdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
785219089Spjd    uint8_t **rows, uint8_t **invrows, const uint8_t *used)
786192194Sdfr{
787219089Spjd	int i, j, ii, jj;
788219089Spjd	uint8_t log;
789192194Sdfr
790219089Spjd	/*
791219089Spjd	 * Assert that the first nmissing entries from the array of used
792219089Spjd	 * columns correspond to parity columns and that subsequent entries
793219089Spjd	 * correspond to data columns.
794219089Spjd	 */
795219089Spjd	for (i = 0; i < nmissing; i++) {
796219089Spjd		ASSERT3S(used[i], <, rm->rm_firstdatacol);
797219089Spjd	}
798219089Spjd	for (; i < n; i++) {
799219089Spjd		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
800219089Spjd	}
801192194Sdfr
802219089Spjd	/*
803219089Spjd	 * First initialize the storage where we'll compute the inverse rows.
804219089Spjd	 */
805219089Spjd	for (i = 0; i < nmissing; i++) {
806219089Spjd		for (j = 0; j < n; j++) {
807219089Spjd			invrows[i][j] = (i == j) ? 1 : 0;
808219089Spjd		}
809219089Spjd	}
810192194Sdfr
811192194Sdfr	/*
812219089Spjd	 * Subtract all trivial rows from the rows of consequence.
813192194Sdfr	 */
814219089Spjd	for (i = 0; i < nmissing; i++) {
815219089Spjd		for (j = nmissing; j < n; j++) {
816219089Spjd			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
817219089Spjd			jj = used[j] - rm->rm_firstdatacol;
818219089Spjd			ASSERT3S(jj, <, n);
819219089Spjd			invrows[i][j] = rows[i][jj];
820219089Spjd			rows[i][jj] = 0;
821219089Spjd		}
822219089Spjd	}
823192194Sdfr
824219089Spjd	/*
825219089Spjd	 * For each of the rows of interest, we must normalize it and subtract
826219089Spjd	 * a multiple of it from the other rows.
827219089Spjd	 */
828219089Spjd	for (i = 0; i < nmissing; i++) {
829219089Spjd		for (j = 0; j < missing[i]; j++) {
830219089Spjd			ASSERT3U(rows[i][j], ==, 0);
831219089Spjd		}
832219089Spjd		ASSERT3U(rows[i][missing[i]], !=, 0);
833192194Sdfr
834219089Spjd		/*
835219089Spjd		 * Compute the inverse of the first element and multiply each
836219089Spjd		 * element in the row by that value.
837219089Spjd		 */
838219089Spjd		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
839192194Sdfr
840219089Spjd		for (j = 0; j < n; j++) {
841219089Spjd			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
842219089Spjd			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
843219089Spjd		}
844192194Sdfr
845219089Spjd		for (ii = 0; ii < nmissing; ii++) {
846219089Spjd			if (i == ii)
847219089Spjd				continue;
848192194Sdfr
849219089Spjd			ASSERT3U(rows[ii][missing[i]], !=, 0);
850219089Spjd
851219089Spjd			log = vdev_raidz_log2[rows[ii][missing[i]]];
852219089Spjd
853219089Spjd			for (j = 0; j < n; j++) {
854219089Spjd				rows[ii][j] ^=
855219089Spjd				    vdev_raidz_exp2(rows[i][j], log);
856219089Spjd				invrows[ii][j] ^=
857219089Spjd				    vdev_raidz_exp2(invrows[i][j], log);
858219089Spjd			}
859219089Spjd		}
860219089Spjd	}
861219089Spjd
862192194Sdfr	/*
863219089Spjd	 * Verify that the data that is left in the rows are properly part of
864219089Spjd	 * an identity matrix.
865192194Sdfr	 */
866219089Spjd	for (i = 0; i < nmissing; i++) {
867219089Spjd		for (j = 0; j < n; j++) {
868219089Spjd			if (j == missing[i]) {
869219089Spjd				ASSERT3U(rows[i][j], ==, 1);
870219089Spjd			} else {
871219089Spjd				ASSERT3U(rows[i][j], ==, 0);
872219089Spjd			}
873219089Spjd		}
874219089Spjd	}
875219089Spjd}
876192194Sdfr
877219089Spjdstatic void
878219089Spjdvdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
879219089Spjd    int *missing, uint8_t **invrows, const uint8_t *used)
880219089Spjd{
881219089Spjd	int i, j, x, cc, c;
882219089Spjd	uint8_t *src;
883219089Spjd	uint64_t ccount;
884219089Spjd	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
885219089Spjd	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
886219089Spjd	uint8_t log, val;
887219089Spjd	int ll;
888219089Spjd	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
889219089Spjd	uint8_t *p, *pp;
890219089Spjd	size_t psize;
891192194Sdfr
892219089Spjd	log = 0;	/* gcc */
893219089Spjd	psize = sizeof (invlog[0][0]) * n * nmissing;
894219089Spjd	p = zfs_alloc(psize);
895192194Sdfr
896219089Spjd	for (pp = p, i = 0; i < nmissing; i++) {
897219089Spjd		invlog[i] = pp;
898219089Spjd		pp += n;
899219089Spjd	}
900192194Sdfr
901219089Spjd	for (i = 0; i < nmissing; i++) {
902219089Spjd		for (j = 0; j < n; j++) {
903219089Spjd			ASSERT3U(invrows[i][j], !=, 0);
904219089Spjd			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
905219089Spjd		}
906192194Sdfr	}
907192194Sdfr
908219089Spjd	for (i = 0; i < n; i++) {
909219089Spjd		c = used[i];
910219089Spjd		ASSERT3U(c, <, rm->rm_cols);
911219089Spjd
912219089Spjd		src = rm->rm_col[c].rc_data;
913219089Spjd		ccount = rm->rm_col[c].rc_size;
914219089Spjd		for (j = 0; j < nmissing; j++) {
915219089Spjd			cc = missing[j] + rm->rm_firstdatacol;
916219089Spjd			ASSERT3U(cc, >=, rm->rm_firstdatacol);
917219089Spjd			ASSERT3U(cc, <, rm->rm_cols);
918219089Spjd			ASSERT3U(cc, !=, c);
919219089Spjd
920219089Spjd			dst[j] = rm->rm_col[cc].rc_data;
921219089Spjd			dcount[j] = rm->rm_col[cc].rc_size;
922219089Spjd		}
923219089Spjd
924219089Spjd		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
925219089Spjd
926219089Spjd		for (x = 0; x < ccount; x++, src++) {
927219089Spjd			if (*src != 0)
928219089Spjd				log = vdev_raidz_log2[*src];
929219089Spjd
930219089Spjd			for (cc = 0; cc < nmissing; cc++) {
931219089Spjd				if (x >= dcount[cc])
932219089Spjd					continue;
933219089Spjd
934219089Spjd				if (*src == 0) {
935219089Spjd					val = 0;
936219089Spjd				} else {
937219089Spjd					if ((ll = log + invlog[cc][i]) >= 255)
938219089Spjd						ll -= 255;
939219089Spjd					val = vdev_raidz_pow2[ll];
940219089Spjd				}
941219089Spjd
942219089Spjd				if (i == 0)
943219089Spjd					dst[cc][x] = val;
944219089Spjd				else
945219089Spjd					dst[cc][x] ^= val;
946219089Spjd			}
947219089Spjd		}
948219089Spjd	}
949219089Spjd
950219089Spjd	zfs_free(p, psize);
951219089Spjd}
952219089Spjd
953219089Spjdstatic int
954219089Spjdvdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
955219089Spjd{
956219089Spjd	int n, i, c, t, tt;
957219089Spjd	int nmissing_rows;
958219089Spjd	int missing_rows[VDEV_RAIDZ_MAXPARITY];
959219089Spjd	int parity_map[VDEV_RAIDZ_MAXPARITY];
960219089Spjd
961219089Spjd	uint8_t *p, *pp;
962219089Spjd	size_t psize;
963219089Spjd
964219089Spjd	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
965219089Spjd	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
966219089Spjd	uint8_t *used;
967219089Spjd
968219089Spjd	int code = 0;
969219089Spjd
970219089Spjd
971219089Spjd	n = rm->rm_cols - rm->rm_firstdatacol;
972219089Spjd
973192194Sdfr	/*
974219089Spjd	 * Figure out which data columns are missing.
975192194Sdfr	 */
976219089Spjd	nmissing_rows = 0;
977219089Spjd	for (t = 0; t < ntgts; t++) {
978219089Spjd		if (tgts[t] >= rm->rm_firstdatacol) {
979219089Spjd			missing_rows[nmissing_rows++] =
980219089Spjd			    tgts[t] - rm->rm_firstdatacol;
981219089Spjd		}
982219089Spjd	}
983219089Spjd
984219089Spjd	/*
985219089Spjd	 * Figure out which parity columns to use to help generate the missing
986219089Spjd	 * data columns.
987219089Spjd	 */
988219089Spjd	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
989219089Spjd		ASSERT(tt < ntgts);
990219089Spjd		ASSERT(c < rm->rm_firstdatacol);
991219089Spjd
992219089Spjd		/*
993219089Spjd		 * Skip any targeted parity columns.
994219089Spjd		 */
995219089Spjd		if (c == tgts[tt]) {
996219089Spjd			tt++;
997219089Spjd			continue;
998219089Spjd		}
999219089Spjd
1000219089Spjd		code |= 1 << c;
1001219089Spjd
1002219089Spjd		parity_map[i] = c;
1003219089Spjd		i++;
1004219089Spjd	}
1005219089Spjd
1006219089Spjd	ASSERT(code != 0);
1007219089Spjd	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1008219089Spjd
1009219089Spjd	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1010219089Spjd	    nmissing_rows * n + sizeof (used[0]) * n;
1011219089Spjd	p = kmem_alloc(psize, KM_SLEEP);
1012219089Spjd
1013219089Spjd	for (pp = p, i = 0; i < nmissing_rows; i++) {
1014219089Spjd		rows[i] = pp;
1015219089Spjd		pp += n;
1016219089Spjd		invrows[i] = pp;
1017219089Spjd		pp += n;
1018219089Spjd	}
1019219089Spjd	used = pp;
1020219089Spjd
1021219089Spjd	for (i = 0; i < nmissing_rows; i++) {
1022219089Spjd		used[i] = parity_map[i];
1023219089Spjd	}
1024219089Spjd
1025219089Spjd	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1026219089Spjd		if (tt < nmissing_rows &&
1027219089Spjd		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1028219089Spjd			tt++;
1029219089Spjd			continue;
1030219089Spjd		}
1031219089Spjd
1032219089Spjd		ASSERT3S(i, <, n);
1033219089Spjd		used[i] = c;
1034219089Spjd		i++;
1035219089Spjd	}
1036219089Spjd
1037219089Spjd	/*
1038219089Spjd	 * Initialize the interesting rows of the matrix.
1039219089Spjd	 */
1040219089Spjd	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1041219089Spjd
1042219089Spjd	/*
1043219089Spjd	 * Invert the matrix.
1044219089Spjd	 */
1045219089Spjd	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1046219089Spjd	    invrows, used);
1047219089Spjd
1048219089Spjd	/*
1049219089Spjd	 * Reconstruct the missing data using the generated matrix.
1050219089Spjd	 */
1051219089Spjd	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1052219089Spjd	    invrows, used);
1053219089Spjd
1054219089Spjd	kmem_free(p, psize);
1055219089Spjd
1056219089Spjd	return (code);
1057192194Sdfr}
1058192194Sdfr
1059192194Sdfrstatic int
1060219089Spjdvdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1061192194Sdfr{
1062219089Spjd	int tgts[VDEV_RAIDZ_MAXPARITY];
1063219089Spjd	int ntgts;
1064219089Spjd	int i, c;
1065219089Spjd	int code;
1066219089Spjd	int nbadparity, nbaddata;
1067219089Spjd
1068219089Spjd	/*
1069219089Spjd	 * The tgts list must already be sorted.
1070219089Spjd	 */
1071219089Spjd	for (i = 1; i < nt; i++) {
1072219089Spjd		ASSERT(t[i] > t[i - 1]);
1073219089Spjd	}
1074219089Spjd
1075219089Spjd	nbadparity = rm->rm_firstdatacol;
1076219089Spjd	nbaddata = rm->rm_cols - nbadparity;
1077219089Spjd	ntgts = 0;
1078219089Spjd	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1079219089Spjd		if (i < nt && c == t[i]) {
1080219089Spjd			tgts[ntgts++] = c;
1081219089Spjd			i++;
1082219089Spjd		} else if (rm->rm_col[c].rc_error != 0) {
1083219089Spjd			tgts[ntgts++] = c;
1084219089Spjd		} else if (c >= rm->rm_firstdatacol) {
1085219089Spjd			nbaddata--;
1086219089Spjd		} else {
1087219089Spjd			nbadparity--;
1088219089Spjd		}
1089219089Spjd	}
1090219089Spjd
1091219089Spjd	ASSERT(ntgts >= nt);
1092219089Spjd	ASSERT(nbaddata >= 0);
1093219089Spjd	ASSERT(nbaddata + nbadparity == ntgts);
1094219089Spjd
1095219089Spjd	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1096219089Spjd	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1097219089Spjd	ASSERT(code > 0);
1098219089Spjd	return (code);
1099219089Spjd}
1100219089Spjd
1101219089Spjdstatic raidz_map_t *
1102219089Spjdvdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1103219089Spjd    uint64_t dcols, uint64_t nparity)
1104219089Spjd{
1105219089Spjd	raidz_map_t *rm;
1106192194Sdfr	uint64_t b = offset >> unit_shift;
1107219089Spjd	uint64_t s = size >> unit_shift;
1108192194Sdfr	uint64_t f = b % dcols;
1109192194Sdfr	uint64_t o = (b / dcols) << unit_shift;
1110219089Spjd	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1111192194Sdfr
1112192194Sdfr	q = s / (dcols - nparity);
1113192194Sdfr	r = s - q * (dcols - nparity);
1114192194Sdfr	bc = (r == 0 ? 0 : r + nparity);
1115219089Spjd	tot = s + nparity * (q + (r == 0 ? 0 : 1));
1116192194Sdfr
1117219089Spjd	if (q == 0) {
1118219089Spjd		acols = bc;
1119219089Spjd		scols = MIN(dcols, roundup(bc, nparity + 1));
1120219089Spjd	} else {
1121219089Spjd		acols = dcols;
1122219089Spjd		scols = dcols;
1123219089Spjd	}
1124219089Spjd
1125219089Spjd	ASSERT3U(acols, <=, scols);
1126219089Spjd
1127219089Spjd	rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols]));
1128219089Spjd
1129219089Spjd	rm->rm_cols = acols;
1130219089Spjd	rm->rm_scols = scols;
1131219089Spjd	rm->rm_bigcols = bc;
1132219089Spjd	rm->rm_skipstart = bc;
1133219089Spjd	rm->rm_missingdata = 0;
1134219089Spjd	rm->rm_missingparity = 0;
1135219089Spjd	rm->rm_firstdatacol = nparity;
1136219089Spjd	rm->rm_reports = 0;
1137219089Spjd	rm->rm_freed = 0;
1138219089Spjd	rm->rm_ecksuminjected = 0;
1139219089Spjd
1140192194Sdfr	asize = 0;
1141219089Spjd
1142219089Spjd	for (c = 0; c < scols; c++) {
1143192194Sdfr		col = f + c;
1144192194Sdfr		coff = o;
1145192194Sdfr		if (col >= dcols) {
1146192194Sdfr			col -= dcols;
1147192194Sdfr			coff += 1ULL << unit_shift;
1148192194Sdfr		}
1149219089Spjd		rm->rm_col[c].rc_devidx = col;
1150219089Spjd		rm->rm_col[c].rc_offset = coff;
1151219089Spjd		rm->rm_col[c].rc_data = NULL;
1152219089Spjd		rm->rm_col[c].rc_error = 0;
1153219089Spjd		rm->rm_col[c].rc_tried = 0;
1154219089Spjd		rm->rm_col[c].rc_skipped = 0;
1155192194Sdfr
1156219089Spjd		if (c >= acols)
1157219089Spjd			rm->rm_col[c].rc_size = 0;
1158219089Spjd		else if (c < bc)
1159219089Spjd			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1160219089Spjd		else
1161219089Spjd			rm->rm_col[c].rc_size = q << unit_shift;
1162192194Sdfr
1163219089Spjd		asize += rm->rm_col[c].rc_size;
1164192194Sdfr	}
1165192194Sdfr
1166219089Spjd	ASSERT3U(asize, ==, tot << unit_shift);
1167219089Spjd	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1168219089Spjd	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1169219089Spjd	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1170219089Spjd	ASSERT3U(rm->rm_nskip, <=, nparity);
1171192194Sdfr
1172219089Spjd	for (c = 0; c < rm->rm_firstdatacol; c++)
1173219089Spjd		rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size);
1174219089Spjd
1175219089Spjd	rm->rm_col[c].rc_data = data;
1176219089Spjd
1177192194Sdfr	for (c = c + 1; c < acols; c++)
1178219089Spjd		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1179219089Spjd		    rm->rm_col[c - 1].rc_size;
1180192194Sdfr
1181192194Sdfr	/*
1182219089Spjd	 * If all data stored spans all columns, there's a danger that parity
1183219089Spjd	 * will always be on the same device and, since parity isn't read
1184219089Spjd	 * during normal operation, that that device's I/O bandwidth won't be
1185219089Spjd	 * used effectively. We therefore switch the parity every 1MB.
1186192194Sdfr	 *
1187219089Spjd	 * ... at least that was, ostensibly, the theory. As a practical
1188219089Spjd	 * matter unless we juggle the parity between all devices evenly, we
1189219089Spjd	 * won't see any benefit. Further, occasional writes that aren't a
1190219089Spjd	 * multiple of the LCM of the number of children and the minimum
1191219089Spjd	 * stripe width are sufficient to avoid pessimal behavior.
1192219089Spjd	 * Unfortunately, this decision created an implicit on-disk format
1193219089Spjd	 * requirement that we need to support for all eternity, but only
1194219089Spjd	 * for single-parity RAID-Z.
1195219089Spjd	 *
1196219089Spjd	 * If we intend to skip a sector in the zeroth column for padding
1197219089Spjd	 * we must make sure to note this swap. We will never intend to
1198219089Spjd	 * skip the first column since at least one data and one parity
1199219089Spjd	 * column must appear in each row.
1200192194Sdfr	 */
1201219089Spjd	ASSERT(rm->rm_cols >= 2);
1202219089Spjd	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1203192194Sdfr
1204219089Spjd	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1205219089Spjd		devidx = rm->rm_col[0].rc_devidx;
1206219089Spjd		o = rm->rm_col[0].rc_offset;
1207219089Spjd		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1208219089Spjd		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1209219089Spjd		rm->rm_col[1].rc_devidx = devidx;
1210219089Spjd		rm->rm_col[1].rc_offset = o;
1211219089Spjd
1212219089Spjd		if (rm->rm_skipstart == 0)
1213219089Spjd			rm->rm_skipstart = 1;
1214192194Sdfr	}
1215192194Sdfr
1216219089Spjd	return (rm);
1217219089Spjd}
1218219089Spjd
1219219089Spjdstatic void
1220219089Spjdvdev_raidz_map_free(raidz_map_t *rm)
1221219089Spjd{
1222219089Spjd	int c;
1223219089Spjd
1224219089Spjd	for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1225219089Spjd		zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
1226219089Spjd
1227219089Spjd	zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
1228219089Spjd}
1229219089Spjd
1230219089Spjdstatic vdev_t *
1231219089Spjdvdev_child(vdev_t *pvd, uint64_t devidx)
1232219089Spjd{
1233219089Spjd	vdev_t *cvd;
1234219089Spjd
1235219089Spjd	STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1236219089Spjd		if (cvd->v_id == devidx)
1237219089Spjd			break;
1238219089Spjd	}
1239219089Spjd
1240219089Spjd	return (cvd);
1241219089Spjd}
1242219089Spjd
1243219089Spjd/*
1244219089Spjd * We keep track of whether or not there were any injected errors, so that
1245219089Spjd * any ereports we generate can note it.
1246219089Spjd */
1247219089Spjdstatic int
1248226553Spjdraidz_checksum_verify(const blkptr_t *bp, void *data, uint64_t size)
1249219089Spjd{
1250219089Spjd
1251226568Spjd	return (zio_checksum_verify(bp, data));
1252219089Spjd}
1253219089Spjd
1254219089Spjd/*
1255219089Spjd * Generate the parity from the data columns. If we tried and were able to
1256219089Spjd * read the parity without error, verify that the generated parity matches the
1257219089Spjd * data we read. If it doesn't, we fire off a checksum error. Return the
1258219089Spjd * number such failures.
1259219089Spjd */
1260219089Spjdstatic int
1261219089Spjdraidz_parity_verify(raidz_map_t *rm)
1262219089Spjd{
1263219089Spjd	void *orig[VDEV_RAIDZ_MAXPARITY];
1264219089Spjd	int c, ret = 0;
1265219089Spjd	raidz_col_t *rc;
1266219089Spjd
1267219089Spjd	for (c = 0; c < rm->rm_firstdatacol; c++) {
1268219089Spjd		rc = &rm->rm_col[c];
1269219089Spjd		if (!rc->rc_tried || rc->rc_error != 0)
1270219089Spjd			continue;
1271219089Spjd		orig[c] = zfs_alloc(rc->rc_size);
1272219089Spjd		bcopy(rc->rc_data, orig[c], rc->rc_size);
1273219089Spjd	}
1274219089Spjd
1275219089Spjd	vdev_raidz_generate_parity(rm);
1276219089Spjd
1277219089Spjd	for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1278219089Spjd		rc = &rm->rm_col[c];
1279219089Spjd		if (!rc->rc_tried || rc->rc_error != 0)
1280219089Spjd			continue;
1281219089Spjd		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1282219089Spjd			rc->rc_error = ECKSUM;
1283219089Spjd			ret++;
1284219089Spjd		}
1285219089Spjd		zfs_free(orig[c], rc->rc_size);
1286219089Spjd	}
1287219089Spjd
1288219089Spjd	return (ret);
1289219089Spjd}
1290219089Spjd
1291219089Spjd/*
1292219089Spjd * Iterate over all combinations of bad data and attempt a reconstruction.
1293219089Spjd * Note that the algorithm below is non-optimal because it doesn't take into
1294219089Spjd * account how reconstruction is actually performed. For example, with
1295219089Spjd * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1296219089Spjd * is targeted as invalid as if columns 1 and 4 are targeted since in both
1297219089Spjd * cases we'd only use parity information in column 0.
1298219089Spjd */
1299219089Spjdstatic int
1300219089Spjdvdev_raidz_combrec(raidz_map_t *rm, const blkptr_t *bp, void *data,
1301226553Spjd    off_t offset, uint64_t bytes, int total_errors, int data_errors)
1302219089Spjd{
1303219089Spjd	raidz_col_t *rc;
1304219089Spjd	void *orig[VDEV_RAIDZ_MAXPARITY];
1305219089Spjd	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1306219089Spjd	int *tgts = &tstore[1];
1307219089Spjd	int current, next, i, c, n;
1308219089Spjd	int code, ret = 0;
1309219089Spjd
1310219089Spjd	ASSERT(total_errors < rm->rm_firstdatacol);
1311219089Spjd
1312192194Sdfr	/*
1313219089Spjd	 * This simplifies one edge condition.
1314192194Sdfr	 */
1315219089Spjd	tgts[-1] = -1;
1316219089Spjd
1317219089Spjd	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1318219089Spjd		/*
1319219089Spjd		 * Initialize the targets array by finding the first n columns
1320219089Spjd		 * that contain no error.
1321219089Spjd		 *
1322219089Spjd		 * If there were no data errors, we need to ensure that we're
1323219089Spjd		 * always explicitly attempting to reconstruct at least one
1324219089Spjd		 * data column. To do this, we simply push the highest target
1325219089Spjd		 * up into the data columns.
1326219089Spjd		 */
1327219089Spjd		for (c = 0, i = 0; i < n; i++) {
1328219089Spjd			if (i == n - 1 && data_errors == 0 &&
1329219089Spjd			    c < rm->rm_firstdatacol) {
1330219089Spjd				c = rm->rm_firstdatacol;
1331219089Spjd			}
1332219089Spjd
1333219089Spjd			while (rm->rm_col[c].rc_error != 0) {
1334219089Spjd				c++;
1335219089Spjd				ASSERT3S(c, <, rm->rm_cols);
1336219089Spjd			}
1337219089Spjd
1338219089Spjd			tgts[i] = c++;
1339219089Spjd		}
1340219089Spjd
1341219089Spjd		/*
1342219089Spjd		 * Setting tgts[n] simplifies the other edge condition.
1343219089Spjd		 */
1344219089Spjd		tgts[n] = rm->rm_cols;
1345219089Spjd
1346219089Spjd		/*
1347219089Spjd		 * These buffers were allocated in previous iterations.
1348219089Spjd		 */
1349219089Spjd		for (i = 0; i < n - 1; i++) {
1350219089Spjd			ASSERT(orig[i] != NULL);
1351219089Spjd		}
1352219089Spjd
1353219089Spjd		orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size);
1354219089Spjd
1355219089Spjd		current = 0;
1356219089Spjd		next = tgts[current];
1357219089Spjd
1358219089Spjd		while (current != n) {
1359219089Spjd			tgts[current] = next;
1360219089Spjd			current = 0;
1361219089Spjd
1362219089Spjd			/*
1363219089Spjd			 * Save off the original data that we're going to
1364219089Spjd			 * attempt to reconstruct.
1365219089Spjd			 */
1366219089Spjd			for (i = 0; i < n; i++) {
1367219089Spjd				ASSERT(orig[i] != NULL);
1368219089Spjd				c = tgts[i];
1369219089Spjd				ASSERT3S(c, >=, 0);
1370219089Spjd				ASSERT3S(c, <, rm->rm_cols);
1371219089Spjd				rc = &rm->rm_col[c];
1372219089Spjd				bcopy(rc->rc_data, orig[i], rc->rc_size);
1373219089Spjd			}
1374219089Spjd
1375219089Spjd			/*
1376219089Spjd			 * Attempt a reconstruction and exit the outer loop on
1377219089Spjd			 * success.
1378219089Spjd			 */
1379219089Spjd			code = vdev_raidz_reconstruct(rm, tgts, n);
1380226553Spjd			if (raidz_checksum_verify(bp, data, bytes) == 0) {
1381219089Spjd				for (i = 0; i < n; i++) {
1382219089Spjd					c = tgts[i];
1383219089Spjd					rc = &rm->rm_col[c];
1384219089Spjd					ASSERT(rc->rc_error == 0);
1385219089Spjd					rc->rc_error = ECKSUM;
1386219089Spjd				}
1387219089Spjd
1388219089Spjd				ret = code;
1389219089Spjd				goto done;
1390219089Spjd			}
1391219089Spjd
1392219089Spjd			/*
1393219089Spjd			 * Restore the original data.
1394219089Spjd			 */
1395219089Spjd			for (i = 0; i < n; i++) {
1396219089Spjd				c = tgts[i];
1397219089Spjd				rc = &rm->rm_col[c];
1398219089Spjd				bcopy(orig[i], rc->rc_data, rc->rc_size);
1399219089Spjd			}
1400219089Spjd
1401219089Spjd			do {
1402219089Spjd				/*
1403219089Spjd				 * Find the next valid column after the current
1404219089Spjd				 * position..
1405219089Spjd				 */
1406219089Spjd				for (next = tgts[current] + 1;
1407219089Spjd				    next < rm->rm_cols &&
1408219089Spjd				    rm->rm_col[next].rc_error != 0; next++)
1409219089Spjd					continue;
1410219089Spjd
1411219089Spjd				ASSERT(next <= tgts[current + 1]);
1412219089Spjd
1413219089Spjd				/*
1414219089Spjd				 * If that spot is available, we're done here.
1415219089Spjd				 */
1416219089Spjd				if (next != tgts[current + 1])
1417219089Spjd					break;
1418219089Spjd
1419219089Spjd				/*
1420219089Spjd				 * Otherwise, find the next valid column after
1421219089Spjd				 * the previous position.
1422219089Spjd				 */
1423219089Spjd				for (c = tgts[current - 1] + 1;
1424219089Spjd				    rm->rm_col[c].rc_error != 0; c++)
1425219089Spjd					continue;
1426219089Spjd
1427219089Spjd				tgts[current] = c;
1428219089Spjd				current++;
1429219089Spjd
1430219089Spjd			} while (current != n);
1431219089Spjd		}
1432219089Spjd	}
1433219089Spjd	n--;
1434219089Spjddone:
1435219089Spjd	for (i = n - 1; i >= 0; i--) {
1436219089Spjd		zfs_free(orig[i], rm->rm_col[0].rc_size);
1437219089Spjd	}
1438219089Spjd
1439219089Spjd	return (ret);
1440219089Spjd}
1441219089Spjd
1442219089Spjdstatic int
1443219089Spjdvdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1444219089Spjd    off_t offset, size_t bytes)
1445219089Spjd{
1446219089Spjd	vdev_t *tvd = vd->v_top;
1447219089Spjd	vdev_t *cvd;
1448219089Spjd	raidz_map_t *rm;
1449219089Spjd	raidz_col_t *rc;
1450219089Spjd	int c, error;
1451219089Spjd	int unexpected_errors;
1452219089Spjd	int parity_errors;
1453219089Spjd	int parity_untried;
1454219089Spjd	int data_errors;
1455219089Spjd	int total_errors;
1456219089Spjd	int n;
1457219089Spjd	int tgts[VDEV_RAIDZ_MAXPARITY];
1458219089Spjd	int code;
1459219089Spjd
1460219089Spjd	rc = NULL;	/* gcc */
1461219089Spjd	error = 0;
1462219089Spjd
1463219089Spjd	rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1464219089Spjd	    vd->v_nchildren, vd->v_nparity);
1465219089Spjd
1466219089Spjd	/*
1467219089Spjd	 * Iterate over the columns in reverse order so that we hit the parity
1468219089Spjd	 * last -- any errors along the way will force us to read the parity.
1469219089Spjd	 */
1470219089Spjd	for (c = rm->rm_cols - 1; c >= 0; c--) {
1471219089Spjd		rc = &rm->rm_col[c];
1472219089Spjd		cvd = vdev_child(vd, rc->rc_devidx);
1473219089Spjd		if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1474219089Spjd			if (c >= rm->rm_firstdatacol)
1475219089Spjd				rm->rm_missingdata++;
1476192194Sdfr			else
1477219089Spjd				rm->rm_missingparity++;
1478192194Sdfr			rc->rc_error = ENXIO;
1479192194Sdfr			rc->rc_tried = 1;	/* don't even try */
1480192194Sdfr			rc->rc_skipped = 1;
1481192194Sdfr			continue;
1482192194Sdfr		}
1483219089Spjd#if 0		/* XXX: Too hard for the boot code. */
1484219089Spjd		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1485219089Spjd			if (c >= rm->rm_firstdatacol)
1486192194Sdfr				rm->rm_missingdata++;
1487192194Sdfr			else
1488192194Sdfr				rm->rm_missingparity++;
1489192194Sdfr			rc->rc_error = ESTALE;
1490192194Sdfr			rc->rc_skipped = 1;
1491192194Sdfr			continue;
1492192194Sdfr		}
1493192194Sdfr#endif
1494219089Spjd		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1495219089Spjd			rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1496219089Spjd			    rc->rc_offset, rc->rc_size);
1497192194Sdfr			rc->rc_tried = 1;
1498192194Sdfr			rc->rc_skipped = 0;
1499192194Sdfr		}
1500192194Sdfr	}
1501192194Sdfr
1502192194Sdfrreconstruct:
1503219089Spjd	unexpected_errors = 0;
1504192194Sdfr	parity_errors = 0;
1505219089Spjd	parity_untried = 0;
1506192194Sdfr	data_errors = 0;
1507192194Sdfr	total_errors = 0;
1508192194Sdfr
1509219089Spjd	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1510219089Spjd	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1511219089Spjd
1512219089Spjd	for (c = 0; c < rm->rm_cols; c++) {
1513219089Spjd		rc = &rm->rm_col[c];
1514219089Spjd
1515192194Sdfr		if (rc->rc_error) {
1516219089Spjd			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
1517219089Spjd
1518219089Spjd			if (c < rm->rm_firstdatacol)
1519192194Sdfr				parity_errors++;
1520192194Sdfr			else
1521192194Sdfr				data_errors++;
1522192194Sdfr
1523192194Sdfr			if (!rc->rc_skipped)
1524192194Sdfr				unexpected_errors++;
1525192194Sdfr
1526192194Sdfr			total_errors++;
1527219089Spjd		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1528192194Sdfr			parity_untried++;
1529192194Sdfr		}
1530192194Sdfr	}
1531192194Sdfr
1532192194Sdfr	/*
1533192194Sdfr	 * There are three potential phases for a read:
1534192194Sdfr	 *	1. produce valid data from the columns read
1535192194Sdfr	 *	2. read all disks and try again
1536192194Sdfr	 *	3. perform combinatorial reconstruction
1537192194Sdfr	 *
1538219089Spjd	 * Each phase is progressively both more expensive and less likely to
1539219089Spjd	 * occur. If we encounter more errors than we can repair or all phases
1540219089Spjd	 * fail, we have no choice but to return an error.
1541192194Sdfr	 */
1542192194Sdfr
1543192194Sdfr	/*
1544219089Spjd	 * If the number of errors we saw was correctable -- less than or equal
1545219089Spjd	 * to the number of parity disks read -- attempt to produce data that
1546219089Spjd	 * has a valid checksum. Naturally, this case applies in the absence of
1547219089Spjd	 * any errors.
1548192194Sdfr	 */
1549219089Spjd	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1550219089Spjd		if (data_errors == 0) {
1551226553Spjd			if (raidz_checksum_verify(bp, data, bytes) == 0) {
1552219089Spjd				/*
1553219089Spjd				 * If we read parity information (unnecessarily
1554219089Spjd				 * as it happens since no reconstruction was
1555219089Spjd				 * needed) regenerate and verify the parity.
1556219089Spjd				 * We also regenerate parity when resilvering
1557219089Spjd				 * so we can write it out to the failed device
1558219089Spjd				 * later.
1559219089Spjd				 */
1560219089Spjd				if (parity_errors + parity_untried <
1561219089Spjd				    rm->rm_firstdatacol) {
1562219089Spjd					n = raidz_parity_verify(rm);
1563219089Spjd					unexpected_errors += n;
1564219089Spjd					ASSERT(parity_errors + n <=
1565219089Spjd					    rm->rm_firstdatacol);
1566219089Spjd				}
1567219089Spjd				goto done;
1568219089Spjd			}
1569219089Spjd		} else {
1570192194Sdfr			/*
1571192194Sdfr			 * We either attempt to read all the parity columns or
1572192194Sdfr			 * none of them. If we didn't try to read parity, we
1573192194Sdfr			 * wouldn't be here in the correctable case. There must
1574192194Sdfr			 * also have been fewer parity errors than parity
1575192194Sdfr			 * columns or, again, we wouldn't be in this code path.
1576192194Sdfr			 */
1577219089Spjd			ASSERT(parity_untried == 0);
1578219089Spjd			ASSERT(parity_errors < rm->rm_firstdatacol);
1579192194Sdfr
1580192194Sdfr			/*
1581219089Spjd			 * Identify the data columns that reported an error.
1582192194Sdfr			 */
1583219089Spjd			n = 0;
1584219089Spjd			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1585219089Spjd				rc = &rm->rm_col[c];
1586219089Spjd				if (rc->rc_error != 0) {
1587219089Spjd					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1588219089Spjd					tgts[n++] = c;
1589219089Spjd				}
1590192194Sdfr			}
1591192194Sdfr
1592219089Spjd			ASSERT(rm->rm_firstdatacol >= n);
1593192194Sdfr
1594219089Spjd			code = vdev_raidz_reconstruct(rm, tgts, n);
1595192194Sdfr
1596226553Spjd			if (raidz_checksum_verify(bp, data, bytes) == 0) {
1597219089Spjd				/*
1598219089Spjd				 * If we read more parity disks than were used
1599219089Spjd				 * for reconstruction, confirm that the other
1600219089Spjd				 * parity disks produced correct data. This
1601219089Spjd				 * routine is suboptimal in that it regenerates
1602219089Spjd				 * the parity that we already used in addition
1603219089Spjd				 * to the parity that we're attempting to
1604219089Spjd				 * verify, but this should be a relatively
1605219089Spjd				 * uncommon case, and can be optimized if it
1606219089Spjd				 * becomes a problem. Note that we regenerate
1607219089Spjd				 * parity when resilvering so we can write it
1608219089Spjd				 * out to failed devices later.
1609219089Spjd				 */
1610219089Spjd				if (parity_errors < rm->rm_firstdatacol - n) {
1611219089Spjd					n = raidz_parity_verify(rm);
1612219089Spjd					unexpected_errors += n;
1613219089Spjd					ASSERT(parity_errors + n <=
1614219089Spjd					    rm->rm_firstdatacol);
1615219089Spjd				}
1616192194Sdfr
1617219089Spjd				goto done;
1618192194Sdfr			}
1619192194Sdfr		}
1620192194Sdfr	}
1621192194Sdfr
1622192194Sdfr	/*
1623192194Sdfr	 * This isn't a typical situation -- either we got a read
1624192194Sdfr	 * error or a child silently returned bad data. Read every
1625192194Sdfr	 * block so we can try again with as much data and parity as
1626192194Sdfr	 * we can track down. If we've already been through once
1627192194Sdfr	 * before, all children will be marked as tried so we'll
1628192194Sdfr	 * proceed to combinatorial reconstruction.
1629192194Sdfr	 */
1630219089Spjd	unexpected_errors = 1;
1631219089Spjd	rm->rm_missingdata = 0;
1632219089Spjd	rm->rm_missingparity = 0;
1633219089Spjd
1634192194Sdfr	n = 0;
1635219089Spjd	for (c = 0; c < rm->rm_cols; c++) {
1636226550Spjd		rc = &rm->rm_col[c];
1637226550Spjd
1638226550Spjd		if (rc->rc_tried)
1639192194Sdfr			continue;
1640192194Sdfr
1641219089Spjd		cvd = vdev_child(vd, rc->rc_devidx);
1642219089Spjd		ASSERT(cvd != NULL);
1643219089Spjd		rc->rc_error = cvd->v_read(cvd, NULL,
1644219089Spjd		    rc->rc_data, rc->rc_offset, rc->rc_size);
1645192194Sdfr		if (rc->rc_error == 0)
1646192194Sdfr			n++;
1647192194Sdfr		rc->rc_tried = 1;
1648192194Sdfr		rc->rc_skipped = 0;
1649192194Sdfr	}
1650192194Sdfr	/*
1651192194Sdfr	 * If we managed to read anything more, retry the
1652192194Sdfr	 * reconstruction.
1653192194Sdfr	 */
1654219089Spjd	if (n > 0)
1655192194Sdfr		goto reconstruct;
1656192194Sdfr
1657192194Sdfr	/*
1658192194Sdfr	 * At this point we've attempted to reconstruct the data given the
1659192194Sdfr	 * errors we detected, and we've attempted to read all columns. There
1660192194Sdfr	 * must, therefore, be one or more additional problems -- silent errors
1661192194Sdfr	 * resulting in invalid data rather than explicit I/O errors resulting
1662219089Spjd	 * in absent data. We check if there is enough additional data to
1663219089Spjd	 * possibly reconstruct the data and then perform combinatorial
1664219089Spjd	 * reconstruction over all possible combinations. If that fails,
1665219089Spjd	 * we're cooked.
1666192194Sdfr	 */
1667219089Spjd	if (total_errors > rm->rm_firstdatacol) {
1668219089Spjd		error = EIO;
1669219089Spjd	} else if (total_errors < rm->rm_firstdatacol &&
1670226553Spjd	    (code = vdev_raidz_combrec(rm, bp, data, offset, bytes,
1671226553Spjd	     total_errors, data_errors)) != 0) {
1672192194Sdfr		/*
1673219089Spjd		 * If we didn't use all the available parity for the
1674219089Spjd		 * combinatorial reconstruction, verify that the remaining
1675219089Spjd		 * parity is correct.
1676192194Sdfr		 */
1677219089Spjd		if (code != (1 << rm->rm_firstdatacol) - 1)
1678219089Spjd			(void) raidz_parity_verify(rm);
1679219089Spjd	} else {
1680192194Sdfr		/*
1681219089Spjd		 * We're here because either:
1682219089Spjd		 *
1683219089Spjd		 *	total_errors == rm_first_datacol, or
1684219089Spjd		 *	vdev_raidz_combrec() failed
1685219089Spjd		 *
1686219089Spjd		 * In either case, there is enough bad data to prevent
1687219089Spjd		 * reconstruction.
1688219089Spjd		 *
1689219089Spjd		 * Start checksum ereports for all children which haven't
1690219089Spjd		 * failed, and the IO wasn't speculative.
1691192194Sdfr		 */
1692219089Spjd		error = ECKSUM;
1693192194Sdfr	}
1694192194Sdfr
1695219089Spjddone:
1696219089Spjd	vdev_raidz_map_free(rm);
1697192194Sdfr
1698219089Spjd	return (error);
1699192194Sdfr}
1700