zfssubr.c revision 268649
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#include <sys/cdefs.h>
27__FBSDID("$FreeBSD: stable/10/sys/cddl/boot/zfs/zfssubr.c 268649 2014-07-15 04:53:34Z delphij $");
28
29static uint64_t zfs_crc64_table[256];
30
31#define	ECKSUM	666
32
33#define	ASSERT3S(x, y, z)	((void)0)
34#define	ASSERT3U(x, y, z)	((void)0)
35#define	ASSERT3P(x, y, z)	((void)0)
36#define	ASSERT0(x)		((void)0)
37#define	ASSERT(x)		((void)0)
38
39#define	panic(...)	do {						\
40	printf(__VA_ARGS__);						\
41	for (;;) ;							\
42} while (0)
43
44#define	kmem_alloc(size, flag)	zfs_alloc((size))
45#define	kmem_free(ptr, size)	zfs_free((ptr), (size))
46
47static void
48zfs_init_crc(void)
49{
50	int i, j;
51	uint64_t *ct;
52
53	/*
54	 * Calculate the crc64 table (used for the zap hash
55	 * function).
56	 */
57	if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
58		memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
59		for (i = 0; i < 256; i++)
60			for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
61				*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
62	}
63}
64
65static void
66zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
67{
68	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
69}
70
71/*
72 * Signature for checksum functions.
73 */
74typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp);
75
76/*
77 * Information about each checksum function.
78 */
79typedef struct zio_checksum_info {
80	zio_checksum_t	*ci_func[2]; /* checksum function for each byteorder */
81	int		ci_correctable;	/* number of correctable bits	*/
82	int		ci_eck;		/* uses zio embedded checksum? */
83	int		ci_dedup;	/* strong enough for dedup? */
84	const char	*ci_name;	/* descriptive name */
85} zio_checksum_info_t;
86
87#include "blkptr.c"
88
89#include "fletcher.c"
90#include "sha256.c"
91
92static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
93	{{NULL,			NULL},			0, 0, 0, "inherit"},
94	{{NULL,			NULL},			0, 0, 0, "on"},
95	{{zio_checksum_off,	zio_checksum_off},	0, 0, 0, "off"},
96	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 1, 0, "label"},
97	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 1, 0, "gang_header"},
98	{{fletcher_2_native,	fletcher_2_byteswap},	0, 1, 0, "zilog"},
99	{{fletcher_2_native,	fletcher_2_byteswap},	0, 0, 0, "fletcher2"},
100	{{fletcher_4_native,	fletcher_4_byteswap},	1, 0, 0, "fletcher4"},
101	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 0, 1, "SHA256"},
102	{{fletcher_4_native,	fletcher_4_byteswap},	0, 1, 0, "zillog2"},
103};
104
105
106/*
107 * Common signature for all zio compress/decompress functions.
108 */
109typedef size_t zio_compress_func_t(void *src, void *dst,
110    size_t s_len, size_t d_len, int);
111typedef int zio_decompress_func_t(void *src, void *dst,
112    size_t s_len, size_t d_len, int);
113
114/*
115 * Information about each compression function.
116 */
117typedef struct zio_compress_info {
118	zio_compress_func_t	*ci_compress;	/* compression function */
119	zio_decompress_func_t	*ci_decompress;	/* decompression function */
120	int			ci_level;	/* level parameter */
121	const char		*ci_name;	/* algorithm name */
122} zio_compress_info_t;
123
124#include "lzjb.c"
125#include "zle.c"
126#include "lz4.c"
127
128/*
129 * Compression vectors.
130 */
131static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
132	{NULL,			NULL,			0,	"inherit"},
133	{NULL,			NULL,			0,	"on"},
134	{NULL,			NULL,			0,	"uncompressed"},
135	{NULL,			lzjb_decompress,	0,	"lzjb"},
136	{NULL,			NULL,			0,	"empty"},
137	{NULL,			NULL,			1,	"gzip-1"},
138	{NULL,			NULL,			2,	"gzip-2"},
139	{NULL,			NULL,			3,	"gzip-3"},
140	{NULL,			NULL,			4,	"gzip-4"},
141	{NULL,			NULL,			5,	"gzip-5"},
142	{NULL,			NULL,			6,	"gzip-6"},
143	{NULL,			NULL,			7,	"gzip-7"},
144	{NULL,			NULL,			8,	"gzip-8"},
145	{NULL,			NULL,			9,	"gzip-9"},
146	{NULL,			zle_decompress,		64,	"zle"},
147	{NULL,			lz4_decompress,		0,	"lz4"},
148};
149
150static void
151byteswap_uint64_array(void *vbuf, size_t size)
152{
153	uint64_t *buf = vbuf;
154	size_t count = size >> 3;
155	int i;
156
157	ASSERT((size & 7) == 0);
158
159	for (i = 0; i < count; i++)
160		buf[i] = BSWAP_64(buf[i]);
161}
162
163/*
164 * Set the external verifier for a gang block based on <vdev, offset, txg>,
165 * a tuple which is guaranteed to be unique for the life of the pool.
166 */
167static void
168zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
169{
170	const dva_t *dva = BP_IDENTITY(bp);
171	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
172
173	ASSERT(BP_IS_GANG(bp));
174
175	ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
176}
177
178/*
179 * Set the external verifier for a label block based on its offset.
180 * The vdev is implicit, and the txg is unknowable at pool open time --
181 * hence the logic in vdev_uberblock_load() to find the most recent copy.
182 */
183static void
184zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
185{
186	ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
187}
188
189static int
190zio_checksum_verify(const blkptr_t *bp, void *data)
191{
192	uint64_t size;
193	unsigned int checksum;
194	zio_checksum_info_t *ci;
195	zio_cksum_t actual_cksum, expected_cksum, verifier;
196	int byteswap;
197
198	checksum = BP_GET_CHECKSUM(bp);
199	size = BP_GET_PSIZE(bp);
200
201	if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
202		return (EINVAL);
203	ci = &zio_checksum_table[checksum];
204	if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
205		return (EINVAL);
206
207	if (ci->ci_eck) {
208		zio_eck_t *eck;
209
210		ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
211		    checksum == ZIO_CHECKSUM_LABEL);
212
213		eck = (zio_eck_t *)((char *)data + size) - 1;
214
215		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
216			zio_checksum_gang_verifier(&verifier, bp);
217		else if (checksum == ZIO_CHECKSUM_LABEL)
218			zio_checksum_label_verifier(&verifier,
219			    DVA_GET_OFFSET(BP_IDENTITY(bp)));
220		else
221			verifier = bp->blk_cksum;
222
223		byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
224
225		if (byteswap)
226			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
227
228		expected_cksum = eck->zec_cksum;
229		eck->zec_cksum = verifier;
230		ci->ci_func[byteswap](data, size, &actual_cksum);
231		eck->zec_cksum = expected_cksum;
232
233		if (byteswap)
234			byteswap_uint64_array(&expected_cksum,
235			    sizeof (zio_cksum_t));
236	} else {
237		expected_cksum = bp->blk_cksum;
238		ci->ci_func[0](data, size, &actual_cksum);
239	}
240
241	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
242		/*printf("ZFS: read checksum failed\n");*/
243		return (EIO);
244	}
245
246	return (0);
247}
248
249static int
250zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
251	void *dest, uint64_t destsize)
252{
253	zio_compress_info_t *ci;
254
255	if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
256		printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
257		return (EIO);
258	}
259
260	ci = &zio_compress_table[cpfunc];
261	if (!ci->ci_decompress) {
262		printf("ZFS: unsupported compression algorithm %s\n",
263		    ci->ci_name);
264		return (EIO);
265	}
266
267	return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
268}
269
270static uint64_t
271zap_hash(uint64_t salt, const char *name)
272{
273	const uint8_t *cp;
274	uint8_t c;
275	uint64_t crc = salt;
276
277	ASSERT(crc != 0);
278	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
279	for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
280		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
281
282	/*
283	 * Only use 28 bits, since we need 4 bits in the cookie for the
284	 * collision differentiator.  We MUST use the high bits, since
285	 * those are the onces that we first pay attention to when
286	 * chosing the bucket.
287	 */
288	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
289
290	return (crc);
291}
292
293static void *zfs_alloc(size_t size);
294static void zfs_free(void *ptr, size_t size);
295
296typedef struct raidz_col {
297	uint64_t rc_devidx;		/* child device index for I/O */
298	uint64_t rc_offset;		/* device offset */
299	uint64_t rc_size;		/* I/O size */
300	void *rc_data;			/* I/O data */
301	int rc_error;			/* I/O error for this device */
302	uint8_t rc_tried;		/* Did we attempt this I/O column? */
303	uint8_t rc_skipped;		/* Did we skip this I/O column? */
304} raidz_col_t;
305
306typedef struct raidz_map {
307	uint64_t rm_cols;		/* Regular column count */
308	uint64_t rm_scols;		/* Count including skipped columns */
309	uint64_t rm_bigcols;		/* Number of oversized columns */
310	uint64_t rm_asize;		/* Actual total I/O size */
311	uint64_t rm_missingdata;	/* Count of missing data devices */
312	uint64_t rm_missingparity;	/* Count of missing parity devices */
313	uint64_t rm_firstdatacol;	/* First data column/parity count */
314	uint64_t rm_nskip;		/* Skipped sectors for padding */
315	uint64_t rm_skipstart;		/* Column index of padding start */
316	uintptr_t rm_reports;		/* # of referencing checksum reports */
317	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
318	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
319	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
320} raidz_map_t;
321
322#define	VDEV_RAIDZ_P		0
323#define	VDEV_RAIDZ_Q		1
324#define	VDEV_RAIDZ_R		2
325
326#define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
327#define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
328
329/*
330 * We provide a mechanism to perform the field multiplication operation on a
331 * 64-bit value all at once rather than a byte at a time. This works by
332 * creating a mask from the top bit in each byte and using that to
333 * conditionally apply the XOR of 0x1d.
334 */
335#define	VDEV_RAIDZ_64MUL_2(x, mask) \
336{ \
337	(mask) = (x) & 0x8080808080808080ULL; \
338	(mask) = ((mask) << 1) - ((mask) >> 7); \
339	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
340	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
341}
342
343#define	VDEV_RAIDZ_64MUL_4(x, mask) \
344{ \
345	VDEV_RAIDZ_64MUL_2((x), mask); \
346	VDEV_RAIDZ_64MUL_2((x), mask); \
347}
348
349/*
350 * These two tables represent powers and logs of 2 in the Galois field defined
351 * above. These values were computed by repeatedly multiplying by 2 as above.
352 */
353static const uint8_t vdev_raidz_pow2[256] = {
354	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
355	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
356	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
357	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
358	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
359	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
360	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
361	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
362	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
363	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
364	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
365	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
366	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
367	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
368	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
369	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
370	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
371	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
372	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
373	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
374	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
375	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
376	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
377	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
378	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
379	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
380	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
381	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
382	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
383	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
384	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
385	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
386};
387static const uint8_t vdev_raidz_log2[256] = {
388	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
389	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
390	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
391	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
392	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
393	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
394	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
395	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
396	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
397	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
398	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
399	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
400	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
401	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
402	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
403	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
404	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
405	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
406	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
407	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
408	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
409	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
410	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
411	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
412	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
413	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
414	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
415	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
416	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
417	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
418	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
419	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
420};
421
422/*
423 * Multiply a given number by 2 raised to the given power.
424 */
425static uint8_t
426vdev_raidz_exp2(uint8_t a, int exp)
427{
428	if (a == 0)
429		return (0);
430
431	ASSERT(exp >= 0);
432	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
433
434	exp += vdev_raidz_log2[a];
435	if (exp > 255)
436		exp -= 255;
437
438	return (vdev_raidz_pow2[exp]);
439}
440
441static void
442vdev_raidz_generate_parity_p(raidz_map_t *rm)
443{
444	uint64_t *p, *src, pcount, ccount, i;
445	int c;
446
447	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
448
449	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
450		src = rm->rm_col[c].rc_data;
451		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
452		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
453
454		if (c == rm->rm_firstdatacol) {
455			ASSERT(ccount == pcount);
456			for (i = 0; i < ccount; i++, src++, p++) {
457				*p = *src;
458			}
459		} else {
460			ASSERT(ccount <= pcount);
461			for (i = 0; i < ccount; i++, src++, p++) {
462				*p ^= *src;
463			}
464		}
465	}
466}
467
468static void
469vdev_raidz_generate_parity_pq(raidz_map_t *rm)
470{
471	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
472	int c;
473
474	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
475	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
476	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
477
478	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
479		src = rm->rm_col[c].rc_data;
480		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
481		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
482
483		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
484
485		if (c == rm->rm_firstdatacol) {
486			ASSERT(ccnt == pcnt || ccnt == 0);
487			for (i = 0; i < ccnt; i++, src++, p++, q++) {
488				*p = *src;
489				*q = *src;
490			}
491			for (; i < pcnt; i++, src++, p++, q++) {
492				*p = 0;
493				*q = 0;
494			}
495		} else {
496			ASSERT(ccnt <= pcnt);
497
498			/*
499			 * Apply the algorithm described above by multiplying
500			 * the previous result and adding in the new value.
501			 */
502			for (i = 0; i < ccnt; i++, src++, p++, q++) {
503				*p ^= *src;
504
505				VDEV_RAIDZ_64MUL_2(*q, mask);
506				*q ^= *src;
507			}
508
509			/*
510			 * Treat short columns as though they are full of 0s.
511			 * Note that there's therefore nothing needed for P.
512			 */
513			for (; i < pcnt; i++, q++) {
514				VDEV_RAIDZ_64MUL_2(*q, mask);
515			}
516		}
517	}
518}
519
520static void
521vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
522{
523	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
524	int c;
525
526	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
527	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
528	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
529	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
530	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
531
532	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
533		src = rm->rm_col[c].rc_data;
534		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
535		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
536		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
537
538		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
539
540		if (c == rm->rm_firstdatacol) {
541			ASSERT(ccnt == pcnt || ccnt == 0);
542			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
543				*p = *src;
544				*q = *src;
545				*r = *src;
546			}
547			for (; i < pcnt; i++, src++, p++, q++, r++) {
548				*p = 0;
549				*q = 0;
550				*r = 0;
551			}
552		} else {
553			ASSERT(ccnt <= pcnt);
554
555			/*
556			 * Apply the algorithm described above by multiplying
557			 * the previous result and adding in the new value.
558			 */
559			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
560				*p ^= *src;
561
562				VDEV_RAIDZ_64MUL_2(*q, mask);
563				*q ^= *src;
564
565				VDEV_RAIDZ_64MUL_4(*r, mask);
566				*r ^= *src;
567			}
568
569			/*
570			 * Treat short columns as though they are full of 0s.
571			 * Note that there's therefore nothing needed for P.
572			 */
573			for (; i < pcnt; i++, q++, r++) {
574				VDEV_RAIDZ_64MUL_2(*q, mask);
575				VDEV_RAIDZ_64MUL_4(*r, mask);
576			}
577		}
578	}
579}
580
581/*
582 * Generate RAID parity in the first virtual columns according to the number of
583 * parity columns available.
584 */
585static void
586vdev_raidz_generate_parity(raidz_map_t *rm)
587{
588	switch (rm->rm_firstdatacol) {
589	case 1:
590		vdev_raidz_generate_parity_p(rm);
591		break;
592	case 2:
593		vdev_raidz_generate_parity_pq(rm);
594		break;
595	case 3:
596		vdev_raidz_generate_parity_pqr(rm);
597		break;
598	default:
599		panic("invalid RAID-Z configuration");
600	}
601}
602
603/* BEGIN CSTYLED */
604/*
605 * In the general case of reconstruction, we must solve the system of linear
606 * equations defined by the coeffecients used to generate parity as well as
607 * the contents of the data and parity disks. This can be expressed with
608 * vectors for the original data (D) and the actual data (d) and parity (p)
609 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
610 *
611 *            __   __                     __     __
612 *            |     |         __     __   |  p_0  |
613 *            |  V  |         |  D_0  |   | p_m-1 |
614 *            |     |    x    |   :   | = |  d_0  |
615 *            |  I  |         | D_n-1 |   |   :   |
616 *            |     |         ~~     ~~   | d_n-1 |
617 *            ~~   ~~                     ~~     ~~
618 *
619 * I is simply a square identity matrix of size n, and V is a vandermonde
620 * matrix defined by the coeffecients we chose for the various parity columns
621 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
622 * computation as well as linear separability.
623 *
624 *      __               __               __     __
625 *      |   1   ..  1 1 1 |               |  p_0  |
626 *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
627 *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
628 *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
629 *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
630 *      |   :       : : : |   |   :   |   |  d_2  |
631 *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
632 *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
633 *      |   0   ..  0 0 1 |               | d_n-1 |
634 *      ~~               ~~               ~~     ~~
635 *
636 * Note that I, V, d, and p are known. To compute D, we must invert the
637 * matrix and use the known data and parity values to reconstruct the unknown
638 * data values. We begin by removing the rows in V|I and d|p that correspond
639 * to failed or missing columns; we then make V|I square (n x n) and d|p
640 * sized n by removing rows corresponding to unused parity from the bottom up
641 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
642 * using Gauss-Jordan elimination. In the example below we use m=3 parity
643 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
644 *           __                               __
645 *           |  1   1   1   1   1   1   1   1  |
646 *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
647 *           |  19 205 116  29  64  16  4   1  |      / /
648 *           |  1   0   0   0   0   0   0   0  |     / /
649 *           |  0   1   0   0   0   0   0   0  | <--' /
650 *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
651 *           |  0   0   0   1   0   0   0   0  |
652 *           |  0   0   0   0   1   0   0   0  |
653 *           |  0   0   0   0   0   1   0   0  |
654 *           |  0   0   0   0   0   0   1   0  |
655 *           |  0   0   0   0   0   0   0   1  |
656 *           ~~                               ~~
657 *           __                               __
658 *           |  1   1   1   1   1   1   1   1  |
659 *           | 128  64  32  16  8   4   2   1  |
660 *           |  19 205 116  29  64  16  4   1  |
661 *           |  1   0   0   0   0   0   0   0  |
662 *           |  0   1   0   0   0   0   0   0  |
663 *  (V|I)' = |  0   0   1   0   0   0   0   0  |
664 *           |  0   0   0   1   0   0   0   0  |
665 *           |  0   0   0   0   1   0   0   0  |
666 *           |  0   0   0   0   0   1   0   0  |
667 *           |  0   0   0   0   0   0   1   0  |
668 *           |  0   0   0   0   0   0   0   1  |
669 *           ~~                               ~~
670 *
671 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
672 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
673 * matrix is not singular.
674 * __                                                                 __
675 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
676 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
677 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
678 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
679 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
680 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
681 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
682 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
683 * ~~                                                                 ~~
684 * __                                                                 __
685 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
686 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
687 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
688 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
689 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
690 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
691 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
692 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
693 * ~~                                                                 ~~
694 * __                                                                 __
695 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
696 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
697 * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
698 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
699 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
700 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
701 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
702 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
703 * ~~                                                                 ~~
704 * __                                                                 __
705 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
706 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
707 * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
708 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
709 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
710 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
711 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
712 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
713 * ~~                                                                 ~~
714 * __                                                                 __
715 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
716 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
717 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
718 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
719 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
720 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
721 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
722 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
723 * ~~                                                                 ~~
724 * __                                                                 __
725 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
726 * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
727 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
728 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
729 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
730 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
731 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
732 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
733 * ~~                                                                 ~~
734 *                   __                               __
735 *                   |  0   0   1   0   0   0   0   0  |
736 *                   | 167 100  5   41 159 169 217 208 |
737 *                   | 166 100  4   40 158 168 216 209 |
738 *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
739 *                   |  0   0   0   0   1   0   0   0  |
740 *                   |  0   0   0   0   0   1   0   0  |
741 *                   |  0   0   0   0   0   0   1   0  |
742 *                   |  0   0   0   0   0   0   0   1  |
743 *                   ~~                               ~~
744 *
745 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
746 * of the missing data.
747 *
748 * As is apparent from the example above, the only non-trivial rows in the
749 * inverse matrix correspond to the data disks that we're trying to
750 * reconstruct. Indeed, those are the only rows we need as the others would
751 * only be useful for reconstructing data known or assumed to be valid. For
752 * that reason, we only build the coefficients in the rows that correspond to
753 * targeted columns.
754 */
755/* END CSTYLED */
756
757static void
758vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
759    uint8_t **rows)
760{
761	int i, j;
762	int pow;
763
764	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
765
766	/*
767	 * Fill in the missing rows of interest.
768	 */
769	for (i = 0; i < nmap; i++) {
770		ASSERT3S(0, <=, map[i]);
771		ASSERT3S(map[i], <=, 2);
772
773		pow = map[i] * n;
774		if (pow > 255)
775			pow -= 255;
776		ASSERT(pow <= 255);
777
778		for (j = 0; j < n; j++) {
779			pow -= map[i];
780			if (pow < 0)
781				pow += 255;
782			rows[i][j] = vdev_raidz_pow2[pow];
783		}
784	}
785}
786
787static void
788vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
789    uint8_t **rows, uint8_t **invrows, const uint8_t *used)
790{
791	int i, j, ii, jj;
792	uint8_t log;
793
794	/*
795	 * Assert that the first nmissing entries from the array of used
796	 * columns correspond to parity columns and that subsequent entries
797	 * correspond to data columns.
798	 */
799	for (i = 0; i < nmissing; i++) {
800		ASSERT3S(used[i], <, rm->rm_firstdatacol);
801	}
802	for (; i < n; i++) {
803		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
804	}
805
806	/*
807	 * First initialize the storage where we'll compute the inverse rows.
808	 */
809	for (i = 0; i < nmissing; i++) {
810		for (j = 0; j < n; j++) {
811			invrows[i][j] = (i == j) ? 1 : 0;
812		}
813	}
814
815	/*
816	 * Subtract all trivial rows from the rows of consequence.
817	 */
818	for (i = 0; i < nmissing; i++) {
819		for (j = nmissing; j < n; j++) {
820			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
821			jj = used[j] - rm->rm_firstdatacol;
822			ASSERT3S(jj, <, n);
823			invrows[i][j] = rows[i][jj];
824			rows[i][jj] = 0;
825		}
826	}
827
828	/*
829	 * For each of the rows of interest, we must normalize it and subtract
830	 * a multiple of it from the other rows.
831	 */
832	for (i = 0; i < nmissing; i++) {
833		for (j = 0; j < missing[i]; j++) {
834			ASSERT3U(rows[i][j], ==, 0);
835		}
836		ASSERT3U(rows[i][missing[i]], !=, 0);
837
838		/*
839		 * Compute the inverse of the first element and multiply each
840		 * element in the row by that value.
841		 */
842		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
843
844		for (j = 0; j < n; j++) {
845			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
846			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
847		}
848
849		for (ii = 0; ii < nmissing; ii++) {
850			if (i == ii)
851				continue;
852
853			ASSERT3U(rows[ii][missing[i]], !=, 0);
854
855			log = vdev_raidz_log2[rows[ii][missing[i]]];
856
857			for (j = 0; j < n; j++) {
858				rows[ii][j] ^=
859				    vdev_raidz_exp2(rows[i][j], log);
860				invrows[ii][j] ^=
861				    vdev_raidz_exp2(invrows[i][j], log);
862			}
863		}
864	}
865
866	/*
867	 * Verify that the data that is left in the rows are properly part of
868	 * an identity matrix.
869	 */
870	for (i = 0; i < nmissing; i++) {
871		for (j = 0; j < n; j++) {
872			if (j == missing[i]) {
873				ASSERT3U(rows[i][j], ==, 1);
874			} else {
875				ASSERT3U(rows[i][j], ==, 0);
876			}
877		}
878	}
879}
880
881static void
882vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
883    int *missing, uint8_t **invrows, const uint8_t *used)
884{
885	int i, j, x, cc, c;
886	uint8_t *src;
887	uint64_t ccount;
888	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
889	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
890	uint8_t log, val;
891	int ll;
892	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
893	uint8_t *p, *pp;
894	size_t psize;
895
896	log = 0;	/* gcc */
897	psize = sizeof (invlog[0][0]) * n * nmissing;
898	p = zfs_alloc(psize);
899
900	for (pp = p, i = 0; i < nmissing; i++) {
901		invlog[i] = pp;
902		pp += n;
903	}
904
905	for (i = 0; i < nmissing; i++) {
906		for (j = 0; j < n; j++) {
907			ASSERT3U(invrows[i][j], !=, 0);
908			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
909		}
910	}
911
912	for (i = 0; i < n; i++) {
913		c = used[i];
914		ASSERT3U(c, <, rm->rm_cols);
915
916		src = rm->rm_col[c].rc_data;
917		ccount = rm->rm_col[c].rc_size;
918		for (j = 0; j < nmissing; j++) {
919			cc = missing[j] + rm->rm_firstdatacol;
920			ASSERT3U(cc, >=, rm->rm_firstdatacol);
921			ASSERT3U(cc, <, rm->rm_cols);
922			ASSERT3U(cc, !=, c);
923
924			dst[j] = rm->rm_col[cc].rc_data;
925			dcount[j] = rm->rm_col[cc].rc_size;
926		}
927
928		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
929
930		for (x = 0; x < ccount; x++, src++) {
931			if (*src != 0)
932				log = vdev_raidz_log2[*src];
933
934			for (cc = 0; cc < nmissing; cc++) {
935				if (x >= dcount[cc])
936					continue;
937
938				if (*src == 0) {
939					val = 0;
940				} else {
941					if ((ll = log + invlog[cc][i]) >= 255)
942						ll -= 255;
943					val = vdev_raidz_pow2[ll];
944				}
945
946				if (i == 0)
947					dst[cc][x] = val;
948				else
949					dst[cc][x] ^= val;
950			}
951		}
952	}
953
954	zfs_free(p, psize);
955}
956
957static int
958vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
959{
960	int n, i, c, t, tt;
961	int nmissing_rows;
962	int missing_rows[VDEV_RAIDZ_MAXPARITY];
963	int parity_map[VDEV_RAIDZ_MAXPARITY];
964
965	uint8_t *p, *pp;
966	size_t psize;
967
968	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
969	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
970	uint8_t *used;
971
972	int code = 0;
973
974
975	n = rm->rm_cols - rm->rm_firstdatacol;
976
977	/*
978	 * Figure out which data columns are missing.
979	 */
980	nmissing_rows = 0;
981	for (t = 0; t < ntgts; t++) {
982		if (tgts[t] >= rm->rm_firstdatacol) {
983			missing_rows[nmissing_rows++] =
984			    tgts[t] - rm->rm_firstdatacol;
985		}
986	}
987
988	/*
989	 * Figure out which parity columns to use to help generate the missing
990	 * data columns.
991	 */
992	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
993		ASSERT(tt < ntgts);
994		ASSERT(c < rm->rm_firstdatacol);
995
996		/*
997		 * Skip any targeted parity columns.
998		 */
999		if (c == tgts[tt]) {
1000			tt++;
1001			continue;
1002		}
1003
1004		code |= 1 << c;
1005
1006		parity_map[i] = c;
1007		i++;
1008	}
1009
1010	ASSERT(code != 0);
1011	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1012
1013	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1014	    nmissing_rows * n + sizeof (used[0]) * n;
1015	p = kmem_alloc(psize, KM_SLEEP);
1016
1017	for (pp = p, i = 0; i < nmissing_rows; i++) {
1018		rows[i] = pp;
1019		pp += n;
1020		invrows[i] = pp;
1021		pp += n;
1022	}
1023	used = pp;
1024
1025	for (i = 0; i < nmissing_rows; i++) {
1026		used[i] = parity_map[i];
1027	}
1028
1029	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1030		if (tt < nmissing_rows &&
1031		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1032			tt++;
1033			continue;
1034		}
1035
1036		ASSERT3S(i, <, n);
1037		used[i] = c;
1038		i++;
1039	}
1040
1041	/*
1042	 * Initialize the interesting rows of the matrix.
1043	 */
1044	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1045
1046	/*
1047	 * Invert the matrix.
1048	 */
1049	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1050	    invrows, used);
1051
1052	/*
1053	 * Reconstruct the missing data using the generated matrix.
1054	 */
1055	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1056	    invrows, used);
1057
1058	kmem_free(p, psize);
1059
1060	return (code);
1061}
1062
1063static int
1064vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1065{
1066	int tgts[VDEV_RAIDZ_MAXPARITY];
1067	int ntgts;
1068	int i, c;
1069	int code;
1070	int nbadparity, nbaddata;
1071
1072	/*
1073	 * The tgts list must already be sorted.
1074	 */
1075	for (i = 1; i < nt; i++) {
1076		ASSERT(t[i] > t[i - 1]);
1077	}
1078
1079	nbadparity = rm->rm_firstdatacol;
1080	nbaddata = rm->rm_cols - nbadparity;
1081	ntgts = 0;
1082	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1083		if (i < nt && c == t[i]) {
1084			tgts[ntgts++] = c;
1085			i++;
1086		} else if (rm->rm_col[c].rc_error != 0) {
1087			tgts[ntgts++] = c;
1088		} else if (c >= rm->rm_firstdatacol) {
1089			nbaddata--;
1090		} else {
1091			nbadparity--;
1092		}
1093	}
1094
1095	ASSERT(ntgts >= nt);
1096	ASSERT(nbaddata >= 0);
1097	ASSERT(nbaddata + nbadparity == ntgts);
1098
1099	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1100	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1101	ASSERT(code > 0);
1102	return (code);
1103}
1104
1105static raidz_map_t *
1106vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1107    uint64_t dcols, uint64_t nparity)
1108{
1109	raidz_map_t *rm;
1110	uint64_t b = offset >> unit_shift;
1111	uint64_t s = size >> unit_shift;
1112	uint64_t f = b % dcols;
1113	uint64_t o = (b / dcols) << unit_shift;
1114	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1115
1116	q = s / (dcols - nparity);
1117	r = s - q * (dcols - nparity);
1118	bc = (r == 0 ? 0 : r + nparity);
1119	tot = s + nparity * (q + (r == 0 ? 0 : 1));
1120
1121	if (q == 0) {
1122		acols = bc;
1123		scols = MIN(dcols, roundup(bc, nparity + 1));
1124	} else {
1125		acols = dcols;
1126		scols = dcols;
1127	}
1128
1129	ASSERT3U(acols, <=, scols);
1130
1131	rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols]));
1132
1133	rm->rm_cols = acols;
1134	rm->rm_scols = scols;
1135	rm->rm_bigcols = bc;
1136	rm->rm_skipstart = bc;
1137	rm->rm_missingdata = 0;
1138	rm->rm_missingparity = 0;
1139	rm->rm_firstdatacol = nparity;
1140	rm->rm_reports = 0;
1141	rm->rm_freed = 0;
1142	rm->rm_ecksuminjected = 0;
1143
1144	asize = 0;
1145
1146	for (c = 0; c < scols; c++) {
1147		col = f + c;
1148		coff = o;
1149		if (col >= dcols) {
1150			col -= dcols;
1151			coff += 1ULL << unit_shift;
1152		}
1153		rm->rm_col[c].rc_devidx = col;
1154		rm->rm_col[c].rc_offset = coff;
1155		rm->rm_col[c].rc_data = NULL;
1156		rm->rm_col[c].rc_error = 0;
1157		rm->rm_col[c].rc_tried = 0;
1158		rm->rm_col[c].rc_skipped = 0;
1159
1160		if (c >= acols)
1161			rm->rm_col[c].rc_size = 0;
1162		else if (c < bc)
1163			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1164		else
1165			rm->rm_col[c].rc_size = q << unit_shift;
1166
1167		asize += rm->rm_col[c].rc_size;
1168	}
1169
1170	ASSERT3U(asize, ==, tot << unit_shift);
1171	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1172	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1173	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1174	ASSERT3U(rm->rm_nskip, <=, nparity);
1175
1176	for (c = 0; c < rm->rm_firstdatacol; c++)
1177		rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size);
1178
1179	rm->rm_col[c].rc_data = data;
1180
1181	for (c = c + 1; c < acols; c++)
1182		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1183		    rm->rm_col[c - 1].rc_size;
1184
1185	/*
1186	 * If all data stored spans all columns, there's a danger that parity
1187	 * will always be on the same device and, since parity isn't read
1188	 * during normal operation, that that device's I/O bandwidth won't be
1189	 * used effectively. We therefore switch the parity every 1MB.
1190	 *
1191	 * ... at least that was, ostensibly, the theory. As a practical
1192	 * matter unless we juggle the parity between all devices evenly, we
1193	 * won't see any benefit. Further, occasional writes that aren't a
1194	 * multiple of the LCM of the number of children and the minimum
1195	 * stripe width are sufficient to avoid pessimal behavior.
1196	 * Unfortunately, this decision created an implicit on-disk format
1197	 * requirement that we need to support for all eternity, but only
1198	 * for single-parity RAID-Z.
1199	 *
1200	 * If we intend to skip a sector in the zeroth column for padding
1201	 * we must make sure to note this swap. We will never intend to
1202	 * skip the first column since at least one data and one parity
1203	 * column must appear in each row.
1204	 */
1205	ASSERT(rm->rm_cols >= 2);
1206	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1207
1208	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1209		devidx = rm->rm_col[0].rc_devidx;
1210		o = rm->rm_col[0].rc_offset;
1211		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1212		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1213		rm->rm_col[1].rc_devidx = devidx;
1214		rm->rm_col[1].rc_offset = o;
1215
1216		if (rm->rm_skipstart == 0)
1217			rm->rm_skipstart = 1;
1218	}
1219
1220	return (rm);
1221}
1222
1223static void
1224vdev_raidz_map_free(raidz_map_t *rm)
1225{
1226	int c;
1227
1228	for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1229		zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
1230
1231	zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
1232}
1233
1234static vdev_t *
1235vdev_child(vdev_t *pvd, uint64_t devidx)
1236{
1237	vdev_t *cvd;
1238
1239	STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1240		if (cvd->v_id == devidx)
1241			break;
1242	}
1243
1244	return (cvd);
1245}
1246
1247/*
1248 * We keep track of whether or not there were any injected errors, so that
1249 * any ereports we generate can note it.
1250 */
1251static int
1252raidz_checksum_verify(const blkptr_t *bp, void *data, uint64_t size)
1253{
1254
1255	return (zio_checksum_verify(bp, data));
1256}
1257
1258/*
1259 * Generate the parity from the data columns. If we tried and were able to
1260 * read the parity without error, verify that the generated parity matches the
1261 * data we read. If it doesn't, we fire off a checksum error. Return the
1262 * number such failures.
1263 */
1264static int
1265raidz_parity_verify(raidz_map_t *rm)
1266{
1267	void *orig[VDEV_RAIDZ_MAXPARITY];
1268	int c, ret = 0;
1269	raidz_col_t *rc;
1270
1271	for (c = 0; c < rm->rm_firstdatacol; c++) {
1272		rc = &rm->rm_col[c];
1273		if (!rc->rc_tried || rc->rc_error != 0)
1274			continue;
1275		orig[c] = zfs_alloc(rc->rc_size);
1276		bcopy(rc->rc_data, orig[c], rc->rc_size);
1277	}
1278
1279	vdev_raidz_generate_parity(rm);
1280
1281	for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1282		rc = &rm->rm_col[c];
1283		if (!rc->rc_tried || rc->rc_error != 0)
1284			continue;
1285		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1286			rc->rc_error = ECKSUM;
1287			ret++;
1288		}
1289		zfs_free(orig[c], rc->rc_size);
1290	}
1291
1292	return (ret);
1293}
1294
1295/*
1296 * Iterate over all combinations of bad data and attempt a reconstruction.
1297 * Note that the algorithm below is non-optimal because it doesn't take into
1298 * account how reconstruction is actually performed. For example, with
1299 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1300 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1301 * cases we'd only use parity information in column 0.
1302 */
1303static int
1304vdev_raidz_combrec(raidz_map_t *rm, const blkptr_t *bp, void *data,
1305    off_t offset, uint64_t bytes, int total_errors, int data_errors)
1306{
1307	raidz_col_t *rc;
1308	void *orig[VDEV_RAIDZ_MAXPARITY];
1309	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1310	int *tgts = &tstore[1];
1311	int current, next, i, c, n;
1312	int code, ret = 0;
1313
1314	ASSERT(total_errors < rm->rm_firstdatacol);
1315
1316	/*
1317	 * This simplifies one edge condition.
1318	 */
1319	tgts[-1] = -1;
1320
1321	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1322		/*
1323		 * Initialize the targets array by finding the first n columns
1324		 * that contain no error.
1325		 *
1326		 * If there were no data errors, we need to ensure that we're
1327		 * always explicitly attempting to reconstruct at least one
1328		 * data column. To do this, we simply push the highest target
1329		 * up into the data columns.
1330		 */
1331		for (c = 0, i = 0; i < n; i++) {
1332			if (i == n - 1 && data_errors == 0 &&
1333			    c < rm->rm_firstdatacol) {
1334				c = rm->rm_firstdatacol;
1335			}
1336
1337			while (rm->rm_col[c].rc_error != 0) {
1338				c++;
1339				ASSERT3S(c, <, rm->rm_cols);
1340			}
1341
1342			tgts[i] = c++;
1343		}
1344
1345		/*
1346		 * Setting tgts[n] simplifies the other edge condition.
1347		 */
1348		tgts[n] = rm->rm_cols;
1349
1350		/*
1351		 * These buffers were allocated in previous iterations.
1352		 */
1353		for (i = 0; i < n - 1; i++) {
1354			ASSERT(orig[i] != NULL);
1355		}
1356
1357		orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size);
1358
1359		current = 0;
1360		next = tgts[current];
1361
1362		while (current != n) {
1363			tgts[current] = next;
1364			current = 0;
1365
1366			/*
1367			 * Save off the original data that we're going to
1368			 * attempt to reconstruct.
1369			 */
1370			for (i = 0; i < n; i++) {
1371				ASSERT(orig[i] != NULL);
1372				c = tgts[i];
1373				ASSERT3S(c, >=, 0);
1374				ASSERT3S(c, <, rm->rm_cols);
1375				rc = &rm->rm_col[c];
1376				bcopy(rc->rc_data, orig[i], rc->rc_size);
1377			}
1378
1379			/*
1380			 * Attempt a reconstruction and exit the outer loop on
1381			 * success.
1382			 */
1383			code = vdev_raidz_reconstruct(rm, tgts, n);
1384			if (raidz_checksum_verify(bp, data, bytes) == 0) {
1385				for (i = 0; i < n; i++) {
1386					c = tgts[i];
1387					rc = &rm->rm_col[c];
1388					ASSERT(rc->rc_error == 0);
1389					rc->rc_error = ECKSUM;
1390				}
1391
1392				ret = code;
1393				goto done;
1394			}
1395
1396			/*
1397			 * Restore the original data.
1398			 */
1399			for (i = 0; i < n; i++) {
1400				c = tgts[i];
1401				rc = &rm->rm_col[c];
1402				bcopy(orig[i], rc->rc_data, rc->rc_size);
1403			}
1404
1405			do {
1406				/*
1407				 * Find the next valid column after the current
1408				 * position..
1409				 */
1410				for (next = tgts[current] + 1;
1411				    next < rm->rm_cols &&
1412				    rm->rm_col[next].rc_error != 0; next++)
1413					continue;
1414
1415				ASSERT(next <= tgts[current + 1]);
1416
1417				/*
1418				 * If that spot is available, we're done here.
1419				 */
1420				if (next != tgts[current + 1])
1421					break;
1422
1423				/*
1424				 * Otherwise, find the next valid column after
1425				 * the previous position.
1426				 */
1427				for (c = tgts[current - 1] + 1;
1428				    rm->rm_col[c].rc_error != 0; c++)
1429					continue;
1430
1431				tgts[current] = c;
1432				current++;
1433
1434			} while (current != n);
1435		}
1436	}
1437	n--;
1438done:
1439	for (i = n - 1; i >= 0; i--) {
1440		zfs_free(orig[i], rm->rm_col[0].rc_size);
1441	}
1442
1443	return (ret);
1444}
1445
1446static int
1447vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1448    off_t offset, size_t bytes)
1449{
1450	vdev_t *tvd = vd->v_top;
1451	vdev_t *cvd;
1452	raidz_map_t *rm;
1453	raidz_col_t *rc;
1454	int c, error;
1455	int unexpected_errors;
1456	int parity_errors;
1457	int parity_untried;
1458	int data_errors;
1459	int total_errors;
1460	int n;
1461	int tgts[VDEV_RAIDZ_MAXPARITY];
1462	int code;
1463
1464	rc = NULL;	/* gcc */
1465	error = 0;
1466
1467	rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1468	    vd->v_nchildren, vd->v_nparity);
1469
1470	/*
1471	 * Iterate over the columns in reverse order so that we hit the parity
1472	 * last -- any errors along the way will force us to read the parity.
1473	 */
1474	for (c = rm->rm_cols - 1; c >= 0; c--) {
1475		rc = &rm->rm_col[c];
1476		cvd = vdev_child(vd, rc->rc_devidx);
1477		if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1478			if (c >= rm->rm_firstdatacol)
1479				rm->rm_missingdata++;
1480			else
1481				rm->rm_missingparity++;
1482			rc->rc_error = ENXIO;
1483			rc->rc_tried = 1;	/* don't even try */
1484			rc->rc_skipped = 1;
1485			continue;
1486		}
1487#if 0		/* XXX: Too hard for the boot code. */
1488		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1489			if (c >= rm->rm_firstdatacol)
1490				rm->rm_missingdata++;
1491			else
1492				rm->rm_missingparity++;
1493			rc->rc_error = ESTALE;
1494			rc->rc_skipped = 1;
1495			continue;
1496		}
1497#endif
1498		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1499			rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1500			    rc->rc_offset, rc->rc_size);
1501			rc->rc_tried = 1;
1502			rc->rc_skipped = 0;
1503		}
1504	}
1505
1506reconstruct:
1507	unexpected_errors = 0;
1508	parity_errors = 0;
1509	parity_untried = 0;
1510	data_errors = 0;
1511	total_errors = 0;
1512
1513	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1514	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1515
1516	for (c = 0; c < rm->rm_cols; c++) {
1517		rc = &rm->rm_col[c];
1518
1519		if (rc->rc_error) {
1520			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
1521
1522			if (c < rm->rm_firstdatacol)
1523				parity_errors++;
1524			else
1525				data_errors++;
1526
1527			if (!rc->rc_skipped)
1528				unexpected_errors++;
1529
1530			total_errors++;
1531		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1532			parity_untried++;
1533		}
1534	}
1535
1536	/*
1537	 * There are three potential phases for a read:
1538	 *	1. produce valid data from the columns read
1539	 *	2. read all disks and try again
1540	 *	3. perform combinatorial reconstruction
1541	 *
1542	 * Each phase is progressively both more expensive and less likely to
1543	 * occur. If we encounter more errors than we can repair or all phases
1544	 * fail, we have no choice but to return an error.
1545	 */
1546
1547	/*
1548	 * If the number of errors we saw was correctable -- less than or equal
1549	 * to the number of parity disks read -- attempt to produce data that
1550	 * has a valid checksum. Naturally, this case applies in the absence of
1551	 * any errors.
1552	 */
1553	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1554		if (data_errors == 0) {
1555			if (raidz_checksum_verify(bp, data, bytes) == 0) {
1556				/*
1557				 * If we read parity information (unnecessarily
1558				 * as it happens since no reconstruction was
1559				 * needed) regenerate and verify the parity.
1560				 * We also regenerate parity when resilvering
1561				 * so we can write it out to the failed device
1562				 * later.
1563				 */
1564				if (parity_errors + parity_untried <
1565				    rm->rm_firstdatacol) {
1566					n = raidz_parity_verify(rm);
1567					unexpected_errors += n;
1568					ASSERT(parity_errors + n <=
1569					    rm->rm_firstdatacol);
1570				}
1571				goto done;
1572			}
1573		} else {
1574			/*
1575			 * We either attempt to read all the parity columns or
1576			 * none of them. If we didn't try to read parity, we
1577			 * wouldn't be here in the correctable case. There must
1578			 * also have been fewer parity errors than parity
1579			 * columns or, again, we wouldn't be in this code path.
1580			 */
1581			ASSERT(parity_untried == 0);
1582			ASSERT(parity_errors < rm->rm_firstdatacol);
1583
1584			/*
1585			 * Identify the data columns that reported an error.
1586			 */
1587			n = 0;
1588			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1589				rc = &rm->rm_col[c];
1590				if (rc->rc_error != 0) {
1591					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1592					tgts[n++] = c;
1593				}
1594			}
1595
1596			ASSERT(rm->rm_firstdatacol >= n);
1597
1598			code = vdev_raidz_reconstruct(rm, tgts, n);
1599
1600			if (raidz_checksum_verify(bp, data, bytes) == 0) {
1601				/*
1602				 * If we read more parity disks than were used
1603				 * for reconstruction, confirm that the other
1604				 * parity disks produced correct data. This
1605				 * routine is suboptimal in that it regenerates
1606				 * the parity that we already used in addition
1607				 * to the parity that we're attempting to
1608				 * verify, but this should be a relatively
1609				 * uncommon case, and can be optimized if it
1610				 * becomes a problem. Note that we regenerate
1611				 * parity when resilvering so we can write it
1612				 * out to failed devices later.
1613				 */
1614				if (parity_errors < rm->rm_firstdatacol - n) {
1615					n = raidz_parity_verify(rm);
1616					unexpected_errors += n;
1617					ASSERT(parity_errors + n <=
1618					    rm->rm_firstdatacol);
1619				}
1620
1621				goto done;
1622			}
1623		}
1624	}
1625
1626	/*
1627	 * This isn't a typical situation -- either we got a read
1628	 * error or a child silently returned bad data. Read every
1629	 * block so we can try again with as much data and parity as
1630	 * we can track down. If we've already been through once
1631	 * before, all children will be marked as tried so we'll
1632	 * proceed to combinatorial reconstruction.
1633	 */
1634	unexpected_errors = 1;
1635	rm->rm_missingdata = 0;
1636	rm->rm_missingparity = 0;
1637
1638	n = 0;
1639	for (c = 0; c < rm->rm_cols; c++) {
1640		rc = &rm->rm_col[c];
1641
1642		if (rc->rc_tried)
1643			continue;
1644
1645		cvd = vdev_child(vd, rc->rc_devidx);
1646		ASSERT(cvd != NULL);
1647		rc->rc_error = cvd->v_read(cvd, NULL,
1648		    rc->rc_data, rc->rc_offset, rc->rc_size);
1649		if (rc->rc_error == 0)
1650			n++;
1651		rc->rc_tried = 1;
1652		rc->rc_skipped = 0;
1653	}
1654	/*
1655	 * If we managed to read anything more, retry the
1656	 * reconstruction.
1657	 */
1658	if (n > 0)
1659		goto reconstruct;
1660
1661	/*
1662	 * At this point we've attempted to reconstruct the data given the
1663	 * errors we detected, and we've attempted to read all columns. There
1664	 * must, therefore, be one or more additional problems -- silent errors
1665	 * resulting in invalid data rather than explicit I/O errors resulting
1666	 * in absent data. We check if there is enough additional data to
1667	 * possibly reconstruct the data and then perform combinatorial
1668	 * reconstruction over all possible combinations. If that fails,
1669	 * we're cooked.
1670	 */
1671	if (total_errors > rm->rm_firstdatacol) {
1672		error = EIO;
1673	} else if (total_errors < rm->rm_firstdatacol &&
1674	    (code = vdev_raidz_combrec(rm, bp, data, offset, bytes,
1675	     total_errors, data_errors)) != 0) {
1676		/*
1677		 * If we didn't use all the available parity for the
1678		 * combinatorial reconstruction, verify that the remaining
1679		 * parity is correct.
1680		 */
1681		if (code != (1 << rm->rm_firstdatacol) - 1)
1682			(void) raidz_parity_verify(rm);
1683	} else {
1684		/*
1685		 * We're here because either:
1686		 *
1687		 *	total_errors == rm_first_datacol, or
1688		 *	vdev_raidz_combrec() failed
1689		 *
1690		 * In either case, there is enough bad data to prevent
1691		 * reconstruction.
1692		 *
1693		 * Start checksum ereports for all children which haven't
1694		 * failed, and the IO wasn't speculative.
1695		 */
1696		error = ECKSUM;
1697	}
1698
1699done:
1700	vdev_raidz_map_free(rm);
1701
1702	return (error);
1703}
1704