1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#include <sys/cdefs.h>
27__FBSDID("$FreeBSD$");
28
29static uint64_t zfs_crc64_table[256];
30
31#define	ECKSUM	666
32
33#define	ASSERT(...)	do { } while (0)
34#define	ASSERT3U(...)	do { } while (0)
35#define	ASSERT3S(...)	do { } while (0)
36
37#define	panic(...)	do {						\
38	printf(__VA_ARGS__);						\
39	for (;;) ;							\
40} while (0)
41
42#define	kmem_alloc(size, flag)	zfs_alloc((size))
43#define	kmem_free(ptr, size)	zfs_free((ptr), (size))
44
45static void
46zfs_init_crc(void)
47{
48	int i, j;
49	uint64_t *ct;
50
51	/*
52	 * Calculate the crc64 table (used for the zap hash
53	 * function).
54	 */
55	if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
56		memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
57		for (i = 0; i < 256; i++)
58			for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
59				*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
60	}
61}
62
63static void
64zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
65{
66	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
67}
68
69/*
70 * Signature for checksum functions.
71 */
72typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp);
73
74/*
75 * Information about each checksum function.
76 */
77typedef struct zio_checksum_info {
78	zio_checksum_t	*ci_func[2]; /* checksum function for each byteorder */
79	int		ci_correctable;	/* number of correctable bits	*/
80	int		ci_eck;		/* uses zio embedded checksum? */
81	int		ci_dedup;	/* strong enough for dedup? */
82	const char	*ci_name;	/* descriptive name */
83} zio_checksum_info_t;
84
85#include "fletcher.c"
86#include "sha256.c"
87
88static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
89	{{NULL,			NULL},			0, 0, 0, "inherit"},
90	{{NULL,			NULL},			0, 0, 0, "on"},
91	{{zio_checksum_off,	zio_checksum_off},	0, 0, 0, "off"},
92	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 1, 0, "label"},
93	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 1, 0, "gang_header"},
94	{{fletcher_2_native,	fletcher_2_byteswap},	0, 1, 0, "zilog"},
95	{{fletcher_2_native,	fletcher_2_byteswap},	0, 0, 0, "fletcher2"},
96	{{fletcher_4_native,	fletcher_4_byteswap},	1, 0, 0, "fletcher4"},
97	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 0, 1, "SHA256"},
98	{{fletcher_4_native,	fletcher_4_byteswap},	0, 1, 0, "zillog2"},
99};
100
101
102/*
103 * Common signature for all zio compress/decompress functions.
104 */
105typedef size_t zio_compress_func_t(void *src, void *dst,
106    size_t s_len, size_t d_len, int);
107typedef int zio_decompress_func_t(void *src, void *dst,
108    size_t s_len, size_t d_len, int);
109
110/*
111 * Information about each compression function.
112 */
113typedef struct zio_compress_info {
114	zio_compress_func_t	*ci_compress;	/* compression function */
115	zio_decompress_func_t	*ci_decompress;	/* decompression function */
116	int			ci_level;	/* level parameter */
117	const char		*ci_name;	/* algorithm name */
118} zio_compress_info_t;
119
120#include "lzjb.c"
121#include "zle.c"
122#include "lz4.c"
123
124/*
125 * Compression vectors.
126 */
127static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
128	{NULL,			NULL,			0,	"inherit"},
129	{NULL,			NULL,			0,	"on"},
130	{NULL,			NULL,			0,	"uncompressed"},
131	{NULL,			lzjb_decompress,	0,	"lzjb"},
132	{NULL,			NULL,			0,	"empty"},
133	{NULL,			NULL,			1,	"gzip-1"},
134	{NULL,			NULL,			2,	"gzip-2"},
135	{NULL,			NULL,			3,	"gzip-3"},
136	{NULL,			NULL,			4,	"gzip-4"},
137	{NULL,			NULL,			5,	"gzip-5"},
138	{NULL,			NULL,			6,	"gzip-6"},
139	{NULL,			NULL,			7,	"gzip-7"},
140	{NULL,			NULL,			8,	"gzip-8"},
141	{NULL,			NULL,			9,	"gzip-9"},
142	{NULL,			zle_decompress,		64,	"zle"},
143	{NULL,			lz4_decompress,		0,	"lz4"},
144};
145
146static void
147byteswap_uint64_array(void *vbuf, size_t size)
148{
149	uint64_t *buf = vbuf;
150	size_t count = size >> 3;
151	int i;
152
153	ASSERT((size & 7) == 0);
154
155	for (i = 0; i < count; i++)
156		buf[i] = BSWAP_64(buf[i]);
157}
158
159/*
160 * Set the external verifier for a gang block based on <vdev, offset, txg>,
161 * a tuple which is guaranteed to be unique for the life of the pool.
162 */
163static void
164zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
165{
166	const dva_t *dva = BP_IDENTITY(bp);
167	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
168
169	ASSERT(BP_IS_GANG(bp));
170
171	ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
172}
173
174/*
175 * Set the external verifier for a label block based on its offset.
176 * The vdev is implicit, and the txg is unknowable at pool open time --
177 * hence the logic in vdev_uberblock_load() to find the most recent copy.
178 */
179static void
180zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
181{
182	ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
183}
184
185static int
186zio_checksum_verify(const blkptr_t *bp, void *data)
187{
188	uint64_t size;
189	unsigned int checksum;
190	zio_checksum_info_t *ci;
191	zio_cksum_t actual_cksum, expected_cksum, verifier;
192	int byteswap;
193
194	checksum = BP_GET_CHECKSUM(bp);
195	size = BP_GET_PSIZE(bp);
196
197	if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
198		return (EINVAL);
199	ci = &zio_checksum_table[checksum];
200	if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
201		return (EINVAL);
202
203	if (ci->ci_eck) {
204		zio_eck_t *eck;
205
206		ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
207		    checksum == ZIO_CHECKSUM_LABEL);
208
209		eck = (zio_eck_t *)((char *)data + size) - 1;
210
211		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
212			zio_checksum_gang_verifier(&verifier, bp);
213		else if (checksum == ZIO_CHECKSUM_LABEL)
214			zio_checksum_label_verifier(&verifier,
215			    DVA_GET_OFFSET(BP_IDENTITY(bp)));
216		else
217			verifier = bp->blk_cksum;
218
219		byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
220
221		if (byteswap)
222			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
223
224		expected_cksum = eck->zec_cksum;
225		eck->zec_cksum = verifier;
226		ci->ci_func[byteswap](data, size, &actual_cksum);
227		eck->zec_cksum = expected_cksum;
228
229		if (byteswap)
230			byteswap_uint64_array(&expected_cksum,
231			    sizeof (zio_cksum_t));
232	} else {
233		expected_cksum = bp->blk_cksum;
234		ci->ci_func[0](data, size, &actual_cksum);
235	}
236
237	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
238		/*printf("ZFS: read checksum failed\n");*/
239		return (EIO);
240	}
241
242	return (0);
243}
244
245static int
246zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
247	void *dest, uint64_t destsize)
248{
249	zio_compress_info_t *ci;
250
251	if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
252		printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
253		return (EIO);
254	}
255
256	ci = &zio_compress_table[cpfunc];
257	if (!ci->ci_decompress) {
258		printf("ZFS: unsupported compression algorithm %s\n",
259		    ci->ci_name);
260		return (EIO);
261	}
262
263	return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
264}
265
266static uint64_t
267zap_hash(uint64_t salt, const char *name)
268{
269	const uint8_t *cp;
270	uint8_t c;
271	uint64_t crc = salt;
272
273	ASSERT(crc != 0);
274	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
275	for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
276		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
277
278	/*
279	 * Only use 28 bits, since we need 4 bits in the cookie for the
280	 * collision differentiator.  We MUST use the high bits, since
281	 * those are the onces that we first pay attention to when
282	 * chosing the bucket.
283	 */
284	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
285
286	return (crc);
287}
288
289static void *zfs_alloc(size_t size);
290static void zfs_free(void *ptr, size_t size);
291
292typedef struct raidz_col {
293	uint64_t rc_devidx;		/* child device index for I/O */
294	uint64_t rc_offset;		/* device offset */
295	uint64_t rc_size;		/* I/O size */
296	void *rc_data;			/* I/O data */
297	int rc_error;			/* I/O error for this device */
298	uint8_t rc_tried;		/* Did we attempt this I/O column? */
299	uint8_t rc_skipped;		/* Did we skip this I/O column? */
300} raidz_col_t;
301
302typedef struct raidz_map {
303	uint64_t rm_cols;		/* Regular column count */
304	uint64_t rm_scols;		/* Count including skipped columns */
305	uint64_t rm_bigcols;		/* Number of oversized columns */
306	uint64_t rm_asize;		/* Actual total I/O size */
307	uint64_t rm_missingdata;	/* Count of missing data devices */
308	uint64_t rm_missingparity;	/* Count of missing parity devices */
309	uint64_t rm_firstdatacol;	/* First data column/parity count */
310	uint64_t rm_nskip;		/* Skipped sectors for padding */
311	uint64_t rm_skipstart;		/* Column index of padding start */
312	uintptr_t rm_reports;		/* # of referencing checksum reports */
313	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
314	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
315	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
316} raidz_map_t;
317
318#define	VDEV_RAIDZ_P		0
319#define	VDEV_RAIDZ_Q		1
320#define	VDEV_RAIDZ_R		2
321
322#define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
323#define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
324
325/*
326 * We provide a mechanism to perform the field multiplication operation on a
327 * 64-bit value all at once rather than a byte at a time. This works by
328 * creating a mask from the top bit in each byte and using that to
329 * conditionally apply the XOR of 0x1d.
330 */
331#define	VDEV_RAIDZ_64MUL_2(x, mask) \
332{ \
333	(mask) = (x) & 0x8080808080808080ULL; \
334	(mask) = ((mask) << 1) - ((mask) >> 7); \
335	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
336	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
337}
338
339#define	VDEV_RAIDZ_64MUL_4(x, mask) \
340{ \
341	VDEV_RAIDZ_64MUL_2((x), mask); \
342	VDEV_RAIDZ_64MUL_2((x), mask); \
343}
344
345/*
346 * These two tables represent powers and logs of 2 in the Galois field defined
347 * above. These values were computed by repeatedly multiplying by 2 as above.
348 */
349static const uint8_t vdev_raidz_pow2[256] = {
350	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
351	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
352	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
353	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
354	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
355	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
356	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
357	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
358	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
359	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
360	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
361	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
362	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
363	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
364	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
365	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
366	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
367	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
368	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
369	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
370	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
371	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
372	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
373	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
374	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
375	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
376	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
377	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
378	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
379	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
380	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
381	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
382};
383static const uint8_t vdev_raidz_log2[256] = {
384	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
385	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
386	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
387	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
388	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
389	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
390	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
391	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
392	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
393	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
394	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
395	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
396	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
397	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
398	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
399	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
400	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
401	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
402	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
403	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
404	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
405	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
406	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
407	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
408	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
409	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
410	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
411	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
412	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
413	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
414	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
415	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
416};
417
418/*
419 * Multiply a given number by 2 raised to the given power.
420 */
421static uint8_t
422vdev_raidz_exp2(uint8_t a, int exp)
423{
424	if (a == 0)
425		return (0);
426
427	ASSERT(exp >= 0);
428	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
429
430	exp += vdev_raidz_log2[a];
431	if (exp > 255)
432		exp -= 255;
433
434	return (vdev_raidz_pow2[exp]);
435}
436
437static void
438vdev_raidz_generate_parity_p(raidz_map_t *rm)
439{
440	uint64_t *p, *src, pcount, ccount, i;
441	int c;
442
443	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
444
445	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
446		src = rm->rm_col[c].rc_data;
447		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
448		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
449
450		if (c == rm->rm_firstdatacol) {
451			ASSERT(ccount == pcount);
452			for (i = 0; i < ccount; i++, src++, p++) {
453				*p = *src;
454			}
455		} else {
456			ASSERT(ccount <= pcount);
457			for (i = 0; i < ccount; i++, src++, p++) {
458				*p ^= *src;
459			}
460		}
461	}
462}
463
464static void
465vdev_raidz_generate_parity_pq(raidz_map_t *rm)
466{
467	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
468	int c;
469
470	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
471	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
472	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
473
474	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
475		src = rm->rm_col[c].rc_data;
476		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
477		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
478
479		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
480
481		if (c == rm->rm_firstdatacol) {
482			ASSERT(ccnt == pcnt || ccnt == 0);
483			for (i = 0; i < ccnt; i++, src++, p++, q++) {
484				*p = *src;
485				*q = *src;
486			}
487			for (; i < pcnt; i++, src++, p++, q++) {
488				*p = 0;
489				*q = 0;
490			}
491		} else {
492			ASSERT(ccnt <= pcnt);
493
494			/*
495			 * Apply the algorithm described above by multiplying
496			 * the previous result and adding in the new value.
497			 */
498			for (i = 0; i < ccnt; i++, src++, p++, q++) {
499				*p ^= *src;
500
501				VDEV_RAIDZ_64MUL_2(*q, mask);
502				*q ^= *src;
503			}
504
505			/*
506			 * Treat short columns as though they are full of 0s.
507			 * Note that there's therefore nothing needed for P.
508			 */
509			for (; i < pcnt; i++, q++) {
510				VDEV_RAIDZ_64MUL_2(*q, mask);
511			}
512		}
513	}
514}
515
516static void
517vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
518{
519	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
520	int c;
521
522	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
523	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
524	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
525	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
526	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
527
528	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
529		src = rm->rm_col[c].rc_data;
530		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
531		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
532		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
533
534		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
535
536		if (c == rm->rm_firstdatacol) {
537			ASSERT(ccnt == pcnt || ccnt == 0);
538			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
539				*p = *src;
540				*q = *src;
541				*r = *src;
542			}
543			for (; i < pcnt; i++, src++, p++, q++, r++) {
544				*p = 0;
545				*q = 0;
546				*r = 0;
547			}
548		} else {
549			ASSERT(ccnt <= pcnt);
550
551			/*
552			 * Apply the algorithm described above by multiplying
553			 * the previous result and adding in the new value.
554			 */
555			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
556				*p ^= *src;
557
558				VDEV_RAIDZ_64MUL_2(*q, mask);
559				*q ^= *src;
560
561				VDEV_RAIDZ_64MUL_4(*r, mask);
562				*r ^= *src;
563			}
564
565			/*
566			 * Treat short columns as though they are full of 0s.
567			 * Note that there's therefore nothing needed for P.
568			 */
569			for (; i < pcnt; i++, q++, r++) {
570				VDEV_RAIDZ_64MUL_2(*q, mask);
571				VDEV_RAIDZ_64MUL_4(*r, mask);
572			}
573		}
574	}
575}
576
577/*
578 * Generate RAID parity in the first virtual columns according to the number of
579 * parity columns available.
580 */
581static void
582vdev_raidz_generate_parity(raidz_map_t *rm)
583{
584	switch (rm->rm_firstdatacol) {
585	case 1:
586		vdev_raidz_generate_parity_p(rm);
587		break;
588	case 2:
589		vdev_raidz_generate_parity_pq(rm);
590		break;
591	case 3:
592		vdev_raidz_generate_parity_pqr(rm);
593		break;
594	default:
595		panic("invalid RAID-Z configuration");
596	}
597}
598
599/* BEGIN CSTYLED */
600/*
601 * In the general case of reconstruction, we must solve the system of linear
602 * equations defined by the coeffecients used to generate parity as well as
603 * the contents of the data and parity disks. This can be expressed with
604 * vectors for the original data (D) and the actual data (d) and parity (p)
605 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
606 *
607 *            __   __                     __     __
608 *            |     |         __     __   |  p_0  |
609 *            |  V  |         |  D_0  |   | p_m-1 |
610 *            |     |    x    |   :   | = |  d_0  |
611 *            |  I  |         | D_n-1 |   |   :   |
612 *            |     |         ~~     ~~   | d_n-1 |
613 *            ~~   ~~                     ~~     ~~
614 *
615 * I is simply a square identity matrix of size n, and V is a vandermonde
616 * matrix defined by the coeffecients we chose for the various parity columns
617 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
618 * computation as well as linear separability.
619 *
620 *      __               __               __     __
621 *      |   1   ..  1 1 1 |               |  p_0  |
622 *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
623 *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
624 *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
625 *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
626 *      |   :       : : : |   |   :   |   |  d_2  |
627 *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
628 *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
629 *      |   0   ..  0 0 1 |               | d_n-1 |
630 *      ~~               ~~               ~~     ~~
631 *
632 * Note that I, V, d, and p are known. To compute D, we must invert the
633 * matrix and use the known data and parity values to reconstruct the unknown
634 * data values. We begin by removing the rows in V|I and d|p that correspond
635 * to failed or missing columns; we then make V|I square (n x n) and d|p
636 * sized n by removing rows corresponding to unused parity from the bottom up
637 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
638 * using Gauss-Jordan elimination. In the example below we use m=3 parity
639 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
640 *           __                               __
641 *           |  1   1   1   1   1   1   1   1  |
642 *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
643 *           |  19 205 116  29  64  16  4   1  |      / /
644 *           |  1   0   0   0   0   0   0   0  |     / /
645 *           |  0   1   0   0   0   0   0   0  | <--' /
646 *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
647 *           |  0   0   0   1   0   0   0   0  |
648 *           |  0   0   0   0   1   0   0   0  |
649 *           |  0   0   0   0   0   1   0   0  |
650 *           |  0   0   0   0   0   0   1   0  |
651 *           |  0   0   0   0   0   0   0   1  |
652 *           ~~                               ~~
653 *           __                               __
654 *           |  1   1   1   1   1   1   1   1  |
655 *           | 128  64  32  16  8   4   2   1  |
656 *           |  19 205 116  29  64  16  4   1  |
657 *           |  1   0   0   0   0   0   0   0  |
658 *           |  0   1   0   0   0   0   0   0  |
659 *  (V|I)' = |  0   0   1   0   0   0   0   0  |
660 *           |  0   0   0   1   0   0   0   0  |
661 *           |  0   0   0   0   1   0   0   0  |
662 *           |  0   0   0   0   0   1   0   0  |
663 *           |  0   0   0   0   0   0   1   0  |
664 *           |  0   0   0   0   0   0   0   1  |
665 *           ~~                               ~~
666 *
667 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
668 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
669 * matrix is not singular.
670 * __                                                                 __
671 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
672 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
673 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
674 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
675 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
676 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
677 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
678 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
679 * ~~                                                                 ~~
680 * __                                                                 __
681 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
682 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
683 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
684 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
685 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
686 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
687 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
688 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
689 * ~~                                                                 ~~
690 * __                                                                 __
691 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
692 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
693 * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
694 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
695 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
696 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
697 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
698 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
699 * ~~                                                                 ~~
700 * __                                                                 __
701 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
702 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
703 * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
704 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
705 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
706 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
707 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
708 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
709 * ~~                                                                 ~~
710 * __                                                                 __
711 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
712 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
713 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
714 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
715 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
716 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
717 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
718 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
719 * ~~                                                                 ~~
720 * __                                                                 __
721 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
722 * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
723 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
724 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
725 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
726 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
727 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
728 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
729 * ~~                                                                 ~~
730 *                   __                               __
731 *                   |  0   0   1   0   0   0   0   0  |
732 *                   | 167 100  5   41 159 169 217 208 |
733 *                   | 166 100  4   40 158 168 216 209 |
734 *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
735 *                   |  0   0   0   0   1   0   0   0  |
736 *                   |  0   0   0   0   0   1   0   0  |
737 *                   |  0   0   0   0   0   0   1   0  |
738 *                   |  0   0   0   0   0   0   0   1  |
739 *                   ~~                               ~~
740 *
741 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
742 * of the missing data.
743 *
744 * As is apparent from the example above, the only non-trivial rows in the
745 * inverse matrix correspond to the data disks that we're trying to
746 * reconstruct. Indeed, those are the only rows we need as the others would
747 * only be useful for reconstructing data known or assumed to be valid. For
748 * that reason, we only build the coefficients in the rows that correspond to
749 * targeted columns.
750 */
751/* END CSTYLED */
752
753static void
754vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
755    uint8_t **rows)
756{
757	int i, j;
758	int pow;
759
760	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
761
762	/*
763	 * Fill in the missing rows of interest.
764	 */
765	for (i = 0; i < nmap; i++) {
766		ASSERT3S(0, <=, map[i]);
767		ASSERT3S(map[i], <=, 2);
768
769		pow = map[i] * n;
770		if (pow > 255)
771			pow -= 255;
772		ASSERT(pow <= 255);
773
774		for (j = 0; j < n; j++) {
775			pow -= map[i];
776			if (pow < 0)
777				pow += 255;
778			rows[i][j] = vdev_raidz_pow2[pow];
779		}
780	}
781}
782
783static void
784vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
785    uint8_t **rows, uint8_t **invrows, const uint8_t *used)
786{
787	int i, j, ii, jj;
788	uint8_t log;
789
790	/*
791	 * Assert that the first nmissing entries from the array of used
792	 * columns correspond to parity columns and that subsequent entries
793	 * correspond to data columns.
794	 */
795	for (i = 0; i < nmissing; i++) {
796		ASSERT3S(used[i], <, rm->rm_firstdatacol);
797	}
798	for (; i < n; i++) {
799		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
800	}
801
802	/*
803	 * First initialize the storage where we'll compute the inverse rows.
804	 */
805	for (i = 0; i < nmissing; i++) {
806		for (j = 0; j < n; j++) {
807			invrows[i][j] = (i == j) ? 1 : 0;
808		}
809	}
810
811	/*
812	 * Subtract all trivial rows from the rows of consequence.
813	 */
814	for (i = 0; i < nmissing; i++) {
815		for (j = nmissing; j < n; j++) {
816			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
817			jj = used[j] - rm->rm_firstdatacol;
818			ASSERT3S(jj, <, n);
819			invrows[i][j] = rows[i][jj];
820			rows[i][jj] = 0;
821		}
822	}
823
824	/*
825	 * For each of the rows of interest, we must normalize it and subtract
826	 * a multiple of it from the other rows.
827	 */
828	for (i = 0; i < nmissing; i++) {
829		for (j = 0; j < missing[i]; j++) {
830			ASSERT3U(rows[i][j], ==, 0);
831		}
832		ASSERT3U(rows[i][missing[i]], !=, 0);
833
834		/*
835		 * Compute the inverse of the first element and multiply each
836		 * element in the row by that value.
837		 */
838		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
839
840		for (j = 0; j < n; j++) {
841			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
842			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
843		}
844
845		for (ii = 0; ii < nmissing; ii++) {
846			if (i == ii)
847				continue;
848
849			ASSERT3U(rows[ii][missing[i]], !=, 0);
850
851			log = vdev_raidz_log2[rows[ii][missing[i]]];
852
853			for (j = 0; j < n; j++) {
854				rows[ii][j] ^=
855				    vdev_raidz_exp2(rows[i][j], log);
856				invrows[ii][j] ^=
857				    vdev_raidz_exp2(invrows[i][j], log);
858			}
859		}
860	}
861
862	/*
863	 * Verify that the data that is left in the rows are properly part of
864	 * an identity matrix.
865	 */
866	for (i = 0; i < nmissing; i++) {
867		for (j = 0; j < n; j++) {
868			if (j == missing[i]) {
869				ASSERT3U(rows[i][j], ==, 1);
870			} else {
871				ASSERT3U(rows[i][j], ==, 0);
872			}
873		}
874	}
875}
876
877static void
878vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
879    int *missing, uint8_t **invrows, const uint8_t *used)
880{
881	int i, j, x, cc, c;
882	uint8_t *src;
883	uint64_t ccount;
884	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
885	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
886	uint8_t log, val;
887	int ll;
888	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
889	uint8_t *p, *pp;
890	size_t psize;
891
892	log = 0;	/* gcc */
893	psize = sizeof (invlog[0][0]) * n * nmissing;
894	p = zfs_alloc(psize);
895
896	for (pp = p, i = 0; i < nmissing; i++) {
897		invlog[i] = pp;
898		pp += n;
899	}
900
901	for (i = 0; i < nmissing; i++) {
902		for (j = 0; j < n; j++) {
903			ASSERT3U(invrows[i][j], !=, 0);
904			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
905		}
906	}
907
908	for (i = 0; i < n; i++) {
909		c = used[i];
910		ASSERT3U(c, <, rm->rm_cols);
911
912		src = rm->rm_col[c].rc_data;
913		ccount = rm->rm_col[c].rc_size;
914		for (j = 0; j < nmissing; j++) {
915			cc = missing[j] + rm->rm_firstdatacol;
916			ASSERT3U(cc, >=, rm->rm_firstdatacol);
917			ASSERT3U(cc, <, rm->rm_cols);
918			ASSERT3U(cc, !=, c);
919
920			dst[j] = rm->rm_col[cc].rc_data;
921			dcount[j] = rm->rm_col[cc].rc_size;
922		}
923
924		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
925
926		for (x = 0; x < ccount; x++, src++) {
927			if (*src != 0)
928				log = vdev_raidz_log2[*src];
929
930			for (cc = 0; cc < nmissing; cc++) {
931				if (x >= dcount[cc])
932					continue;
933
934				if (*src == 0) {
935					val = 0;
936				} else {
937					if ((ll = log + invlog[cc][i]) >= 255)
938						ll -= 255;
939					val = vdev_raidz_pow2[ll];
940				}
941
942				if (i == 0)
943					dst[cc][x] = val;
944				else
945					dst[cc][x] ^= val;
946			}
947		}
948	}
949
950	zfs_free(p, psize);
951}
952
953static int
954vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
955{
956	int n, i, c, t, tt;
957	int nmissing_rows;
958	int missing_rows[VDEV_RAIDZ_MAXPARITY];
959	int parity_map[VDEV_RAIDZ_MAXPARITY];
960
961	uint8_t *p, *pp;
962	size_t psize;
963
964	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
965	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
966	uint8_t *used;
967
968	int code = 0;
969
970
971	n = rm->rm_cols - rm->rm_firstdatacol;
972
973	/*
974	 * Figure out which data columns are missing.
975	 */
976	nmissing_rows = 0;
977	for (t = 0; t < ntgts; t++) {
978		if (tgts[t] >= rm->rm_firstdatacol) {
979			missing_rows[nmissing_rows++] =
980			    tgts[t] - rm->rm_firstdatacol;
981		}
982	}
983
984	/*
985	 * Figure out which parity columns to use to help generate the missing
986	 * data columns.
987	 */
988	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
989		ASSERT(tt < ntgts);
990		ASSERT(c < rm->rm_firstdatacol);
991
992		/*
993		 * Skip any targeted parity columns.
994		 */
995		if (c == tgts[tt]) {
996			tt++;
997			continue;
998		}
999
1000		code |= 1 << c;
1001
1002		parity_map[i] = c;
1003		i++;
1004	}
1005
1006	ASSERT(code != 0);
1007	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1008
1009	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1010	    nmissing_rows * n + sizeof (used[0]) * n;
1011	p = kmem_alloc(psize, KM_SLEEP);
1012
1013	for (pp = p, i = 0; i < nmissing_rows; i++) {
1014		rows[i] = pp;
1015		pp += n;
1016		invrows[i] = pp;
1017		pp += n;
1018	}
1019	used = pp;
1020
1021	for (i = 0; i < nmissing_rows; i++) {
1022		used[i] = parity_map[i];
1023	}
1024
1025	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1026		if (tt < nmissing_rows &&
1027		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1028			tt++;
1029			continue;
1030		}
1031
1032		ASSERT3S(i, <, n);
1033		used[i] = c;
1034		i++;
1035	}
1036
1037	/*
1038	 * Initialize the interesting rows of the matrix.
1039	 */
1040	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1041
1042	/*
1043	 * Invert the matrix.
1044	 */
1045	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1046	    invrows, used);
1047
1048	/*
1049	 * Reconstruct the missing data using the generated matrix.
1050	 */
1051	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1052	    invrows, used);
1053
1054	kmem_free(p, psize);
1055
1056	return (code);
1057}
1058
1059static int
1060vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1061{
1062	int tgts[VDEV_RAIDZ_MAXPARITY];
1063	int ntgts;
1064	int i, c;
1065	int code;
1066	int nbadparity, nbaddata;
1067
1068	/*
1069	 * The tgts list must already be sorted.
1070	 */
1071	for (i = 1; i < nt; i++) {
1072		ASSERT(t[i] > t[i - 1]);
1073	}
1074
1075	nbadparity = rm->rm_firstdatacol;
1076	nbaddata = rm->rm_cols - nbadparity;
1077	ntgts = 0;
1078	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1079		if (i < nt && c == t[i]) {
1080			tgts[ntgts++] = c;
1081			i++;
1082		} else if (rm->rm_col[c].rc_error != 0) {
1083			tgts[ntgts++] = c;
1084		} else if (c >= rm->rm_firstdatacol) {
1085			nbaddata--;
1086		} else {
1087			nbadparity--;
1088		}
1089	}
1090
1091	ASSERT(ntgts >= nt);
1092	ASSERT(nbaddata >= 0);
1093	ASSERT(nbaddata + nbadparity == ntgts);
1094
1095	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1096	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1097	ASSERT(code > 0);
1098	return (code);
1099}
1100
1101static raidz_map_t *
1102vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1103    uint64_t dcols, uint64_t nparity)
1104{
1105	raidz_map_t *rm;
1106	uint64_t b = offset >> unit_shift;
1107	uint64_t s = size >> unit_shift;
1108	uint64_t f = b % dcols;
1109	uint64_t o = (b / dcols) << unit_shift;
1110	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1111
1112	q = s / (dcols - nparity);
1113	r = s - q * (dcols - nparity);
1114	bc = (r == 0 ? 0 : r + nparity);
1115	tot = s + nparity * (q + (r == 0 ? 0 : 1));
1116
1117	if (q == 0) {
1118		acols = bc;
1119		scols = MIN(dcols, roundup(bc, nparity + 1));
1120	} else {
1121		acols = dcols;
1122		scols = dcols;
1123	}
1124
1125	ASSERT3U(acols, <=, scols);
1126
1127	rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols]));
1128
1129	rm->rm_cols = acols;
1130	rm->rm_scols = scols;
1131	rm->rm_bigcols = bc;
1132	rm->rm_skipstart = bc;
1133	rm->rm_missingdata = 0;
1134	rm->rm_missingparity = 0;
1135	rm->rm_firstdatacol = nparity;
1136	rm->rm_reports = 0;
1137	rm->rm_freed = 0;
1138	rm->rm_ecksuminjected = 0;
1139
1140	asize = 0;
1141
1142	for (c = 0; c < scols; c++) {
1143		col = f + c;
1144		coff = o;
1145		if (col >= dcols) {
1146			col -= dcols;
1147			coff += 1ULL << unit_shift;
1148		}
1149		rm->rm_col[c].rc_devidx = col;
1150		rm->rm_col[c].rc_offset = coff;
1151		rm->rm_col[c].rc_data = NULL;
1152		rm->rm_col[c].rc_error = 0;
1153		rm->rm_col[c].rc_tried = 0;
1154		rm->rm_col[c].rc_skipped = 0;
1155
1156		if (c >= acols)
1157			rm->rm_col[c].rc_size = 0;
1158		else if (c < bc)
1159			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1160		else
1161			rm->rm_col[c].rc_size = q << unit_shift;
1162
1163		asize += rm->rm_col[c].rc_size;
1164	}
1165
1166	ASSERT3U(asize, ==, tot << unit_shift);
1167	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1168	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1169	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1170	ASSERT3U(rm->rm_nskip, <=, nparity);
1171
1172	for (c = 0; c < rm->rm_firstdatacol; c++)
1173		rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size);
1174
1175	rm->rm_col[c].rc_data = data;
1176
1177	for (c = c + 1; c < acols; c++)
1178		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1179		    rm->rm_col[c - 1].rc_size;
1180
1181	/*
1182	 * If all data stored spans all columns, there's a danger that parity
1183	 * will always be on the same device and, since parity isn't read
1184	 * during normal operation, that that device's I/O bandwidth won't be
1185	 * used effectively. We therefore switch the parity every 1MB.
1186	 *
1187	 * ... at least that was, ostensibly, the theory. As a practical
1188	 * matter unless we juggle the parity between all devices evenly, we
1189	 * won't see any benefit. Further, occasional writes that aren't a
1190	 * multiple of the LCM of the number of children and the minimum
1191	 * stripe width are sufficient to avoid pessimal behavior.
1192	 * Unfortunately, this decision created an implicit on-disk format
1193	 * requirement that we need to support for all eternity, but only
1194	 * for single-parity RAID-Z.
1195	 *
1196	 * If we intend to skip a sector in the zeroth column for padding
1197	 * we must make sure to note this swap. We will never intend to
1198	 * skip the first column since at least one data and one parity
1199	 * column must appear in each row.
1200	 */
1201	ASSERT(rm->rm_cols >= 2);
1202	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1203
1204	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1205		devidx = rm->rm_col[0].rc_devidx;
1206		o = rm->rm_col[0].rc_offset;
1207		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1208		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1209		rm->rm_col[1].rc_devidx = devidx;
1210		rm->rm_col[1].rc_offset = o;
1211
1212		if (rm->rm_skipstart == 0)
1213			rm->rm_skipstart = 1;
1214	}
1215
1216	return (rm);
1217}
1218
1219static void
1220vdev_raidz_map_free(raidz_map_t *rm)
1221{
1222	int c;
1223
1224	for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1225		zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
1226
1227	zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
1228}
1229
1230static vdev_t *
1231vdev_child(vdev_t *pvd, uint64_t devidx)
1232{
1233	vdev_t *cvd;
1234
1235	STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1236		if (cvd->v_id == devidx)
1237			break;
1238	}
1239
1240	return (cvd);
1241}
1242
1243/*
1244 * We keep track of whether or not there were any injected errors, so that
1245 * any ereports we generate can note it.
1246 */
1247static int
1248raidz_checksum_verify(const blkptr_t *bp, void *data, uint64_t size)
1249{
1250
1251	return (zio_checksum_verify(bp, data));
1252}
1253
1254/*
1255 * Generate the parity from the data columns. If we tried and were able to
1256 * read the parity without error, verify that the generated parity matches the
1257 * data we read. If it doesn't, we fire off a checksum error. Return the
1258 * number such failures.
1259 */
1260static int
1261raidz_parity_verify(raidz_map_t *rm)
1262{
1263	void *orig[VDEV_RAIDZ_MAXPARITY];
1264	int c, ret = 0;
1265	raidz_col_t *rc;
1266
1267	for (c = 0; c < rm->rm_firstdatacol; c++) {
1268		rc = &rm->rm_col[c];
1269		if (!rc->rc_tried || rc->rc_error != 0)
1270			continue;
1271		orig[c] = zfs_alloc(rc->rc_size);
1272		bcopy(rc->rc_data, orig[c], rc->rc_size);
1273	}
1274
1275	vdev_raidz_generate_parity(rm);
1276
1277	for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1278		rc = &rm->rm_col[c];
1279		if (!rc->rc_tried || rc->rc_error != 0)
1280			continue;
1281		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1282			rc->rc_error = ECKSUM;
1283			ret++;
1284		}
1285		zfs_free(orig[c], rc->rc_size);
1286	}
1287
1288	return (ret);
1289}
1290
1291/*
1292 * Iterate over all combinations of bad data and attempt a reconstruction.
1293 * Note that the algorithm below is non-optimal because it doesn't take into
1294 * account how reconstruction is actually performed. For example, with
1295 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1296 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1297 * cases we'd only use parity information in column 0.
1298 */
1299static int
1300vdev_raidz_combrec(raidz_map_t *rm, const blkptr_t *bp, void *data,
1301    off_t offset, uint64_t bytes, int total_errors, int data_errors)
1302{
1303	raidz_col_t *rc;
1304	void *orig[VDEV_RAIDZ_MAXPARITY];
1305	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1306	int *tgts = &tstore[1];
1307	int current, next, i, c, n;
1308	int code, ret = 0;
1309
1310	ASSERT(total_errors < rm->rm_firstdatacol);
1311
1312	/*
1313	 * This simplifies one edge condition.
1314	 */
1315	tgts[-1] = -1;
1316
1317	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1318		/*
1319		 * Initialize the targets array by finding the first n columns
1320		 * that contain no error.
1321		 *
1322		 * If there were no data errors, we need to ensure that we're
1323		 * always explicitly attempting to reconstruct at least one
1324		 * data column. To do this, we simply push the highest target
1325		 * up into the data columns.
1326		 */
1327		for (c = 0, i = 0; i < n; i++) {
1328			if (i == n - 1 && data_errors == 0 &&
1329			    c < rm->rm_firstdatacol) {
1330				c = rm->rm_firstdatacol;
1331			}
1332
1333			while (rm->rm_col[c].rc_error != 0) {
1334				c++;
1335				ASSERT3S(c, <, rm->rm_cols);
1336			}
1337
1338			tgts[i] = c++;
1339		}
1340
1341		/*
1342		 * Setting tgts[n] simplifies the other edge condition.
1343		 */
1344		tgts[n] = rm->rm_cols;
1345
1346		/*
1347		 * These buffers were allocated in previous iterations.
1348		 */
1349		for (i = 0; i < n - 1; i++) {
1350			ASSERT(orig[i] != NULL);
1351		}
1352
1353		orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size);
1354
1355		current = 0;
1356		next = tgts[current];
1357
1358		while (current != n) {
1359			tgts[current] = next;
1360			current = 0;
1361
1362			/*
1363			 * Save off the original data that we're going to
1364			 * attempt to reconstruct.
1365			 */
1366			for (i = 0; i < n; i++) {
1367				ASSERT(orig[i] != NULL);
1368				c = tgts[i];
1369				ASSERT3S(c, >=, 0);
1370				ASSERT3S(c, <, rm->rm_cols);
1371				rc = &rm->rm_col[c];
1372				bcopy(rc->rc_data, orig[i], rc->rc_size);
1373			}
1374
1375			/*
1376			 * Attempt a reconstruction and exit the outer loop on
1377			 * success.
1378			 */
1379			code = vdev_raidz_reconstruct(rm, tgts, n);
1380			if (raidz_checksum_verify(bp, data, bytes) == 0) {
1381				for (i = 0; i < n; i++) {
1382					c = tgts[i];
1383					rc = &rm->rm_col[c];
1384					ASSERT(rc->rc_error == 0);
1385					rc->rc_error = ECKSUM;
1386				}
1387
1388				ret = code;
1389				goto done;
1390			}
1391
1392			/*
1393			 * Restore the original data.
1394			 */
1395			for (i = 0; i < n; i++) {
1396				c = tgts[i];
1397				rc = &rm->rm_col[c];
1398				bcopy(orig[i], rc->rc_data, rc->rc_size);
1399			}
1400
1401			do {
1402				/*
1403				 * Find the next valid column after the current
1404				 * position..
1405				 */
1406				for (next = tgts[current] + 1;
1407				    next < rm->rm_cols &&
1408				    rm->rm_col[next].rc_error != 0; next++)
1409					continue;
1410
1411				ASSERT(next <= tgts[current + 1]);
1412
1413				/*
1414				 * If that spot is available, we're done here.
1415				 */
1416				if (next != tgts[current + 1])
1417					break;
1418
1419				/*
1420				 * Otherwise, find the next valid column after
1421				 * the previous position.
1422				 */
1423				for (c = tgts[current - 1] + 1;
1424				    rm->rm_col[c].rc_error != 0; c++)
1425					continue;
1426
1427				tgts[current] = c;
1428				current++;
1429
1430			} while (current != n);
1431		}
1432	}
1433	n--;
1434done:
1435	for (i = n - 1; i >= 0; i--) {
1436		zfs_free(orig[i], rm->rm_col[0].rc_size);
1437	}
1438
1439	return (ret);
1440}
1441
1442static int
1443vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1444    off_t offset, size_t bytes)
1445{
1446	vdev_t *tvd = vd->v_top;
1447	vdev_t *cvd;
1448	raidz_map_t *rm;
1449	raidz_col_t *rc;
1450	int c, error;
1451	int unexpected_errors;
1452	int parity_errors;
1453	int parity_untried;
1454	int data_errors;
1455	int total_errors;
1456	int n;
1457	int tgts[VDEV_RAIDZ_MAXPARITY];
1458	int code;
1459
1460	rc = NULL;	/* gcc */
1461	error = 0;
1462
1463	rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1464	    vd->v_nchildren, vd->v_nparity);
1465
1466	/*
1467	 * Iterate over the columns in reverse order so that we hit the parity
1468	 * last -- any errors along the way will force us to read the parity.
1469	 */
1470	for (c = rm->rm_cols - 1; c >= 0; c--) {
1471		rc = &rm->rm_col[c];
1472		cvd = vdev_child(vd, rc->rc_devidx);
1473		if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1474			if (c >= rm->rm_firstdatacol)
1475				rm->rm_missingdata++;
1476			else
1477				rm->rm_missingparity++;
1478			rc->rc_error = ENXIO;
1479			rc->rc_tried = 1;	/* don't even try */
1480			rc->rc_skipped = 1;
1481			continue;
1482		}
1483#if 0		/* XXX: Too hard for the boot code. */
1484		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1485			if (c >= rm->rm_firstdatacol)
1486				rm->rm_missingdata++;
1487			else
1488				rm->rm_missingparity++;
1489			rc->rc_error = ESTALE;
1490			rc->rc_skipped = 1;
1491			continue;
1492		}
1493#endif
1494		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1495			rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1496			    rc->rc_offset, rc->rc_size);
1497			rc->rc_tried = 1;
1498			rc->rc_skipped = 0;
1499		}
1500	}
1501
1502reconstruct:
1503	unexpected_errors = 0;
1504	parity_errors = 0;
1505	parity_untried = 0;
1506	data_errors = 0;
1507	total_errors = 0;
1508
1509	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1510	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1511
1512	for (c = 0; c < rm->rm_cols; c++) {
1513		rc = &rm->rm_col[c];
1514
1515		if (rc->rc_error) {
1516			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
1517
1518			if (c < rm->rm_firstdatacol)
1519				parity_errors++;
1520			else
1521				data_errors++;
1522
1523			if (!rc->rc_skipped)
1524				unexpected_errors++;
1525
1526			total_errors++;
1527		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1528			parity_untried++;
1529		}
1530	}
1531
1532	/*
1533	 * There are three potential phases for a read:
1534	 *	1. produce valid data from the columns read
1535	 *	2. read all disks and try again
1536	 *	3. perform combinatorial reconstruction
1537	 *
1538	 * Each phase is progressively both more expensive and less likely to
1539	 * occur. If we encounter more errors than we can repair or all phases
1540	 * fail, we have no choice but to return an error.
1541	 */
1542
1543	/*
1544	 * If the number of errors we saw was correctable -- less than or equal
1545	 * to the number of parity disks read -- attempt to produce data that
1546	 * has a valid checksum. Naturally, this case applies in the absence of
1547	 * any errors.
1548	 */
1549	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1550		if (data_errors == 0) {
1551			if (raidz_checksum_verify(bp, data, bytes) == 0) {
1552				/*
1553				 * If we read parity information (unnecessarily
1554				 * as it happens since no reconstruction was
1555				 * needed) regenerate and verify the parity.
1556				 * We also regenerate parity when resilvering
1557				 * so we can write it out to the failed device
1558				 * later.
1559				 */
1560				if (parity_errors + parity_untried <
1561				    rm->rm_firstdatacol) {
1562					n = raidz_parity_verify(rm);
1563					unexpected_errors += n;
1564					ASSERT(parity_errors + n <=
1565					    rm->rm_firstdatacol);
1566				}
1567				goto done;
1568			}
1569		} else {
1570			/*
1571			 * We either attempt to read all the parity columns or
1572			 * none of them. If we didn't try to read parity, we
1573			 * wouldn't be here in the correctable case. There must
1574			 * also have been fewer parity errors than parity
1575			 * columns or, again, we wouldn't be in this code path.
1576			 */
1577			ASSERT(parity_untried == 0);
1578			ASSERT(parity_errors < rm->rm_firstdatacol);
1579
1580			/*
1581			 * Identify the data columns that reported an error.
1582			 */
1583			n = 0;
1584			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1585				rc = &rm->rm_col[c];
1586				if (rc->rc_error != 0) {
1587					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1588					tgts[n++] = c;
1589				}
1590			}
1591
1592			ASSERT(rm->rm_firstdatacol >= n);
1593
1594			code = vdev_raidz_reconstruct(rm, tgts, n);
1595
1596			if (raidz_checksum_verify(bp, data, bytes) == 0) {
1597				/*
1598				 * If we read more parity disks than were used
1599				 * for reconstruction, confirm that the other
1600				 * parity disks produced correct data. This
1601				 * routine is suboptimal in that it regenerates
1602				 * the parity that we already used in addition
1603				 * to the parity that we're attempting to
1604				 * verify, but this should be a relatively
1605				 * uncommon case, and can be optimized if it
1606				 * becomes a problem. Note that we regenerate
1607				 * parity when resilvering so we can write it
1608				 * out to failed devices later.
1609				 */
1610				if (parity_errors < rm->rm_firstdatacol - n) {
1611					n = raidz_parity_verify(rm);
1612					unexpected_errors += n;
1613					ASSERT(parity_errors + n <=
1614					    rm->rm_firstdatacol);
1615				}
1616
1617				goto done;
1618			}
1619		}
1620	}
1621
1622	/*
1623	 * This isn't a typical situation -- either we got a read
1624	 * error or a child silently returned bad data. Read every
1625	 * block so we can try again with as much data and parity as
1626	 * we can track down. If we've already been through once
1627	 * before, all children will be marked as tried so we'll
1628	 * proceed to combinatorial reconstruction.
1629	 */
1630	unexpected_errors = 1;
1631	rm->rm_missingdata = 0;
1632	rm->rm_missingparity = 0;
1633
1634	n = 0;
1635	for (c = 0; c < rm->rm_cols; c++) {
1636		rc = &rm->rm_col[c];
1637
1638		if (rc->rc_tried)
1639			continue;
1640
1641		cvd = vdev_child(vd, rc->rc_devidx);
1642		ASSERT(cvd != NULL);
1643		rc->rc_error = cvd->v_read(cvd, NULL,
1644		    rc->rc_data, rc->rc_offset, rc->rc_size);
1645		if (rc->rc_error == 0)
1646			n++;
1647		rc->rc_tried = 1;
1648		rc->rc_skipped = 0;
1649	}
1650	/*
1651	 * If we managed to read anything more, retry the
1652	 * reconstruction.
1653	 */
1654	if (n > 0)
1655		goto reconstruct;
1656
1657	/*
1658	 * At this point we've attempted to reconstruct the data given the
1659	 * errors we detected, and we've attempted to read all columns. There
1660	 * must, therefore, be one or more additional problems -- silent errors
1661	 * resulting in invalid data rather than explicit I/O errors resulting
1662	 * in absent data. We check if there is enough additional data to
1663	 * possibly reconstruct the data and then perform combinatorial
1664	 * reconstruction over all possible combinations. If that fails,
1665	 * we're cooked.
1666	 */
1667	if (total_errors > rm->rm_firstdatacol) {
1668		error = EIO;
1669	} else if (total_errors < rm->rm_firstdatacol &&
1670	    (code = vdev_raidz_combrec(rm, bp, data, offset, bytes,
1671	     total_errors, data_errors)) != 0) {
1672		/*
1673		 * If we didn't use all the available parity for the
1674		 * combinatorial reconstruction, verify that the remaining
1675		 * parity is correct.
1676		 */
1677		if (code != (1 << rm->rm_firstdatacol) - 1)
1678			(void) raidz_parity_verify(rm);
1679	} else {
1680		/*
1681		 * We're here because either:
1682		 *
1683		 *	total_errors == rm_first_datacol, or
1684		 *	vdev_raidz_combrec() failed
1685		 *
1686		 * In either case, there is enough bad data to prevent
1687		 * reconstruction.
1688		 *
1689		 * Start checksum ereports for all children which haven't
1690		 * failed, and the IO wasn't speculative.
1691		 */
1692		error = ECKSUM;
1693	}
1694
1695done:
1696	vdev_raidz_map_free(rm);
1697
1698	return (error);
1699}
1700