1185029Spjd/* 2185029Spjd * CDDL HEADER START 3185029Spjd * 4185029Spjd * The contents of this file are subject to the terms of the 5185029Spjd * Common Development and Distribution License (the "License"). 6185029Spjd * You may not use this file except in compliance with the License. 7185029Spjd * 8185029Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9185029Spjd * or http://www.opensolaris.org/os/licensing. 10185029Spjd * See the License for the specific language governing permissions 11185029Spjd * and limitations under the License. 12185029Spjd * 13185029Spjd * When distributing Covered Code, include this CDDL HEADER in each 14185029Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15185029Spjd * If applicable, add the following below this CDDL HEADER, with the 16185029Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17185029Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18185029Spjd * 19185029Spjd * CDDL HEADER END 20185029Spjd */ 21185029Spjd/* 22185029Spjd * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23185029Spjd * Use is subject to license terms. 24185029Spjd */ 25185029Spjd 26185029Spjd#include <sys/cdefs.h> 27192640Sdes__FBSDID("$FreeBSD$"); 28185029Spjd 29185029Spjdstatic uint64_t zfs_crc64_table[256]; 30185029Spjd 31219089Spjd#define ECKSUM 666 32219089Spjd 33219089Spjd#define ASSERT(...) do { } while (0) 34219089Spjd#define ASSERT3U(...) do { } while (0) 35219089Spjd#define ASSERT3S(...) do { } while (0) 36219089Spjd 37219089Spjd#define panic(...) do { \ 38219089Spjd printf(__VA_ARGS__); \ 39219089Spjd for (;;) ; \ 40219089Spjd} while (0) 41219089Spjd 42219089Spjd#define kmem_alloc(size, flag) zfs_alloc((size)) 43219089Spjd#define kmem_free(ptr, size) zfs_free((ptr), (size)) 44219089Spjd 45185029Spjdstatic void 46185029Spjdzfs_init_crc(void) 47185029Spjd{ 48185029Spjd int i, j; 49185029Spjd uint64_t *ct; 50185029Spjd 51185029Spjd /* 52185029Spjd * Calculate the crc64 table (used for the zap hash 53185029Spjd * function). 54185029Spjd */ 55185029Spjd if (zfs_crc64_table[128] != ZFS_CRC64_POLY) { 56185029Spjd memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table)); 57185029Spjd for (i = 0; i < 256; i++) 58185029Spjd for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 59185029Spjd *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 60185029Spjd } 61185029Spjd} 62185029Spjd 63185029Spjdstatic void 64185029Spjdzio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp) 65185029Spjd{ 66185029Spjd ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); 67185029Spjd} 68185029Spjd 69185029Spjd/* 70185029Spjd * Signature for checksum functions. 71185029Spjd */ 72185029Spjdtypedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp); 73185029Spjd 74185029Spjd/* 75185029Spjd * Information about each checksum function. 76185029Spjd */ 77185029Spjdtypedef struct zio_checksum_info { 78185029Spjd zio_checksum_t *ci_func[2]; /* checksum function for each byteorder */ 79185029Spjd int ci_correctable; /* number of correctable bits */ 80219089Spjd int ci_eck; /* uses zio embedded checksum? */ 81219089Spjd int ci_dedup; /* strong enough for dedup? */ 82185029Spjd const char *ci_name; /* descriptive name */ 83185029Spjd} zio_checksum_info_t; 84185029Spjd 85185029Spjd#include "fletcher.c" 86185029Spjd#include "sha256.c" 87185029Spjd 88185029Spjdstatic zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { 89219089Spjd {{NULL, NULL}, 0, 0, 0, "inherit"}, 90219089Spjd {{NULL, NULL}, 0, 0, 0, "on"}, 91219089Spjd {{zio_checksum_off, zio_checksum_off}, 0, 0, 0, "off"}, 92219089Spjd {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "label"}, 93219089Spjd {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "gang_header"}, 94219089Spjd {{fletcher_2_native, fletcher_2_byteswap}, 0, 1, 0, "zilog"}, 95219089Spjd {{fletcher_2_native, fletcher_2_byteswap}, 0, 0, 0, "fletcher2"}, 96219089Spjd {{fletcher_4_native, fletcher_4_byteswap}, 1, 0, 0, "fletcher4"}, 97219089Spjd {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, 1, "SHA256"}, 98219089Spjd {{fletcher_4_native, fletcher_4_byteswap}, 0, 1, 0, "zillog2"}, 99185029Spjd}; 100185029Spjd 101219089Spjd 102185029Spjd/* 103185029Spjd * Common signature for all zio compress/decompress functions. 104185029Spjd */ 105185029Spjdtypedef size_t zio_compress_func_t(void *src, void *dst, 106185029Spjd size_t s_len, size_t d_len, int); 107185029Spjdtypedef int zio_decompress_func_t(void *src, void *dst, 108185029Spjd size_t s_len, size_t d_len, int); 109185029Spjd 110185029Spjd/* 111185029Spjd * Information about each compression function. 112185029Spjd */ 113185029Spjdtypedef struct zio_compress_info { 114185029Spjd zio_compress_func_t *ci_compress; /* compression function */ 115185029Spjd zio_decompress_func_t *ci_decompress; /* decompression function */ 116185029Spjd int ci_level; /* level parameter */ 117185029Spjd const char *ci_name; /* algorithm name */ 118185029Spjd} zio_compress_info_t; 119185029Spjd 120185029Spjd#include "lzjb.c" 121219089Spjd#include "zle.c" 122246586Sdelphij#include "lz4.c" 123185029Spjd 124185029Spjd/* 125185029Spjd * Compression vectors. 126185029Spjd */ 127185029Spjdstatic zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { 128185029Spjd {NULL, NULL, 0, "inherit"}, 129185029Spjd {NULL, NULL, 0, "on"}, 130185029Spjd {NULL, NULL, 0, "uncompressed"}, 131185029Spjd {NULL, lzjb_decompress, 0, "lzjb"}, 132185029Spjd {NULL, NULL, 0, "empty"}, 133185029Spjd {NULL, NULL, 1, "gzip-1"}, 134185029Spjd {NULL, NULL, 2, "gzip-2"}, 135185029Spjd {NULL, NULL, 3, "gzip-3"}, 136185029Spjd {NULL, NULL, 4, "gzip-4"}, 137185029Spjd {NULL, NULL, 5, "gzip-5"}, 138185029Spjd {NULL, NULL, 6, "gzip-6"}, 139185029Spjd {NULL, NULL, 7, "gzip-7"}, 140185029Spjd {NULL, NULL, 8, "gzip-8"}, 141185029Spjd {NULL, NULL, 9, "gzip-9"}, 142219089Spjd {NULL, zle_decompress, 64, "zle"}, 143246586Sdelphij {NULL, lz4_decompress, 0, "lz4"}, 144185029Spjd}; 145185029Spjd 146219089Spjdstatic void 147219089Spjdbyteswap_uint64_array(void *vbuf, size_t size) 148219089Spjd{ 149219089Spjd uint64_t *buf = vbuf; 150219089Spjd size_t count = size >> 3; 151219089Spjd int i; 152219089Spjd 153219089Spjd ASSERT((size & 7) == 0); 154219089Spjd 155219089Spjd for (i = 0; i < count; i++) 156219089Spjd buf[i] = BSWAP_64(buf[i]); 157219089Spjd} 158219089Spjd 159219089Spjd/* 160219089Spjd * Set the external verifier for a gang block based on <vdev, offset, txg>, 161219089Spjd * a tuple which is guaranteed to be unique for the life of the pool. 162219089Spjd */ 163219089Spjdstatic void 164219089Spjdzio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp) 165219089Spjd{ 166219089Spjd const dva_t *dva = BP_IDENTITY(bp); 167219089Spjd uint64_t txg = BP_PHYSICAL_BIRTH(bp); 168219089Spjd 169219089Spjd ASSERT(BP_IS_GANG(bp)); 170219089Spjd 171219089Spjd ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0); 172219089Spjd} 173219089Spjd 174219089Spjd/* 175219089Spjd * Set the external verifier for a label block based on its offset. 176219089Spjd * The vdev is implicit, and the txg is unknowable at pool open time -- 177219089Spjd * hence the logic in vdev_uberblock_load() to find the most recent copy. 178219089Spjd */ 179219089Spjdstatic void 180219089Spjdzio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset) 181219089Spjd{ 182219089Spjd ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0); 183219089Spjd} 184219089Spjd 185185029Spjdstatic int 186226568Spjdzio_checksum_verify(const blkptr_t *bp, void *data) 187185029Spjd{ 188226568Spjd uint64_t size; 189226568Spjd unsigned int checksum; 190219089Spjd zio_checksum_info_t *ci; 191219089Spjd zio_cksum_t actual_cksum, expected_cksum, verifier; 192219089Spjd int byteswap; 193185029Spjd 194226568Spjd checksum = BP_GET_CHECKSUM(bp); 195226568Spjd size = BP_GET_PSIZE(bp); 196226568Spjd 197219089Spjd if (checksum >= ZIO_CHECKSUM_FUNCTIONS) 198185029Spjd return (EINVAL); 199219089Spjd ci = &zio_checksum_table[checksum]; 200219089Spjd if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL) 201219089Spjd return (EINVAL); 202185029Spjd 203219089Spjd if (ci->ci_eck) { 204219089Spjd zio_eck_t *eck; 205219089Spjd 206219089Spjd ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER || 207219089Spjd checksum == ZIO_CHECKSUM_LABEL); 208219089Spjd 209219089Spjd eck = (zio_eck_t *)((char *)data + size) - 1; 210219089Spjd 211219089Spjd if (checksum == ZIO_CHECKSUM_GANG_HEADER) 212219089Spjd zio_checksum_gang_verifier(&verifier, bp); 213219089Spjd else if (checksum == ZIO_CHECKSUM_LABEL) 214226568Spjd zio_checksum_label_verifier(&verifier, 215226568Spjd DVA_GET_OFFSET(BP_IDENTITY(bp))); 216219089Spjd else 217219089Spjd verifier = bp->blk_cksum; 218219089Spjd 219219089Spjd byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); 220219089Spjd 221219089Spjd if (byteswap) 222219089Spjd byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); 223219089Spjd 224219089Spjd expected_cksum = eck->zec_cksum; 225219089Spjd eck->zec_cksum = verifier; 226219089Spjd ci->ci_func[byteswap](data, size, &actual_cksum); 227219089Spjd eck->zec_cksum = expected_cksum; 228219089Spjd 229219089Spjd if (byteswap) 230219089Spjd byteswap_uint64_array(&expected_cksum, 231219089Spjd sizeof (zio_cksum_t)); 232185029Spjd } else { 233219089Spjd expected_cksum = bp->blk_cksum; 234185029Spjd ci->ci_func[0](data, size, &actual_cksum); 235185029Spjd } 236185029Spjd 237219089Spjd if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) { 238185029Spjd /*printf("ZFS: read checksum failed\n");*/ 239185029Spjd return (EIO); 240185029Spjd } 241185029Spjd 242185029Spjd return (0); 243185029Spjd} 244185029Spjd 245185029Spjdstatic int 246185029Spjdzio_decompress_data(int cpfunc, void *src, uint64_t srcsize, 247185029Spjd void *dest, uint64_t destsize) 248185029Spjd{ 249219089Spjd zio_compress_info_t *ci; 250185029Spjd 251219089Spjd if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) { 252185097Sdfr printf("ZFS: unsupported compression algorithm %u\n", cpfunc); 253185029Spjd return (EIO); 254185029Spjd } 255185029Spjd 256219089Spjd ci = &zio_compress_table[cpfunc]; 257219089Spjd if (!ci->ci_decompress) { 258219089Spjd printf("ZFS: unsupported compression algorithm %s\n", 259219089Spjd ci->ci_name); 260219089Spjd return (EIO); 261219089Spjd } 262219089Spjd 263185029Spjd return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level)); 264185029Spjd} 265185029Spjd 266185029Spjdstatic uint64_t 267185029Spjdzap_hash(uint64_t salt, const char *name) 268185029Spjd{ 269185029Spjd const uint8_t *cp; 270185029Spjd uint8_t c; 271185029Spjd uint64_t crc = salt; 272185029Spjd 273219089Spjd ASSERT(crc != 0); 274219089Spjd ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 275185029Spjd for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++) 276185029Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF]; 277185029Spjd 278185029Spjd /* 279185029Spjd * Only use 28 bits, since we need 4 bits in the cookie for the 280185029Spjd * collision differentiator. We MUST use the high bits, since 281185029Spjd * those are the onces that we first pay attention to when 282185029Spjd * chosing the bucket. 283185029Spjd */ 284185029Spjd crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); 285185029Spjd 286185029Spjd return (crc); 287185029Spjd} 288192194Sdfr 289219089Spjdstatic void *zfs_alloc(size_t size); 290219089Spjdstatic void zfs_free(void *ptr, size_t size); 291192194Sdfr 292192194Sdfrtypedef struct raidz_col { 293192194Sdfr uint64_t rc_devidx; /* child device index for I/O */ 294192194Sdfr uint64_t rc_offset; /* device offset */ 295192194Sdfr uint64_t rc_size; /* I/O size */ 296192194Sdfr void *rc_data; /* I/O data */ 297192194Sdfr int rc_error; /* I/O error for this device */ 298192194Sdfr uint8_t rc_tried; /* Did we attempt this I/O column? */ 299192194Sdfr uint8_t rc_skipped; /* Did we skip this I/O column? */ 300192194Sdfr} raidz_col_t; 301192194Sdfr 302219089Spjdtypedef struct raidz_map { 303219089Spjd uint64_t rm_cols; /* Regular column count */ 304219089Spjd uint64_t rm_scols; /* Count including skipped columns */ 305219089Spjd uint64_t rm_bigcols; /* Number of oversized columns */ 306219089Spjd uint64_t rm_asize; /* Actual total I/O size */ 307219089Spjd uint64_t rm_missingdata; /* Count of missing data devices */ 308219089Spjd uint64_t rm_missingparity; /* Count of missing parity devices */ 309219089Spjd uint64_t rm_firstdatacol; /* First data column/parity count */ 310219089Spjd uint64_t rm_nskip; /* Skipped sectors for padding */ 311219089Spjd uint64_t rm_skipstart; /* Column index of padding start */ 312219089Spjd uintptr_t rm_reports; /* # of referencing checksum reports */ 313219089Spjd uint8_t rm_freed; /* map no longer has referencing ZIO */ 314219089Spjd uint8_t rm_ecksuminjected; /* checksum error was injected */ 315219089Spjd raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 316219089Spjd} raidz_map_t; 317219089Spjd 318192194Sdfr#define VDEV_RAIDZ_P 0 319192194Sdfr#define VDEV_RAIDZ_Q 1 320219089Spjd#define VDEV_RAIDZ_R 2 321192194Sdfr 322219089Spjd#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 323219089Spjd#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 324192194Sdfr 325219089Spjd/* 326219089Spjd * We provide a mechanism to perform the field multiplication operation on a 327219089Spjd * 64-bit value all at once rather than a byte at a time. This works by 328219089Spjd * creating a mask from the top bit in each byte and using that to 329219089Spjd * conditionally apply the XOR of 0x1d. 330219089Spjd */ 331219089Spjd#define VDEV_RAIDZ_64MUL_2(x, mask) \ 332219089Spjd{ \ 333219089Spjd (mask) = (x) & 0x8080808080808080ULL; \ 334219089Spjd (mask) = ((mask) << 1) - ((mask) >> 7); \ 335219089Spjd (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 336225531Savg ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ 337219089Spjd} 338192194Sdfr 339219089Spjd#define VDEV_RAIDZ_64MUL_4(x, mask) \ 340219089Spjd{ \ 341219089Spjd VDEV_RAIDZ_64MUL_2((x), mask); \ 342219089Spjd VDEV_RAIDZ_64MUL_2((x), mask); \ 343192194Sdfr} 344192194Sdfr 345192194Sdfr/* 346192194Sdfr * These two tables represent powers and logs of 2 in the Galois field defined 347192194Sdfr * above. These values were computed by repeatedly multiplying by 2 as above. 348192194Sdfr */ 349192194Sdfrstatic const uint8_t vdev_raidz_pow2[256] = { 350192194Sdfr 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 351192194Sdfr 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 352192194Sdfr 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 353192194Sdfr 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 354192194Sdfr 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 355192194Sdfr 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 356192194Sdfr 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 357192194Sdfr 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 358192194Sdfr 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 359192194Sdfr 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 360192194Sdfr 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 361192194Sdfr 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 362192194Sdfr 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 363192194Sdfr 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 364192194Sdfr 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 365192194Sdfr 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 366192194Sdfr 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 367192194Sdfr 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 368192194Sdfr 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 369192194Sdfr 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 370192194Sdfr 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 371192194Sdfr 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 372192194Sdfr 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 373192194Sdfr 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 374192194Sdfr 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 375192194Sdfr 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 376192194Sdfr 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 377192194Sdfr 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 378192194Sdfr 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 379192194Sdfr 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 380192194Sdfr 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 381192194Sdfr 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 382192194Sdfr}; 383192194Sdfrstatic const uint8_t vdev_raidz_log2[256] = { 384192194Sdfr 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 385192194Sdfr 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 386192194Sdfr 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 387192194Sdfr 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 388192194Sdfr 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 389192194Sdfr 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 390192194Sdfr 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 391192194Sdfr 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 392192194Sdfr 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 393192194Sdfr 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 394192194Sdfr 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 395192194Sdfr 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 396192194Sdfr 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 397192194Sdfr 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 398192194Sdfr 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 399192194Sdfr 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 400192194Sdfr 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 401192194Sdfr 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 402192194Sdfr 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 403192194Sdfr 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 404192194Sdfr 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 405192194Sdfr 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 406192194Sdfr 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 407192194Sdfr 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 408192194Sdfr 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 409192194Sdfr 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 410192194Sdfr 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 411192194Sdfr 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 412192194Sdfr 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 413192194Sdfr 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 414192194Sdfr 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 415192194Sdfr 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 416192194Sdfr}; 417192194Sdfr 418192194Sdfr/* 419192194Sdfr * Multiply a given number by 2 raised to the given power. 420192194Sdfr */ 421192194Sdfrstatic uint8_t 422192194Sdfrvdev_raidz_exp2(uint8_t a, int exp) 423192194Sdfr{ 424192194Sdfr if (a == 0) 425192194Sdfr return (0); 426192194Sdfr 427219089Spjd ASSERT(exp >= 0); 428219089Spjd ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 429192194Sdfr 430192194Sdfr exp += vdev_raidz_log2[a]; 431192194Sdfr if (exp > 255) 432192194Sdfr exp -= 255; 433192194Sdfr 434192194Sdfr return (vdev_raidz_pow2[exp]); 435192194Sdfr} 436192194Sdfr 437192194Sdfrstatic void 438219089Spjdvdev_raidz_generate_parity_p(raidz_map_t *rm) 439192194Sdfr{ 440219089Spjd uint64_t *p, *src, pcount, ccount, i; 441192194Sdfr int c; 442192194Sdfr 443219089Spjd pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 444192194Sdfr 445219089Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 446219089Spjd src = rm->rm_col[c].rc_data; 447219089Spjd p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 448219089Spjd ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 449192194Sdfr 450219089Spjd if (c == rm->rm_firstdatacol) { 451219089Spjd ASSERT(ccount == pcount); 452219089Spjd for (i = 0; i < ccount; i++, src++, p++) { 453192194Sdfr *p = *src; 454192194Sdfr } 455219089Spjd } else { 456219089Spjd ASSERT(ccount <= pcount); 457219089Spjd for (i = 0; i < ccount; i++, src++, p++) { 458219089Spjd *p ^= *src; 459219089Spjd } 460219089Spjd } 461219089Spjd } 462219089Spjd} 463219089Spjd 464219089Spjdstatic void 465219089Spjdvdev_raidz_generate_parity_pq(raidz_map_t *rm) 466219089Spjd{ 467219089Spjd uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 468219089Spjd int c; 469219089Spjd 470219089Spjd pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 471219089Spjd ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 472219089Spjd rm->rm_col[VDEV_RAIDZ_Q].rc_size); 473219089Spjd 474219089Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 475219089Spjd src = rm->rm_col[c].rc_data; 476219089Spjd p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 477219089Spjd q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 478219089Spjd 479219089Spjd ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 480219089Spjd 481219089Spjd if (c == rm->rm_firstdatacol) { 482219089Spjd ASSERT(ccnt == pcnt || ccnt == 0); 483219089Spjd for (i = 0; i < ccnt; i++, src++, p++, q++) { 484219089Spjd *p = *src; 485219089Spjd *q = *src; 486219089Spjd } 487219089Spjd for (; i < pcnt; i++, src++, p++, q++) { 488219089Spjd *p = 0; 489192194Sdfr *q = 0; 490192194Sdfr } 491192194Sdfr } else { 492219089Spjd ASSERT(ccnt <= pcnt); 493192194Sdfr 494192194Sdfr /* 495219089Spjd * Apply the algorithm described above by multiplying 496219089Spjd * the previous result and adding in the new value. 497192194Sdfr */ 498219089Spjd for (i = 0; i < ccnt; i++, src++, p++, q++) { 499219089Spjd *p ^= *src; 500219089Spjd 501219089Spjd VDEV_RAIDZ_64MUL_2(*q, mask); 502192194Sdfr *q ^= *src; 503192194Sdfr } 504192194Sdfr 505192194Sdfr /* 506192194Sdfr * Treat short columns as though they are full of 0s. 507219089Spjd * Note that there's therefore nothing needed for P. 508192194Sdfr */ 509219089Spjd for (; i < pcnt; i++, q++) { 510219089Spjd VDEV_RAIDZ_64MUL_2(*q, mask); 511192194Sdfr } 512192194Sdfr } 513192194Sdfr } 514192194Sdfr} 515192194Sdfr 516192194Sdfrstatic void 517219089Spjdvdev_raidz_generate_parity_pqr(raidz_map_t *rm) 518192194Sdfr{ 519219089Spjd uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 520219089Spjd int c; 521192194Sdfr 522219089Spjd pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 523219089Spjd ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 524219089Spjd rm->rm_col[VDEV_RAIDZ_Q].rc_size); 525219089Spjd ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 526219089Spjd rm->rm_col[VDEV_RAIDZ_R].rc_size); 527192194Sdfr 528219089Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 529219089Spjd src = rm->rm_col[c].rc_data; 530219089Spjd p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 531219089Spjd q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 532219089Spjd r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 533192194Sdfr 534219089Spjd ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 535192194Sdfr 536219089Spjd if (c == rm->rm_firstdatacol) { 537219089Spjd ASSERT(ccnt == pcnt || ccnt == 0); 538219089Spjd for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 539219089Spjd *p = *src; 540219089Spjd *q = *src; 541219089Spjd *r = *src; 542192194Sdfr } 543219089Spjd for (; i < pcnt; i++, src++, p++, q++, r++) { 544219089Spjd *p = 0; 545219089Spjd *q = 0; 546219089Spjd *r = 0; 547192194Sdfr } 548219089Spjd } else { 549219089Spjd ASSERT(ccnt <= pcnt); 550192194Sdfr 551192194Sdfr /* 552219089Spjd * Apply the algorithm described above by multiplying 553219089Spjd * the previous result and adding in the new value. 554192194Sdfr */ 555219089Spjd for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 556219089Spjd *p ^= *src; 557219089Spjd 558219089Spjd VDEV_RAIDZ_64MUL_2(*q, mask); 559219089Spjd *q ^= *src; 560219089Spjd 561219089Spjd VDEV_RAIDZ_64MUL_4(*r, mask); 562219089Spjd *r ^= *src; 563192194Sdfr } 564192194Sdfr 565219089Spjd /* 566219089Spjd * Treat short columns as though they are full of 0s. 567219089Spjd * Note that there's therefore nothing needed for P. 568219089Spjd */ 569219089Spjd for (; i < pcnt; i++, q++, r++) { 570219089Spjd VDEV_RAIDZ_64MUL_2(*q, mask); 571219089Spjd VDEV_RAIDZ_64MUL_4(*r, mask); 572192194Sdfr } 573192194Sdfr } 574192194Sdfr } 575219089Spjd} 576192194Sdfr 577219089Spjd/* 578219089Spjd * Generate RAID parity in the first virtual columns according to the number of 579219089Spjd * parity columns available. 580219089Spjd */ 581219089Spjdstatic void 582219089Spjdvdev_raidz_generate_parity(raidz_map_t *rm) 583219089Spjd{ 584219089Spjd switch (rm->rm_firstdatacol) { 585219089Spjd case 1: 586219089Spjd vdev_raidz_generate_parity_p(rm); 587219089Spjd break; 588219089Spjd case 2: 589219089Spjd vdev_raidz_generate_parity_pq(rm); 590219089Spjd break; 591219089Spjd case 3: 592219089Spjd vdev_raidz_generate_parity_pqr(rm); 593219089Spjd break; 594219089Spjd default: 595219089Spjd panic("invalid RAID-Z configuration"); 596219089Spjd } 597219089Spjd} 598192194Sdfr 599219089Spjd/* BEGIN CSTYLED */ 600219089Spjd/* 601219089Spjd * In the general case of reconstruction, we must solve the system of linear 602219089Spjd * equations defined by the coeffecients used to generate parity as well as 603219089Spjd * the contents of the data and parity disks. This can be expressed with 604219089Spjd * vectors for the original data (D) and the actual data (d) and parity (p) 605219089Spjd * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 606219089Spjd * 607219089Spjd * __ __ __ __ 608219089Spjd * | | __ __ | p_0 | 609219089Spjd * | V | | D_0 | | p_m-1 | 610219089Spjd * | | x | : | = | d_0 | 611219089Spjd * | I | | D_n-1 | | : | 612219089Spjd * | | ~~ ~~ | d_n-1 | 613219089Spjd * ~~ ~~ ~~ ~~ 614219089Spjd * 615219089Spjd * I is simply a square identity matrix of size n, and V is a vandermonde 616219089Spjd * matrix defined by the coeffecients we chose for the various parity columns 617219089Spjd * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 618219089Spjd * computation as well as linear separability. 619219089Spjd * 620219089Spjd * __ __ __ __ 621219089Spjd * | 1 .. 1 1 1 | | p_0 | 622219089Spjd * | 2^n-1 .. 4 2 1 | __ __ | : | 623219089Spjd * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 624219089Spjd * | 1 .. 0 0 0 | | D_1 | | d_0 | 625219089Spjd * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 626219089Spjd * | : : : : | | : | | d_2 | 627219089Spjd * | 0 .. 1 0 0 | | D_n-1 | | : | 628219089Spjd * | 0 .. 0 1 0 | ~~ ~~ | : | 629219089Spjd * | 0 .. 0 0 1 | | d_n-1 | 630219089Spjd * ~~ ~~ ~~ ~~ 631219089Spjd * 632219089Spjd * Note that I, V, d, and p are known. To compute D, we must invert the 633219089Spjd * matrix and use the known data and parity values to reconstruct the unknown 634219089Spjd * data values. We begin by removing the rows in V|I and d|p that correspond 635219089Spjd * to failed or missing columns; we then make V|I square (n x n) and d|p 636219089Spjd * sized n by removing rows corresponding to unused parity from the bottom up 637219089Spjd * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 638219089Spjd * using Gauss-Jordan elimination. In the example below we use m=3 parity 639219089Spjd * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 640219089Spjd * __ __ 641219089Spjd * | 1 1 1 1 1 1 1 1 | 642219089Spjd * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 643219089Spjd * | 19 205 116 29 64 16 4 1 | / / 644219089Spjd * | 1 0 0 0 0 0 0 0 | / / 645219089Spjd * | 0 1 0 0 0 0 0 0 | <--' / 646219089Spjd * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 647219089Spjd * | 0 0 0 1 0 0 0 0 | 648219089Spjd * | 0 0 0 0 1 0 0 0 | 649219089Spjd * | 0 0 0 0 0 1 0 0 | 650219089Spjd * | 0 0 0 0 0 0 1 0 | 651219089Spjd * | 0 0 0 0 0 0 0 1 | 652219089Spjd * ~~ ~~ 653219089Spjd * __ __ 654219089Spjd * | 1 1 1 1 1 1 1 1 | 655219089Spjd * | 128 64 32 16 8 4 2 1 | 656219089Spjd * | 19 205 116 29 64 16 4 1 | 657219089Spjd * | 1 0 0 0 0 0 0 0 | 658219089Spjd * | 0 1 0 0 0 0 0 0 | 659219089Spjd * (V|I)' = | 0 0 1 0 0 0 0 0 | 660219089Spjd * | 0 0 0 1 0 0 0 0 | 661219089Spjd * | 0 0 0 0 1 0 0 0 | 662219089Spjd * | 0 0 0 0 0 1 0 0 | 663219089Spjd * | 0 0 0 0 0 0 1 0 | 664219089Spjd * | 0 0 0 0 0 0 0 1 | 665219089Spjd * ~~ ~~ 666219089Spjd * 667219089Spjd * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 668219089Spjd * have carefully chosen the seed values 1, 2, and 4 to ensure that this 669219089Spjd * matrix is not singular. 670219089Spjd * __ __ 671219089Spjd * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 672219089Spjd * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 673219089Spjd * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 674219089Spjd * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 675219089Spjd * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 676219089Spjd * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 677219089Spjd * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 678219089Spjd * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 679219089Spjd * ~~ ~~ 680219089Spjd * __ __ 681219089Spjd * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 682219089Spjd * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 683219089Spjd * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 684219089Spjd * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 685219089Spjd * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 686219089Spjd * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 687219089Spjd * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 688219089Spjd * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 689219089Spjd * ~~ ~~ 690219089Spjd * __ __ 691219089Spjd * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 692219089Spjd * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 693219089Spjd * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 694219089Spjd * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 695219089Spjd * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 696219089Spjd * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 697219089Spjd * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 698219089Spjd * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 699219089Spjd * ~~ ~~ 700219089Spjd * __ __ 701219089Spjd * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 702219089Spjd * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 703219089Spjd * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 704219089Spjd * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 705219089Spjd * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 706219089Spjd * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 707219089Spjd * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 708219089Spjd * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 709219089Spjd * ~~ ~~ 710219089Spjd * __ __ 711219089Spjd * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 712219089Spjd * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 713219089Spjd * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 714219089Spjd * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 715219089Spjd * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 716219089Spjd * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 717219089Spjd * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 718219089Spjd * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 719219089Spjd * ~~ ~~ 720219089Spjd * __ __ 721219089Spjd * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 722219089Spjd * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 723219089Spjd * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 724219089Spjd * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 725219089Spjd * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 726219089Spjd * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 727219089Spjd * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 728219089Spjd * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 729219089Spjd * ~~ ~~ 730219089Spjd * __ __ 731219089Spjd * | 0 0 1 0 0 0 0 0 | 732219089Spjd * | 167 100 5 41 159 169 217 208 | 733219089Spjd * | 166 100 4 40 158 168 216 209 | 734219089Spjd * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 735219089Spjd * | 0 0 0 0 1 0 0 0 | 736219089Spjd * | 0 0 0 0 0 1 0 0 | 737219089Spjd * | 0 0 0 0 0 0 1 0 | 738219089Spjd * | 0 0 0 0 0 0 0 1 | 739219089Spjd * ~~ ~~ 740219089Spjd * 741219089Spjd * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 742219089Spjd * of the missing data. 743219089Spjd * 744219089Spjd * As is apparent from the example above, the only non-trivial rows in the 745219089Spjd * inverse matrix correspond to the data disks that we're trying to 746219089Spjd * reconstruct. Indeed, those are the only rows we need as the others would 747219089Spjd * only be useful for reconstructing data known or assumed to be valid. For 748219089Spjd * that reason, we only build the coefficients in the rows that correspond to 749219089Spjd * targeted columns. 750219089Spjd */ 751219089Spjd/* END CSTYLED */ 752219089Spjd 753219089Spjdstatic void 754219089Spjdvdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 755219089Spjd uint8_t **rows) 756219089Spjd{ 757219089Spjd int i, j; 758219089Spjd int pow; 759219089Spjd 760219089Spjd ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 761219089Spjd 762219089Spjd /* 763219089Spjd * Fill in the missing rows of interest. 764219089Spjd */ 765219089Spjd for (i = 0; i < nmap; i++) { 766219089Spjd ASSERT3S(0, <=, map[i]); 767219089Spjd ASSERT3S(map[i], <=, 2); 768219089Spjd 769219089Spjd pow = map[i] * n; 770219089Spjd if (pow > 255) 771219089Spjd pow -= 255; 772219089Spjd ASSERT(pow <= 255); 773219089Spjd 774219089Spjd for (j = 0; j < n; j++) { 775219089Spjd pow -= map[i]; 776219089Spjd if (pow < 0) 777219089Spjd pow += 255; 778219089Spjd rows[i][j] = vdev_raidz_pow2[pow]; 779192194Sdfr } 780192194Sdfr } 781192194Sdfr} 782192194Sdfr 783192194Sdfrstatic void 784219089Spjdvdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 785219089Spjd uint8_t **rows, uint8_t **invrows, const uint8_t *used) 786192194Sdfr{ 787219089Spjd int i, j, ii, jj; 788219089Spjd uint8_t log; 789192194Sdfr 790219089Spjd /* 791219089Spjd * Assert that the first nmissing entries from the array of used 792219089Spjd * columns correspond to parity columns and that subsequent entries 793219089Spjd * correspond to data columns. 794219089Spjd */ 795219089Spjd for (i = 0; i < nmissing; i++) { 796219089Spjd ASSERT3S(used[i], <, rm->rm_firstdatacol); 797219089Spjd } 798219089Spjd for (; i < n; i++) { 799219089Spjd ASSERT3S(used[i], >=, rm->rm_firstdatacol); 800219089Spjd } 801192194Sdfr 802219089Spjd /* 803219089Spjd * First initialize the storage where we'll compute the inverse rows. 804219089Spjd */ 805219089Spjd for (i = 0; i < nmissing; i++) { 806219089Spjd for (j = 0; j < n; j++) { 807219089Spjd invrows[i][j] = (i == j) ? 1 : 0; 808219089Spjd } 809219089Spjd } 810192194Sdfr 811192194Sdfr /* 812219089Spjd * Subtract all trivial rows from the rows of consequence. 813192194Sdfr */ 814219089Spjd for (i = 0; i < nmissing; i++) { 815219089Spjd for (j = nmissing; j < n; j++) { 816219089Spjd ASSERT3U(used[j], >=, rm->rm_firstdatacol); 817219089Spjd jj = used[j] - rm->rm_firstdatacol; 818219089Spjd ASSERT3S(jj, <, n); 819219089Spjd invrows[i][j] = rows[i][jj]; 820219089Spjd rows[i][jj] = 0; 821219089Spjd } 822219089Spjd } 823192194Sdfr 824219089Spjd /* 825219089Spjd * For each of the rows of interest, we must normalize it and subtract 826219089Spjd * a multiple of it from the other rows. 827219089Spjd */ 828219089Spjd for (i = 0; i < nmissing; i++) { 829219089Spjd for (j = 0; j < missing[i]; j++) { 830219089Spjd ASSERT3U(rows[i][j], ==, 0); 831219089Spjd } 832219089Spjd ASSERT3U(rows[i][missing[i]], !=, 0); 833192194Sdfr 834219089Spjd /* 835219089Spjd * Compute the inverse of the first element and multiply each 836219089Spjd * element in the row by that value. 837219089Spjd */ 838219089Spjd log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 839192194Sdfr 840219089Spjd for (j = 0; j < n; j++) { 841219089Spjd rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 842219089Spjd invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 843219089Spjd } 844192194Sdfr 845219089Spjd for (ii = 0; ii < nmissing; ii++) { 846219089Spjd if (i == ii) 847219089Spjd continue; 848192194Sdfr 849219089Spjd ASSERT3U(rows[ii][missing[i]], !=, 0); 850219089Spjd 851219089Spjd log = vdev_raidz_log2[rows[ii][missing[i]]]; 852219089Spjd 853219089Spjd for (j = 0; j < n; j++) { 854219089Spjd rows[ii][j] ^= 855219089Spjd vdev_raidz_exp2(rows[i][j], log); 856219089Spjd invrows[ii][j] ^= 857219089Spjd vdev_raidz_exp2(invrows[i][j], log); 858219089Spjd } 859219089Spjd } 860219089Spjd } 861219089Spjd 862192194Sdfr /* 863219089Spjd * Verify that the data that is left in the rows are properly part of 864219089Spjd * an identity matrix. 865192194Sdfr */ 866219089Spjd for (i = 0; i < nmissing; i++) { 867219089Spjd for (j = 0; j < n; j++) { 868219089Spjd if (j == missing[i]) { 869219089Spjd ASSERT3U(rows[i][j], ==, 1); 870219089Spjd } else { 871219089Spjd ASSERT3U(rows[i][j], ==, 0); 872219089Spjd } 873219089Spjd } 874219089Spjd } 875219089Spjd} 876192194Sdfr 877219089Spjdstatic void 878219089Spjdvdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 879219089Spjd int *missing, uint8_t **invrows, const uint8_t *used) 880219089Spjd{ 881219089Spjd int i, j, x, cc, c; 882219089Spjd uint8_t *src; 883219089Spjd uint64_t ccount; 884219089Spjd uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 885219089Spjd uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 886219089Spjd uint8_t log, val; 887219089Spjd int ll; 888219089Spjd uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 889219089Spjd uint8_t *p, *pp; 890219089Spjd size_t psize; 891192194Sdfr 892219089Spjd log = 0; /* gcc */ 893219089Spjd psize = sizeof (invlog[0][0]) * n * nmissing; 894219089Spjd p = zfs_alloc(psize); 895192194Sdfr 896219089Spjd for (pp = p, i = 0; i < nmissing; i++) { 897219089Spjd invlog[i] = pp; 898219089Spjd pp += n; 899219089Spjd } 900192194Sdfr 901219089Spjd for (i = 0; i < nmissing; i++) { 902219089Spjd for (j = 0; j < n; j++) { 903219089Spjd ASSERT3U(invrows[i][j], !=, 0); 904219089Spjd invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 905219089Spjd } 906192194Sdfr } 907192194Sdfr 908219089Spjd for (i = 0; i < n; i++) { 909219089Spjd c = used[i]; 910219089Spjd ASSERT3U(c, <, rm->rm_cols); 911219089Spjd 912219089Spjd src = rm->rm_col[c].rc_data; 913219089Spjd ccount = rm->rm_col[c].rc_size; 914219089Spjd for (j = 0; j < nmissing; j++) { 915219089Spjd cc = missing[j] + rm->rm_firstdatacol; 916219089Spjd ASSERT3U(cc, >=, rm->rm_firstdatacol); 917219089Spjd ASSERT3U(cc, <, rm->rm_cols); 918219089Spjd ASSERT3U(cc, !=, c); 919219089Spjd 920219089Spjd dst[j] = rm->rm_col[cc].rc_data; 921219089Spjd dcount[j] = rm->rm_col[cc].rc_size; 922219089Spjd } 923219089Spjd 924219089Spjd ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 925219089Spjd 926219089Spjd for (x = 0; x < ccount; x++, src++) { 927219089Spjd if (*src != 0) 928219089Spjd log = vdev_raidz_log2[*src]; 929219089Spjd 930219089Spjd for (cc = 0; cc < nmissing; cc++) { 931219089Spjd if (x >= dcount[cc]) 932219089Spjd continue; 933219089Spjd 934219089Spjd if (*src == 0) { 935219089Spjd val = 0; 936219089Spjd } else { 937219089Spjd if ((ll = log + invlog[cc][i]) >= 255) 938219089Spjd ll -= 255; 939219089Spjd val = vdev_raidz_pow2[ll]; 940219089Spjd } 941219089Spjd 942219089Spjd if (i == 0) 943219089Spjd dst[cc][x] = val; 944219089Spjd else 945219089Spjd dst[cc][x] ^= val; 946219089Spjd } 947219089Spjd } 948219089Spjd } 949219089Spjd 950219089Spjd zfs_free(p, psize); 951219089Spjd} 952219089Spjd 953219089Spjdstatic int 954219089Spjdvdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 955219089Spjd{ 956219089Spjd int n, i, c, t, tt; 957219089Spjd int nmissing_rows; 958219089Spjd int missing_rows[VDEV_RAIDZ_MAXPARITY]; 959219089Spjd int parity_map[VDEV_RAIDZ_MAXPARITY]; 960219089Spjd 961219089Spjd uint8_t *p, *pp; 962219089Spjd size_t psize; 963219089Spjd 964219089Spjd uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 965219089Spjd uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 966219089Spjd uint8_t *used; 967219089Spjd 968219089Spjd int code = 0; 969219089Spjd 970219089Spjd 971219089Spjd n = rm->rm_cols - rm->rm_firstdatacol; 972219089Spjd 973192194Sdfr /* 974219089Spjd * Figure out which data columns are missing. 975192194Sdfr */ 976219089Spjd nmissing_rows = 0; 977219089Spjd for (t = 0; t < ntgts; t++) { 978219089Spjd if (tgts[t] >= rm->rm_firstdatacol) { 979219089Spjd missing_rows[nmissing_rows++] = 980219089Spjd tgts[t] - rm->rm_firstdatacol; 981219089Spjd } 982219089Spjd } 983219089Spjd 984219089Spjd /* 985219089Spjd * Figure out which parity columns to use to help generate the missing 986219089Spjd * data columns. 987219089Spjd */ 988219089Spjd for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 989219089Spjd ASSERT(tt < ntgts); 990219089Spjd ASSERT(c < rm->rm_firstdatacol); 991219089Spjd 992219089Spjd /* 993219089Spjd * Skip any targeted parity columns. 994219089Spjd */ 995219089Spjd if (c == tgts[tt]) { 996219089Spjd tt++; 997219089Spjd continue; 998219089Spjd } 999219089Spjd 1000219089Spjd code |= 1 << c; 1001219089Spjd 1002219089Spjd parity_map[i] = c; 1003219089Spjd i++; 1004219089Spjd } 1005219089Spjd 1006219089Spjd ASSERT(code != 0); 1007219089Spjd ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1008219089Spjd 1009219089Spjd psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1010219089Spjd nmissing_rows * n + sizeof (used[0]) * n; 1011219089Spjd p = kmem_alloc(psize, KM_SLEEP); 1012219089Spjd 1013219089Spjd for (pp = p, i = 0; i < nmissing_rows; i++) { 1014219089Spjd rows[i] = pp; 1015219089Spjd pp += n; 1016219089Spjd invrows[i] = pp; 1017219089Spjd pp += n; 1018219089Spjd } 1019219089Spjd used = pp; 1020219089Spjd 1021219089Spjd for (i = 0; i < nmissing_rows; i++) { 1022219089Spjd used[i] = parity_map[i]; 1023219089Spjd } 1024219089Spjd 1025219089Spjd for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1026219089Spjd if (tt < nmissing_rows && 1027219089Spjd c == missing_rows[tt] + rm->rm_firstdatacol) { 1028219089Spjd tt++; 1029219089Spjd continue; 1030219089Spjd } 1031219089Spjd 1032219089Spjd ASSERT3S(i, <, n); 1033219089Spjd used[i] = c; 1034219089Spjd i++; 1035219089Spjd } 1036219089Spjd 1037219089Spjd /* 1038219089Spjd * Initialize the interesting rows of the matrix. 1039219089Spjd */ 1040219089Spjd vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1041219089Spjd 1042219089Spjd /* 1043219089Spjd * Invert the matrix. 1044219089Spjd */ 1045219089Spjd vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1046219089Spjd invrows, used); 1047219089Spjd 1048219089Spjd /* 1049219089Spjd * Reconstruct the missing data using the generated matrix. 1050219089Spjd */ 1051219089Spjd vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1052219089Spjd invrows, used); 1053219089Spjd 1054219089Spjd kmem_free(p, psize); 1055219089Spjd 1056219089Spjd return (code); 1057192194Sdfr} 1058192194Sdfr 1059192194Sdfrstatic int 1060219089Spjdvdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1061192194Sdfr{ 1062219089Spjd int tgts[VDEV_RAIDZ_MAXPARITY]; 1063219089Spjd int ntgts; 1064219089Spjd int i, c; 1065219089Spjd int code; 1066219089Spjd int nbadparity, nbaddata; 1067219089Spjd 1068219089Spjd /* 1069219089Spjd * The tgts list must already be sorted. 1070219089Spjd */ 1071219089Spjd for (i = 1; i < nt; i++) { 1072219089Spjd ASSERT(t[i] > t[i - 1]); 1073219089Spjd } 1074219089Spjd 1075219089Spjd nbadparity = rm->rm_firstdatacol; 1076219089Spjd nbaddata = rm->rm_cols - nbadparity; 1077219089Spjd ntgts = 0; 1078219089Spjd for (i = 0, c = 0; c < rm->rm_cols; c++) { 1079219089Spjd if (i < nt && c == t[i]) { 1080219089Spjd tgts[ntgts++] = c; 1081219089Spjd i++; 1082219089Spjd } else if (rm->rm_col[c].rc_error != 0) { 1083219089Spjd tgts[ntgts++] = c; 1084219089Spjd } else if (c >= rm->rm_firstdatacol) { 1085219089Spjd nbaddata--; 1086219089Spjd } else { 1087219089Spjd nbadparity--; 1088219089Spjd } 1089219089Spjd } 1090219089Spjd 1091219089Spjd ASSERT(ntgts >= nt); 1092219089Spjd ASSERT(nbaddata >= 0); 1093219089Spjd ASSERT(nbaddata + nbadparity == ntgts); 1094219089Spjd 1095219089Spjd code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1096219089Spjd ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1097219089Spjd ASSERT(code > 0); 1098219089Spjd return (code); 1099219089Spjd} 1100219089Spjd 1101219089Spjdstatic raidz_map_t * 1102219089Spjdvdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift, 1103219089Spjd uint64_t dcols, uint64_t nparity) 1104219089Spjd{ 1105219089Spjd raidz_map_t *rm; 1106192194Sdfr uint64_t b = offset >> unit_shift; 1107219089Spjd uint64_t s = size >> unit_shift; 1108192194Sdfr uint64_t f = b % dcols; 1109192194Sdfr uint64_t o = (b / dcols) << unit_shift; 1110219089Spjd uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 1111192194Sdfr 1112192194Sdfr q = s / (dcols - nparity); 1113192194Sdfr r = s - q * (dcols - nparity); 1114192194Sdfr bc = (r == 0 ? 0 : r + nparity); 1115219089Spjd tot = s + nparity * (q + (r == 0 ? 0 : 1)); 1116192194Sdfr 1117219089Spjd if (q == 0) { 1118219089Spjd acols = bc; 1119219089Spjd scols = MIN(dcols, roundup(bc, nparity + 1)); 1120219089Spjd } else { 1121219089Spjd acols = dcols; 1122219089Spjd scols = dcols; 1123219089Spjd } 1124219089Spjd 1125219089Spjd ASSERT3U(acols, <=, scols); 1126219089Spjd 1127219089Spjd rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols])); 1128219089Spjd 1129219089Spjd rm->rm_cols = acols; 1130219089Spjd rm->rm_scols = scols; 1131219089Spjd rm->rm_bigcols = bc; 1132219089Spjd rm->rm_skipstart = bc; 1133219089Spjd rm->rm_missingdata = 0; 1134219089Spjd rm->rm_missingparity = 0; 1135219089Spjd rm->rm_firstdatacol = nparity; 1136219089Spjd rm->rm_reports = 0; 1137219089Spjd rm->rm_freed = 0; 1138219089Spjd rm->rm_ecksuminjected = 0; 1139219089Spjd 1140192194Sdfr asize = 0; 1141219089Spjd 1142219089Spjd for (c = 0; c < scols; c++) { 1143192194Sdfr col = f + c; 1144192194Sdfr coff = o; 1145192194Sdfr if (col >= dcols) { 1146192194Sdfr col -= dcols; 1147192194Sdfr coff += 1ULL << unit_shift; 1148192194Sdfr } 1149219089Spjd rm->rm_col[c].rc_devidx = col; 1150219089Spjd rm->rm_col[c].rc_offset = coff; 1151219089Spjd rm->rm_col[c].rc_data = NULL; 1152219089Spjd rm->rm_col[c].rc_error = 0; 1153219089Spjd rm->rm_col[c].rc_tried = 0; 1154219089Spjd rm->rm_col[c].rc_skipped = 0; 1155192194Sdfr 1156219089Spjd if (c >= acols) 1157219089Spjd rm->rm_col[c].rc_size = 0; 1158219089Spjd else if (c < bc) 1159219089Spjd rm->rm_col[c].rc_size = (q + 1) << unit_shift; 1160219089Spjd else 1161219089Spjd rm->rm_col[c].rc_size = q << unit_shift; 1162192194Sdfr 1163219089Spjd asize += rm->rm_col[c].rc_size; 1164192194Sdfr } 1165192194Sdfr 1166219089Spjd ASSERT3U(asize, ==, tot << unit_shift); 1167219089Spjd rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 1168219089Spjd rm->rm_nskip = roundup(tot, nparity + 1) - tot; 1169219089Spjd ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 1170219089Spjd ASSERT3U(rm->rm_nskip, <=, nparity); 1171192194Sdfr 1172219089Spjd for (c = 0; c < rm->rm_firstdatacol; c++) 1173219089Spjd rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size); 1174219089Spjd 1175219089Spjd rm->rm_col[c].rc_data = data; 1176219089Spjd 1177192194Sdfr for (c = c + 1; c < acols; c++) 1178219089Spjd rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 1179219089Spjd rm->rm_col[c - 1].rc_size; 1180192194Sdfr 1181192194Sdfr /* 1182219089Spjd * If all data stored spans all columns, there's a danger that parity 1183219089Spjd * will always be on the same device and, since parity isn't read 1184219089Spjd * during normal operation, that that device's I/O bandwidth won't be 1185219089Spjd * used effectively. We therefore switch the parity every 1MB. 1186192194Sdfr * 1187219089Spjd * ... at least that was, ostensibly, the theory. As a practical 1188219089Spjd * matter unless we juggle the parity between all devices evenly, we 1189219089Spjd * won't see any benefit. Further, occasional writes that aren't a 1190219089Spjd * multiple of the LCM of the number of children and the minimum 1191219089Spjd * stripe width are sufficient to avoid pessimal behavior. 1192219089Spjd * Unfortunately, this decision created an implicit on-disk format 1193219089Spjd * requirement that we need to support for all eternity, but only 1194219089Spjd * for single-parity RAID-Z. 1195219089Spjd * 1196219089Spjd * If we intend to skip a sector in the zeroth column for padding 1197219089Spjd * we must make sure to note this swap. We will never intend to 1198219089Spjd * skip the first column since at least one data and one parity 1199219089Spjd * column must appear in each row. 1200192194Sdfr */ 1201219089Spjd ASSERT(rm->rm_cols >= 2); 1202219089Spjd ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 1203192194Sdfr 1204219089Spjd if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { 1205219089Spjd devidx = rm->rm_col[0].rc_devidx; 1206219089Spjd o = rm->rm_col[0].rc_offset; 1207219089Spjd rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 1208219089Spjd rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 1209219089Spjd rm->rm_col[1].rc_devidx = devidx; 1210219089Spjd rm->rm_col[1].rc_offset = o; 1211219089Spjd 1212219089Spjd if (rm->rm_skipstart == 0) 1213219089Spjd rm->rm_skipstart = 1; 1214192194Sdfr } 1215192194Sdfr 1216219089Spjd return (rm); 1217219089Spjd} 1218219089Spjd 1219219089Spjdstatic void 1220219089Spjdvdev_raidz_map_free(raidz_map_t *rm) 1221219089Spjd{ 1222219089Spjd int c; 1223219089Spjd 1224219089Spjd for (c = rm->rm_firstdatacol - 1; c >= 0; c--) 1225219089Spjd zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); 1226219089Spjd 1227219089Spjd zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 1228219089Spjd} 1229219089Spjd 1230219089Spjdstatic vdev_t * 1231219089Spjdvdev_child(vdev_t *pvd, uint64_t devidx) 1232219089Spjd{ 1233219089Spjd vdev_t *cvd; 1234219089Spjd 1235219089Spjd STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) { 1236219089Spjd if (cvd->v_id == devidx) 1237219089Spjd break; 1238219089Spjd } 1239219089Spjd 1240219089Spjd return (cvd); 1241219089Spjd} 1242219089Spjd 1243219089Spjd/* 1244219089Spjd * We keep track of whether or not there were any injected errors, so that 1245219089Spjd * any ereports we generate can note it. 1246219089Spjd */ 1247219089Spjdstatic int 1248226553Spjdraidz_checksum_verify(const blkptr_t *bp, void *data, uint64_t size) 1249219089Spjd{ 1250219089Spjd 1251226568Spjd return (zio_checksum_verify(bp, data)); 1252219089Spjd} 1253219089Spjd 1254219089Spjd/* 1255219089Spjd * Generate the parity from the data columns. If we tried and were able to 1256219089Spjd * read the parity without error, verify that the generated parity matches the 1257219089Spjd * data we read. If it doesn't, we fire off a checksum error. Return the 1258219089Spjd * number such failures. 1259219089Spjd */ 1260219089Spjdstatic int 1261219089Spjdraidz_parity_verify(raidz_map_t *rm) 1262219089Spjd{ 1263219089Spjd void *orig[VDEV_RAIDZ_MAXPARITY]; 1264219089Spjd int c, ret = 0; 1265219089Spjd raidz_col_t *rc; 1266219089Spjd 1267219089Spjd for (c = 0; c < rm->rm_firstdatacol; c++) { 1268219089Spjd rc = &rm->rm_col[c]; 1269219089Spjd if (!rc->rc_tried || rc->rc_error != 0) 1270219089Spjd continue; 1271219089Spjd orig[c] = zfs_alloc(rc->rc_size); 1272219089Spjd bcopy(rc->rc_data, orig[c], rc->rc_size); 1273219089Spjd } 1274219089Spjd 1275219089Spjd vdev_raidz_generate_parity(rm); 1276219089Spjd 1277219089Spjd for (c = rm->rm_firstdatacol - 1; c >= 0; c--) { 1278219089Spjd rc = &rm->rm_col[c]; 1279219089Spjd if (!rc->rc_tried || rc->rc_error != 0) 1280219089Spjd continue; 1281219089Spjd if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 1282219089Spjd rc->rc_error = ECKSUM; 1283219089Spjd ret++; 1284219089Spjd } 1285219089Spjd zfs_free(orig[c], rc->rc_size); 1286219089Spjd } 1287219089Spjd 1288219089Spjd return (ret); 1289219089Spjd} 1290219089Spjd 1291219089Spjd/* 1292219089Spjd * Iterate over all combinations of bad data and attempt a reconstruction. 1293219089Spjd * Note that the algorithm below is non-optimal because it doesn't take into 1294219089Spjd * account how reconstruction is actually performed. For example, with 1295219089Spjd * triple-parity RAID-Z the reconstruction procedure is the same if column 4 1296219089Spjd * is targeted as invalid as if columns 1 and 4 are targeted since in both 1297219089Spjd * cases we'd only use parity information in column 0. 1298219089Spjd */ 1299219089Spjdstatic int 1300219089Spjdvdev_raidz_combrec(raidz_map_t *rm, const blkptr_t *bp, void *data, 1301226553Spjd off_t offset, uint64_t bytes, int total_errors, int data_errors) 1302219089Spjd{ 1303219089Spjd raidz_col_t *rc; 1304219089Spjd void *orig[VDEV_RAIDZ_MAXPARITY]; 1305219089Spjd int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 1306219089Spjd int *tgts = &tstore[1]; 1307219089Spjd int current, next, i, c, n; 1308219089Spjd int code, ret = 0; 1309219089Spjd 1310219089Spjd ASSERT(total_errors < rm->rm_firstdatacol); 1311219089Spjd 1312192194Sdfr /* 1313219089Spjd * This simplifies one edge condition. 1314192194Sdfr */ 1315219089Spjd tgts[-1] = -1; 1316219089Spjd 1317219089Spjd for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 1318219089Spjd /* 1319219089Spjd * Initialize the targets array by finding the first n columns 1320219089Spjd * that contain no error. 1321219089Spjd * 1322219089Spjd * If there were no data errors, we need to ensure that we're 1323219089Spjd * always explicitly attempting to reconstruct at least one 1324219089Spjd * data column. To do this, we simply push the highest target 1325219089Spjd * up into the data columns. 1326219089Spjd */ 1327219089Spjd for (c = 0, i = 0; i < n; i++) { 1328219089Spjd if (i == n - 1 && data_errors == 0 && 1329219089Spjd c < rm->rm_firstdatacol) { 1330219089Spjd c = rm->rm_firstdatacol; 1331219089Spjd } 1332219089Spjd 1333219089Spjd while (rm->rm_col[c].rc_error != 0) { 1334219089Spjd c++; 1335219089Spjd ASSERT3S(c, <, rm->rm_cols); 1336219089Spjd } 1337219089Spjd 1338219089Spjd tgts[i] = c++; 1339219089Spjd } 1340219089Spjd 1341219089Spjd /* 1342219089Spjd * Setting tgts[n] simplifies the other edge condition. 1343219089Spjd */ 1344219089Spjd tgts[n] = rm->rm_cols; 1345219089Spjd 1346219089Spjd /* 1347219089Spjd * These buffers were allocated in previous iterations. 1348219089Spjd */ 1349219089Spjd for (i = 0; i < n - 1; i++) { 1350219089Spjd ASSERT(orig[i] != NULL); 1351219089Spjd } 1352219089Spjd 1353219089Spjd orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size); 1354219089Spjd 1355219089Spjd current = 0; 1356219089Spjd next = tgts[current]; 1357219089Spjd 1358219089Spjd while (current != n) { 1359219089Spjd tgts[current] = next; 1360219089Spjd current = 0; 1361219089Spjd 1362219089Spjd /* 1363219089Spjd * Save off the original data that we're going to 1364219089Spjd * attempt to reconstruct. 1365219089Spjd */ 1366219089Spjd for (i = 0; i < n; i++) { 1367219089Spjd ASSERT(orig[i] != NULL); 1368219089Spjd c = tgts[i]; 1369219089Spjd ASSERT3S(c, >=, 0); 1370219089Spjd ASSERT3S(c, <, rm->rm_cols); 1371219089Spjd rc = &rm->rm_col[c]; 1372219089Spjd bcopy(rc->rc_data, orig[i], rc->rc_size); 1373219089Spjd } 1374219089Spjd 1375219089Spjd /* 1376219089Spjd * Attempt a reconstruction and exit the outer loop on 1377219089Spjd * success. 1378219089Spjd */ 1379219089Spjd code = vdev_raidz_reconstruct(rm, tgts, n); 1380226553Spjd if (raidz_checksum_verify(bp, data, bytes) == 0) { 1381219089Spjd for (i = 0; i < n; i++) { 1382219089Spjd c = tgts[i]; 1383219089Spjd rc = &rm->rm_col[c]; 1384219089Spjd ASSERT(rc->rc_error == 0); 1385219089Spjd rc->rc_error = ECKSUM; 1386219089Spjd } 1387219089Spjd 1388219089Spjd ret = code; 1389219089Spjd goto done; 1390219089Spjd } 1391219089Spjd 1392219089Spjd /* 1393219089Spjd * Restore the original data. 1394219089Spjd */ 1395219089Spjd for (i = 0; i < n; i++) { 1396219089Spjd c = tgts[i]; 1397219089Spjd rc = &rm->rm_col[c]; 1398219089Spjd bcopy(orig[i], rc->rc_data, rc->rc_size); 1399219089Spjd } 1400219089Spjd 1401219089Spjd do { 1402219089Spjd /* 1403219089Spjd * Find the next valid column after the current 1404219089Spjd * position.. 1405219089Spjd */ 1406219089Spjd for (next = tgts[current] + 1; 1407219089Spjd next < rm->rm_cols && 1408219089Spjd rm->rm_col[next].rc_error != 0; next++) 1409219089Spjd continue; 1410219089Spjd 1411219089Spjd ASSERT(next <= tgts[current + 1]); 1412219089Spjd 1413219089Spjd /* 1414219089Spjd * If that spot is available, we're done here. 1415219089Spjd */ 1416219089Spjd if (next != tgts[current + 1]) 1417219089Spjd break; 1418219089Spjd 1419219089Spjd /* 1420219089Spjd * Otherwise, find the next valid column after 1421219089Spjd * the previous position. 1422219089Spjd */ 1423219089Spjd for (c = tgts[current - 1] + 1; 1424219089Spjd rm->rm_col[c].rc_error != 0; c++) 1425219089Spjd continue; 1426219089Spjd 1427219089Spjd tgts[current] = c; 1428219089Spjd current++; 1429219089Spjd 1430219089Spjd } while (current != n); 1431219089Spjd } 1432219089Spjd } 1433219089Spjd n--; 1434219089Spjddone: 1435219089Spjd for (i = n - 1; i >= 0; i--) { 1436219089Spjd zfs_free(orig[i], rm->rm_col[0].rc_size); 1437219089Spjd } 1438219089Spjd 1439219089Spjd return (ret); 1440219089Spjd} 1441219089Spjd 1442219089Spjdstatic int 1443219089Spjdvdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data, 1444219089Spjd off_t offset, size_t bytes) 1445219089Spjd{ 1446219089Spjd vdev_t *tvd = vd->v_top; 1447219089Spjd vdev_t *cvd; 1448219089Spjd raidz_map_t *rm; 1449219089Spjd raidz_col_t *rc; 1450219089Spjd int c, error; 1451219089Spjd int unexpected_errors; 1452219089Spjd int parity_errors; 1453219089Spjd int parity_untried; 1454219089Spjd int data_errors; 1455219089Spjd int total_errors; 1456219089Spjd int n; 1457219089Spjd int tgts[VDEV_RAIDZ_MAXPARITY]; 1458219089Spjd int code; 1459219089Spjd 1460219089Spjd rc = NULL; /* gcc */ 1461219089Spjd error = 0; 1462219089Spjd 1463219089Spjd rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift, 1464219089Spjd vd->v_nchildren, vd->v_nparity); 1465219089Spjd 1466219089Spjd /* 1467219089Spjd * Iterate over the columns in reverse order so that we hit the parity 1468219089Spjd * last -- any errors along the way will force us to read the parity. 1469219089Spjd */ 1470219089Spjd for (c = rm->rm_cols - 1; c >= 0; c--) { 1471219089Spjd rc = &rm->rm_col[c]; 1472219089Spjd cvd = vdev_child(vd, rc->rc_devidx); 1473219089Spjd if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) { 1474219089Spjd if (c >= rm->rm_firstdatacol) 1475219089Spjd rm->rm_missingdata++; 1476192194Sdfr else 1477219089Spjd rm->rm_missingparity++; 1478192194Sdfr rc->rc_error = ENXIO; 1479192194Sdfr rc->rc_tried = 1; /* don't even try */ 1480192194Sdfr rc->rc_skipped = 1; 1481192194Sdfr continue; 1482192194Sdfr } 1483219089Spjd#if 0 /* XXX: Too hard for the boot code. */ 1484219089Spjd if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 1485219089Spjd if (c >= rm->rm_firstdatacol) 1486192194Sdfr rm->rm_missingdata++; 1487192194Sdfr else 1488192194Sdfr rm->rm_missingparity++; 1489192194Sdfr rc->rc_error = ESTALE; 1490192194Sdfr rc->rc_skipped = 1; 1491192194Sdfr continue; 1492192194Sdfr } 1493192194Sdfr#endif 1494219089Spjd if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) { 1495219089Spjd rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data, 1496219089Spjd rc->rc_offset, rc->rc_size); 1497192194Sdfr rc->rc_tried = 1; 1498192194Sdfr rc->rc_skipped = 0; 1499192194Sdfr } 1500192194Sdfr } 1501192194Sdfr 1502192194Sdfrreconstruct: 1503219089Spjd unexpected_errors = 0; 1504192194Sdfr parity_errors = 0; 1505219089Spjd parity_untried = 0; 1506192194Sdfr data_errors = 0; 1507192194Sdfr total_errors = 0; 1508192194Sdfr 1509219089Spjd ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 1510219089Spjd ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 1511219089Spjd 1512219089Spjd for (c = 0; c < rm->rm_cols; c++) { 1513219089Spjd rc = &rm->rm_col[c]; 1514219089Spjd 1515192194Sdfr if (rc->rc_error) { 1516219089Spjd ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 1517219089Spjd 1518219089Spjd if (c < rm->rm_firstdatacol) 1519192194Sdfr parity_errors++; 1520192194Sdfr else 1521192194Sdfr data_errors++; 1522192194Sdfr 1523192194Sdfr if (!rc->rc_skipped) 1524192194Sdfr unexpected_errors++; 1525192194Sdfr 1526192194Sdfr total_errors++; 1527219089Spjd } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 1528192194Sdfr parity_untried++; 1529192194Sdfr } 1530192194Sdfr } 1531192194Sdfr 1532192194Sdfr /* 1533192194Sdfr * There are three potential phases for a read: 1534192194Sdfr * 1. produce valid data from the columns read 1535192194Sdfr * 2. read all disks and try again 1536192194Sdfr * 3. perform combinatorial reconstruction 1537192194Sdfr * 1538219089Spjd * Each phase is progressively both more expensive and less likely to 1539219089Spjd * occur. If we encounter more errors than we can repair or all phases 1540219089Spjd * fail, we have no choice but to return an error. 1541192194Sdfr */ 1542192194Sdfr 1543192194Sdfr /* 1544219089Spjd * If the number of errors we saw was correctable -- less than or equal 1545219089Spjd * to the number of parity disks read -- attempt to produce data that 1546219089Spjd * has a valid checksum. Naturally, this case applies in the absence of 1547219089Spjd * any errors. 1548192194Sdfr */ 1549219089Spjd if (total_errors <= rm->rm_firstdatacol - parity_untried) { 1550219089Spjd if (data_errors == 0) { 1551226553Spjd if (raidz_checksum_verify(bp, data, bytes) == 0) { 1552219089Spjd /* 1553219089Spjd * If we read parity information (unnecessarily 1554219089Spjd * as it happens since no reconstruction was 1555219089Spjd * needed) regenerate and verify the parity. 1556219089Spjd * We also regenerate parity when resilvering 1557219089Spjd * so we can write it out to the failed device 1558219089Spjd * later. 1559219089Spjd */ 1560219089Spjd if (parity_errors + parity_untried < 1561219089Spjd rm->rm_firstdatacol) { 1562219089Spjd n = raidz_parity_verify(rm); 1563219089Spjd unexpected_errors += n; 1564219089Spjd ASSERT(parity_errors + n <= 1565219089Spjd rm->rm_firstdatacol); 1566219089Spjd } 1567219089Spjd goto done; 1568219089Spjd } 1569219089Spjd } else { 1570192194Sdfr /* 1571192194Sdfr * We either attempt to read all the parity columns or 1572192194Sdfr * none of them. If we didn't try to read parity, we 1573192194Sdfr * wouldn't be here in the correctable case. There must 1574192194Sdfr * also have been fewer parity errors than parity 1575192194Sdfr * columns or, again, we wouldn't be in this code path. 1576192194Sdfr */ 1577219089Spjd ASSERT(parity_untried == 0); 1578219089Spjd ASSERT(parity_errors < rm->rm_firstdatacol); 1579192194Sdfr 1580192194Sdfr /* 1581219089Spjd * Identify the data columns that reported an error. 1582192194Sdfr */ 1583219089Spjd n = 0; 1584219089Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1585219089Spjd rc = &rm->rm_col[c]; 1586219089Spjd if (rc->rc_error != 0) { 1587219089Spjd ASSERT(n < VDEV_RAIDZ_MAXPARITY); 1588219089Spjd tgts[n++] = c; 1589219089Spjd } 1590192194Sdfr } 1591192194Sdfr 1592219089Spjd ASSERT(rm->rm_firstdatacol >= n); 1593192194Sdfr 1594219089Spjd code = vdev_raidz_reconstruct(rm, tgts, n); 1595192194Sdfr 1596226553Spjd if (raidz_checksum_verify(bp, data, bytes) == 0) { 1597219089Spjd /* 1598219089Spjd * If we read more parity disks than were used 1599219089Spjd * for reconstruction, confirm that the other 1600219089Spjd * parity disks produced correct data. This 1601219089Spjd * routine is suboptimal in that it regenerates 1602219089Spjd * the parity that we already used in addition 1603219089Spjd * to the parity that we're attempting to 1604219089Spjd * verify, but this should be a relatively 1605219089Spjd * uncommon case, and can be optimized if it 1606219089Spjd * becomes a problem. Note that we regenerate 1607219089Spjd * parity when resilvering so we can write it 1608219089Spjd * out to failed devices later. 1609219089Spjd */ 1610219089Spjd if (parity_errors < rm->rm_firstdatacol - n) { 1611219089Spjd n = raidz_parity_verify(rm); 1612219089Spjd unexpected_errors += n; 1613219089Spjd ASSERT(parity_errors + n <= 1614219089Spjd rm->rm_firstdatacol); 1615219089Spjd } 1616192194Sdfr 1617219089Spjd goto done; 1618192194Sdfr } 1619192194Sdfr } 1620192194Sdfr } 1621192194Sdfr 1622192194Sdfr /* 1623192194Sdfr * This isn't a typical situation -- either we got a read 1624192194Sdfr * error or a child silently returned bad data. Read every 1625192194Sdfr * block so we can try again with as much data and parity as 1626192194Sdfr * we can track down. If we've already been through once 1627192194Sdfr * before, all children will be marked as tried so we'll 1628192194Sdfr * proceed to combinatorial reconstruction. 1629192194Sdfr */ 1630219089Spjd unexpected_errors = 1; 1631219089Spjd rm->rm_missingdata = 0; 1632219089Spjd rm->rm_missingparity = 0; 1633219089Spjd 1634192194Sdfr n = 0; 1635219089Spjd for (c = 0; c < rm->rm_cols; c++) { 1636226550Spjd rc = &rm->rm_col[c]; 1637226550Spjd 1638226550Spjd if (rc->rc_tried) 1639192194Sdfr continue; 1640192194Sdfr 1641219089Spjd cvd = vdev_child(vd, rc->rc_devidx); 1642219089Spjd ASSERT(cvd != NULL); 1643219089Spjd rc->rc_error = cvd->v_read(cvd, NULL, 1644219089Spjd rc->rc_data, rc->rc_offset, rc->rc_size); 1645192194Sdfr if (rc->rc_error == 0) 1646192194Sdfr n++; 1647192194Sdfr rc->rc_tried = 1; 1648192194Sdfr rc->rc_skipped = 0; 1649192194Sdfr } 1650192194Sdfr /* 1651192194Sdfr * If we managed to read anything more, retry the 1652192194Sdfr * reconstruction. 1653192194Sdfr */ 1654219089Spjd if (n > 0) 1655192194Sdfr goto reconstruct; 1656192194Sdfr 1657192194Sdfr /* 1658192194Sdfr * At this point we've attempted to reconstruct the data given the 1659192194Sdfr * errors we detected, and we've attempted to read all columns. There 1660192194Sdfr * must, therefore, be one or more additional problems -- silent errors 1661192194Sdfr * resulting in invalid data rather than explicit I/O errors resulting 1662219089Spjd * in absent data. We check if there is enough additional data to 1663219089Spjd * possibly reconstruct the data and then perform combinatorial 1664219089Spjd * reconstruction over all possible combinations. If that fails, 1665219089Spjd * we're cooked. 1666192194Sdfr */ 1667219089Spjd if (total_errors > rm->rm_firstdatacol) { 1668219089Spjd error = EIO; 1669219089Spjd } else if (total_errors < rm->rm_firstdatacol && 1670226553Spjd (code = vdev_raidz_combrec(rm, bp, data, offset, bytes, 1671226553Spjd total_errors, data_errors)) != 0) { 1672192194Sdfr /* 1673219089Spjd * If we didn't use all the available parity for the 1674219089Spjd * combinatorial reconstruction, verify that the remaining 1675219089Spjd * parity is correct. 1676192194Sdfr */ 1677219089Spjd if (code != (1 << rm->rm_firstdatacol) - 1) 1678219089Spjd (void) raidz_parity_verify(rm); 1679219089Spjd } else { 1680192194Sdfr /* 1681219089Spjd * We're here because either: 1682219089Spjd * 1683219089Spjd * total_errors == rm_first_datacol, or 1684219089Spjd * vdev_raidz_combrec() failed 1685219089Spjd * 1686219089Spjd * In either case, there is enough bad data to prevent 1687219089Spjd * reconstruction. 1688219089Spjd * 1689219089Spjd * Start checksum ereports for all children which haven't 1690219089Spjd * failed, and the IO wasn't speculative. 1691192194Sdfr */ 1692219089Spjd error = ECKSUM; 1693192194Sdfr } 1694192194Sdfr 1695219089Spjddone: 1696219089Spjd vdev_raidz_map_free(rm); 1697192194Sdfr 1698219089Spjd return (error); 1699192194Sdfr} 1700