vdev_raidz.c revision 297112
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2014 by Delphix. All rights reserved. 25 * Copyright (c) 2013, Joyent, Inc. All rights reserved. 26 * Copyright (c) 2014 Integros [integros.com] 27 */ 28 29#include <sys/zfs_context.h> 30#include <sys/spa.h> 31#include <sys/vdev_impl.h> 32#ifdef illumos 33#include <sys/vdev_disk.h> 34#endif 35#include <sys/vdev_file.h> 36#include <sys/vdev_raidz.h> 37#include <sys/zio.h> 38#include <sys/zio_checksum.h> 39#include <sys/fs/zfs.h> 40#include <sys/fm/fs/zfs.h> 41#include <sys/bio.h> 42 43/* 44 * Virtual device vector for RAID-Z. 45 * 46 * This vdev supports single, double, and triple parity. For single parity, 47 * we use a simple XOR of all the data columns. For double or triple parity, 48 * we use a special case of Reed-Solomon coding. This extends the 49 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 50 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 51 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 52 * former is also based. The latter is designed to provide higher performance 53 * for writes. 54 * 55 * Note that the Plank paper claimed to support arbitrary N+M, but was then 56 * amended six years later identifying a critical flaw that invalidates its 57 * claims. Nevertheless, the technique can be adapted to work for up to 58 * triple parity. For additional parity, the amendment "Note: Correction to 59 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 60 * is viable, but the additional complexity means that write performance will 61 * suffer. 62 * 63 * All of the methods above operate on a Galois field, defined over the 64 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 65 * can be expressed with a single byte. Briefly, the operations on the 66 * field are defined as follows: 67 * 68 * o addition (+) is represented by a bitwise XOR 69 * o subtraction (-) is therefore identical to addition: A + B = A - B 70 * o multiplication of A by 2 is defined by the following bitwise expression: 71 * 72 * (A * 2)_7 = A_6 73 * (A * 2)_6 = A_5 74 * (A * 2)_5 = A_4 75 * (A * 2)_4 = A_3 + A_7 76 * (A * 2)_3 = A_2 + A_7 77 * (A * 2)_2 = A_1 + A_7 78 * (A * 2)_1 = A_0 79 * (A * 2)_0 = A_7 80 * 81 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 82 * As an aside, this multiplication is derived from the error correcting 83 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 84 * 85 * Observe that any number in the field (except for 0) can be expressed as a 86 * power of 2 -- a generator for the field. We store a table of the powers of 87 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 88 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 89 * than field addition). The inverse of a field element A (A^-1) is therefore 90 * A ^ (255 - 1) = A^254. 91 * 92 * The up-to-three parity columns, P, Q, R over several data columns, 93 * D_0, ... D_n-1, can be expressed by field operations: 94 * 95 * P = D_0 + D_1 + ... + D_n-2 + D_n-1 96 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 97 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 98 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 99 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 100 * 101 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival 102 * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 103 * independent coefficients. (There are no additional coefficients that have 104 * this property which is why the uncorrected Plank method breaks down.) 105 * 106 * See the reconstruction code below for how P, Q and R can used individually 107 * or in concert to recover missing data columns. 108 */ 109 110typedef struct raidz_col { 111 uint64_t rc_devidx; /* child device index for I/O */ 112 uint64_t rc_offset; /* device offset */ 113 uint64_t rc_size; /* I/O size */ 114 void *rc_data; /* I/O data */ 115 void *rc_gdata; /* used to store the "good" version */ 116 int rc_error; /* I/O error for this device */ 117 uint8_t rc_tried; /* Did we attempt this I/O column? */ 118 uint8_t rc_skipped; /* Did we skip this I/O column? */ 119} raidz_col_t; 120 121typedef struct raidz_map { 122 uint64_t rm_cols; /* Regular column count */ 123 uint64_t rm_scols; /* Count including skipped columns */ 124 uint64_t rm_bigcols; /* Number of oversized columns */ 125 uint64_t rm_asize; /* Actual total I/O size */ 126 uint64_t rm_missingdata; /* Count of missing data devices */ 127 uint64_t rm_missingparity; /* Count of missing parity devices */ 128 uint64_t rm_firstdatacol; /* First data column/parity count */ 129 uint64_t rm_nskip; /* Skipped sectors for padding */ 130 uint64_t rm_skipstart; /* Column index of padding start */ 131 void *rm_datacopy; /* rm_asize-buffer of copied data */ 132 uintptr_t rm_reports; /* # of referencing checksum reports */ 133 uint8_t rm_freed; /* map no longer has referencing ZIO */ 134 uint8_t rm_ecksuminjected; /* checksum error was injected */ 135 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 136} raidz_map_t; 137 138#define VDEV_RAIDZ_P 0 139#define VDEV_RAIDZ_Q 1 140#define VDEV_RAIDZ_R 2 141 142#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 143#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 144 145/* 146 * We provide a mechanism to perform the field multiplication operation on a 147 * 64-bit value all at once rather than a byte at a time. This works by 148 * creating a mask from the top bit in each byte and using that to 149 * conditionally apply the XOR of 0x1d. 150 */ 151#define VDEV_RAIDZ_64MUL_2(x, mask) \ 152{ \ 153 (mask) = (x) & 0x8080808080808080ULL; \ 154 (mask) = ((mask) << 1) - ((mask) >> 7); \ 155 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 156 ((mask) & 0x1d1d1d1d1d1d1d1d); \ 157} 158 159#define VDEV_RAIDZ_64MUL_4(x, mask) \ 160{ \ 161 VDEV_RAIDZ_64MUL_2((x), mask); \ 162 VDEV_RAIDZ_64MUL_2((x), mask); \ 163} 164 165#define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE) 166 167/* 168 * Force reconstruction to use the general purpose method. 169 */ 170int vdev_raidz_default_to_general; 171 172/* Powers of 2 in the Galois field defined above. */ 173static const uint8_t vdev_raidz_pow2[256] = { 174 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 175 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 176 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 177 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 178 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 179 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 180 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 181 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 182 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 183 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 184 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 185 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 186 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 187 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 188 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 189 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 190 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 191 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 192 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 193 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 194 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 195 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 196 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 197 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 198 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 199 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 200 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 201 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 202 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 203 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 204 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 205 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 206}; 207/* Logs of 2 in the Galois field defined above. */ 208static const uint8_t vdev_raidz_log2[256] = { 209 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 210 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 211 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 212 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 213 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 214 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 215 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 216 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 217 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 218 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 219 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 220 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 221 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 222 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 223 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 224 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 225 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 226 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 227 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 228 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 229 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 230 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 231 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 232 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 233 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 234 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 235 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 236 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 237 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 238 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 239 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 240 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 241}; 242 243static void vdev_raidz_generate_parity(raidz_map_t *rm); 244 245/* 246 * Multiply a given number by 2 raised to the given power. 247 */ 248static uint8_t 249vdev_raidz_exp2(uint_t a, int exp) 250{ 251 if (a == 0) 252 return (0); 253 254 ASSERT(exp >= 0); 255 ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 256 257 exp += vdev_raidz_log2[a]; 258 if (exp > 255) 259 exp -= 255; 260 261 return (vdev_raidz_pow2[exp]); 262} 263 264static void 265vdev_raidz_map_free(raidz_map_t *rm) 266{ 267 int c; 268 size_t size; 269 270 for (c = 0; c < rm->rm_firstdatacol; c++) { 271 if (rm->rm_col[c].rc_data != NULL) 272 zio_buf_free(rm->rm_col[c].rc_data, 273 rm->rm_col[c].rc_size); 274 275 if (rm->rm_col[c].rc_gdata != NULL) 276 zio_buf_free(rm->rm_col[c].rc_gdata, 277 rm->rm_col[c].rc_size); 278 } 279 280 size = 0; 281 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 282 size += rm->rm_col[c].rc_size; 283 284 if (rm->rm_datacopy != NULL) 285 zio_buf_free(rm->rm_datacopy, size); 286 287 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 288} 289 290static void 291vdev_raidz_map_free_vsd(zio_t *zio) 292{ 293 raidz_map_t *rm = zio->io_vsd; 294 295 ASSERT0(rm->rm_freed); 296 rm->rm_freed = 1; 297 298 if (rm->rm_reports == 0) 299 vdev_raidz_map_free(rm); 300} 301 302/*ARGSUSED*/ 303static void 304vdev_raidz_cksum_free(void *arg, size_t ignored) 305{ 306 raidz_map_t *rm = arg; 307 308 ASSERT3U(rm->rm_reports, >, 0); 309 310 if (--rm->rm_reports == 0 && rm->rm_freed != 0) 311 vdev_raidz_map_free(rm); 312} 313 314static void 315vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) 316{ 317 raidz_map_t *rm = zcr->zcr_cbdata; 318 size_t c = zcr->zcr_cbinfo; 319 size_t x; 320 321 const char *good = NULL; 322 const char *bad = rm->rm_col[c].rc_data; 323 324 if (good_data == NULL) { 325 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); 326 return; 327 } 328 329 if (c < rm->rm_firstdatacol) { 330 /* 331 * The first time through, calculate the parity blocks for 332 * the good data (this relies on the fact that the good 333 * data never changes for a given logical ZIO) 334 */ 335 if (rm->rm_col[0].rc_gdata == NULL) { 336 char *bad_parity[VDEV_RAIDZ_MAXPARITY]; 337 char *buf; 338 339 /* 340 * Set up the rm_col[]s to generate the parity for 341 * good_data, first saving the parity bufs and 342 * replacing them with buffers to hold the result. 343 */ 344 for (x = 0; x < rm->rm_firstdatacol; x++) { 345 bad_parity[x] = rm->rm_col[x].rc_data; 346 rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata = 347 zio_buf_alloc(rm->rm_col[x].rc_size); 348 } 349 350 /* fill in the data columns from good_data */ 351 buf = (char *)good_data; 352 for (; x < rm->rm_cols; x++) { 353 rm->rm_col[x].rc_data = buf; 354 buf += rm->rm_col[x].rc_size; 355 } 356 357 /* 358 * Construct the parity from the good data. 359 */ 360 vdev_raidz_generate_parity(rm); 361 362 /* restore everything back to its original state */ 363 for (x = 0; x < rm->rm_firstdatacol; x++) 364 rm->rm_col[x].rc_data = bad_parity[x]; 365 366 buf = rm->rm_datacopy; 367 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { 368 rm->rm_col[x].rc_data = buf; 369 buf += rm->rm_col[x].rc_size; 370 } 371 } 372 373 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); 374 good = rm->rm_col[c].rc_gdata; 375 } else { 376 /* adjust good_data to point at the start of our column */ 377 good = good_data; 378 379 for (x = rm->rm_firstdatacol; x < c; x++) 380 good += rm->rm_col[x].rc_size; 381 } 382 383 /* we drop the ereport if it ends up that the data was good */ 384 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); 385} 386 387/* 388 * Invoked indirectly by zfs_ereport_start_checksum(), called 389 * below when our read operation fails completely. The main point 390 * is to keep a copy of everything we read from disk, so that at 391 * vdev_raidz_cksum_finish() time we can compare it with the good data. 392 */ 393static void 394vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) 395{ 396 size_t c = (size_t)(uintptr_t)arg; 397 caddr_t buf; 398 399 raidz_map_t *rm = zio->io_vsd; 400 size_t size; 401 402 /* set up the report and bump the refcount */ 403 zcr->zcr_cbdata = rm; 404 zcr->zcr_cbinfo = c; 405 zcr->zcr_finish = vdev_raidz_cksum_finish; 406 zcr->zcr_free = vdev_raidz_cksum_free; 407 408 rm->rm_reports++; 409 ASSERT3U(rm->rm_reports, >, 0); 410 411 if (rm->rm_datacopy != NULL) 412 return; 413 414 /* 415 * It's the first time we're called for this raidz_map_t, so we need 416 * to copy the data aside; there's no guarantee that our zio's buffer 417 * won't be re-used for something else. 418 * 419 * Our parity data is already in separate buffers, so there's no need 420 * to copy them. 421 */ 422 423 size = 0; 424 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 425 size += rm->rm_col[c].rc_size; 426 427 buf = rm->rm_datacopy = zio_buf_alloc(size); 428 429 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 430 raidz_col_t *col = &rm->rm_col[c]; 431 432 bcopy(col->rc_data, buf, col->rc_size); 433 col->rc_data = buf; 434 435 buf += col->rc_size; 436 } 437 ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size); 438} 439 440static const zio_vsd_ops_t vdev_raidz_vsd_ops = { 441 vdev_raidz_map_free_vsd, 442 vdev_raidz_cksum_report 443}; 444 445/* 446 * Divides the IO evenly across all child vdevs; usually, dcols is 447 * the number of children in the target vdev. 448 */ 449static raidz_map_t * 450vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, boolean_t dofree, 451 uint64_t unit_shift, uint64_t dcols, uint64_t nparity) 452{ 453 raidz_map_t *rm; 454 /* The starting RAIDZ (parent) vdev sector of the block. */ 455 uint64_t b = offset >> unit_shift; 456 /* The zio's size in units of the vdev's minimum sector size. */ 457 uint64_t s = size >> unit_shift; 458 /* The first column for this stripe. */ 459 uint64_t f = b % dcols; 460 /* The starting byte offset on each child vdev. */ 461 uint64_t o = (b / dcols) << unit_shift; 462 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 463 464 /* 465 * "Quotient": The number of data sectors for this stripe on all but 466 * the "big column" child vdevs that also contain "remainder" data. 467 */ 468 q = s / (dcols - nparity); 469 470 /* 471 * "Remainder": The number of partial stripe data sectors in this I/O. 472 * This will add a sector to some, but not all, child vdevs. 473 */ 474 r = s - q * (dcols - nparity); 475 476 /* The number of "big columns" - those which contain remainder data. */ 477 bc = (r == 0 ? 0 : r + nparity); 478 479 /* 480 * The total number of data and parity sectors associated with 481 * this I/O. 482 */ 483 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 484 485 /* acols: The columns that will be accessed. */ 486 /* scols: The columns that will be accessed or skipped. */ 487 if (q == 0) { 488 /* Our I/O request doesn't span all child vdevs. */ 489 acols = bc; 490 scols = MIN(dcols, roundup(bc, nparity + 1)); 491 } else { 492 acols = dcols; 493 scols = dcols; 494 } 495 496 ASSERT3U(acols, <=, scols); 497 498 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); 499 500 rm->rm_cols = acols; 501 rm->rm_scols = scols; 502 rm->rm_bigcols = bc; 503 rm->rm_skipstart = bc; 504 rm->rm_missingdata = 0; 505 rm->rm_missingparity = 0; 506 rm->rm_firstdatacol = nparity; 507 rm->rm_datacopy = NULL; 508 rm->rm_reports = 0; 509 rm->rm_freed = 0; 510 rm->rm_ecksuminjected = 0; 511 512 asize = 0; 513 514 for (c = 0; c < scols; c++) { 515 col = f + c; 516 coff = o; 517 if (col >= dcols) { 518 col -= dcols; 519 coff += 1ULL << unit_shift; 520 } 521 rm->rm_col[c].rc_devidx = col; 522 rm->rm_col[c].rc_offset = coff; 523 rm->rm_col[c].rc_data = NULL; 524 rm->rm_col[c].rc_gdata = NULL; 525 rm->rm_col[c].rc_error = 0; 526 rm->rm_col[c].rc_tried = 0; 527 rm->rm_col[c].rc_skipped = 0; 528 529 if (c >= acols) 530 rm->rm_col[c].rc_size = 0; 531 else if (c < bc) 532 rm->rm_col[c].rc_size = (q + 1) << unit_shift; 533 else 534 rm->rm_col[c].rc_size = q << unit_shift; 535 536 asize += rm->rm_col[c].rc_size; 537 } 538 539 ASSERT3U(asize, ==, tot << unit_shift); 540 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 541 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 542 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 543 ASSERT3U(rm->rm_nskip, <=, nparity); 544 545 if (!dofree) { 546 for (c = 0; c < rm->rm_firstdatacol; c++) { 547 rm->rm_col[c].rc_data = 548 zio_buf_alloc(rm->rm_col[c].rc_size); 549 } 550 551 rm->rm_col[c].rc_data = data; 552 553 for (c = c + 1; c < acols; c++) { 554 rm->rm_col[c].rc_data = 555 (char *)rm->rm_col[c - 1].rc_data + 556 rm->rm_col[c - 1].rc_size; 557 } 558 } 559 560 /* 561 * If all data stored spans all columns, there's a danger that parity 562 * will always be on the same device and, since parity isn't read 563 * during normal operation, that that device's I/O bandwidth won't be 564 * used effectively. We therefore switch the parity every 1MB. 565 * 566 * ... at least that was, ostensibly, the theory. As a practical 567 * matter unless we juggle the parity between all devices evenly, we 568 * won't see any benefit. Further, occasional writes that aren't a 569 * multiple of the LCM of the number of children and the minimum 570 * stripe width are sufficient to avoid pessimal behavior. 571 * Unfortunately, this decision created an implicit on-disk format 572 * requirement that we need to support for all eternity, but only 573 * for single-parity RAID-Z. 574 * 575 * If we intend to skip a sector in the zeroth column for padding 576 * we must make sure to note this swap. We will never intend to 577 * skip the first column since at least one data and one parity 578 * column must appear in each row. 579 */ 580 ASSERT(rm->rm_cols >= 2); 581 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 582 583 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { 584 devidx = rm->rm_col[0].rc_devidx; 585 o = rm->rm_col[0].rc_offset; 586 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 587 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 588 rm->rm_col[1].rc_devidx = devidx; 589 rm->rm_col[1].rc_offset = o; 590 591 if (rm->rm_skipstart == 0) 592 rm->rm_skipstart = 1; 593 } 594 595 return (rm); 596} 597 598static void 599vdev_raidz_generate_parity_p(raidz_map_t *rm) 600{ 601 uint64_t *p, *src, pcount, ccount, i; 602 int c; 603 604 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 605 606 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 607 src = rm->rm_col[c].rc_data; 608 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 609 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 610 611 if (c == rm->rm_firstdatacol) { 612 ASSERT(ccount == pcount); 613 for (i = 0; i < ccount; i++, src++, p++) { 614 *p = *src; 615 } 616 } else { 617 ASSERT(ccount <= pcount); 618 for (i = 0; i < ccount; i++, src++, p++) { 619 *p ^= *src; 620 } 621 } 622 } 623} 624 625static void 626vdev_raidz_generate_parity_pq(raidz_map_t *rm) 627{ 628 uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 629 int c; 630 631 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 632 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 633 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 634 635 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 636 src = rm->rm_col[c].rc_data; 637 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 638 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 639 640 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 641 642 if (c == rm->rm_firstdatacol) { 643 ASSERT(ccnt == pcnt || ccnt == 0); 644 for (i = 0; i < ccnt; i++, src++, p++, q++) { 645 *p = *src; 646 *q = *src; 647 } 648 for (; i < pcnt; i++, src++, p++, q++) { 649 *p = 0; 650 *q = 0; 651 } 652 } else { 653 ASSERT(ccnt <= pcnt); 654 655 /* 656 * Apply the algorithm described above by multiplying 657 * the previous result and adding in the new value. 658 */ 659 for (i = 0; i < ccnt; i++, src++, p++, q++) { 660 *p ^= *src; 661 662 VDEV_RAIDZ_64MUL_2(*q, mask); 663 *q ^= *src; 664 } 665 666 /* 667 * Treat short columns as though they are full of 0s. 668 * Note that there's therefore nothing needed for P. 669 */ 670 for (; i < pcnt; i++, q++) { 671 VDEV_RAIDZ_64MUL_2(*q, mask); 672 } 673 } 674 } 675} 676 677static void 678vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 679{ 680 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 681 int c; 682 683 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 684 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 685 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 686 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 687 rm->rm_col[VDEV_RAIDZ_R].rc_size); 688 689 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 690 src = rm->rm_col[c].rc_data; 691 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 692 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 693 r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 694 695 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 696 697 if (c == rm->rm_firstdatacol) { 698 ASSERT(ccnt == pcnt || ccnt == 0); 699 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 700 *p = *src; 701 *q = *src; 702 *r = *src; 703 } 704 for (; i < pcnt; i++, src++, p++, q++, r++) { 705 *p = 0; 706 *q = 0; 707 *r = 0; 708 } 709 } else { 710 ASSERT(ccnt <= pcnt); 711 712 /* 713 * Apply the algorithm described above by multiplying 714 * the previous result and adding in the new value. 715 */ 716 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 717 *p ^= *src; 718 719 VDEV_RAIDZ_64MUL_2(*q, mask); 720 *q ^= *src; 721 722 VDEV_RAIDZ_64MUL_4(*r, mask); 723 *r ^= *src; 724 } 725 726 /* 727 * Treat short columns as though they are full of 0s. 728 * Note that there's therefore nothing needed for P. 729 */ 730 for (; i < pcnt; i++, q++, r++) { 731 VDEV_RAIDZ_64MUL_2(*q, mask); 732 VDEV_RAIDZ_64MUL_4(*r, mask); 733 } 734 } 735 } 736} 737 738/* 739 * Generate RAID parity in the first virtual columns according to the number of 740 * parity columns available. 741 */ 742static void 743vdev_raidz_generate_parity(raidz_map_t *rm) 744{ 745 switch (rm->rm_firstdatacol) { 746 case 1: 747 vdev_raidz_generate_parity_p(rm); 748 break; 749 case 2: 750 vdev_raidz_generate_parity_pq(rm); 751 break; 752 case 3: 753 vdev_raidz_generate_parity_pqr(rm); 754 break; 755 default: 756 cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 757 } 758} 759 760static int 761vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) 762{ 763 uint64_t *dst, *src, xcount, ccount, count, i; 764 int x = tgts[0]; 765 int c; 766 767 ASSERT(ntgts == 1); 768 ASSERT(x >= rm->rm_firstdatacol); 769 ASSERT(x < rm->rm_cols); 770 771 xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 772 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); 773 ASSERT(xcount > 0); 774 775 src = rm->rm_col[VDEV_RAIDZ_P].rc_data; 776 dst = rm->rm_col[x].rc_data; 777 for (i = 0; i < xcount; i++, dst++, src++) { 778 *dst = *src; 779 } 780 781 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 782 src = rm->rm_col[c].rc_data; 783 dst = rm->rm_col[x].rc_data; 784 785 if (c == x) 786 continue; 787 788 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 789 count = MIN(ccount, xcount); 790 791 for (i = 0; i < count; i++, dst++, src++) { 792 *dst ^= *src; 793 } 794 } 795 796 return (1 << VDEV_RAIDZ_P); 797} 798 799static int 800vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) 801{ 802 uint64_t *dst, *src, xcount, ccount, count, mask, i; 803 uint8_t *b; 804 int x = tgts[0]; 805 int c, j, exp; 806 807 ASSERT(ntgts == 1); 808 809 xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 810 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); 811 812 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 813 src = rm->rm_col[c].rc_data; 814 dst = rm->rm_col[x].rc_data; 815 816 if (c == x) 817 ccount = 0; 818 else 819 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 820 821 count = MIN(ccount, xcount); 822 823 if (c == rm->rm_firstdatacol) { 824 for (i = 0; i < count; i++, dst++, src++) { 825 *dst = *src; 826 } 827 for (; i < xcount; i++, dst++) { 828 *dst = 0; 829 } 830 831 } else { 832 for (i = 0; i < count; i++, dst++, src++) { 833 VDEV_RAIDZ_64MUL_2(*dst, mask); 834 *dst ^= *src; 835 } 836 837 for (; i < xcount; i++, dst++) { 838 VDEV_RAIDZ_64MUL_2(*dst, mask); 839 } 840 } 841 } 842 843 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 844 dst = rm->rm_col[x].rc_data; 845 exp = 255 - (rm->rm_cols - 1 - x); 846 847 for (i = 0; i < xcount; i++, dst++, src++) { 848 *dst ^= *src; 849 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 850 *b = vdev_raidz_exp2(*b, exp); 851 } 852 } 853 854 return (1 << VDEV_RAIDZ_Q); 855} 856 857static int 858vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) 859{ 860 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; 861 void *pdata, *qdata; 862 uint64_t xsize, ysize, i; 863 int x = tgts[0]; 864 int y = tgts[1]; 865 866 ASSERT(ntgts == 2); 867 ASSERT(x < y); 868 ASSERT(x >= rm->rm_firstdatacol); 869 ASSERT(y < rm->rm_cols); 870 871 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); 872 873 /* 874 * Move the parity data aside -- we're going to compute parity as 875 * though columns x and y were full of zeros -- Pxy and Qxy. We want to 876 * reuse the parity generation mechanism without trashing the actual 877 * parity so we make those columns appear to be full of zeros by 878 * setting their lengths to zero. 879 */ 880 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; 881 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 882 xsize = rm->rm_col[x].rc_size; 883 ysize = rm->rm_col[y].rc_size; 884 885 rm->rm_col[VDEV_RAIDZ_P].rc_data = 886 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); 887 rm->rm_col[VDEV_RAIDZ_Q].rc_data = 888 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); 889 rm->rm_col[x].rc_size = 0; 890 rm->rm_col[y].rc_size = 0; 891 892 vdev_raidz_generate_parity_pq(rm); 893 894 rm->rm_col[x].rc_size = xsize; 895 rm->rm_col[y].rc_size = ysize; 896 897 p = pdata; 898 q = qdata; 899 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; 900 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 901 xd = rm->rm_col[x].rc_data; 902 yd = rm->rm_col[y].rc_data; 903 904 /* 905 * We now have: 906 * Pxy = P + D_x + D_y 907 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 908 * 909 * We can then solve for D_x: 910 * D_x = A * (P + Pxy) + B * (Q + Qxy) 911 * where 912 * A = 2^(x - y) * (2^(x - y) + 1)^-1 913 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 914 * 915 * With D_x in hand, we can easily solve for D_y: 916 * D_y = P + Pxy + D_x 917 */ 918 919 a = vdev_raidz_pow2[255 + x - y]; 920 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; 921 tmp = 255 - vdev_raidz_log2[a ^ 1]; 922 923 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 924 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 925 926 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { 927 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ 928 vdev_raidz_exp2(*q ^ *qxy, bexp); 929 930 if (i < ysize) 931 *yd = *p ^ *pxy ^ *xd; 932 } 933 934 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, 935 rm->rm_col[VDEV_RAIDZ_P].rc_size); 936 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, 937 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 938 939 /* 940 * Restore the saved parity data. 941 */ 942 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; 943 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; 944 945 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); 946} 947 948/* BEGIN CSTYLED */ 949/* 950 * In the general case of reconstruction, we must solve the system of linear 951 * equations defined by the coeffecients used to generate parity as well as 952 * the contents of the data and parity disks. This can be expressed with 953 * vectors for the original data (D) and the actual data (d) and parity (p) 954 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 955 * 956 * __ __ __ __ 957 * | | __ __ | p_0 | 958 * | V | | D_0 | | p_m-1 | 959 * | | x | : | = | d_0 | 960 * | I | | D_n-1 | | : | 961 * | | ~~ ~~ | d_n-1 | 962 * ~~ ~~ ~~ ~~ 963 * 964 * I is simply a square identity matrix of size n, and V is a vandermonde 965 * matrix defined by the coeffecients we chose for the various parity columns 966 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 967 * computation as well as linear separability. 968 * 969 * __ __ __ __ 970 * | 1 .. 1 1 1 | | p_0 | 971 * | 2^n-1 .. 4 2 1 | __ __ | : | 972 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 973 * | 1 .. 0 0 0 | | D_1 | | d_0 | 974 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 975 * | : : : : | | : | | d_2 | 976 * | 0 .. 1 0 0 | | D_n-1 | | : | 977 * | 0 .. 0 1 0 | ~~ ~~ | : | 978 * | 0 .. 0 0 1 | | d_n-1 | 979 * ~~ ~~ ~~ ~~ 980 * 981 * Note that I, V, d, and p are known. To compute D, we must invert the 982 * matrix and use the known data and parity values to reconstruct the unknown 983 * data values. We begin by removing the rows in V|I and d|p that correspond 984 * to failed or missing columns; we then make V|I square (n x n) and d|p 985 * sized n by removing rows corresponding to unused parity from the bottom up 986 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 987 * using Gauss-Jordan elimination. In the example below we use m=3 parity 988 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 989 * __ __ 990 * | 1 1 1 1 1 1 1 1 | 991 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 992 * | 19 205 116 29 64 16 4 1 | / / 993 * | 1 0 0 0 0 0 0 0 | / / 994 * | 0 1 0 0 0 0 0 0 | <--' / 995 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 996 * | 0 0 0 1 0 0 0 0 | 997 * | 0 0 0 0 1 0 0 0 | 998 * | 0 0 0 0 0 1 0 0 | 999 * | 0 0 0 0 0 0 1 0 | 1000 * | 0 0 0 0 0 0 0 1 | 1001 * ~~ ~~ 1002 * __ __ 1003 * | 1 1 1 1 1 1 1 1 | 1004 * | 19 205 116 29 64 16 4 1 | 1005 * | 1 0 0 0 0 0 0 0 | 1006 * (V|I)' = | 0 0 0 1 0 0 0 0 | 1007 * | 0 0 0 0 1 0 0 0 | 1008 * | 0 0 0 0 0 1 0 0 | 1009 * | 0 0 0 0 0 0 1 0 | 1010 * | 0 0 0 0 0 0 0 1 | 1011 * ~~ ~~ 1012 * 1013 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 1014 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 1015 * matrix is not singular. 1016 * __ __ 1017 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1018 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1019 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1020 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1021 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1022 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1023 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1024 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1025 * ~~ ~~ 1026 * __ __ 1027 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1028 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1029 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1030 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1031 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1032 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1033 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1034 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1035 * ~~ ~~ 1036 * __ __ 1037 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1038 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1039 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 1040 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1041 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1042 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1043 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1044 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1045 * ~~ ~~ 1046 * __ __ 1047 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1048 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1049 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 1050 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1051 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1052 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1053 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1054 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1055 * ~~ ~~ 1056 * __ __ 1057 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1058 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1059 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1060 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1061 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1062 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1063 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1064 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1065 * ~~ ~~ 1066 * __ __ 1067 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1068 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 1069 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1070 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1071 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1072 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1073 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1074 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1075 * ~~ ~~ 1076 * __ __ 1077 * | 0 0 1 0 0 0 0 0 | 1078 * | 167 100 5 41 159 169 217 208 | 1079 * | 166 100 4 40 158 168 216 209 | 1080 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 1081 * | 0 0 0 0 1 0 0 0 | 1082 * | 0 0 0 0 0 1 0 0 | 1083 * | 0 0 0 0 0 0 1 0 | 1084 * | 0 0 0 0 0 0 0 1 | 1085 * ~~ ~~ 1086 * 1087 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 1088 * of the missing data. 1089 * 1090 * As is apparent from the example above, the only non-trivial rows in the 1091 * inverse matrix correspond to the data disks that we're trying to 1092 * reconstruct. Indeed, those are the only rows we need as the others would 1093 * only be useful for reconstructing data known or assumed to be valid. For 1094 * that reason, we only build the coefficients in the rows that correspond to 1095 * targeted columns. 1096 */ 1097/* END CSTYLED */ 1098 1099static void 1100vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 1101 uint8_t **rows) 1102{ 1103 int i, j; 1104 int pow; 1105 1106 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 1107 1108 /* 1109 * Fill in the missing rows of interest. 1110 */ 1111 for (i = 0; i < nmap; i++) { 1112 ASSERT3S(0, <=, map[i]); 1113 ASSERT3S(map[i], <=, 2); 1114 1115 pow = map[i] * n; 1116 if (pow > 255) 1117 pow -= 255; 1118 ASSERT(pow <= 255); 1119 1120 for (j = 0; j < n; j++) { 1121 pow -= map[i]; 1122 if (pow < 0) 1123 pow += 255; 1124 rows[i][j] = vdev_raidz_pow2[pow]; 1125 } 1126 } 1127} 1128 1129static void 1130vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 1131 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 1132{ 1133 int i, j, ii, jj; 1134 uint8_t log; 1135 1136 /* 1137 * Assert that the first nmissing entries from the array of used 1138 * columns correspond to parity columns and that subsequent entries 1139 * correspond to data columns. 1140 */ 1141 for (i = 0; i < nmissing; i++) { 1142 ASSERT3S(used[i], <, rm->rm_firstdatacol); 1143 } 1144 for (; i < n; i++) { 1145 ASSERT3S(used[i], >=, rm->rm_firstdatacol); 1146 } 1147 1148 /* 1149 * First initialize the storage where we'll compute the inverse rows. 1150 */ 1151 for (i = 0; i < nmissing; i++) { 1152 for (j = 0; j < n; j++) { 1153 invrows[i][j] = (i == j) ? 1 : 0; 1154 } 1155 } 1156 1157 /* 1158 * Subtract all trivial rows from the rows of consequence. 1159 */ 1160 for (i = 0; i < nmissing; i++) { 1161 for (j = nmissing; j < n; j++) { 1162 ASSERT3U(used[j], >=, rm->rm_firstdatacol); 1163 jj = used[j] - rm->rm_firstdatacol; 1164 ASSERT3S(jj, <, n); 1165 invrows[i][j] = rows[i][jj]; 1166 rows[i][jj] = 0; 1167 } 1168 } 1169 1170 /* 1171 * For each of the rows of interest, we must normalize it and subtract 1172 * a multiple of it from the other rows. 1173 */ 1174 for (i = 0; i < nmissing; i++) { 1175 for (j = 0; j < missing[i]; j++) { 1176 ASSERT0(rows[i][j]); 1177 } 1178 ASSERT3U(rows[i][missing[i]], !=, 0); 1179 1180 /* 1181 * Compute the inverse of the first element and multiply each 1182 * element in the row by that value. 1183 */ 1184 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 1185 1186 for (j = 0; j < n; j++) { 1187 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 1188 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 1189 } 1190 1191 for (ii = 0; ii < nmissing; ii++) { 1192 if (i == ii) 1193 continue; 1194 1195 ASSERT3U(rows[ii][missing[i]], !=, 0); 1196 1197 log = vdev_raidz_log2[rows[ii][missing[i]]]; 1198 1199 for (j = 0; j < n; j++) { 1200 rows[ii][j] ^= 1201 vdev_raidz_exp2(rows[i][j], log); 1202 invrows[ii][j] ^= 1203 vdev_raidz_exp2(invrows[i][j], log); 1204 } 1205 } 1206 } 1207 1208 /* 1209 * Verify that the data that is left in the rows are properly part of 1210 * an identity matrix. 1211 */ 1212 for (i = 0; i < nmissing; i++) { 1213 for (j = 0; j < n; j++) { 1214 if (j == missing[i]) { 1215 ASSERT3U(rows[i][j], ==, 1); 1216 } else { 1217 ASSERT0(rows[i][j]); 1218 } 1219 } 1220 } 1221} 1222 1223static void 1224vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 1225 int *missing, uint8_t **invrows, const uint8_t *used) 1226{ 1227 int i, j, x, cc, c; 1228 uint8_t *src; 1229 uint64_t ccount; 1230 uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 1231 uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 1232 uint8_t log = 0; 1233 uint8_t val; 1234 int ll; 1235 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1236 uint8_t *p, *pp; 1237 size_t psize; 1238 1239 psize = sizeof (invlog[0][0]) * n * nmissing; 1240 p = kmem_alloc(psize, KM_SLEEP); 1241 1242 for (pp = p, i = 0; i < nmissing; i++) { 1243 invlog[i] = pp; 1244 pp += n; 1245 } 1246 1247 for (i = 0; i < nmissing; i++) { 1248 for (j = 0; j < n; j++) { 1249 ASSERT3U(invrows[i][j], !=, 0); 1250 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1251 } 1252 } 1253 1254 for (i = 0; i < n; i++) { 1255 c = used[i]; 1256 ASSERT3U(c, <, rm->rm_cols); 1257 1258 src = rm->rm_col[c].rc_data; 1259 ccount = rm->rm_col[c].rc_size; 1260 for (j = 0; j < nmissing; j++) { 1261 cc = missing[j] + rm->rm_firstdatacol; 1262 ASSERT3U(cc, >=, rm->rm_firstdatacol); 1263 ASSERT3U(cc, <, rm->rm_cols); 1264 ASSERT3U(cc, !=, c); 1265 1266 dst[j] = rm->rm_col[cc].rc_data; 1267 dcount[j] = rm->rm_col[cc].rc_size; 1268 } 1269 1270 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 1271 1272 for (x = 0; x < ccount; x++, src++) { 1273 if (*src != 0) 1274 log = vdev_raidz_log2[*src]; 1275 1276 for (cc = 0; cc < nmissing; cc++) { 1277 if (x >= dcount[cc]) 1278 continue; 1279 1280 if (*src == 0) { 1281 val = 0; 1282 } else { 1283 if ((ll = log + invlog[cc][i]) >= 255) 1284 ll -= 255; 1285 val = vdev_raidz_pow2[ll]; 1286 } 1287 1288 if (i == 0) 1289 dst[cc][x] = val; 1290 else 1291 dst[cc][x] ^= val; 1292 } 1293 } 1294 } 1295 1296 kmem_free(p, psize); 1297} 1298 1299static int 1300vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1301{ 1302 int n, i, c, t, tt; 1303 int nmissing_rows; 1304 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1305 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1306 1307 uint8_t *p, *pp; 1308 size_t psize; 1309 1310 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1311 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1312 uint8_t *used; 1313 1314 int code = 0; 1315 1316 1317 n = rm->rm_cols - rm->rm_firstdatacol; 1318 1319 /* 1320 * Figure out which data columns are missing. 1321 */ 1322 nmissing_rows = 0; 1323 for (t = 0; t < ntgts; t++) { 1324 if (tgts[t] >= rm->rm_firstdatacol) { 1325 missing_rows[nmissing_rows++] = 1326 tgts[t] - rm->rm_firstdatacol; 1327 } 1328 } 1329 1330 /* 1331 * Figure out which parity columns to use to help generate the missing 1332 * data columns. 1333 */ 1334 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1335 ASSERT(tt < ntgts); 1336 ASSERT(c < rm->rm_firstdatacol); 1337 1338 /* 1339 * Skip any targeted parity columns. 1340 */ 1341 if (c == tgts[tt]) { 1342 tt++; 1343 continue; 1344 } 1345 1346 code |= 1 << c; 1347 1348 parity_map[i] = c; 1349 i++; 1350 } 1351 1352 ASSERT(code != 0); 1353 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1354 1355 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1356 nmissing_rows * n + sizeof (used[0]) * n; 1357 p = kmem_alloc(psize, KM_SLEEP); 1358 1359 for (pp = p, i = 0; i < nmissing_rows; i++) { 1360 rows[i] = pp; 1361 pp += n; 1362 invrows[i] = pp; 1363 pp += n; 1364 } 1365 used = pp; 1366 1367 for (i = 0; i < nmissing_rows; i++) { 1368 used[i] = parity_map[i]; 1369 } 1370 1371 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1372 if (tt < nmissing_rows && 1373 c == missing_rows[tt] + rm->rm_firstdatacol) { 1374 tt++; 1375 continue; 1376 } 1377 1378 ASSERT3S(i, <, n); 1379 used[i] = c; 1380 i++; 1381 } 1382 1383 /* 1384 * Initialize the interesting rows of the matrix. 1385 */ 1386 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1387 1388 /* 1389 * Invert the matrix. 1390 */ 1391 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1392 invrows, used); 1393 1394 /* 1395 * Reconstruct the missing data using the generated matrix. 1396 */ 1397 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1398 invrows, used); 1399 1400 kmem_free(p, psize); 1401 1402 return (code); 1403} 1404 1405static int 1406vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1407{ 1408 int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 1409 int ntgts; 1410 int i, c; 1411 int code; 1412 int nbadparity, nbaddata; 1413 int parity_valid[VDEV_RAIDZ_MAXPARITY]; 1414 1415 /* 1416 * The tgts list must already be sorted. 1417 */ 1418 for (i = 1; i < nt; i++) { 1419 ASSERT(t[i] > t[i - 1]); 1420 } 1421 1422 nbadparity = rm->rm_firstdatacol; 1423 nbaddata = rm->rm_cols - nbadparity; 1424 ntgts = 0; 1425 for (i = 0, c = 0; c < rm->rm_cols; c++) { 1426 if (c < rm->rm_firstdatacol) 1427 parity_valid[c] = B_FALSE; 1428 1429 if (i < nt && c == t[i]) { 1430 tgts[ntgts++] = c; 1431 i++; 1432 } else if (rm->rm_col[c].rc_error != 0) { 1433 tgts[ntgts++] = c; 1434 } else if (c >= rm->rm_firstdatacol) { 1435 nbaddata--; 1436 } else { 1437 parity_valid[c] = B_TRUE; 1438 nbadparity--; 1439 } 1440 } 1441 1442 ASSERT(ntgts >= nt); 1443 ASSERT(nbaddata >= 0); 1444 ASSERT(nbaddata + nbadparity == ntgts); 1445 1446 dt = &tgts[nbadparity]; 1447 1448 /* 1449 * See if we can use any of our optimized reconstruction routines. 1450 */ 1451 if (!vdev_raidz_default_to_general) { 1452 switch (nbaddata) { 1453 case 1: 1454 if (parity_valid[VDEV_RAIDZ_P]) 1455 return (vdev_raidz_reconstruct_p(rm, dt, 1)); 1456 1457 ASSERT(rm->rm_firstdatacol > 1); 1458 1459 if (parity_valid[VDEV_RAIDZ_Q]) 1460 return (vdev_raidz_reconstruct_q(rm, dt, 1)); 1461 1462 ASSERT(rm->rm_firstdatacol > 2); 1463 break; 1464 1465 case 2: 1466 ASSERT(rm->rm_firstdatacol > 1); 1467 1468 if (parity_valid[VDEV_RAIDZ_P] && 1469 parity_valid[VDEV_RAIDZ_Q]) 1470 return (vdev_raidz_reconstruct_pq(rm, dt, 2)); 1471 1472 ASSERT(rm->rm_firstdatacol > 2); 1473 1474 break; 1475 } 1476 } 1477 1478 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1479 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1480 ASSERT(code > 0); 1481 return (code); 1482} 1483 1484static int 1485vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, 1486 uint64_t *logical_ashift, uint64_t *physical_ashift) 1487{ 1488 vdev_t *cvd; 1489 uint64_t nparity = vd->vdev_nparity; 1490 int c; 1491 int lasterror = 0; 1492 int numerrors = 0; 1493 1494 ASSERT(nparity > 0); 1495 1496 if (nparity > VDEV_RAIDZ_MAXPARITY || 1497 vd->vdev_children < nparity + 1) { 1498 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 1499 return (SET_ERROR(EINVAL)); 1500 } 1501 1502 vdev_open_children(vd); 1503 1504 for (c = 0; c < vd->vdev_children; c++) { 1505 cvd = vd->vdev_child[c]; 1506 1507 if (cvd->vdev_open_error != 0) { 1508 lasterror = cvd->vdev_open_error; 1509 numerrors++; 1510 continue; 1511 } 1512 1513 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 1514 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; 1515 *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); 1516 *physical_ashift = MAX(*physical_ashift, 1517 cvd->vdev_physical_ashift); 1518 } 1519 1520 *asize *= vd->vdev_children; 1521 *max_asize *= vd->vdev_children; 1522 1523 if (numerrors > nparity) { 1524 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 1525 return (lasterror); 1526 } 1527 1528 return (0); 1529} 1530 1531static void 1532vdev_raidz_close(vdev_t *vd) 1533{ 1534 int c; 1535 1536 for (c = 0; c < vd->vdev_children; c++) 1537 vdev_close(vd->vdev_child[c]); 1538} 1539 1540#ifdef illumos 1541/* 1542 * Handle a read or write I/O to a RAID-Z dump device. 1543 * 1544 * The dump device is in a unique situation compared to other ZFS datasets: 1545 * writing to this device should be as simple and fast as possible. In 1546 * addition, durability matters much less since the dump will be extracted 1547 * once the machine reboots. For that reason, this function eschews parity for 1548 * performance and simplicity. The dump device uses the checksum setting 1549 * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this 1550 * dataset. 1551 * 1552 * Blocks of size 128 KB have been preallocated for this volume. I/Os less than 1553 * 128 KB will not fill an entire block; in addition, they may not be properly 1554 * aligned. In that case, this function uses the preallocated 128 KB block and 1555 * omits reading or writing any "empty" portions of that block, as opposed to 1556 * allocating a fresh appropriately-sized block. 1557 * 1558 * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs: 1559 * 1560 * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB) 1561 * 1562 * If this were a standard RAID-Z dataset, a block of at least 40 KB would be 1563 * allocated which spans all five child vdevs. 8 KB of data would be written to 1564 * each of four vdevs, with the fifth containing the parity bits. 1565 * 1566 * parity data data data data 1567 * | PP | XX | XX | XX | XX | 1568 * ^ ^ ^ ^ ^ 1569 * | | | | | 1570 * 8 KB parity ------8 KB data blocks------ 1571 * 1572 * However, when writing to the dump device, the behavior is different: 1573 * 1574 * vdev_raidz_physio(data, size: 32 KB, offset: 64 KB) 1575 * 1576 * Unlike the normal RAID-Z case in which the block is allocated based on the 1577 * I/O size, reads and writes here always use a 128 KB logical I/O size. If the 1578 * I/O size is less than 128 KB, only the actual portions of data are written. 1579 * In this example the data is written to the third data vdev since that vdev 1580 * contains the offset [64 KB, 96 KB). 1581 * 1582 * parity data data data data 1583 * | | | | XX | | 1584 * ^ 1585 * | 1586 * 32 KB data block 1587 * 1588 * As a result, an individual I/O may not span all child vdevs; moreover, a 1589 * small I/O may only operate on a single child vdev. 1590 * 1591 * Note that since there are no parity bits calculated or written, this format 1592 * remains the same no matter how many parity bits are used in a normal RAID-Z 1593 * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above 1594 * would look like: 1595 * 1596 * parity parity parity data data data data 1597 * | | | | | | XX | | 1598 * ^ 1599 * | 1600 * 32 KB data block 1601 */ 1602int 1603vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size, 1604 uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump) 1605{ 1606 vdev_t *tvd = vd->vdev_top; 1607 vdev_t *cvd; 1608 raidz_map_t *rm; 1609 raidz_col_t *rc; 1610 int c, err = 0; 1611 1612 uint64_t start, end, colstart, colend; 1613 uint64_t coloffset, colsize, colskip; 1614 1615 int flags = doread ? BIO_READ : BIO_WRITE; 1616 1617#ifdef _KERNEL 1618 1619 /* 1620 * Don't write past the end of the block 1621 */ 1622 VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE); 1623 1624 start = offset; 1625 end = start + size; 1626 1627 /* 1628 * Allocate a RAID-Z map for this block. Note that this block starts 1629 * from the "original" offset, this is, the offset of the extent which 1630 * contains the requisite offset of the data being read or written. 1631 * 1632 * Even if this I/O operation doesn't span the full block size, let's 1633 * treat the on-disk format as if the only blocks are the complete 128 1634 * KB size. 1635 */ 1636 rm = vdev_raidz_map_alloc(data - (offset - origoffset), 1637 SPA_OLD_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift, 1638 vd->vdev_children, vd->vdev_nparity); 1639 1640 coloffset = origoffset; 1641 1642 for (c = rm->rm_firstdatacol; c < rm->rm_cols; 1643 c++, coloffset += rc->rc_size) { 1644 rc = &rm->rm_col[c]; 1645 cvd = vd->vdev_child[rc->rc_devidx]; 1646 1647 /* 1648 * Find the start and end of this column in the RAID-Z map, 1649 * keeping in mind that the stated size and offset of the 1650 * operation may not fill the entire column for this vdev. 1651 * 1652 * If any portion of the data spans this column, issue the 1653 * appropriate operation to the vdev. 1654 */ 1655 if (coloffset + rc->rc_size <= start) 1656 continue; 1657 if (coloffset >= end) 1658 continue; 1659 1660 colstart = MAX(coloffset, start); 1661 colend = MIN(end, coloffset + rc->rc_size); 1662 colsize = colend - colstart; 1663 colskip = colstart - coloffset; 1664 1665 VERIFY3U(colsize, <=, rc->rc_size); 1666 VERIFY3U(colskip, <=, rc->rc_size); 1667 1668 /* 1669 * Note that the child vdev will have a vdev label at the start 1670 * of its range of offsets, hence the need for 1671 * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another 1672 * example of why this calculation is needed. 1673 */ 1674 if ((err = vdev_disk_physio(cvd, 1675 ((char *)rc->rc_data) + colskip, colsize, 1676 VDEV_LABEL_OFFSET(rc->rc_offset) + colskip, 1677 flags, isdump)) != 0) 1678 break; 1679 } 1680 1681 vdev_raidz_map_free(rm); 1682#endif /* KERNEL */ 1683 1684 return (err); 1685} 1686#endif 1687 1688static uint64_t 1689vdev_raidz_asize(vdev_t *vd, uint64_t psize) 1690{ 1691 uint64_t asize; 1692 uint64_t ashift = vd->vdev_top->vdev_ashift; 1693 uint64_t cols = vd->vdev_children; 1694 uint64_t nparity = vd->vdev_nparity; 1695 1696 asize = ((psize - 1) >> ashift) + 1; 1697 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 1698 asize = roundup(asize, nparity + 1) << ashift; 1699 1700 return (asize); 1701} 1702 1703static void 1704vdev_raidz_child_done(zio_t *zio) 1705{ 1706 raidz_col_t *rc = zio->io_private; 1707 1708 rc->rc_error = zio->io_error; 1709 rc->rc_tried = 1; 1710 rc->rc_skipped = 0; 1711} 1712 1713/* 1714 * Start an IO operation on a RAIDZ VDev 1715 * 1716 * Outline: 1717 * - For write operations: 1718 * 1. Generate the parity data 1719 * 2. Create child zio write operations to each column's vdev, for both 1720 * data and parity. 1721 * 3. If the column skips any sectors for padding, create optional dummy 1722 * write zio children for those areas to improve aggregation continuity. 1723 * - For read operations: 1724 * 1. Create child zio read operations to each data column's vdev to read 1725 * the range of data required for zio. 1726 * 2. If this is a scrub or resilver operation, or if any of the data 1727 * vdevs have had errors, then create zio read operations to the parity 1728 * columns' VDevs as well. 1729 */ 1730static void 1731vdev_raidz_io_start(zio_t *zio) 1732{ 1733 vdev_t *vd = zio->io_vd; 1734 vdev_t *tvd = vd->vdev_top; 1735 vdev_t *cvd; 1736 raidz_map_t *rm; 1737 raidz_col_t *rc; 1738 int c, i; 1739 1740 rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset, 1741 zio->io_type == ZIO_TYPE_FREE, 1742 tvd->vdev_ashift, vd->vdev_children, 1743 vd->vdev_nparity); 1744 1745 zio->io_vsd = rm; 1746 zio->io_vsd_ops = &vdev_raidz_vsd_ops; 1747 1748 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); 1749 1750 if (zio->io_type == ZIO_TYPE_FREE) { 1751 for (c = 0; c < rm->rm_cols; c++) { 1752 rc = &rm->rm_col[c]; 1753 cvd = vd->vdev_child[rc->rc_devidx]; 1754 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1755 rc->rc_offset, rc->rc_data, rc->rc_size, 1756 zio->io_type, zio->io_priority, 0, 1757 vdev_raidz_child_done, rc)); 1758 } 1759 1760 zio_execute(zio); 1761 return; 1762 } 1763 1764 if (zio->io_type == ZIO_TYPE_WRITE) { 1765 vdev_raidz_generate_parity(rm); 1766 1767 for (c = 0; c < rm->rm_cols; c++) { 1768 rc = &rm->rm_col[c]; 1769 cvd = vd->vdev_child[rc->rc_devidx]; 1770 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1771 rc->rc_offset, rc->rc_data, rc->rc_size, 1772 zio->io_type, zio->io_priority, 0, 1773 vdev_raidz_child_done, rc)); 1774 } 1775 1776 /* 1777 * Generate optional I/Os for any skipped sectors to improve 1778 * aggregation contiguity. 1779 */ 1780 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { 1781 ASSERT(c <= rm->rm_scols); 1782 if (c == rm->rm_scols) 1783 c = 0; 1784 rc = &rm->rm_col[c]; 1785 cvd = vd->vdev_child[rc->rc_devidx]; 1786 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1787 rc->rc_offset + rc->rc_size, NULL, 1788 1 << tvd->vdev_ashift, 1789 zio->io_type, zio->io_priority, 1790 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 1791 } 1792 1793 zio_execute(zio); 1794 return; 1795 } 1796 1797 ASSERT(zio->io_type == ZIO_TYPE_READ); 1798 1799 /* 1800 * Iterate over the columns in reverse order so that we hit the parity 1801 * last -- any errors along the way will force us to read the parity. 1802 */ 1803 for (c = rm->rm_cols - 1; c >= 0; c--) { 1804 rc = &rm->rm_col[c]; 1805 cvd = vd->vdev_child[rc->rc_devidx]; 1806 if (!vdev_readable(cvd)) { 1807 if (c >= rm->rm_firstdatacol) 1808 rm->rm_missingdata++; 1809 else 1810 rm->rm_missingparity++; 1811 rc->rc_error = SET_ERROR(ENXIO); 1812 rc->rc_tried = 1; /* don't even try */ 1813 rc->rc_skipped = 1; 1814 continue; 1815 } 1816 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 1817 if (c >= rm->rm_firstdatacol) 1818 rm->rm_missingdata++; 1819 else 1820 rm->rm_missingparity++; 1821 rc->rc_error = SET_ERROR(ESTALE); 1822 rc->rc_skipped = 1; 1823 continue; 1824 } 1825 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || 1826 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 1827 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1828 rc->rc_offset, rc->rc_data, rc->rc_size, 1829 zio->io_type, zio->io_priority, 0, 1830 vdev_raidz_child_done, rc)); 1831 } 1832 } 1833 1834 zio_execute(zio); 1835} 1836 1837 1838/* 1839 * Report a checksum error for a child of a RAID-Z device. 1840 */ 1841static void 1842raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) 1843{ 1844 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 1845 1846 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 1847 zio_bad_cksum_t zbc; 1848 raidz_map_t *rm = zio->io_vsd; 1849 1850 mutex_enter(&vd->vdev_stat_lock); 1851 vd->vdev_stat.vs_checksum_errors++; 1852 mutex_exit(&vd->vdev_stat_lock); 1853 1854 zbc.zbc_has_cksum = 0; 1855 zbc.zbc_injected = rm->rm_ecksuminjected; 1856 1857 zfs_ereport_post_checksum(zio->io_spa, vd, zio, 1858 rc->rc_offset, rc->rc_size, rc->rc_data, bad_data, 1859 &zbc); 1860 } 1861} 1862 1863/* 1864 * We keep track of whether or not there were any injected errors, so that 1865 * any ereports we generate can note it. 1866 */ 1867static int 1868raidz_checksum_verify(zio_t *zio) 1869{ 1870 zio_bad_cksum_t zbc; 1871 raidz_map_t *rm = zio->io_vsd; 1872 1873 int ret = zio_checksum_error(zio, &zbc); 1874 if (ret != 0 && zbc.zbc_injected != 0) 1875 rm->rm_ecksuminjected = 1; 1876 1877 return (ret); 1878} 1879 1880/* 1881 * Generate the parity from the data columns. If we tried and were able to 1882 * read the parity without error, verify that the generated parity matches the 1883 * data we read. If it doesn't, we fire off a checksum error. Return the 1884 * number such failures. 1885 */ 1886static int 1887raidz_parity_verify(zio_t *zio, raidz_map_t *rm) 1888{ 1889 void *orig[VDEV_RAIDZ_MAXPARITY]; 1890 int c, ret = 0; 1891 raidz_col_t *rc; 1892 1893 blkptr_t *bp = zio->io_bp; 1894 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : 1895 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); 1896 1897 if (checksum == ZIO_CHECKSUM_NOPARITY) 1898 return (ret); 1899 1900 for (c = 0; c < rm->rm_firstdatacol; c++) { 1901 rc = &rm->rm_col[c]; 1902 if (!rc->rc_tried || rc->rc_error != 0) 1903 continue; 1904 orig[c] = zio_buf_alloc(rc->rc_size); 1905 bcopy(rc->rc_data, orig[c], rc->rc_size); 1906 } 1907 1908 vdev_raidz_generate_parity(rm); 1909 1910 for (c = 0; c < rm->rm_firstdatacol; c++) { 1911 rc = &rm->rm_col[c]; 1912 if (!rc->rc_tried || rc->rc_error != 0) 1913 continue; 1914 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 1915 raidz_checksum_error(zio, rc, orig[c]); 1916 rc->rc_error = SET_ERROR(ECKSUM); 1917 ret++; 1918 } 1919 zio_buf_free(orig[c], rc->rc_size); 1920 } 1921 1922 return (ret); 1923} 1924 1925/* 1926 * Keep statistics on all the ways that we used parity to correct data. 1927 */ 1928static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY]; 1929 1930static int 1931vdev_raidz_worst_error(raidz_map_t *rm) 1932{ 1933 int error = 0; 1934 1935 for (int c = 0; c < rm->rm_cols; c++) 1936 error = zio_worst_error(error, rm->rm_col[c].rc_error); 1937 1938 return (error); 1939} 1940 1941/* 1942 * Iterate over all combinations of bad data and attempt a reconstruction. 1943 * Note that the algorithm below is non-optimal because it doesn't take into 1944 * account how reconstruction is actually performed. For example, with 1945 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 1946 * is targeted as invalid as if columns 1 and 4 are targeted since in both 1947 * cases we'd only use parity information in column 0. 1948 */ 1949static int 1950vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) 1951{ 1952 raidz_map_t *rm = zio->io_vsd; 1953 raidz_col_t *rc; 1954 void *orig[VDEV_RAIDZ_MAXPARITY]; 1955 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 1956 int *tgts = &tstore[1]; 1957 int current, next, i, c, n; 1958 int code, ret = 0; 1959 1960 ASSERT(total_errors < rm->rm_firstdatacol); 1961 1962 /* 1963 * This simplifies one edge condition. 1964 */ 1965 tgts[-1] = -1; 1966 1967 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 1968 /* 1969 * Initialize the targets array by finding the first n columns 1970 * that contain no error. 1971 * 1972 * If there were no data errors, we need to ensure that we're 1973 * always explicitly attempting to reconstruct at least one 1974 * data column. To do this, we simply push the highest target 1975 * up into the data columns. 1976 */ 1977 for (c = 0, i = 0; i < n; i++) { 1978 if (i == n - 1 && data_errors == 0 && 1979 c < rm->rm_firstdatacol) { 1980 c = rm->rm_firstdatacol; 1981 } 1982 1983 while (rm->rm_col[c].rc_error != 0) { 1984 c++; 1985 ASSERT3S(c, <, rm->rm_cols); 1986 } 1987 1988 tgts[i] = c++; 1989 } 1990 1991 /* 1992 * Setting tgts[n] simplifies the other edge condition. 1993 */ 1994 tgts[n] = rm->rm_cols; 1995 1996 /* 1997 * These buffers were allocated in previous iterations. 1998 */ 1999 for (i = 0; i < n - 1; i++) { 2000 ASSERT(orig[i] != NULL); 2001 } 2002 2003 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size); 2004 2005 current = 0; 2006 next = tgts[current]; 2007 2008 while (current != n) { 2009 tgts[current] = next; 2010 current = 0; 2011 2012 /* 2013 * Save off the original data that we're going to 2014 * attempt to reconstruct. 2015 */ 2016 for (i = 0; i < n; i++) { 2017 ASSERT(orig[i] != NULL); 2018 c = tgts[i]; 2019 ASSERT3S(c, >=, 0); 2020 ASSERT3S(c, <, rm->rm_cols); 2021 rc = &rm->rm_col[c]; 2022 bcopy(rc->rc_data, orig[i], rc->rc_size); 2023 } 2024 2025 /* 2026 * Attempt a reconstruction and exit the outer loop on 2027 * success. 2028 */ 2029 code = vdev_raidz_reconstruct(rm, tgts, n); 2030 if (raidz_checksum_verify(zio) == 0) { 2031 atomic_inc_64(&raidz_corrected[code]); 2032 2033 for (i = 0; i < n; i++) { 2034 c = tgts[i]; 2035 rc = &rm->rm_col[c]; 2036 ASSERT(rc->rc_error == 0); 2037 if (rc->rc_tried) 2038 raidz_checksum_error(zio, rc, 2039 orig[i]); 2040 rc->rc_error = SET_ERROR(ECKSUM); 2041 } 2042 2043 ret = code; 2044 goto done; 2045 } 2046 2047 /* 2048 * Restore the original data. 2049 */ 2050 for (i = 0; i < n; i++) { 2051 c = tgts[i]; 2052 rc = &rm->rm_col[c]; 2053 bcopy(orig[i], rc->rc_data, rc->rc_size); 2054 } 2055 2056 do { 2057 /* 2058 * Find the next valid column after the current 2059 * position.. 2060 */ 2061 for (next = tgts[current] + 1; 2062 next < rm->rm_cols && 2063 rm->rm_col[next].rc_error != 0; next++) 2064 continue; 2065 2066 ASSERT(next <= tgts[current + 1]); 2067 2068 /* 2069 * If that spot is available, we're done here. 2070 */ 2071 if (next != tgts[current + 1]) 2072 break; 2073 2074 /* 2075 * Otherwise, find the next valid column after 2076 * the previous position. 2077 */ 2078 for (c = tgts[current - 1] + 1; 2079 rm->rm_col[c].rc_error != 0; c++) 2080 continue; 2081 2082 tgts[current] = c; 2083 current++; 2084 2085 } while (current != n); 2086 } 2087 } 2088 n--; 2089done: 2090 for (i = 0; i < n; i++) { 2091 zio_buf_free(orig[i], rm->rm_col[0].rc_size); 2092 } 2093 2094 return (ret); 2095} 2096 2097/* 2098 * Complete an IO operation on a RAIDZ VDev 2099 * 2100 * Outline: 2101 * - For write operations: 2102 * 1. Check for errors on the child IOs. 2103 * 2. Return, setting an error code if too few child VDevs were written 2104 * to reconstruct the data later. Note that partial writes are 2105 * considered successful if they can be reconstructed at all. 2106 * - For read operations: 2107 * 1. Check for errors on the child IOs. 2108 * 2. If data errors occurred: 2109 * a. Try to reassemble the data from the parity available. 2110 * b. If we haven't yet read the parity drives, read them now. 2111 * c. If all parity drives have been read but the data still doesn't 2112 * reassemble with a correct checksum, then try combinatorial 2113 * reconstruction. 2114 * d. If that doesn't work, return an error. 2115 * 3. If there were unexpected errors or this is a resilver operation, 2116 * rewrite the vdevs that had errors. 2117 */ 2118static void 2119vdev_raidz_io_done(zio_t *zio) 2120{ 2121 vdev_t *vd = zio->io_vd; 2122 vdev_t *cvd; 2123 raidz_map_t *rm = zio->io_vsd; 2124 raidz_col_t *rc; 2125 int unexpected_errors = 0; 2126 int parity_errors = 0; 2127 int parity_untried = 0; 2128 int data_errors = 0; 2129 int total_errors = 0; 2130 int n, c; 2131 int tgts[VDEV_RAIDZ_MAXPARITY]; 2132 int code; 2133 2134 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ 2135 2136 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 2137 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 2138 2139 for (c = 0; c < rm->rm_cols; c++) { 2140 rc = &rm->rm_col[c]; 2141 2142 if (rc->rc_error) { 2143 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 2144 2145 if (c < rm->rm_firstdatacol) 2146 parity_errors++; 2147 else 2148 data_errors++; 2149 2150 if (!rc->rc_skipped) 2151 unexpected_errors++; 2152 2153 total_errors++; 2154 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 2155 parity_untried++; 2156 } 2157 } 2158 2159 if (zio->io_type == ZIO_TYPE_WRITE) { 2160 /* 2161 * XXX -- for now, treat partial writes as a success. 2162 * (If we couldn't write enough columns to reconstruct 2163 * the data, the I/O failed. Otherwise, good enough.) 2164 * 2165 * Now that we support write reallocation, it would be better 2166 * to treat partial failure as real failure unless there are 2167 * no non-degraded top-level vdevs left, and not update DTLs 2168 * if we intend to reallocate. 2169 */ 2170 /* XXPOLICY */ 2171 if (total_errors > rm->rm_firstdatacol) 2172 zio->io_error = vdev_raidz_worst_error(rm); 2173 2174 return; 2175 } else if (zio->io_type == ZIO_TYPE_FREE) { 2176 return; 2177 } 2178 2179 ASSERT(zio->io_type == ZIO_TYPE_READ); 2180 /* 2181 * There are three potential phases for a read: 2182 * 1. produce valid data from the columns read 2183 * 2. read all disks and try again 2184 * 3. perform combinatorial reconstruction 2185 * 2186 * Each phase is progressively both more expensive and less likely to 2187 * occur. If we encounter more errors than we can repair or all phases 2188 * fail, we have no choice but to return an error. 2189 */ 2190 2191 /* 2192 * If the number of errors we saw was correctable -- less than or equal 2193 * to the number of parity disks read -- attempt to produce data that 2194 * has a valid checksum. Naturally, this case applies in the absence of 2195 * any errors. 2196 */ 2197 if (total_errors <= rm->rm_firstdatacol - parity_untried) { 2198 if (data_errors == 0) { 2199 if (raidz_checksum_verify(zio) == 0) { 2200 /* 2201 * If we read parity information (unnecessarily 2202 * as it happens since no reconstruction was 2203 * needed) regenerate and verify the parity. 2204 * We also regenerate parity when resilvering 2205 * so we can write it out to the failed device 2206 * later. 2207 */ 2208 if (parity_errors + parity_untried < 2209 rm->rm_firstdatacol || 2210 (zio->io_flags & ZIO_FLAG_RESILVER)) { 2211 n = raidz_parity_verify(zio, rm); 2212 unexpected_errors += n; 2213 ASSERT(parity_errors + n <= 2214 rm->rm_firstdatacol); 2215 } 2216 goto done; 2217 } 2218 } else { 2219 /* 2220 * We either attempt to read all the parity columns or 2221 * none of them. If we didn't try to read parity, we 2222 * wouldn't be here in the correctable case. There must 2223 * also have been fewer parity errors than parity 2224 * columns or, again, we wouldn't be in this code path. 2225 */ 2226 ASSERT(parity_untried == 0); 2227 ASSERT(parity_errors < rm->rm_firstdatacol); 2228 2229 /* 2230 * Identify the data columns that reported an error. 2231 */ 2232 n = 0; 2233 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 2234 rc = &rm->rm_col[c]; 2235 if (rc->rc_error != 0) { 2236 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 2237 tgts[n++] = c; 2238 } 2239 } 2240 2241 ASSERT(rm->rm_firstdatacol >= n); 2242 2243 code = vdev_raidz_reconstruct(rm, tgts, n); 2244 2245 if (raidz_checksum_verify(zio) == 0) { 2246 atomic_inc_64(&raidz_corrected[code]); 2247 2248 /* 2249 * If we read more parity disks than were used 2250 * for reconstruction, confirm that the other 2251 * parity disks produced correct data. This 2252 * routine is suboptimal in that it regenerates 2253 * the parity that we already used in addition 2254 * to the parity that we're attempting to 2255 * verify, but this should be a relatively 2256 * uncommon case, and can be optimized if it 2257 * becomes a problem. Note that we regenerate 2258 * parity when resilvering so we can write it 2259 * out to failed devices later. 2260 */ 2261 if (parity_errors < rm->rm_firstdatacol - n || 2262 (zio->io_flags & ZIO_FLAG_RESILVER)) { 2263 n = raidz_parity_verify(zio, rm); 2264 unexpected_errors += n; 2265 ASSERT(parity_errors + n <= 2266 rm->rm_firstdatacol); 2267 } 2268 2269 goto done; 2270 } 2271 } 2272 } 2273 2274 /* 2275 * This isn't a typical situation -- either we got a read error or 2276 * a child silently returned bad data. Read every block so we can 2277 * try again with as much data and parity as we can track down. If 2278 * we've already been through once before, all children will be marked 2279 * as tried so we'll proceed to combinatorial reconstruction. 2280 */ 2281 unexpected_errors = 1; 2282 rm->rm_missingdata = 0; 2283 rm->rm_missingparity = 0; 2284 2285 for (c = 0; c < rm->rm_cols; c++) { 2286 if (rm->rm_col[c].rc_tried) 2287 continue; 2288 2289 zio_vdev_io_redone(zio); 2290 do { 2291 rc = &rm->rm_col[c]; 2292 if (rc->rc_tried) 2293 continue; 2294 zio_nowait(zio_vdev_child_io(zio, NULL, 2295 vd->vdev_child[rc->rc_devidx], 2296 rc->rc_offset, rc->rc_data, rc->rc_size, 2297 zio->io_type, zio->io_priority, 0, 2298 vdev_raidz_child_done, rc)); 2299 } while (++c < rm->rm_cols); 2300 2301 return; 2302 } 2303 2304 /* 2305 * At this point we've attempted to reconstruct the data given the 2306 * errors we detected, and we've attempted to read all columns. There 2307 * must, therefore, be one or more additional problems -- silent errors 2308 * resulting in invalid data rather than explicit I/O errors resulting 2309 * in absent data. We check if there is enough additional data to 2310 * possibly reconstruct the data and then perform combinatorial 2311 * reconstruction over all possible combinations. If that fails, 2312 * we're cooked. 2313 */ 2314 if (total_errors > rm->rm_firstdatacol) { 2315 zio->io_error = vdev_raidz_worst_error(rm); 2316 2317 } else if (total_errors < rm->rm_firstdatacol && 2318 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) { 2319 /* 2320 * If we didn't use all the available parity for the 2321 * combinatorial reconstruction, verify that the remaining 2322 * parity is correct. 2323 */ 2324 if (code != (1 << rm->rm_firstdatacol) - 1) 2325 (void) raidz_parity_verify(zio, rm); 2326 } else { 2327 /* 2328 * We're here because either: 2329 * 2330 * total_errors == rm_first_datacol, or 2331 * vdev_raidz_combrec() failed 2332 * 2333 * In either case, there is enough bad data to prevent 2334 * reconstruction. 2335 * 2336 * Start checksum ereports for all children which haven't 2337 * failed, and the IO wasn't speculative. 2338 */ 2339 zio->io_error = SET_ERROR(ECKSUM); 2340 2341 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2342 for (c = 0; c < rm->rm_cols; c++) { 2343 rc = &rm->rm_col[c]; 2344 if (rc->rc_error == 0) { 2345 zio_bad_cksum_t zbc; 2346 zbc.zbc_has_cksum = 0; 2347 zbc.zbc_injected = 2348 rm->rm_ecksuminjected; 2349 2350 zfs_ereport_start_checksum( 2351 zio->io_spa, 2352 vd->vdev_child[rc->rc_devidx], 2353 zio, rc->rc_offset, rc->rc_size, 2354 (void *)(uintptr_t)c, &zbc); 2355 } 2356 } 2357 } 2358 } 2359 2360done: 2361 zio_checksum_verified(zio); 2362 2363 if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 2364 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 2365 /* 2366 * Use the good data we have in hand to repair damaged children. 2367 */ 2368 for (c = 0; c < rm->rm_cols; c++) { 2369 rc = &rm->rm_col[c]; 2370 cvd = vd->vdev_child[rc->rc_devidx]; 2371 2372 if (rc->rc_error == 0) 2373 continue; 2374 2375 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2376 rc->rc_offset, rc->rc_data, rc->rc_size, 2377 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, 2378 ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 2379 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 2380 } 2381 } 2382} 2383 2384static void 2385vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 2386{ 2387 if (faulted > vd->vdev_nparity) 2388 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2389 VDEV_AUX_NO_REPLICAS); 2390 else if (degraded + faulted != 0) 2391 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 2392 else 2393 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 2394} 2395 2396vdev_ops_t vdev_raidz_ops = { 2397 vdev_raidz_open, 2398 vdev_raidz_close, 2399 vdev_raidz_asize, 2400 vdev_raidz_io_start, 2401 vdev_raidz_io_done, 2402 vdev_raidz_state_change, 2403 NULL, 2404 NULL, 2405 VDEV_TYPE_RAIDZ, /* name of this vdev type */ 2406 B_FALSE /* not a leaf vdev */ 2407}; 2408