zio.c revision 260750
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2013 by Delphix. All rights reserved. 24 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. 25 */ 26 27#include <sys/zfs_context.h> 28#include <sys/fm/fs/zfs.h> 29#include <sys/spa.h> 30#include <sys/txg.h> 31#include <sys/spa_impl.h> 32#include <sys/vdev_impl.h> 33#include <sys/zio_impl.h> 34#include <sys/zio_compress.h> 35#include <sys/zio_checksum.h> 36#include <sys/dmu_objset.h> 37#include <sys/arc.h> 38#include <sys/ddt.h> 39#include <sys/trim_map.h> 40 41SYSCTL_DECL(_vfs_zfs); 42SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); 43#if defined(__amd64__) 44static int zio_use_uma = 1; 45#else 46static int zio_use_uma = 0; 47#endif 48TUNABLE_INT("vfs.zfs.zio.use_uma", &zio_use_uma); 49SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0, 50 "Use uma(9) for ZIO allocations"); 51static int zio_exclude_metadata = 0; 52TUNABLE_INT("vfs.zfs.zio.exclude_metadata", &zio_exclude_metadata); 53SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0, 54 "Exclude metadata buffers from dumps as well"); 55 56zio_trim_stats_t zio_trim_stats = { 57 { "bytes", KSTAT_DATA_UINT64, 58 "Number of bytes successfully TRIMmed" }, 59 { "success", KSTAT_DATA_UINT64, 60 "Number of successful TRIM requests" }, 61 { "unsupported", KSTAT_DATA_UINT64, 62 "Number of TRIM requests that failed because TRIM is not supported" }, 63 { "failed", KSTAT_DATA_UINT64, 64 "Number of TRIM requests that failed for reasons other than not supported" }, 65}; 66 67static kstat_t *zio_trim_ksp; 68 69/* 70 * ========================================================================== 71 * I/O priority table 72 * ========================================================================== 73 */ 74uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 75 0, /* ZIO_PRIORITY_NOW */ 76 0, /* ZIO_PRIORITY_SYNC_READ */ 77 0, /* ZIO_PRIORITY_SYNC_WRITE */ 78 0, /* ZIO_PRIORITY_LOG_WRITE */ 79 1, /* ZIO_PRIORITY_CACHE_FILL */ 80 1, /* ZIO_PRIORITY_AGG */ 81 4, /* ZIO_PRIORITY_FREE */ 82 4, /* ZIO_PRIORITY_ASYNC_WRITE */ 83 6, /* ZIO_PRIORITY_ASYNC_READ */ 84 10, /* ZIO_PRIORITY_RESILVER */ 85 20, /* ZIO_PRIORITY_SCRUB */ 86 2, /* ZIO_PRIORITY_DDT_PREFETCH */ 87 30, /* ZIO_PRIORITY_TRIM */ 88}; 89 90/* 91 * ========================================================================== 92 * I/O type descriptions 93 * ========================================================================== 94 */ 95char *zio_type_name[ZIO_TYPES] = { 96 "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", 97 "zio_ioctl" 98}; 99 100/* 101 * ========================================================================== 102 * I/O kmem caches 103 * ========================================================================== 104 */ 105kmem_cache_t *zio_cache; 106kmem_cache_t *zio_link_cache; 107kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 108kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 109 110#ifdef _KERNEL 111extern vmem_t *zio_alloc_arena; 112#endif 113extern int zfs_mg_alloc_failures; 114 115/* 116 * The following actions directly effect the spa's sync-to-convergence logic. 117 * The values below define the sync pass when we start performing the action. 118 * Care should be taken when changing these values as they directly impact 119 * spa_sync() performance. Tuning these values may introduce subtle performance 120 * pathologies and should only be done in the context of performance analysis. 121 * These tunables will eventually be removed and replaced with #defines once 122 * enough analysis has been done to determine optimal values. 123 * 124 * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that 125 * regular blocks are not deferred. 126 */ 127int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */ 128TUNABLE_INT("vfs.zfs.sync_pass_deferred_free", &zfs_sync_pass_deferred_free); 129SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_deferred_free, CTLFLAG_RDTUN, 130 &zfs_sync_pass_deferred_free, 0, "defer frees starting in this pass"); 131int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */ 132TUNABLE_INT("vfs.zfs.sync_pass_dont_compress", &zfs_sync_pass_dont_compress); 133SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_dont_compress, CTLFLAG_RDTUN, 134 &zfs_sync_pass_dont_compress, 0, "don't compress starting in this pass"); 135int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ 136TUNABLE_INT("vfs.zfs.sync_pass_rewrite", &zfs_sync_pass_rewrite); 137SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_rewrite, CTLFLAG_RDTUN, 138 &zfs_sync_pass_rewrite, 0, "rewrite new bps starting in this pass"); 139 140/* 141 * An allocating zio is one that either currently has the DVA allocate 142 * stage set or will have it later in its lifetime. 143 */ 144#define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) 145 146boolean_t zio_requeue_io_start_cut_in_line = B_TRUE; 147 148#ifdef ZFS_DEBUG 149int zio_buf_debug_limit = 16384; 150#else 151int zio_buf_debug_limit = 0; 152#endif 153 154void 155zio_init(void) 156{ 157 size_t c; 158 zio_cache = kmem_cache_create("zio_cache", 159 sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 160 zio_link_cache = kmem_cache_create("zio_link_cache", 161 sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 162 if (!zio_use_uma) 163 goto out; 164 165 /* 166 * For small buffers, we want a cache for each multiple of 167 * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 168 * for each quarter-power of 2. For large buffers, we want 169 * a cache for each multiple of PAGESIZE. 170 */ 171 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 172 size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 173 size_t p2 = size; 174 size_t align = 0; 175 size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0; 176 177 while (p2 & (p2 - 1)) 178 p2 &= p2 - 1; 179 180#ifdef illumos 181#ifndef _KERNEL 182 /* 183 * If we are using watchpoints, put each buffer on its own page, 184 * to eliminate the performance overhead of trapping to the 185 * kernel when modifying a non-watched buffer that shares the 186 * page with a watched buffer. 187 */ 188 if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) 189 continue; 190#endif 191#endif /* illumos */ 192 if (size <= 4 * SPA_MINBLOCKSIZE) { 193 align = SPA_MINBLOCKSIZE; 194 } else if (IS_P2ALIGNED(size, PAGESIZE)) { 195 align = PAGESIZE; 196 } else if (IS_P2ALIGNED(size, p2 >> 2)) { 197 align = p2 >> 2; 198 } 199 200 if (align != 0) { 201 char name[36]; 202 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 203 zio_buf_cache[c] = kmem_cache_create(name, size, 204 align, NULL, NULL, NULL, NULL, NULL, cflags); 205 206 /* 207 * Since zio_data bufs do not appear in crash dumps, we 208 * pass KMC_NOTOUCH so that no allocator metadata is 209 * stored with the buffers. 210 */ 211 (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 212 zio_data_buf_cache[c] = kmem_cache_create(name, size, 213 align, NULL, NULL, NULL, NULL, NULL, 214 cflags | KMC_NOTOUCH | KMC_NODEBUG); 215 } 216 } 217 218 while (--c != 0) { 219 ASSERT(zio_buf_cache[c] != NULL); 220 if (zio_buf_cache[c - 1] == NULL) 221 zio_buf_cache[c - 1] = zio_buf_cache[c]; 222 223 ASSERT(zio_data_buf_cache[c] != NULL); 224 if (zio_data_buf_cache[c - 1] == NULL) 225 zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 226 } 227out: 228 229 /* 230 * The zio write taskqs have 1 thread per cpu, allow 1/2 of the taskqs 231 * to fail 3 times per txg or 8 failures, whichever is greater. 232 */ 233 if (zfs_mg_alloc_failures == 0) 234 zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8); 235 else if (zfs_mg_alloc_failures < 8) 236 zfs_mg_alloc_failures = 8; 237 238 zio_inject_init(); 239 240 zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc", 241 KSTAT_TYPE_NAMED, 242 sizeof(zio_trim_stats) / sizeof(kstat_named_t), 243 KSTAT_FLAG_VIRTUAL); 244 245 if (zio_trim_ksp != NULL) { 246 zio_trim_ksp->ks_data = &zio_trim_stats; 247 kstat_install(zio_trim_ksp); 248 } 249} 250 251void 252zio_fini(void) 253{ 254 size_t c; 255 kmem_cache_t *last_cache = NULL; 256 kmem_cache_t *last_data_cache = NULL; 257 258 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 259 if (zio_buf_cache[c] != last_cache) { 260 last_cache = zio_buf_cache[c]; 261 kmem_cache_destroy(zio_buf_cache[c]); 262 } 263 zio_buf_cache[c] = NULL; 264 265 if (zio_data_buf_cache[c] != last_data_cache) { 266 last_data_cache = zio_data_buf_cache[c]; 267 kmem_cache_destroy(zio_data_buf_cache[c]); 268 } 269 zio_data_buf_cache[c] = NULL; 270 } 271 272 kmem_cache_destroy(zio_link_cache); 273 kmem_cache_destroy(zio_cache); 274 275 zio_inject_fini(); 276 277 if (zio_trim_ksp != NULL) { 278 kstat_delete(zio_trim_ksp); 279 zio_trim_ksp = NULL; 280 } 281} 282 283/* 284 * ========================================================================== 285 * Allocate and free I/O buffers 286 * ========================================================================== 287 */ 288 289/* 290 * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 291 * crashdump if the kernel panics, so use it judiciously. Obviously, it's 292 * useful to inspect ZFS metadata, but if possible, we should avoid keeping 293 * excess / transient data in-core during a crashdump. 294 */ 295void * 296zio_buf_alloc(size_t size) 297{ 298 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 299 int flags = zio_exclude_metadata ? KM_NODEBUG : 0; 300 301 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 302 303 if (zio_use_uma) 304 return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 305 else 306 return (kmem_alloc(size, KM_SLEEP|flags)); 307} 308 309/* 310 * Use zio_data_buf_alloc to allocate data. The data will not appear in a 311 * crashdump if the kernel panics. This exists so that we will limit the amount 312 * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 313 * of kernel heap dumped to disk when the kernel panics) 314 */ 315void * 316zio_data_buf_alloc(size_t size) 317{ 318 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 319 320 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 321 322 if (zio_use_uma) 323 return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 324 else 325 return (kmem_alloc(size, KM_SLEEP | KM_NODEBUG)); 326} 327 328void 329zio_buf_free(void *buf, size_t size) 330{ 331 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 332 333 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 334 335 if (zio_use_uma) 336 kmem_cache_free(zio_buf_cache[c], buf); 337 else 338 kmem_free(buf, size); 339} 340 341void 342zio_data_buf_free(void *buf, size_t size) 343{ 344 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 345 346 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 347 348 if (zio_use_uma) 349 kmem_cache_free(zio_data_buf_cache[c], buf); 350 else 351 kmem_free(buf, size); 352} 353 354/* 355 * ========================================================================== 356 * Push and pop I/O transform buffers 357 * ========================================================================== 358 */ 359static void 360zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, 361 zio_transform_func_t *transform) 362{ 363 zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 364 365 zt->zt_orig_data = zio->io_data; 366 zt->zt_orig_size = zio->io_size; 367 zt->zt_bufsize = bufsize; 368 zt->zt_transform = transform; 369 370 zt->zt_next = zio->io_transform_stack; 371 zio->io_transform_stack = zt; 372 373 zio->io_data = data; 374 zio->io_size = size; 375} 376 377static void 378zio_pop_transforms(zio_t *zio) 379{ 380 zio_transform_t *zt; 381 382 while ((zt = zio->io_transform_stack) != NULL) { 383 if (zt->zt_transform != NULL) 384 zt->zt_transform(zio, 385 zt->zt_orig_data, zt->zt_orig_size); 386 387 if (zt->zt_bufsize != 0) 388 zio_buf_free(zio->io_data, zt->zt_bufsize); 389 390 zio->io_data = zt->zt_orig_data; 391 zio->io_size = zt->zt_orig_size; 392 zio->io_transform_stack = zt->zt_next; 393 394 kmem_free(zt, sizeof (zio_transform_t)); 395 } 396} 397 398/* 399 * ========================================================================== 400 * I/O transform callbacks for subblocks and decompression 401 * ========================================================================== 402 */ 403static void 404zio_subblock(zio_t *zio, void *data, uint64_t size) 405{ 406 ASSERT(zio->io_size > size); 407 408 if (zio->io_type == ZIO_TYPE_READ) 409 bcopy(zio->io_data, data, size); 410} 411 412static void 413zio_decompress(zio_t *zio, void *data, uint64_t size) 414{ 415 if (zio->io_error == 0 && 416 zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), 417 zio->io_data, data, zio->io_size, size) != 0) 418 zio->io_error = SET_ERROR(EIO); 419} 420 421/* 422 * ========================================================================== 423 * I/O parent/child relationships and pipeline interlocks 424 * ========================================================================== 425 */ 426/* 427 * NOTE - Callers to zio_walk_parents() and zio_walk_children must 428 * continue calling these functions until they return NULL. 429 * Otherwise, the next caller will pick up the list walk in 430 * some indeterminate state. (Otherwise every caller would 431 * have to pass in a cookie to keep the state represented by 432 * io_walk_link, which gets annoying.) 433 */ 434zio_t * 435zio_walk_parents(zio_t *cio) 436{ 437 zio_link_t *zl = cio->io_walk_link; 438 list_t *pl = &cio->io_parent_list; 439 440 zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); 441 cio->io_walk_link = zl; 442 443 if (zl == NULL) 444 return (NULL); 445 446 ASSERT(zl->zl_child == cio); 447 return (zl->zl_parent); 448} 449 450zio_t * 451zio_walk_children(zio_t *pio) 452{ 453 zio_link_t *zl = pio->io_walk_link; 454 list_t *cl = &pio->io_child_list; 455 456 zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); 457 pio->io_walk_link = zl; 458 459 if (zl == NULL) 460 return (NULL); 461 462 ASSERT(zl->zl_parent == pio); 463 return (zl->zl_child); 464} 465 466zio_t * 467zio_unique_parent(zio_t *cio) 468{ 469 zio_t *pio = zio_walk_parents(cio); 470 471 VERIFY(zio_walk_parents(cio) == NULL); 472 return (pio); 473} 474 475void 476zio_add_child(zio_t *pio, zio_t *cio) 477{ 478 zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); 479 480 /* 481 * Logical I/Os can have logical, gang, or vdev children. 482 * Gang I/Os can have gang or vdev children. 483 * Vdev I/Os can only have vdev children. 484 * The following ASSERT captures all of these constraints. 485 */ 486 ASSERT(cio->io_child_type <= pio->io_child_type); 487 488 zl->zl_parent = pio; 489 zl->zl_child = cio; 490 491 mutex_enter(&cio->io_lock); 492 mutex_enter(&pio->io_lock); 493 494 ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); 495 496 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 497 pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; 498 499 list_insert_head(&pio->io_child_list, zl); 500 list_insert_head(&cio->io_parent_list, zl); 501 502 pio->io_child_count++; 503 cio->io_parent_count++; 504 505 mutex_exit(&pio->io_lock); 506 mutex_exit(&cio->io_lock); 507} 508 509static void 510zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) 511{ 512 ASSERT(zl->zl_parent == pio); 513 ASSERT(zl->zl_child == cio); 514 515 mutex_enter(&cio->io_lock); 516 mutex_enter(&pio->io_lock); 517 518 list_remove(&pio->io_child_list, zl); 519 list_remove(&cio->io_parent_list, zl); 520 521 pio->io_child_count--; 522 cio->io_parent_count--; 523 524 mutex_exit(&pio->io_lock); 525 mutex_exit(&cio->io_lock); 526 527 kmem_cache_free(zio_link_cache, zl); 528} 529 530static boolean_t 531zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) 532{ 533 uint64_t *countp = &zio->io_children[child][wait]; 534 boolean_t waiting = B_FALSE; 535 536 mutex_enter(&zio->io_lock); 537 ASSERT(zio->io_stall == NULL); 538 if (*countp != 0) { 539 zio->io_stage >>= 1; 540 zio->io_stall = countp; 541 waiting = B_TRUE; 542 } 543 mutex_exit(&zio->io_lock); 544 545 return (waiting); 546} 547 548static void 549zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) 550{ 551 uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; 552 int *errorp = &pio->io_child_error[zio->io_child_type]; 553 554 mutex_enter(&pio->io_lock); 555 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 556 *errorp = zio_worst_error(*errorp, zio->io_error); 557 pio->io_reexecute |= zio->io_reexecute; 558 ASSERT3U(*countp, >, 0); 559 if (--*countp == 0 && pio->io_stall == countp) { 560 pio->io_stall = NULL; 561 mutex_exit(&pio->io_lock); 562 zio_execute(pio); 563 } else { 564 mutex_exit(&pio->io_lock); 565 } 566} 567 568static void 569zio_inherit_child_errors(zio_t *zio, enum zio_child c) 570{ 571 if (zio->io_child_error[c] != 0 && zio->io_error == 0) 572 zio->io_error = zio->io_child_error[c]; 573} 574 575/* 576 * ========================================================================== 577 * Create the various types of I/O (read, write, free, etc) 578 * ========================================================================== 579 */ 580static zio_t * 581zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 582 void *data, uint64_t size, zio_done_func_t *done, void *private, 583 zio_type_t type, int priority, enum zio_flag flags, 584 vdev_t *vd, uint64_t offset, const zbookmark_t *zb, 585 enum zio_stage stage, enum zio_stage pipeline) 586{ 587 zio_t *zio; 588 589 ASSERT3U(type == ZIO_TYPE_FREE || size, <=, SPA_MAXBLOCKSIZE); 590 ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 591 ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 592 593 ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); 594 ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); 595 ASSERT(vd || stage == ZIO_STAGE_OPEN); 596 597 zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 598 bzero(zio, sizeof (zio_t)); 599 600 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 601 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 602 603 list_create(&zio->io_parent_list, sizeof (zio_link_t), 604 offsetof(zio_link_t, zl_parent_node)); 605 list_create(&zio->io_child_list, sizeof (zio_link_t), 606 offsetof(zio_link_t, zl_child_node)); 607 608 if (vd != NULL) 609 zio->io_child_type = ZIO_CHILD_VDEV; 610 else if (flags & ZIO_FLAG_GANG_CHILD) 611 zio->io_child_type = ZIO_CHILD_GANG; 612 else if (flags & ZIO_FLAG_DDT_CHILD) 613 zio->io_child_type = ZIO_CHILD_DDT; 614 else 615 zio->io_child_type = ZIO_CHILD_LOGICAL; 616 617 if (bp != NULL) { 618 zio->io_bp = (blkptr_t *)bp; 619 zio->io_bp_copy = *bp; 620 zio->io_bp_orig = *bp; 621 if (type != ZIO_TYPE_WRITE || 622 zio->io_child_type == ZIO_CHILD_DDT) 623 zio->io_bp = &zio->io_bp_copy; /* so caller can free */ 624 if (zio->io_child_type == ZIO_CHILD_LOGICAL) 625 zio->io_logical = zio; 626 if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) 627 pipeline |= ZIO_GANG_STAGES; 628 } 629 630 zio->io_spa = spa; 631 zio->io_txg = txg; 632 zio->io_done = done; 633 zio->io_private = private; 634 zio->io_type = type; 635 zio->io_priority = priority; 636 zio->io_vd = vd; 637 zio->io_offset = offset; 638 zio->io_orig_data = zio->io_data = data; 639 zio->io_orig_size = zio->io_size = size; 640 zio->io_orig_flags = zio->io_flags = flags; 641 zio->io_orig_stage = zio->io_stage = stage; 642 zio->io_orig_pipeline = zio->io_pipeline = pipeline; 643 644 zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); 645 zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); 646 647 if (zb != NULL) 648 zio->io_bookmark = *zb; 649 650 if (pio != NULL) { 651 if (zio->io_logical == NULL) 652 zio->io_logical = pio->io_logical; 653 if (zio->io_child_type == ZIO_CHILD_GANG) 654 zio->io_gang_leader = pio->io_gang_leader; 655 zio_add_child(pio, zio); 656 } 657 658 return (zio); 659} 660 661static void 662zio_destroy(zio_t *zio) 663{ 664 list_destroy(&zio->io_parent_list); 665 list_destroy(&zio->io_child_list); 666 mutex_destroy(&zio->io_lock); 667 cv_destroy(&zio->io_cv); 668 kmem_cache_free(zio_cache, zio); 669} 670 671zio_t * 672zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, 673 void *private, enum zio_flag flags) 674{ 675 zio_t *zio; 676 677 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 678 ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 679 ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); 680 681 return (zio); 682} 683 684zio_t * 685zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) 686{ 687 return (zio_null(NULL, spa, NULL, done, private, flags)); 688} 689 690zio_t * 691zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, 692 void *data, uint64_t size, zio_done_func_t *done, void *private, 693 int priority, enum zio_flag flags, const zbookmark_t *zb) 694{ 695 zio_t *zio; 696 697 zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, 698 data, size, done, private, 699 ZIO_TYPE_READ, priority, flags, NULL, 0, zb, 700 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 701 ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); 702 703 return (zio); 704} 705 706zio_t * 707zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 708 void *data, uint64_t size, const zio_prop_t *zp, 709 zio_done_func_t *ready, zio_done_func_t *done, void *private, 710 int priority, enum zio_flag flags, const zbookmark_t *zb) 711{ 712 zio_t *zio; 713 714 ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && 715 zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && 716 zp->zp_compress >= ZIO_COMPRESS_OFF && 717 zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && 718 DMU_OT_IS_VALID(zp->zp_type) && 719 zp->zp_level < 32 && 720 zp->zp_copies > 0 && 721 zp->zp_copies <= spa_max_replication(spa)); 722 723 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 724 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 725 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 726 ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); 727 728 zio->io_ready = ready; 729 zio->io_prop = *zp; 730 731 return (zio); 732} 733 734zio_t * 735zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, 736 uint64_t size, zio_done_func_t *done, void *private, int priority, 737 enum zio_flag flags, zbookmark_t *zb) 738{ 739 zio_t *zio; 740 741 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 742 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 743 ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 744 745 return (zio); 746} 747 748void 749zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) 750{ 751 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 752 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 753 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 754 ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); 755 756 /* 757 * We must reset the io_prop to match the values that existed 758 * when the bp was first written by dmu_sync() keeping in mind 759 * that nopwrite and dedup are mutually exclusive. 760 */ 761 zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; 762 zio->io_prop.zp_nopwrite = nopwrite; 763 zio->io_prop.zp_copies = copies; 764 zio->io_bp_override = bp; 765} 766 767void 768zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) 769{ 770 metaslab_check_free(spa, bp); 771 772 /* 773 * Frees that are for the currently-syncing txg, are not going to be 774 * deferred, and which will not need to do a read (i.e. not GANG or 775 * DEDUP), can be processed immediately. Otherwise, put them on the 776 * in-memory list for later processing. 777 */ 778 if (zfs_trim_enabled || BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || 779 txg != spa->spa_syncing_txg || 780 spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) { 781 bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); 782 } else { 783 VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 784 BP_GET_PSIZE(bp), 0))); 785 } 786} 787 788zio_t * 789zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 790 uint64_t size, enum zio_flag flags) 791{ 792 zio_t *zio; 793 enum zio_stage stage = ZIO_FREE_PIPELINE; 794 795 dprintf_bp(bp, "freeing in txg %llu, pass %u", 796 (longlong_t)txg, spa->spa_sync_pass); 797 798 ASSERT(!BP_IS_HOLE(bp)); 799 ASSERT(spa_syncing_txg(spa) == txg); 800 ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); 801 802 metaslab_check_free(spa, bp); 803 arc_freed(spa, bp); 804 805 if (zfs_trim_enabled) 806 stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START | 807 ZIO_STAGE_VDEV_IO_ASSESS; 808 /* 809 * GANG and DEDUP blocks can induce a read (for the gang block header, 810 * or the DDT), so issue them asynchronously so that this thread is 811 * not tied up. 812 */ 813 else if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) 814 stage |= ZIO_STAGE_ISSUE_ASYNC; 815 816 zio = zio_create(pio, spa, txg, bp, NULL, size, 817 NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags, 818 NULL, 0, NULL, ZIO_STAGE_OPEN, stage); 819 820 return (zio); 821} 822 823zio_t * 824zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 825 zio_done_func_t *done, void *private, enum zio_flag flags) 826{ 827 zio_t *zio; 828 829 /* 830 * A claim is an allocation of a specific block. Claims are needed 831 * to support immediate writes in the intent log. The issue is that 832 * immediate writes contain committed data, but in a txg that was 833 * *not* committed. Upon opening the pool after an unclean shutdown, 834 * the intent log claims all blocks that contain immediate write data 835 * so that the SPA knows they're in use. 836 * 837 * All claims *must* be resolved in the first txg -- before the SPA 838 * starts allocating blocks -- so that nothing is allocated twice. 839 * If txg == 0 we just verify that the block is claimable. 840 */ 841 ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 842 ASSERT(txg == spa_first_txg(spa) || txg == 0); 843 ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ 844 845 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 846 done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, 847 NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 848 849 return (zio); 850} 851 852zio_t * 853zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset, 854 uint64_t size, zio_done_func_t *done, void *private, int priority, 855 enum zio_flag flags) 856{ 857 zio_t *zio; 858 int c; 859 860 if (vd->vdev_children == 0) { 861 zio = zio_create(pio, spa, 0, NULL, NULL, size, done, private, 862 ZIO_TYPE_IOCTL, priority, flags, vd, offset, NULL, 863 ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 864 865 zio->io_cmd = cmd; 866 } else { 867 zio = zio_null(pio, spa, NULL, NULL, NULL, flags); 868 869 for (c = 0; c < vd->vdev_children; c++) 870 zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 871 offset, size, done, private, priority, flags)); 872 } 873 874 return (zio); 875} 876 877zio_t * 878zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 879 void *data, int checksum, zio_done_func_t *done, void *private, 880 int priority, enum zio_flag flags, boolean_t labels) 881{ 882 zio_t *zio; 883 884 ASSERT(vd->vdev_children == 0); 885 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 886 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 887 ASSERT3U(offset + size, <=, vd->vdev_psize); 888 889 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 890 ZIO_TYPE_READ, priority, flags, vd, offset, NULL, 891 ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 892 893 zio->io_prop.zp_checksum = checksum; 894 895 return (zio); 896} 897 898zio_t * 899zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 900 void *data, int checksum, zio_done_func_t *done, void *private, 901 int priority, enum zio_flag flags, boolean_t labels) 902{ 903 zio_t *zio; 904 905 ASSERT(vd->vdev_children == 0); 906 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 907 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 908 ASSERT3U(offset + size, <=, vd->vdev_psize); 909 910 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 911 ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL, 912 ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 913 914 zio->io_prop.zp_checksum = checksum; 915 916 if (zio_checksum_table[checksum].ci_eck) { 917 /* 918 * zec checksums are necessarily destructive -- they modify 919 * the end of the write buffer to hold the verifier/checksum. 920 * Therefore, we must make a local copy in case the data is 921 * being written to multiple places in parallel. 922 */ 923 void *wbuf = zio_buf_alloc(size); 924 bcopy(data, wbuf, size); 925 zio_push_transform(zio, wbuf, size, size, NULL); 926 } 927 928 return (zio); 929} 930 931/* 932 * Create a child I/O to do some work for us. 933 */ 934zio_t * 935zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 936 void *data, uint64_t size, int type, int priority, enum zio_flag flags, 937 zio_done_func_t *done, void *private) 938{ 939 enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; 940 zio_t *zio; 941 942 ASSERT(vd->vdev_parent == 943 (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); 944 945 if (type == ZIO_TYPE_READ && bp != NULL) { 946 /* 947 * If we have the bp, then the child should perform the 948 * checksum and the parent need not. This pushes error 949 * detection as close to the leaves as possible and 950 * eliminates redundant checksums in the interior nodes. 951 */ 952 pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; 953 pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 954 } 955 956 if (vd->vdev_children == 0) 957 offset += VDEV_LABEL_START_SIZE; 958 959 flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE; 960 961 /* 962 * If we've decided to do a repair, the write is not speculative -- 963 * even if the original read was. 964 */ 965 if (flags & ZIO_FLAG_IO_REPAIR) 966 flags &= ~ZIO_FLAG_SPECULATIVE; 967 968 zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, 969 done, private, type, priority, flags, vd, offset, &pio->io_bookmark, 970 ZIO_STAGE_VDEV_IO_START >> 1, pipeline); 971 972 return (zio); 973} 974 975zio_t * 976zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, 977 int type, int priority, enum zio_flag flags, 978 zio_done_func_t *done, void *private) 979{ 980 zio_t *zio; 981 982 ASSERT(vd->vdev_ops->vdev_op_leaf); 983 984 zio = zio_create(NULL, vd->vdev_spa, 0, NULL, 985 data, size, done, private, type, priority, 986 flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY, 987 vd, offset, NULL, 988 ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); 989 990 return (zio); 991} 992 993void 994zio_flush(zio_t *zio, vdev_t *vd) 995{ 996 zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0, 997 NULL, NULL, ZIO_PRIORITY_NOW, 998 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); 999} 1000 1001zio_t * 1002zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size) 1003{ 1004 1005 ASSERT(vd->vdev_ops->vdev_op_leaf); 1006 1007 return zio_ioctl(zio, spa, vd, DKIOCTRIM, offset, size, 1008 NULL, NULL, ZIO_PRIORITY_TRIM, 1009 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY); 1010} 1011 1012void 1013zio_shrink(zio_t *zio, uint64_t size) 1014{ 1015 ASSERT(zio->io_executor == NULL); 1016 ASSERT(zio->io_orig_size == zio->io_size); 1017 ASSERT(size <= zio->io_size); 1018 1019 /* 1020 * We don't shrink for raidz because of problems with the 1021 * reconstruction when reading back less than the block size. 1022 * Note, BP_IS_RAIDZ() assumes no compression. 1023 */ 1024 ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); 1025 if (!BP_IS_RAIDZ(zio->io_bp)) 1026 zio->io_orig_size = zio->io_size = size; 1027} 1028 1029/* 1030 * ========================================================================== 1031 * Prepare to read and write logical blocks 1032 * ========================================================================== 1033 */ 1034 1035static int 1036zio_read_bp_init(zio_t *zio) 1037{ 1038 blkptr_t *bp = zio->io_bp; 1039 1040 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && 1041 zio->io_child_type == ZIO_CHILD_LOGICAL && 1042 !(zio->io_flags & ZIO_FLAG_RAW)) { 1043 uint64_t psize = BP_GET_PSIZE(bp); 1044 void *cbuf = zio_buf_alloc(psize); 1045 1046 zio_push_transform(zio, cbuf, psize, psize, zio_decompress); 1047 } 1048 1049 if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) 1050 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1051 1052 if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) 1053 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1054 1055 if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) 1056 zio->io_pipeline = ZIO_DDT_READ_PIPELINE; 1057 1058 return (ZIO_PIPELINE_CONTINUE); 1059} 1060 1061static int 1062zio_write_bp_init(zio_t *zio) 1063{ 1064 spa_t *spa = zio->io_spa; 1065 zio_prop_t *zp = &zio->io_prop; 1066 enum zio_compress compress = zp->zp_compress; 1067 blkptr_t *bp = zio->io_bp; 1068 uint64_t lsize = zio->io_size; 1069 uint64_t psize = lsize; 1070 int pass = 1; 1071 1072 /* 1073 * If our children haven't all reached the ready stage, 1074 * wait for them and then repeat this pipeline stage. 1075 */ 1076 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 1077 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) 1078 return (ZIO_PIPELINE_STOP); 1079 1080 if (!IO_IS_ALLOCATING(zio)) 1081 return (ZIO_PIPELINE_CONTINUE); 1082 1083 ASSERT(zio->io_child_type != ZIO_CHILD_DDT); 1084 1085 if (zio->io_bp_override) { 1086 ASSERT(bp->blk_birth != zio->io_txg); 1087 ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); 1088 1089 *bp = *zio->io_bp_override; 1090 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1091 1092 /* 1093 * If we've been overridden and nopwrite is set then 1094 * set the flag accordingly to indicate that a nopwrite 1095 * has already occurred. 1096 */ 1097 if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { 1098 ASSERT(!zp->zp_dedup); 1099 zio->io_flags |= ZIO_FLAG_NOPWRITE; 1100 return (ZIO_PIPELINE_CONTINUE); 1101 } 1102 1103 ASSERT(!zp->zp_nopwrite); 1104 1105 if (BP_IS_HOLE(bp) || !zp->zp_dedup) 1106 return (ZIO_PIPELINE_CONTINUE); 1107 1108 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup || 1109 zp->zp_dedup_verify); 1110 1111 if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { 1112 BP_SET_DEDUP(bp, 1); 1113 zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; 1114 return (ZIO_PIPELINE_CONTINUE); 1115 } 1116 zio->io_bp_override = NULL; 1117 BP_ZERO(bp); 1118 } 1119 1120 if (bp->blk_birth == zio->io_txg) { 1121 /* 1122 * We're rewriting an existing block, which means we're 1123 * working on behalf of spa_sync(). For spa_sync() to 1124 * converge, it must eventually be the case that we don't 1125 * have to allocate new blocks. But compression changes 1126 * the blocksize, which forces a reallocate, and makes 1127 * convergence take longer. Therefore, after the first 1128 * few passes, stop compressing to ensure convergence. 1129 */ 1130 pass = spa_sync_pass(spa); 1131 1132 ASSERT(zio->io_txg == spa_syncing_txg(spa)); 1133 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1134 ASSERT(!BP_GET_DEDUP(bp)); 1135 1136 if (pass >= zfs_sync_pass_dont_compress) 1137 compress = ZIO_COMPRESS_OFF; 1138 1139 /* Make sure someone doesn't change their mind on overwrites */ 1140 ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp), 1141 spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 1142 } 1143 1144 if (compress != ZIO_COMPRESS_OFF) { 1145 metaslab_class_t *mc = spa_normal_class(spa); 1146 void *cbuf = zio_buf_alloc(lsize); 1147 psize = zio_compress_data(compress, zio->io_data, cbuf, lsize, 1148 (size_t)metaslab_class_get_minblocksize(mc)); 1149 if (psize == 0 || psize == lsize) { 1150 compress = ZIO_COMPRESS_OFF; 1151 zio_buf_free(cbuf, lsize); 1152 } else { 1153 ASSERT(psize < lsize); 1154 zio_push_transform(zio, cbuf, psize, lsize, NULL); 1155 } 1156 } 1157 1158 /* 1159 * The final pass of spa_sync() must be all rewrites, but the first 1160 * few passes offer a trade-off: allocating blocks defers convergence, 1161 * but newly allocated blocks are sequential, so they can be written 1162 * to disk faster. Therefore, we allow the first few passes of 1163 * spa_sync() to allocate new blocks, but force rewrites after that. 1164 * There should only be a handful of blocks after pass 1 in any case. 1165 */ 1166 if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize && 1167 pass >= zfs_sync_pass_rewrite) { 1168 ASSERT(psize != 0); 1169 enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; 1170 zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; 1171 zio->io_flags |= ZIO_FLAG_IO_REWRITE; 1172 } else { 1173 BP_ZERO(bp); 1174 zio->io_pipeline = ZIO_WRITE_PIPELINE; 1175 } 1176 1177 if (psize == 0) { 1178 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1179 } else { 1180 ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); 1181 BP_SET_LSIZE(bp, lsize); 1182 BP_SET_PSIZE(bp, psize); 1183 BP_SET_COMPRESS(bp, compress); 1184 BP_SET_CHECKSUM(bp, zp->zp_checksum); 1185 BP_SET_TYPE(bp, zp->zp_type); 1186 BP_SET_LEVEL(bp, zp->zp_level); 1187 BP_SET_DEDUP(bp, zp->zp_dedup); 1188 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1189 if (zp->zp_dedup) { 1190 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1191 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1192 zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; 1193 } 1194 if (zp->zp_nopwrite) { 1195 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1196 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1197 zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; 1198 } 1199 } 1200 1201 return (ZIO_PIPELINE_CONTINUE); 1202} 1203 1204static int 1205zio_free_bp_init(zio_t *zio) 1206{ 1207 blkptr_t *bp = zio->io_bp; 1208 1209 if (zio->io_child_type == ZIO_CHILD_LOGICAL) { 1210 if (BP_GET_DEDUP(bp)) 1211 zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; 1212 } 1213 1214 return (ZIO_PIPELINE_CONTINUE); 1215} 1216 1217/* 1218 * ========================================================================== 1219 * Execute the I/O pipeline 1220 * ========================================================================== 1221 */ 1222 1223static void 1224zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) 1225{ 1226 spa_t *spa = zio->io_spa; 1227 zio_type_t t = zio->io_type; 1228 int flags = (cutinline ? TQ_FRONT : 0); 1229 1230 ASSERT(q == ZIO_TASKQ_ISSUE || q == ZIO_TASKQ_INTERRUPT); 1231 1232 /* 1233 * If we're a config writer or a probe, the normal issue and 1234 * interrupt threads may all be blocked waiting for the config lock. 1235 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. 1236 */ 1237 if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) 1238 t = ZIO_TYPE_NULL; 1239 1240 /* 1241 * A similar issue exists for the L2ARC write thread until L2ARC 2.0. 1242 */ 1243 if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) 1244 t = ZIO_TYPE_NULL; 1245 1246 /* 1247 * If this is a high priority I/O, then use the high priority taskq if 1248 * available. 1249 */ 1250 if (zio->io_priority == ZIO_PRIORITY_NOW && 1251 spa->spa_zio_taskq[t][q + 1].stqs_count != 0) 1252 q++; 1253 1254 ASSERT3U(q, <, ZIO_TASKQ_TYPES); 1255 1256 /* 1257 * NB: We are assuming that the zio can only be dispatched 1258 * to a single taskq at a time. It would be a grievous error 1259 * to dispatch the zio to another taskq at the same time. 1260 */ 1261#if defined(illumos) || !defined(_KERNEL) 1262 ASSERT(zio->io_tqent.tqent_next == NULL); 1263#else 1264 ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); 1265#endif 1266 spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio, 1267 flags, &zio->io_tqent); 1268} 1269 1270static boolean_t 1271zio_taskq_member(zio_t *zio, zio_taskq_type_t q) 1272{ 1273 kthread_t *executor = zio->io_executor; 1274 spa_t *spa = zio->io_spa; 1275 1276 for (zio_type_t t = 0; t < ZIO_TYPES; t++) { 1277 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1278 uint_t i; 1279 for (i = 0; i < tqs->stqs_count; i++) { 1280 if (taskq_member(tqs->stqs_taskq[i], executor)) 1281 return (B_TRUE); 1282 } 1283 } 1284 1285 return (B_FALSE); 1286} 1287 1288static int 1289zio_issue_async(zio_t *zio) 1290{ 1291 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 1292 1293 return (ZIO_PIPELINE_STOP); 1294} 1295 1296void 1297zio_interrupt(zio_t *zio) 1298{ 1299 zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); 1300} 1301 1302/* 1303 * Execute the I/O pipeline until one of the following occurs: 1304 * 1305 * (1) the I/O completes 1306 * (2) the pipeline stalls waiting for dependent child I/Os 1307 * (3) the I/O issues, so we're waiting for an I/O completion interrupt 1308 * (4) the I/O is delegated by vdev-level caching or aggregation 1309 * (5) the I/O is deferred due to vdev-level queueing 1310 * (6) the I/O is handed off to another thread. 1311 * 1312 * In all cases, the pipeline stops whenever there's no CPU work; it never 1313 * burns a thread in cv_wait(). 1314 * 1315 * There's no locking on io_stage because there's no legitimate way 1316 * for multiple threads to be attempting to process the same I/O. 1317 */ 1318static zio_pipe_stage_t *zio_pipeline[]; 1319 1320void 1321zio_execute(zio_t *zio) 1322{ 1323 zio->io_executor = curthread; 1324 1325 while (zio->io_stage < ZIO_STAGE_DONE) { 1326 enum zio_stage pipeline = zio->io_pipeline; 1327 enum zio_stage stage = zio->io_stage; 1328 int rv; 1329 1330 ASSERT(!MUTEX_HELD(&zio->io_lock)); 1331 ASSERT(ISP2(stage)); 1332 ASSERT(zio->io_stall == NULL); 1333 1334 do { 1335 stage <<= 1; 1336 } while ((stage & pipeline) == 0); 1337 1338 ASSERT(stage <= ZIO_STAGE_DONE); 1339 1340 /* 1341 * If we are in interrupt context and this pipeline stage 1342 * will grab a config lock that is held across I/O, 1343 * or may wait for an I/O that needs an interrupt thread 1344 * to complete, issue async to avoid deadlock. 1345 * 1346 * For VDEV_IO_START, we cut in line so that the io will 1347 * be sent to disk promptly. 1348 */ 1349 if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && 1350 zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { 1351 boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? 1352 zio_requeue_io_start_cut_in_line : B_FALSE; 1353 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); 1354 return; 1355 } 1356 1357 zio->io_stage = stage; 1358 rv = zio_pipeline[highbit(stage) - 1](zio); 1359 1360 if (rv == ZIO_PIPELINE_STOP) 1361 return; 1362 1363 ASSERT(rv == ZIO_PIPELINE_CONTINUE); 1364 } 1365} 1366 1367/* 1368 * ========================================================================== 1369 * Initiate I/O, either sync or async 1370 * ========================================================================== 1371 */ 1372int 1373zio_wait(zio_t *zio) 1374{ 1375 int error; 1376 1377 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 1378 ASSERT(zio->io_executor == NULL); 1379 1380 zio->io_waiter = curthread; 1381 1382 zio_execute(zio); 1383 1384 mutex_enter(&zio->io_lock); 1385 while (zio->io_executor != NULL) 1386 cv_wait(&zio->io_cv, &zio->io_lock); 1387 mutex_exit(&zio->io_lock); 1388 1389 error = zio->io_error; 1390 zio_destroy(zio); 1391 1392 return (error); 1393} 1394 1395void 1396zio_nowait(zio_t *zio) 1397{ 1398 ASSERT(zio->io_executor == NULL); 1399 1400 if (zio->io_child_type == ZIO_CHILD_LOGICAL && 1401 zio_unique_parent(zio) == NULL) { 1402 /* 1403 * This is a logical async I/O with no parent to wait for it. 1404 * We add it to the spa_async_root_zio "Godfather" I/O which 1405 * will ensure they complete prior to unloading the pool. 1406 */ 1407 spa_t *spa = zio->io_spa; 1408 1409 zio_add_child(spa->spa_async_zio_root, zio); 1410 } 1411 1412 zio_execute(zio); 1413} 1414 1415/* 1416 * ========================================================================== 1417 * Reexecute or suspend/resume failed I/O 1418 * ========================================================================== 1419 */ 1420 1421static void 1422zio_reexecute(zio_t *pio) 1423{ 1424 zio_t *cio, *cio_next; 1425 1426 ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); 1427 ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); 1428 ASSERT(pio->io_gang_leader == NULL); 1429 ASSERT(pio->io_gang_tree == NULL); 1430 1431 pio->io_flags = pio->io_orig_flags; 1432 pio->io_stage = pio->io_orig_stage; 1433 pio->io_pipeline = pio->io_orig_pipeline; 1434 pio->io_reexecute = 0; 1435 pio->io_flags |= ZIO_FLAG_REEXECUTED; 1436 pio->io_error = 0; 1437 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1438 pio->io_state[w] = 0; 1439 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 1440 pio->io_child_error[c] = 0; 1441 1442 if (IO_IS_ALLOCATING(pio)) 1443 BP_ZERO(pio->io_bp); 1444 1445 /* 1446 * As we reexecute pio's children, new children could be created. 1447 * New children go to the head of pio's io_child_list, however, 1448 * so we will (correctly) not reexecute them. The key is that 1449 * the remainder of pio's io_child_list, from 'cio_next' onward, 1450 * cannot be affected by any side effects of reexecuting 'cio'. 1451 */ 1452 for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { 1453 cio_next = zio_walk_children(pio); 1454 mutex_enter(&pio->io_lock); 1455 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1456 pio->io_children[cio->io_child_type][w]++; 1457 mutex_exit(&pio->io_lock); 1458 zio_reexecute(cio); 1459 } 1460 1461 /* 1462 * Now that all children have been reexecuted, execute the parent. 1463 * We don't reexecute "The Godfather" I/O here as it's the 1464 * responsibility of the caller to wait on him. 1465 */ 1466 if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) 1467 zio_execute(pio); 1468} 1469 1470void 1471zio_suspend(spa_t *spa, zio_t *zio) 1472{ 1473 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) 1474 fm_panic("Pool '%s' has encountered an uncorrectable I/O " 1475 "failure and the failure mode property for this pool " 1476 "is set to panic.", spa_name(spa)); 1477 1478 zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); 1479 1480 mutex_enter(&spa->spa_suspend_lock); 1481 1482 if (spa->spa_suspend_zio_root == NULL) 1483 spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 1484 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 1485 ZIO_FLAG_GODFATHER); 1486 1487 spa->spa_suspended = B_TRUE; 1488 1489 if (zio != NULL) { 1490 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 1491 ASSERT(zio != spa->spa_suspend_zio_root); 1492 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1493 ASSERT(zio_unique_parent(zio) == NULL); 1494 ASSERT(zio->io_stage == ZIO_STAGE_DONE); 1495 zio_add_child(spa->spa_suspend_zio_root, zio); 1496 } 1497 1498 mutex_exit(&spa->spa_suspend_lock); 1499} 1500 1501int 1502zio_resume(spa_t *spa) 1503{ 1504 zio_t *pio; 1505 1506 /* 1507 * Reexecute all previously suspended i/o. 1508 */ 1509 mutex_enter(&spa->spa_suspend_lock); 1510 spa->spa_suspended = B_FALSE; 1511 cv_broadcast(&spa->spa_suspend_cv); 1512 pio = spa->spa_suspend_zio_root; 1513 spa->spa_suspend_zio_root = NULL; 1514 mutex_exit(&spa->spa_suspend_lock); 1515 1516 if (pio == NULL) 1517 return (0); 1518 1519 zio_reexecute(pio); 1520 return (zio_wait(pio)); 1521} 1522 1523void 1524zio_resume_wait(spa_t *spa) 1525{ 1526 mutex_enter(&spa->spa_suspend_lock); 1527 while (spa_suspended(spa)) 1528 cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); 1529 mutex_exit(&spa->spa_suspend_lock); 1530} 1531 1532/* 1533 * ========================================================================== 1534 * Gang blocks. 1535 * 1536 * A gang block is a collection of small blocks that looks to the DMU 1537 * like one large block. When zio_dva_allocate() cannot find a block 1538 * of the requested size, due to either severe fragmentation or the pool 1539 * being nearly full, it calls zio_write_gang_block() to construct the 1540 * block from smaller fragments. 1541 * 1542 * A gang block consists of a gang header (zio_gbh_phys_t) and up to 1543 * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like 1544 * an indirect block: it's an array of block pointers. It consumes 1545 * only one sector and hence is allocatable regardless of fragmentation. 1546 * The gang header's bps point to its gang members, which hold the data. 1547 * 1548 * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> 1549 * as the verifier to ensure uniqueness of the SHA256 checksum. 1550 * Critically, the gang block bp's blk_cksum is the checksum of the data, 1551 * not the gang header. This ensures that data block signatures (needed for 1552 * deduplication) are independent of how the block is physically stored. 1553 * 1554 * Gang blocks can be nested: a gang member may itself be a gang block. 1555 * Thus every gang block is a tree in which root and all interior nodes are 1556 * gang headers, and the leaves are normal blocks that contain user data. 1557 * The root of the gang tree is called the gang leader. 1558 * 1559 * To perform any operation (read, rewrite, free, claim) on a gang block, 1560 * zio_gang_assemble() first assembles the gang tree (minus data leaves) 1561 * in the io_gang_tree field of the original logical i/o by recursively 1562 * reading the gang leader and all gang headers below it. This yields 1563 * an in-core tree containing the contents of every gang header and the 1564 * bps for every constituent of the gang block. 1565 * 1566 * With the gang tree now assembled, zio_gang_issue() just walks the gang tree 1567 * and invokes a callback on each bp. To free a gang block, zio_gang_issue() 1568 * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. 1569 * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). 1570 * zio_read_gang() is a wrapper around zio_read() that omits reading gang 1571 * headers, since we already have those in io_gang_tree. zio_rewrite_gang() 1572 * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() 1573 * of the gang header plus zio_checksum_compute() of the data to update the 1574 * gang header's blk_cksum as described above. 1575 * 1576 * The two-phase assemble/issue model solves the problem of partial failure -- 1577 * what if you'd freed part of a gang block but then couldn't read the 1578 * gang header for another part? Assembling the entire gang tree first 1579 * ensures that all the necessary gang header I/O has succeeded before 1580 * starting the actual work of free, claim, or write. Once the gang tree 1581 * is assembled, free and claim are in-memory operations that cannot fail. 1582 * 1583 * In the event that a gang write fails, zio_dva_unallocate() walks the 1584 * gang tree to immediately free (i.e. insert back into the space map) 1585 * everything we've allocated. This ensures that we don't get ENOSPC 1586 * errors during repeated suspend/resume cycles due to a flaky device. 1587 * 1588 * Gang rewrites only happen during sync-to-convergence. If we can't assemble 1589 * the gang tree, we won't modify the block, so we can safely defer the free 1590 * (knowing that the block is still intact). If we *can* assemble the gang 1591 * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free 1592 * each constituent bp and we can allocate a new block on the next sync pass. 1593 * 1594 * In all cases, the gang tree allows complete recovery from partial failure. 1595 * ========================================================================== 1596 */ 1597 1598static zio_t * 1599zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1600{ 1601 if (gn != NULL) 1602 return (pio); 1603 1604 return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), 1605 NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1606 &pio->io_bookmark)); 1607} 1608 1609zio_t * 1610zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1611{ 1612 zio_t *zio; 1613 1614 if (gn != NULL) { 1615 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1616 gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, 1617 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1618 /* 1619 * As we rewrite each gang header, the pipeline will compute 1620 * a new gang block header checksum for it; but no one will 1621 * compute a new data checksum, so we do that here. The one 1622 * exception is the gang leader: the pipeline already computed 1623 * its data checksum because that stage precedes gang assembly. 1624 * (Presently, nothing actually uses interior data checksums; 1625 * this is just good hygiene.) 1626 */ 1627 if (gn != pio->io_gang_leader->io_gang_tree) { 1628 zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), 1629 data, BP_GET_PSIZE(bp)); 1630 } 1631 /* 1632 * If we are here to damage data for testing purposes, 1633 * leave the GBH alone so that we can detect the damage. 1634 */ 1635 if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) 1636 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 1637 } else { 1638 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1639 data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, 1640 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1641 } 1642 1643 return (zio); 1644} 1645 1646/* ARGSUSED */ 1647zio_t * 1648zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1649{ 1650 return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, 1651 BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp), 1652 ZIO_GANG_CHILD_FLAGS(pio))); 1653} 1654 1655/* ARGSUSED */ 1656zio_t * 1657zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1658{ 1659 return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, 1660 NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); 1661} 1662 1663static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { 1664 NULL, 1665 zio_read_gang, 1666 zio_rewrite_gang, 1667 zio_free_gang, 1668 zio_claim_gang, 1669 NULL 1670}; 1671 1672static void zio_gang_tree_assemble_done(zio_t *zio); 1673 1674static zio_gang_node_t * 1675zio_gang_node_alloc(zio_gang_node_t **gnpp) 1676{ 1677 zio_gang_node_t *gn; 1678 1679 ASSERT(*gnpp == NULL); 1680 1681 gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); 1682 gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); 1683 *gnpp = gn; 1684 1685 return (gn); 1686} 1687 1688static void 1689zio_gang_node_free(zio_gang_node_t **gnpp) 1690{ 1691 zio_gang_node_t *gn = *gnpp; 1692 1693 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1694 ASSERT(gn->gn_child[g] == NULL); 1695 1696 zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); 1697 kmem_free(gn, sizeof (*gn)); 1698 *gnpp = NULL; 1699} 1700 1701static void 1702zio_gang_tree_free(zio_gang_node_t **gnpp) 1703{ 1704 zio_gang_node_t *gn = *gnpp; 1705 1706 if (gn == NULL) 1707 return; 1708 1709 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1710 zio_gang_tree_free(&gn->gn_child[g]); 1711 1712 zio_gang_node_free(gnpp); 1713} 1714 1715static void 1716zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) 1717{ 1718 zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); 1719 1720 ASSERT(gio->io_gang_leader == gio); 1721 ASSERT(BP_IS_GANG(bp)); 1722 1723 zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, 1724 SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, 1725 gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); 1726} 1727 1728static void 1729zio_gang_tree_assemble_done(zio_t *zio) 1730{ 1731 zio_t *gio = zio->io_gang_leader; 1732 zio_gang_node_t *gn = zio->io_private; 1733 blkptr_t *bp = zio->io_bp; 1734 1735 ASSERT(gio == zio_unique_parent(zio)); 1736 ASSERT(zio->io_child_count == 0); 1737 1738 if (zio->io_error) 1739 return; 1740 1741 if (BP_SHOULD_BYTESWAP(bp)) 1742 byteswap_uint64_array(zio->io_data, zio->io_size); 1743 1744 ASSERT(zio->io_data == gn->gn_gbh); 1745 ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 1746 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1747 1748 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1749 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1750 if (!BP_IS_GANG(gbp)) 1751 continue; 1752 zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); 1753 } 1754} 1755 1756static void 1757zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) 1758{ 1759 zio_t *gio = pio->io_gang_leader; 1760 zio_t *zio; 1761 1762 ASSERT(BP_IS_GANG(bp) == !!gn); 1763 ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); 1764 ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); 1765 1766 /* 1767 * If you're a gang header, your data is in gn->gn_gbh. 1768 * If you're a gang member, your data is in 'data' and gn == NULL. 1769 */ 1770 zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); 1771 1772 if (gn != NULL) { 1773 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1774 1775 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1776 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1777 if (BP_IS_HOLE(gbp)) 1778 continue; 1779 zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); 1780 data = (char *)data + BP_GET_PSIZE(gbp); 1781 } 1782 } 1783 1784 if (gn == gio->io_gang_tree && gio->io_data != NULL) 1785 ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); 1786 1787 if (zio != pio) 1788 zio_nowait(zio); 1789} 1790 1791static int 1792zio_gang_assemble(zio_t *zio) 1793{ 1794 blkptr_t *bp = zio->io_bp; 1795 1796 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); 1797 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1798 1799 zio->io_gang_leader = zio; 1800 1801 zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); 1802 1803 return (ZIO_PIPELINE_CONTINUE); 1804} 1805 1806static int 1807zio_gang_issue(zio_t *zio) 1808{ 1809 blkptr_t *bp = zio->io_bp; 1810 1811 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) 1812 return (ZIO_PIPELINE_STOP); 1813 1814 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); 1815 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1816 1817 if (zio->io_child_error[ZIO_CHILD_GANG] == 0) 1818 zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); 1819 else 1820 zio_gang_tree_free(&zio->io_gang_tree); 1821 1822 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1823 1824 return (ZIO_PIPELINE_CONTINUE); 1825} 1826 1827static void 1828zio_write_gang_member_ready(zio_t *zio) 1829{ 1830 zio_t *pio = zio_unique_parent(zio); 1831 zio_t *gio = zio->io_gang_leader; 1832 dva_t *cdva = zio->io_bp->blk_dva; 1833 dva_t *pdva = pio->io_bp->blk_dva; 1834 uint64_t asize; 1835 1836 if (BP_IS_HOLE(zio->io_bp)) 1837 return; 1838 1839 ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); 1840 1841 ASSERT(zio->io_child_type == ZIO_CHILD_GANG); 1842 ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); 1843 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); 1844 ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); 1845 ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 1846 1847 mutex_enter(&pio->io_lock); 1848 for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { 1849 ASSERT(DVA_GET_GANG(&pdva[d])); 1850 asize = DVA_GET_ASIZE(&pdva[d]); 1851 asize += DVA_GET_ASIZE(&cdva[d]); 1852 DVA_SET_ASIZE(&pdva[d], asize); 1853 } 1854 mutex_exit(&pio->io_lock); 1855} 1856 1857static int 1858zio_write_gang_block(zio_t *pio) 1859{ 1860 spa_t *spa = pio->io_spa; 1861 blkptr_t *bp = pio->io_bp; 1862 zio_t *gio = pio->io_gang_leader; 1863 zio_t *zio; 1864 zio_gang_node_t *gn, **gnpp; 1865 zio_gbh_phys_t *gbh; 1866 uint64_t txg = pio->io_txg; 1867 uint64_t resid = pio->io_size; 1868 uint64_t lsize; 1869 int copies = gio->io_prop.zp_copies; 1870 int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); 1871 zio_prop_t zp; 1872 int error; 1873 1874 error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE, 1875 bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, 1876 METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); 1877 if (error) { 1878 pio->io_error = error; 1879 return (ZIO_PIPELINE_CONTINUE); 1880 } 1881 1882 if (pio == gio) { 1883 gnpp = &gio->io_gang_tree; 1884 } else { 1885 gnpp = pio->io_private; 1886 ASSERT(pio->io_ready == zio_write_gang_member_ready); 1887 } 1888 1889 gn = zio_gang_node_alloc(gnpp); 1890 gbh = gn->gn_gbh; 1891 bzero(gbh, SPA_GANGBLOCKSIZE); 1892 1893 /* 1894 * Create the gang header. 1895 */ 1896 zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, 1897 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1898 1899 /* 1900 * Create and nowait the gang children. 1901 */ 1902 for (int g = 0; resid != 0; resid -= lsize, g++) { 1903 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), 1904 SPA_MINBLOCKSIZE); 1905 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); 1906 1907 zp.zp_checksum = gio->io_prop.zp_checksum; 1908 zp.zp_compress = ZIO_COMPRESS_OFF; 1909 zp.zp_type = DMU_OT_NONE; 1910 zp.zp_level = 0; 1911 zp.zp_copies = gio->io_prop.zp_copies; 1912 zp.zp_dedup = B_FALSE; 1913 zp.zp_dedup_verify = B_FALSE; 1914 zp.zp_nopwrite = B_FALSE; 1915 1916 zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], 1917 (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, 1918 zio_write_gang_member_ready, NULL, &gn->gn_child[g], 1919 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1920 &pio->io_bookmark)); 1921 } 1922 1923 /* 1924 * Set pio's pipeline to just wait for zio to finish. 1925 */ 1926 pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1927 1928 zio_nowait(zio); 1929 1930 return (ZIO_PIPELINE_CONTINUE); 1931} 1932 1933/* 1934 * The zio_nop_write stage in the pipeline determines if allocating 1935 * a new bp is necessary. By leveraging a cryptographically secure checksum, 1936 * such as SHA256, we can compare the checksums of the new data and the old 1937 * to determine if allocating a new block is required. The nopwrite 1938 * feature can handle writes in either syncing or open context (i.e. zil 1939 * writes) and as a result is mutually exclusive with dedup. 1940 */ 1941static int 1942zio_nop_write(zio_t *zio) 1943{ 1944 blkptr_t *bp = zio->io_bp; 1945 blkptr_t *bp_orig = &zio->io_bp_orig; 1946 zio_prop_t *zp = &zio->io_prop; 1947 1948 ASSERT(BP_GET_LEVEL(bp) == 0); 1949 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1950 ASSERT(zp->zp_nopwrite); 1951 ASSERT(!zp->zp_dedup); 1952 ASSERT(zio->io_bp_override == NULL); 1953 ASSERT(IO_IS_ALLOCATING(zio)); 1954 1955 /* 1956 * Check to see if the original bp and the new bp have matching 1957 * characteristics (i.e. same checksum, compression algorithms, etc). 1958 * If they don't then just continue with the pipeline which will 1959 * allocate a new bp. 1960 */ 1961 if (BP_IS_HOLE(bp_orig) || 1962 !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup || 1963 BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || 1964 BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || 1965 BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || 1966 zp->zp_copies != BP_GET_NDVAS(bp_orig)) 1967 return (ZIO_PIPELINE_CONTINUE); 1968 1969 /* 1970 * If the checksums match then reset the pipeline so that we 1971 * avoid allocating a new bp and issuing any I/O. 1972 */ 1973 if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { 1974 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup); 1975 ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); 1976 ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); 1977 ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); 1978 ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, 1979 sizeof (uint64_t)) == 0); 1980 1981 *bp = *bp_orig; 1982 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1983 zio->io_flags |= ZIO_FLAG_NOPWRITE; 1984 } 1985 1986 return (ZIO_PIPELINE_CONTINUE); 1987} 1988 1989/* 1990 * ========================================================================== 1991 * Dedup 1992 * ========================================================================== 1993 */ 1994static void 1995zio_ddt_child_read_done(zio_t *zio) 1996{ 1997 blkptr_t *bp = zio->io_bp; 1998 ddt_entry_t *dde = zio->io_private; 1999 ddt_phys_t *ddp; 2000 zio_t *pio = zio_unique_parent(zio); 2001 2002 mutex_enter(&pio->io_lock); 2003 ddp = ddt_phys_select(dde, bp); 2004 if (zio->io_error == 0) 2005 ddt_phys_clear(ddp); /* this ddp doesn't need repair */ 2006 if (zio->io_error == 0 && dde->dde_repair_data == NULL) 2007 dde->dde_repair_data = zio->io_data; 2008 else 2009 zio_buf_free(zio->io_data, zio->io_size); 2010 mutex_exit(&pio->io_lock); 2011} 2012 2013static int 2014zio_ddt_read_start(zio_t *zio) 2015{ 2016 blkptr_t *bp = zio->io_bp; 2017 2018 ASSERT(BP_GET_DEDUP(bp)); 2019 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2020 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2021 2022 if (zio->io_child_error[ZIO_CHILD_DDT]) { 2023 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2024 ddt_entry_t *dde = ddt_repair_start(ddt, bp); 2025 ddt_phys_t *ddp = dde->dde_phys; 2026 ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); 2027 blkptr_t blk; 2028 2029 ASSERT(zio->io_vsd == NULL); 2030 zio->io_vsd = dde; 2031 2032 if (ddp_self == NULL) 2033 return (ZIO_PIPELINE_CONTINUE); 2034 2035 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 2036 if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) 2037 continue; 2038 ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, 2039 &blk); 2040 zio_nowait(zio_read(zio, zio->io_spa, &blk, 2041 zio_buf_alloc(zio->io_size), zio->io_size, 2042 zio_ddt_child_read_done, dde, zio->io_priority, 2043 ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, 2044 &zio->io_bookmark)); 2045 } 2046 return (ZIO_PIPELINE_CONTINUE); 2047 } 2048 2049 zio_nowait(zio_read(zio, zio->io_spa, bp, 2050 zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, 2051 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); 2052 2053 return (ZIO_PIPELINE_CONTINUE); 2054} 2055 2056static int 2057zio_ddt_read_done(zio_t *zio) 2058{ 2059 blkptr_t *bp = zio->io_bp; 2060 2061 if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)) 2062 return (ZIO_PIPELINE_STOP); 2063 2064 ASSERT(BP_GET_DEDUP(bp)); 2065 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2066 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2067 2068 if (zio->io_child_error[ZIO_CHILD_DDT]) { 2069 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2070 ddt_entry_t *dde = zio->io_vsd; 2071 if (ddt == NULL) { 2072 ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); 2073 return (ZIO_PIPELINE_CONTINUE); 2074 } 2075 if (dde == NULL) { 2076 zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; 2077 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 2078 return (ZIO_PIPELINE_STOP); 2079 } 2080 if (dde->dde_repair_data != NULL) { 2081 bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); 2082 zio->io_child_error[ZIO_CHILD_DDT] = 0; 2083 } 2084 ddt_repair_done(ddt, dde); 2085 zio->io_vsd = NULL; 2086 } 2087 2088 ASSERT(zio->io_vsd == NULL); 2089 2090 return (ZIO_PIPELINE_CONTINUE); 2091} 2092 2093static boolean_t 2094zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) 2095{ 2096 spa_t *spa = zio->io_spa; 2097 2098 /* 2099 * Note: we compare the original data, not the transformed data, 2100 * because when zio->io_bp is an override bp, we will not have 2101 * pushed the I/O transforms. That's an important optimization 2102 * because otherwise we'd compress/encrypt all dmu_sync() data twice. 2103 */ 2104 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2105 zio_t *lio = dde->dde_lead_zio[p]; 2106 2107 if (lio != NULL) { 2108 return (lio->io_orig_size != zio->io_orig_size || 2109 bcmp(zio->io_orig_data, lio->io_orig_data, 2110 zio->io_orig_size) != 0); 2111 } 2112 } 2113 2114 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2115 ddt_phys_t *ddp = &dde->dde_phys[p]; 2116 2117 if (ddp->ddp_phys_birth != 0) { 2118 arc_buf_t *abuf = NULL; 2119 uint32_t aflags = ARC_WAIT; 2120 blkptr_t blk = *zio->io_bp; 2121 int error; 2122 2123 ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); 2124 2125 ddt_exit(ddt); 2126 2127 error = arc_read(NULL, spa, &blk, 2128 arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, 2129 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2130 &aflags, &zio->io_bookmark); 2131 2132 if (error == 0) { 2133 if (arc_buf_size(abuf) != zio->io_orig_size || 2134 bcmp(abuf->b_data, zio->io_orig_data, 2135 zio->io_orig_size) != 0) 2136 error = SET_ERROR(EEXIST); 2137 VERIFY(arc_buf_remove_ref(abuf, &abuf)); 2138 } 2139 2140 ddt_enter(ddt); 2141 return (error != 0); 2142 } 2143 } 2144 2145 return (B_FALSE); 2146} 2147 2148static void 2149zio_ddt_child_write_ready(zio_t *zio) 2150{ 2151 int p = zio->io_prop.zp_copies; 2152 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2153 ddt_entry_t *dde = zio->io_private; 2154 ddt_phys_t *ddp = &dde->dde_phys[p]; 2155 zio_t *pio; 2156 2157 if (zio->io_error) 2158 return; 2159 2160 ddt_enter(ddt); 2161 2162 ASSERT(dde->dde_lead_zio[p] == zio); 2163 2164 ddt_phys_fill(ddp, zio->io_bp); 2165 2166 while ((pio = zio_walk_parents(zio)) != NULL) 2167 ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); 2168 2169 ddt_exit(ddt); 2170} 2171 2172static void 2173zio_ddt_child_write_done(zio_t *zio) 2174{ 2175 int p = zio->io_prop.zp_copies; 2176 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2177 ddt_entry_t *dde = zio->io_private; 2178 ddt_phys_t *ddp = &dde->dde_phys[p]; 2179 2180 ddt_enter(ddt); 2181 2182 ASSERT(ddp->ddp_refcnt == 0); 2183 ASSERT(dde->dde_lead_zio[p] == zio); 2184 dde->dde_lead_zio[p] = NULL; 2185 2186 if (zio->io_error == 0) { 2187 while (zio_walk_parents(zio) != NULL) 2188 ddt_phys_addref(ddp); 2189 } else { 2190 ddt_phys_clear(ddp); 2191 } 2192 2193 ddt_exit(ddt); 2194} 2195 2196static void 2197zio_ddt_ditto_write_done(zio_t *zio) 2198{ 2199 int p = DDT_PHYS_DITTO; 2200 zio_prop_t *zp = &zio->io_prop; 2201 blkptr_t *bp = zio->io_bp; 2202 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2203 ddt_entry_t *dde = zio->io_private; 2204 ddt_phys_t *ddp = &dde->dde_phys[p]; 2205 ddt_key_t *ddk = &dde->dde_key; 2206 2207 ddt_enter(ddt); 2208 2209 ASSERT(ddp->ddp_refcnt == 0); 2210 ASSERT(dde->dde_lead_zio[p] == zio); 2211 dde->dde_lead_zio[p] = NULL; 2212 2213 if (zio->io_error == 0) { 2214 ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); 2215 ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); 2216 ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); 2217 if (ddp->ddp_phys_birth != 0) 2218 ddt_phys_free(ddt, ddk, ddp, zio->io_txg); 2219 ddt_phys_fill(ddp, bp); 2220 } 2221 2222 ddt_exit(ddt); 2223} 2224 2225static int 2226zio_ddt_write(zio_t *zio) 2227{ 2228 spa_t *spa = zio->io_spa; 2229 blkptr_t *bp = zio->io_bp; 2230 uint64_t txg = zio->io_txg; 2231 zio_prop_t *zp = &zio->io_prop; 2232 int p = zp->zp_copies; 2233 int ditto_copies; 2234 zio_t *cio = NULL; 2235 zio_t *dio = NULL; 2236 ddt_t *ddt = ddt_select(spa, bp); 2237 ddt_entry_t *dde; 2238 ddt_phys_t *ddp; 2239 2240 ASSERT(BP_GET_DEDUP(bp)); 2241 ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); 2242 ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); 2243 2244 ddt_enter(ddt); 2245 dde = ddt_lookup(ddt, bp, B_TRUE); 2246 ddp = &dde->dde_phys[p]; 2247 2248 if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { 2249 /* 2250 * If we're using a weak checksum, upgrade to a strong checksum 2251 * and try again. If we're already using a strong checksum, 2252 * we can't resolve it, so just convert to an ordinary write. 2253 * (And automatically e-mail a paper to Nature?) 2254 */ 2255 if (!zio_checksum_table[zp->zp_checksum].ci_dedup) { 2256 zp->zp_checksum = spa_dedup_checksum(spa); 2257 zio_pop_transforms(zio); 2258 zio->io_stage = ZIO_STAGE_OPEN; 2259 BP_ZERO(bp); 2260 } else { 2261 zp->zp_dedup = B_FALSE; 2262 } 2263 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2264 ddt_exit(ddt); 2265 return (ZIO_PIPELINE_CONTINUE); 2266 } 2267 2268 ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); 2269 ASSERT(ditto_copies < SPA_DVAS_PER_BP); 2270 2271 if (ditto_copies > ddt_ditto_copies_present(dde) && 2272 dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { 2273 zio_prop_t czp = *zp; 2274 2275 czp.zp_copies = ditto_copies; 2276 2277 /* 2278 * If we arrived here with an override bp, we won't have run 2279 * the transform stack, so we won't have the data we need to 2280 * generate a child i/o. So, toss the override bp and restart. 2281 * This is safe, because using the override bp is just an 2282 * optimization; and it's rare, so the cost doesn't matter. 2283 */ 2284 if (zio->io_bp_override) { 2285 zio_pop_transforms(zio); 2286 zio->io_stage = ZIO_STAGE_OPEN; 2287 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2288 zio->io_bp_override = NULL; 2289 BP_ZERO(bp); 2290 ddt_exit(ddt); 2291 return (ZIO_PIPELINE_CONTINUE); 2292 } 2293 2294 dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2295 zio->io_orig_size, &czp, NULL, 2296 zio_ddt_ditto_write_done, dde, zio->io_priority, 2297 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2298 2299 zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); 2300 dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; 2301 } 2302 2303 if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { 2304 if (ddp->ddp_phys_birth != 0) 2305 ddt_bp_fill(ddp, bp, txg); 2306 if (dde->dde_lead_zio[p] != NULL) 2307 zio_add_child(zio, dde->dde_lead_zio[p]); 2308 else 2309 ddt_phys_addref(ddp); 2310 } else if (zio->io_bp_override) { 2311 ASSERT(bp->blk_birth == txg); 2312 ASSERT(BP_EQUAL(bp, zio->io_bp_override)); 2313 ddt_phys_fill(ddp, bp); 2314 ddt_phys_addref(ddp); 2315 } else { 2316 cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2317 zio->io_orig_size, zp, zio_ddt_child_write_ready, 2318 zio_ddt_child_write_done, dde, zio->io_priority, 2319 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2320 2321 zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); 2322 dde->dde_lead_zio[p] = cio; 2323 } 2324 2325 ddt_exit(ddt); 2326 2327 if (cio) 2328 zio_nowait(cio); 2329 if (dio) 2330 zio_nowait(dio); 2331 2332 return (ZIO_PIPELINE_CONTINUE); 2333} 2334 2335ddt_entry_t *freedde; /* for debugging */ 2336 2337static int 2338zio_ddt_free(zio_t *zio) 2339{ 2340 spa_t *spa = zio->io_spa; 2341 blkptr_t *bp = zio->io_bp; 2342 ddt_t *ddt = ddt_select(spa, bp); 2343 ddt_entry_t *dde; 2344 ddt_phys_t *ddp; 2345 2346 ASSERT(BP_GET_DEDUP(bp)); 2347 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2348 2349 ddt_enter(ddt); 2350 freedde = dde = ddt_lookup(ddt, bp, B_TRUE); 2351 ddp = ddt_phys_select(dde, bp); 2352 ddt_phys_decref(ddp); 2353 ddt_exit(ddt); 2354 2355 return (ZIO_PIPELINE_CONTINUE); 2356} 2357 2358/* 2359 * ========================================================================== 2360 * Allocate and free blocks 2361 * ========================================================================== 2362 */ 2363static int 2364zio_dva_allocate(zio_t *zio) 2365{ 2366 spa_t *spa = zio->io_spa; 2367 metaslab_class_t *mc = spa_normal_class(spa); 2368 blkptr_t *bp = zio->io_bp; 2369 int error; 2370 int flags = 0; 2371 2372 if (zio->io_gang_leader == NULL) { 2373 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2374 zio->io_gang_leader = zio; 2375 } 2376 2377 ASSERT(BP_IS_HOLE(bp)); 2378 ASSERT0(BP_GET_NDVAS(bp)); 2379 ASSERT3U(zio->io_prop.zp_copies, >, 0); 2380 ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); 2381 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 2382 2383 /* 2384 * The dump device does not support gang blocks so allocation on 2385 * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid 2386 * the "fast" gang feature. 2387 */ 2388 flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0; 2389 flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ? 2390 METASLAB_GANG_CHILD : 0; 2391 error = metaslab_alloc(spa, mc, zio->io_size, bp, 2392 zio->io_prop.zp_copies, zio->io_txg, NULL, flags); 2393 2394 if (error) { 2395 spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " 2396 "size %llu, error %d", spa_name(spa), zio, zio->io_size, 2397 error); 2398 if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) 2399 return (zio_write_gang_block(zio)); 2400 zio->io_error = error; 2401 } 2402 2403 return (ZIO_PIPELINE_CONTINUE); 2404} 2405 2406static int 2407zio_dva_free(zio_t *zio) 2408{ 2409 metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); 2410 2411 return (ZIO_PIPELINE_CONTINUE); 2412} 2413 2414static int 2415zio_dva_claim(zio_t *zio) 2416{ 2417 int error; 2418 2419 error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 2420 if (error) 2421 zio->io_error = error; 2422 2423 return (ZIO_PIPELINE_CONTINUE); 2424} 2425 2426/* 2427 * Undo an allocation. This is used by zio_done() when an I/O fails 2428 * and we want to give back the block we just allocated. 2429 * This handles both normal blocks and gang blocks. 2430 */ 2431static void 2432zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) 2433{ 2434 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 2435 ASSERT(zio->io_bp_override == NULL); 2436 2437 if (!BP_IS_HOLE(bp)) 2438 metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); 2439 2440 if (gn != NULL) { 2441 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 2442 zio_dva_unallocate(zio, gn->gn_child[g], 2443 &gn->gn_gbh->zg_blkptr[g]); 2444 } 2445 } 2446} 2447 2448/* 2449 * Try to allocate an intent log block. Return 0 on success, errno on failure. 2450 */ 2451int 2452zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, 2453 uint64_t size, boolean_t use_slog) 2454{ 2455 int error = 1; 2456 2457 ASSERT(txg > spa_syncing_txg(spa)); 2458 2459 /* 2460 * ZIL blocks are always contiguous (i.e. not gang blocks) so we 2461 * set the METASLAB_GANG_AVOID flag so that they don't "fast gang" 2462 * when allocating them. 2463 */ 2464 if (use_slog) { 2465 error = metaslab_alloc(spa, spa_log_class(spa), size, 2466 new_bp, 1, txg, old_bp, 2467 METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID); 2468 } 2469 2470 if (error) { 2471 error = metaslab_alloc(spa, spa_normal_class(spa), size, 2472 new_bp, 1, txg, old_bp, 2473 METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID); 2474 } 2475 2476 if (error == 0) { 2477 BP_SET_LSIZE(new_bp, size); 2478 BP_SET_PSIZE(new_bp, size); 2479 BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 2480 BP_SET_CHECKSUM(new_bp, 2481 spa_version(spa) >= SPA_VERSION_SLIM_ZIL 2482 ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); 2483 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 2484 BP_SET_LEVEL(new_bp, 0); 2485 BP_SET_DEDUP(new_bp, 0); 2486 BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 2487 } 2488 2489 return (error); 2490} 2491 2492/* 2493 * Free an intent log block. 2494 */ 2495void 2496zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp) 2497{ 2498 ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG); 2499 ASSERT(!BP_IS_GANG(bp)); 2500 2501 zio_free(spa, txg, bp); 2502} 2503 2504/* 2505 * ========================================================================== 2506 * Read, write and delete to physical devices 2507 * ========================================================================== 2508 */ 2509static int 2510zio_vdev_io_start(zio_t *zio) 2511{ 2512 vdev_t *vd = zio->io_vd; 2513 uint64_t align; 2514 spa_t *spa = zio->io_spa; 2515 2516 ASSERT(zio->io_error == 0); 2517 ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); 2518 2519 if (vd == NULL) { 2520 if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2521 spa_config_enter(spa, SCL_ZIO, zio, RW_READER); 2522 2523 /* 2524 * The mirror_ops handle multiple DVAs in a single BP. 2525 */ 2526 return (vdev_mirror_ops.vdev_op_io_start(zio)); 2527 } 2528 2529 if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE) { 2530 trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg); 2531 return (ZIO_PIPELINE_CONTINUE); 2532 } 2533 2534 /* 2535 * We keep track of time-sensitive I/Os so that the scan thread 2536 * can quickly react to certain workloads. In particular, we care 2537 * about non-scrubbing, top-level reads and writes with the following 2538 * characteristics: 2539 * - synchronous writes of user data to non-slog devices 2540 * - any reads of user data 2541 * When these conditions are met, adjust the timestamp of spa_last_io 2542 * which allows the scan thread to adjust its workload accordingly. 2543 */ 2544 if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && 2545 vd == vd->vdev_top && !vd->vdev_islog && 2546 zio->io_bookmark.zb_objset != DMU_META_OBJSET && 2547 zio->io_txg != spa_syncing_txg(spa)) { 2548 uint64_t old = spa->spa_last_io; 2549 uint64_t new = ddi_get_lbolt64(); 2550 if (old != new) 2551 (void) atomic_cas_64(&spa->spa_last_io, old, new); 2552 } 2553 2554 align = 1ULL << vd->vdev_top->vdev_ashift; 2555 2556 if (P2PHASE(zio->io_size, align) != 0) { 2557 uint64_t asize = P2ROUNDUP(zio->io_size, align); 2558 char *abuf = NULL; 2559 if (zio->io_type == ZIO_TYPE_READ || 2560 zio->io_type == ZIO_TYPE_WRITE) 2561 abuf = zio_buf_alloc(asize); 2562 ASSERT(vd == vd->vdev_top); 2563 if (zio->io_type == ZIO_TYPE_WRITE) { 2564 bcopy(zio->io_data, abuf, zio->io_size); 2565 bzero(abuf + zio->io_size, asize - zio->io_size); 2566 } 2567 zio_push_transform(zio, abuf, asize, abuf ? asize : 0, 2568 zio_subblock); 2569 } 2570 2571 ASSERT(P2PHASE(zio->io_offset, align) == 0); 2572 ASSERT(P2PHASE(zio->io_size, align) == 0); 2573 VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa)); 2574 2575 /* 2576 * If this is a repair I/O, and there's no self-healing involved -- 2577 * that is, we're just resilvering what we expect to resilver -- 2578 * then don't do the I/O unless zio's txg is actually in vd's DTL. 2579 * This prevents spurious resilvering with nested replication. 2580 * For example, given a mirror of mirrors, (A+B)+(C+D), if only 2581 * A is out of date, we'll read from C+D, then use the data to 2582 * resilver A+B -- but we don't actually want to resilver B, just A. 2583 * The top-level mirror has no way to know this, so instead we just 2584 * discard unnecessary repairs as we work our way down the vdev tree. 2585 * The same logic applies to any form of nested replication: 2586 * ditto + mirror, RAID-Z + replacing, etc. This covers them all. 2587 */ 2588 if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && 2589 !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && 2590 zio->io_txg != 0 && /* not a delegated i/o */ 2591 !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { 2592 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 2593 zio_vdev_io_bypass(zio); 2594 return (ZIO_PIPELINE_CONTINUE); 2595 } 2596 2597 if (vd->vdev_ops->vdev_op_leaf && 2598 (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { 2599 2600 if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) 2601 return (ZIO_PIPELINE_CONTINUE); 2602 2603 if ((zio = vdev_queue_io(zio)) == NULL) 2604 return (ZIO_PIPELINE_STOP); 2605 2606 if (!vdev_accessible(vd, zio)) { 2607 zio->io_error = SET_ERROR(ENXIO); 2608 zio_interrupt(zio); 2609 return (ZIO_PIPELINE_STOP); 2610 } 2611 } 2612 2613 /* 2614 * Note that we ignore repair writes for TRIM because they can conflict 2615 * with normal writes. This isn't an issue because, by definition, we 2616 * only repair blocks that aren't freed. 2617 */ 2618 if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_WRITE && 2619 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 2620 if (!trim_map_write_start(zio)) 2621 return (ZIO_PIPELINE_STOP); 2622 } 2623 2624 return (vd->vdev_ops->vdev_op_io_start(zio)); 2625} 2626 2627static int 2628zio_vdev_io_done(zio_t *zio) 2629{ 2630 vdev_t *vd = zio->io_vd; 2631 vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; 2632 boolean_t unexpected_error = B_FALSE; 2633 2634 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2635 return (ZIO_PIPELINE_STOP); 2636 2637 ASSERT(zio->io_type == ZIO_TYPE_READ || 2638 zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE); 2639 2640 if (vd != NULL && vd->vdev_ops->vdev_op_leaf && 2641 (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { 2642 2643 if (zio->io_type == ZIO_TYPE_WRITE && 2644 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) 2645 trim_map_write_done(zio); 2646 2647 vdev_queue_io_done(zio); 2648 2649 if (zio->io_type == ZIO_TYPE_WRITE) 2650 vdev_cache_write(zio); 2651 2652 if (zio_injection_enabled && zio->io_error == 0) 2653 zio->io_error = zio_handle_device_injection(vd, 2654 zio, EIO); 2655 2656 if (zio_injection_enabled && zio->io_error == 0) 2657 zio->io_error = zio_handle_label_injection(zio, EIO); 2658 2659 if (zio->io_error) { 2660 if (!vdev_accessible(vd, zio)) { 2661 zio->io_error = SET_ERROR(ENXIO); 2662 } else { 2663 unexpected_error = B_TRUE; 2664 } 2665 } 2666 } 2667 2668 ops->vdev_op_io_done(zio); 2669 2670 if (unexpected_error) 2671 VERIFY(vdev_probe(vd, zio) == NULL); 2672 2673 return (ZIO_PIPELINE_CONTINUE); 2674} 2675 2676/* 2677 * For non-raidz ZIOs, we can just copy aside the bad data read from the 2678 * disk, and use that to finish the checksum ereport later. 2679 */ 2680static void 2681zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, 2682 const void *good_buf) 2683{ 2684 /* no processing needed */ 2685 zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); 2686} 2687 2688/*ARGSUSED*/ 2689void 2690zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) 2691{ 2692 void *buf = zio_buf_alloc(zio->io_size); 2693 2694 bcopy(zio->io_data, buf, zio->io_size); 2695 2696 zcr->zcr_cbinfo = zio->io_size; 2697 zcr->zcr_cbdata = buf; 2698 zcr->zcr_finish = zio_vsd_default_cksum_finish; 2699 zcr->zcr_free = zio_buf_free; 2700} 2701 2702static int 2703zio_vdev_io_assess(zio_t *zio) 2704{ 2705 vdev_t *vd = zio->io_vd; 2706 2707 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2708 return (ZIO_PIPELINE_STOP); 2709 2710 if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2711 spa_config_exit(zio->io_spa, SCL_ZIO, zio); 2712 2713 if (zio->io_vsd != NULL) { 2714 zio->io_vsd_ops->vsd_free(zio); 2715 zio->io_vsd = NULL; 2716 } 2717 2718 if (zio_injection_enabled && zio->io_error == 0) 2719 zio->io_error = zio_handle_fault_injection(zio, EIO); 2720 2721 if (zio->io_type == ZIO_TYPE_IOCTL && zio->io_cmd == DKIOCTRIM) 2722 switch (zio->io_error) { 2723 case 0: 2724 ZIO_TRIM_STAT_INCR(bytes, zio->io_size); 2725 ZIO_TRIM_STAT_BUMP(success); 2726 break; 2727 case EOPNOTSUPP: 2728 ZIO_TRIM_STAT_BUMP(unsupported); 2729 break; 2730 default: 2731 ZIO_TRIM_STAT_BUMP(failed); 2732 break; 2733 } 2734 2735 /* 2736 * If the I/O failed, determine whether we should attempt to retry it. 2737 * 2738 * On retry, we cut in line in the issue queue, since we don't want 2739 * compression/checksumming/etc. work to prevent our (cheap) IO reissue. 2740 */ 2741 if (zio->io_error && vd == NULL && 2742 !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { 2743 ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ 2744 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ 2745 zio->io_error = 0; 2746 zio->io_flags |= ZIO_FLAG_IO_RETRY | 2747 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; 2748 zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; 2749 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, 2750 zio_requeue_io_start_cut_in_line); 2751 return (ZIO_PIPELINE_STOP); 2752 } 2753 2754 /* 2755 * If we got an error on a leaf device, convert it to ENXIO 2756 * if the device is not accessible at all. 2757 */ 2758 if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && 2759 !vdev_accessible(vd, zio)) 2760 zio->io_error = SET_ERROR(ENXIO); 2761 2762 /* 2763 * If we can't write to an interior vdev (mirror or RAID-Z), 2764 * set vdev_cant_write so that we stop trying to allocate from it. 2765 */ 2766 if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && 2767 vd != NULL && !vd->vdev_ops->vdev_op_leaf) { 2768 vd->vdev_cant_write = B_TRUE; 2769 } 2770 2771 if (zio->io_error) 2772 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2773 2774 return (ZIO_PIPELINE_CONTINUE); 2775} 2776 2777void 2778zio_vdev_io_reissue(zio_t *zio) 2779{ 2780 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2781 ASSERT(zio->io_error == 0); 2782 2783 zio->io_stage >>= 1; 2784} 2785 2786void 2787zio_vdev_io_redone(zio_t *zio) 2788{ 2789 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 2790 2791 zio->io_stage >>= 1; 2792} 2793 2794void 2795zio_vdev_io_bypass(zio_t *zio) 2796{ 2797 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2798 ASSERT(zio->io_error == 0); 2799 2800 zio->io_flags |= ZIO_FLAG_IO_BYPASS; 2801 zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; 2802} 2803 2804/* 2805 * ========================================================================== 2806 * Generate and verify checksums 2807 * ========================================================================== 2808 */ 2809static int 2810zio_checksum_generate(zio_t *zio) 2811{ 2812 blkptr_t *bp = zio->io_bp; 2813 enum zio_checksum checksum; 2814 2815 if (bp == NULL) { 2816 /* 2817 * This is zio_write_phys(). 2818 * We're either generating a label checksum, or none at all. 2819 */ 2820 checksum = zio->io_prop.zp_checksum; 2821 2822 if (checksum == ZIO_CHECKSUM_OFF) 2823 return (ZIO_PIPELINE_CONTINUE); 2824 2825 ASSERT(checksum == ZIO_CHECKSUM_LABEL); 2826 } else { 2827 if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { 2828 ASSERT(!IO_IS_ALLOCATING(zio)); 2829 checksum = ZIO_CHECKSUM_GANG_HEADER; 2830 } else { 2831 checksum = BP_GET_CHECKSUM(bp); 2832 } 2833 } 2834 2835 zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); 2836 2837 return (ZIO_PIPELINE_CONTINUE); 2838} 2839 2840static int 2841zio_checksum_verify(zio_t *zio) 2842{ 2843 zio_bad_cksum_t info; 2844 blkptr_t *bp = zio->io_bp; 2845 int error; 2846 2847 ASSERT(zio->io_vd != NULL); 2848 2849 if (bp == NULL) { 2850 /* 2851 * This is zio_read_phys(). 2852 * We're either verifying a label checksum, or nothing at all. 2853 */ 2854 if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) 2855 return (ZIO_PIPELINE_CONTINUE); 2856 2857 ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); 2858 } 2859 2860 if ((error = zio_checksum_error(zio, &info)) != 0) { 2861 zio->io_error = error; 2862 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2863 zfs_ereport_start_checksum(zio->io_spa, 2864 zio->io_vd, zio, zio->io_offset, 2865 zio->io_size, NULL, &info); 2866 } 2867 } 2868 2869 return (ZIO_PIPELINE_CONTINUE); 2870} 2871 2872/* 2873 * Called by RAID-Z to ensure we don't compute the checksum twice. 2874 */ 2875void 2876zio_checksum_verified(zio_t *zio) 2877{ 2878 zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 2879} 2880 2881/* 2882 * ========================================================================== 2883 * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. 2884 * An error of 0 indictes success. ENXIO indicates whole-device failure, 2885 * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO 2886 * indicate errors that are specific to one I/O, and most likely permanent. 2887 * Any other error is presumed to be worse because we weren't expecting it. 2888 * ========================================================================== 2889 */ 2890int 2891zio_worst_error(int e1, int e2) 2892{ 2893 static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; 2894 int r1, r2; 2895 2896 for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) 2897 if (e1 == zio_error_rank[r1]) 2898 break; 2899 2900 for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) 2901 if (e2 == zio_error_rank[r2]) 2902 break; 2903 2904 return (r1 > r2 ? e1 : e2); 2905} 2906 2907/* 2908 * ========================================================================== 2909 * I/O completion 2910 * ========================================================================== 2911 */ 2912static int 2913zio_ready(zio_t *zio) 2914{ 2915 blkptr_t *bp = zio->io_bp; 2916 zio_t *pio, *pio_next; 2917 2918 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 2919 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY)) 2920 return (ZIO_PIPELINE_STOP); 2921 2922 if (zio->io_ready) { 2923 ASSERT(IO_IS_ALLOCATING(zio)); 2924 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || 2925 (zio->io_flags & ZIO_FLAG_NOPWRITE)); 2926 ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); 2927 2928 zio->io_ready(zio); 2929 } 2930 2931 if (bp != NULL && bp != &zio->io_bp_copy) 2932 zio->io_bp_copy = *bp; 2933 2934 if (zio->io_error) 2935 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2936 2937 mutex_enter(&zio->io_lock); 2938 zio->io_state[ZIO_WAIT_READY] = 1; 2939 pio = zio_walk_parents(zio); 2940 mutex_exit(&zio->io_lock); 2941 2942 /* 2943 * As we notify zio's parents, new parents could be added. 2944 * New parents go to the head of zio's io_parent_list, however, 2945 * so we will (correctly) not notify them. The remainder of zio's 2946 * io_parent_list, from 'pio_next' onward, cannot change because 2947 * all parents must wait for us to be done before they can be done. 2948 */ 2949 for (; pio != NULL; pio = pio_next) { 2950 pio_next = zio_walk_parents(zio); 2951 zio_notify_parent(pio, zio, ZIO_WAIT_READY); 2952 } 2953 2954 if (zio->io_flags & ZIO_FLAG_NODATA) { 2955 if (BP_IS_GANG(bp)) { 2956 zio->io_flags &= ~ZIO_FLAG_NODATA; 2957 } else { 2958 ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE); 2959 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 2960 } 2961 } 2962 2963 if (zio_injection_enabled && 2964 zio->io_spa->spa_syncing_txg == zio->io_txg) 2965 zio_handle_ignored_writes(zio); 2966 2967 return (ZIO_PIPELINE_CONTINUE); 2968} 2969 2970static int 2971zio_done(zio_t *zio) 2972{ 2973 spa_t *spa = zio->io_spa; 2974 zio_t *lio = zio->io_logical; 2975 blkptr_t *bp = zio->io_bp; 2976 vdev_t *vd = zio->io_vd; 2977 uint64_t psize = zio->io_size; 2978 zio_t *pio, *pio_next; 2979 2980 /* 2981 * If our children haven't all completed, 2982 * wait for them and then repeat this pipeline stage. 2983 */ 2984 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || 2985 zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || 2986 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) || 2987 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) 2988 return (ZIO_PIPELINE_STOP); 2989 2990 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 2991 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 2992 ASSERT(zio->io_children[c][w] == 0); 2993 2994 if (bp != NULL) { 2995 ASSERT(bp->blk_pad[0] == 0); 2996 ASSERT(bp->blk_pad[1] == 0); 2997 ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || 2998 (bp == zio_unique_parent(zio)->io_bp)); 2999 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 3000 zio->io_bp_override == NULL && 3001 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 3002 ASSERT(!BP_SHOULD_BYTESWAP(bp)); 3003 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); 3004 ASSERT(BP_COUNT_GANG(bp) == 0 || 3005 (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 3006 } 3007 if (zio->io_flags & ZIO_FLAG_NOPWRITE) 3008 VERIFY(BP_EQUAL(bp, &zio->io_bp_orig)); 3009 } 3010 3011 /* 3012 * If there were child vdev/gang/ddt errors, they apply to us now. 3013 */ 3014 zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); 3015 zio_inherit_child_errors(zio, ZIO_CHILD_GANG); 3016 zio_inherit_child_errors(zio, ZIO_CHILD_DDT); 3017 3018 /* 3019 * If the I/O on the transformed data was successful, generate any 3020 * checksum reports now while we still have the transformed data. 3021 */ 3022 if (zio->io_error == 0) { 3023 while (zio->io_cksum_report != NULL) { 3024 zio_cksum_report_t *zcr = zio->io_cksum_report; 3025 uint64_t align = zcr->zcr_align; 3026 uint64_t asize = P2ROUNDUP(psize, align); 3027 char *abuf = zio->io_data; 3028 3029 if (asize != psize) { 3030 abuf = zio_buf_alloc(asize); 3031 bcopy(zio->io_data, abuf, psize); 3032 bzero(abuf + psize, asize - psize); 3033 } 3034 3035 zio->io_cksum_report = zcr->zcr_next; 3036 zcr->zcr_next = NULL; 3037 zcr->zcr_finish(zcr, abuf); 3038 zfs_ereport_free_checksum(zcr); 3039 3040 if (asize != psize) 3041 zio_buf_free(abuf, asize); 3042 } 3043 } 3044 3045 zio_pop_transforms(zio); /* note: may set zio->io_error */ 3046 3047 vdev_stat_update(zio, psize); 3048 3049 if (zio->io_error) { 3050 /* 3051 * If this I/O is attached to a particular vdev, 3052 * generate an error message describing the I/O failure 3053 * at the block level. We ignore these errors if the 3054 * device is currently unavailable. 3055 */ 3056 if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 3057 zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 3058 3059 if ((zio->io_error == EIO || !(zio->io_flags & 3060 (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && 3061 zio == lio) { 3062 /* 3063 * For logical I/O requests, tell the SPA to log the 3064 * error and generate a logical data ereport. 3065 */ 3066 spa_log_error(spa, zio); 3067 zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 3068 0, 0); 3069 } 3070 } 3071 3072 if (zio->io_error && zio == lio) { 3073 /* 3074 * Determine whether zio should be reexecuted. This will 3075 * propagate all the way to the root via zio_notify_parent(). 3076 */ 3077 ASSERT(vd == NULL && bp != NULL); 3078 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3079 3080 if (IO_IS_ALLOCATING(zio) && 3081 !(zio->io_flags & ZIO_FLAG_CANFAIL)) { 3082 if (zio->io_error != ENOSPC) 3083 zio->io_reexecute |= ZIO_REEXECUTE_NOW; 3084 else 3085 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3086 } 3087 3088 if ((zio->io_type == ZIO_TYPE_READ || 3089 zio->io_type == ZIO_TYPE_FREE) && 3090 !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && 3091 zio->io_error == ENXIO && 3092 spa_load_state(spa) == SPA_LOAD_NONE && 3093 spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 3094 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3095 3096 if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) 3097 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3098 3099 /* 3100 * Here is a possibly good place to attempt to do 3101 * either combinatorial reconstruction or error correction 3102 * based on checksums. It also might be a good place 3103 * to send out preliminary ereports before we suspend 3104 * processing. 3105 */ 3106 } 3107 3108 /* 3109 * If there were logical child errors, they apply to us now. 3110 * We defer this until now to avoid conflating logical child 3111 * errors with errors that happened to the zio itself when 3112 * updating vdev stats and reporting FMA events above. 3113 */ 3114 zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); 3115 3116 if ((zio->io_error || zio->io_reexecute) && 3117 IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && 3118 !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) 3119 zio_dva_unallocate(zio, zio->io_gang_tree, bp); 3120 3121 zio_gang_tree_free(&zio->io_gang_tree); 3122 3123 /* 3124 * Godfather I/Os should never suspend. 3125 */ 3126 if ((zio->io_flags & ZIO_FLAG_GODFATHER) && 3127 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) 3128 zio->io_reexecute = 0; 3129 3130 if (zio->io_reexecute) { 3131 /* 3132 * This is a logical I/O that wants to reexecute. 3133 * 3134 * Reexecute is top-down. When an i/o fails, if it's not 3135 * the root, it simply notifies its parent and sticks around. 3136 * The parent, seeing that it still has children in zio_done(), 3137 * does the same. This percolates all the way up to the root. 3138 * The root i/o will reexecute or suspend the entire tree. 3139 * 3140 * This approach ensures that zio_reexecute() honors 3141 * all the original i/o dependency relationships, e.g. 3142 * parents not executing until children are ready. 3143 */ 3144 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3145 3146 zio->io_gang_leader = NULL; 3147 3148 mutex_enter(&zio->io_lock); 3149 zio->io_state[ZIO_WAIT_DONE] = 1; 3150 mutex_exit(&zio->io_lock); 3151 3152 /* 3153 * "The Godfather" I/O monitors its children but is 3154 * not a true parent to them. It will track them through 3155 * the pipeline but severs its ties whenever they get into 3156 * trouble (e.g. suspended). This allows "The Godfather" 3157 * I/O to return status without blocking. 3158 */ 3159 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 3160 zio_link_t *zl = zio->io_walk_link; 3161 pio_next = zio_walk_parents(zio); 3162 3163 if ((pio->io_flags & ZIO_FLAG_GODFATHER) && 3164 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { 3165 zio_remove_child(pio, zio, zl); 3166 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3167 } 3168 } 3169 3170 if ((pio = zio_unique_parent(zio)) != NULL) { 3171 /* 3172 * We're not a root i/o, so there's nothing to do 3173 * but notify our parent. Don't propagate errors 3174 * upward since we haven't permanently failed yet. 3175 */ 3176 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 3177 zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; 3178 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3179 } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { 3180 /* 3181 * We'd fail again if we reexecuted now, so suspend 3182 * until conditions improve (e.g. device comes online). 3183 */ 3184 zio_suspend(spa, zio); 3185 } else { 3186 /* 3187 * Reexecution is potentially a huge amount of work. 3188 * Hand it off to the otherwise-unused claim taskq. 3189 */ 3190#if defined(illumos) || !defined(_KERNEL) 3191 ASSERT(zio->io_tqent.tqent_next == NULL); 3192#else 3193 ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); 3194#endif 3195 spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM, 3196 ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio, 3197 0, &zio->io_tqent); 3198 } 3199 return (ZIO_PIPELINE_STOP); 3200 } 3201 3202 ASSERT(zio->io_child_count == 0); 3203 ASSERT(zio->io_reexecute == 0); 3204 ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); 3205 3206 /* 3207 * Report any checksum errors, since the I/O is complete. 3208 */ 3209 while (zio->io_cksum_report != NULL) { 3210 zio_cksum_report_t *zcr = zio->io_cksum_report; 3211 zio->io_cksum_report = zcr->zcr_next; 3212 zcr->zcr_next = NULL; 3213 zcr->zcr_finish(zcr, NULL); 3214 zfs_ereport_free_checksum(zcr); 3215 } 3216 3217 /* 3218 * It is the responsibility of the done callback to ensure that this 3219 * particular zio is no longer discoverable for adoption, and as 3220 * such, cannot acquire any new parents. 3221 */ 3222 if (zio->io_done) 3223 zio->io_done(zio); 3224 3225 mutex_enter(&zio->io_lock); 3226 zio->io_state[ZIO_WAIT_DONE] = 1; 3227 mutex_exit(&zio->io_lock); 3228 3229 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 3230 zio_link_t *zl = zio->io_walk_link; 3231 pio_next = zio_walk_parents(zio); 3232 zio_remove_child(pio, zio, zl); 3233 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3234 } 3235 3236 if (zio->io_waiter != NULL) { 3237 mutex_enter(&zio->io_lock); 3238 zio->io_executor = NULL; 3239 cv_broadcast(&zio->io_cv); 3240 mutex_exit(&zio->io_lock); 3241 } else { 3242 zio_destroy(zio); 3243 } 3244 3245 return (ZIO_PIPELINE_STOP); 3246} 3247 3248/* 3249 * ========================================================================== 3250 * I/O pipeline definition 3251 * ========================================================================== 3252 */ 3253static zio_pipe_stage_t *zio_pipeline[] = { 3254 NULL, 3255 zio_read_bp_init, 3256 zio_free_bp_init, 3257 zio_issue_async, 3258 zio_write_bp_init, 3259 zio_checksum_generate, 3260 zio_nop_write, 3261 zio_ddt_read_start, 3262 zio_ddt_read_done, 3263 zio_ddt_write, 3264 zio_ddt_free, 3265 zio_gang_assemble, 3266 zio_gang_issue, 3267 zio_dva_allocate, 3268 zio_dva_free, 3269 zio_dva_claim, 3270 zio_ready, 3271 zio_vdev_io_start, 3272 zio_vdev_io_done, 3273 zio_vdev_io_assess, 3274 zio_checksum_verify, 3275 zio_done 3276}; 3277 3278/* dnp is the dnode for zb1->zb_object */ 3279boolean_t 3280zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1, 3281 const zbookmark_t *zb2) 3282{ 3283 uint64_t zb1nextL0, zb2thisobj; 3284 3285 ASSERT(zb1->zb_objset == zb2->zb_objset); 3286 ASSERT(zb2->zb_level == 0); 3287 3288 /* 3289 * A bookmark in the deadlist is considered to be after 3290 * everything else. 3291 */ 3292 if (zb2->zb_object == DMU_DEADLIST_OBJECT) 3293 return (B_TRUE); 3294 3295 /* The objset_phys_t isn't before anything. */ 3296 if (dnp == NULL) 3297 return (B_FALSE); 3298 3299 zb1nextL0 = (zb1->zb_blkid + 1) << 3300 ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); 3301 3302 zb2thisobj = zb2->zb_object ? zb2->zb_object : 3303 zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); 3304 3305 if (zb1->zb_object == DMU_META_DNODE_OBJECT) { 3306 uint64_t nextobj = zb1nextL0 * 3307 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; 3308 return (nextobj <= zb2thisobj); 3309 } 3310 3311 if (zb1->zb_object < zb2thisobj) 3312 return (B_TRUE); 3313 if (zb1->zb_object > zb2thisobj) 3314 return (B_FALSE); 3315 if (zb2->zb_object == DMU_META_DNODE_OBJECT) 3316 return (B_FALSE); 3317 return (zb1nextL0 <= zb2->zb_blkid); 3318} 3319