zio.c revision 250149
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2013 by Delphix. All rights reserved. 24 */ 25 26#include <sys/zfs_context.h> 27#include <sys/fm/fs/zfs.h> 28#include <sys/spa.h> 29#include <sys/txg.h> 30#include <sys/spa_impl.h> 31#include <sys/vdev_impl.h> 32#include <sys/zio_impl.h> 33#include <sys/zio_compress.h> 34#include <sys/zio_checksum.h> 35#include <sys/dmu_objset.h> 36#include <sys/arc.h> 37#include <sys/ddt.h> 38#include <sys/trim_map.h> 39 40SYSCTL_DECL(_vfs_zfs); 41SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); 42static int zio_use_uma = 0; 43TUNABLE_INT("vfs.zfs.zio.use_uma", &zio_use_uma); 44SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0, 45 "Use uma(9) for ZIO allocations"); 46static int zio_exclude_metadata = 0; 47TUNABLE_INT("vfs.zfs.zio.exclude_metadata", &zio_exclude_metadata); 48SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0, 49 "Exclude metadata buffers from dumps as well"); 50 51zio_trim_stats_t zio_trim_stats = { 52 { "bytes", KSTAT_DATA_UINT64, 53 "Number of bytes successfully TRIMmed" }, 54 { "success", KSTAT_DATA_UINT64, 55 "Number of successful TRIM requests" }, 56 { "unsupported", KSTAT_DATA_UINT64, 57 "Number of TRIM requests that failed because TRIM is not supported" }, 58 { "failed", KSTAT_DATA_UINT64, 59 "Number of TRIM requests that failed for reasons other than not supported" }, 60}; 61 62static kstat_t *zio_trim_ksp; 63 64/* 65 * ========================================================================== 66 * I/O priority table 67 * ========================================================================== 68 */ 69uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 70 0, /* ZIO_PRIORITY_NOW */ 71 0, /* ZIO_PRIORITY_SYNC_READ */ 72 0, /* ZIO_PRIORITY_SYNC_WRITE */ 73 0, /* ZIO_PRIORITY_LOG_WRITE */ 74 1, /* ZIO_PRIORITY_CACHE_FILL */ 75 1, /* ZIO_PRIORITY_AGG */ 76 4, /* ZIO_PRIORITY_FREE */ 77 4, /* ZIO_PRIORITY_ASYNC_WRITE */ 78 6, /* ZIO_PRIORITY_ASYNC_READ */ 79 10, /* ZIO_PRIORITY_RESILVER */ 80 20, /* ZIO_PRIORITY_SCRUB */ 81 2, /* ZIO_PRIORITY_DDT_PREFETCH */ 82 30, /* ZIO_PRIORITY_TRIM */ 83}; 84 85/* 86 * ========================================================================== 87 * I/O type descriptions 88 * ========================================================================== 89 */ 90char *zio_type_name[ZIO_TYPES] = { 91 "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", 92 "zio_ioctl" 93}; 94 95/* 96 * ========================================================================== 97 * I/O kmem caches 98 * ========================================================================== 99 */ 100kmem_cache_t *zio_cache; 101kmem_cache_t *zio_link_cache; 102kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 103kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 104 105#ifdef _KERNEL 106extern vmem_t *zio_alloc_arena; 107#endif 108extern int zfs_mg_alloc_failures; 109 110/* 111 * The following actions directly effect the spa's sync-to-convergence logic. 112 * The values below define the sync pass when we start performing the action. 113 * Care should be taken when changing these values as they directly impact 114 * spa_sync() performance. Tuning these values may introduce subtle performance 115 * pathologies and should only be done in the context of performance analysis. 116 * These tunables will eventually be removed and replaced with #defines once 117 * enough analysis has been done to determine optimal values. 118 * 119 * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that 120 * regular blocks are not deferred. 121 */ 122int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */ 123TUNABLE_INT("vfs.zfs.sync_pass_deferred_free", &zfs_sync_pass_deferred_free); 124SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_deferred_free, CTLFLAG_RDTUN, 125 &zfs_sync_pass_deferred_free, 0, "defer frees starting in this pass"); 126int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */ 127TUNABLE_INT("vfs.zfs.sync_pass_dont_compress", &zfs_sync_pass_dont_compress); 128SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_dont_compress, CTLFLAG_RDTUN, 129 &zfs_sync_pass_dont_compress, 0, "don't compress starting in this pass"); 130int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ 131TUNABLE_INT("vfs.zfs.sync_pass_rewrite", &zfs_sync_pass_rewrite); 132SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_rewrite, CTLFLAG_RDTUN, 133 &zfs_sync_pass_rewrite, 0, "rewrite new bps starting in this pass"); 134 135/* 136 * An allocating zio is one that either currently has the DVA allocate 137 * stage set or will have it later in its lifetime. 138 */ 139#define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) 140 141boolean_t zio_requeue_io_start_cut_in_line = B_TRUE; 142 143#ifdef ZFS_DEBUG 144int zio_buf_debug_limit = 16384; 145#else 146int zio_buf_debug_limit = 0; 147#endif 148 149void 150zio_init(void) 151{ 152 size_t c; 153 zio_cache = kmem_cache_create("zio_cache", 154 sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 155 zio_link_cache = kmem_cache_create("zio_link_cache", 156 sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 157 if (!zio_use_uma) 158 goto out; 159 160 /* 161 * For small buffers, we want a cache for each multiple of 162 * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 163 * for each quarter-power of 2. For large buffers, we want 164 * a cache for each multiple of PAGESIZE. 165 */ 166 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 167 size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 168 size_t p2 = size; 169 size_t align = 0; 170 size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0; 171 172 while (p2 & (p2 - 1)) 173 p2 &= p2 - 1; 174 175#ifdef illumos 176#ifndef _KERNEL 177 /* 178 * If we are using watchpoints, put each buffer on its own page, 179 * to eliminate the performance overhead of trapping to the 180 * kernel when modifying a non-watched buffer that shares the 181 * page with a watched buffer. 182 */ 183 if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) 184 continue; 185#endif 186#endif /* illumos */ 187 if (size <= 4 * SPA_MINBLOCKSIZE) { 188 align = SPA_MINBLOCKSIZE; 189 } else if (IS_P2ALIGNED(size, PAGESIZE)) { 190 align = PAGESIZE; 191 } else if (IS_P2ALIGNED(size, p2 >> 2)) { 192 align = p2 >> 2; 193 } 194 195 if (align != 0) { 196 char name[36]; 197 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 198 zio_buf_cache[c] = kmem_cache_create(name, size, 199 align, NULL, NULL, NULL, NULL, NULL, cflags); 200 201 /* 202 * Since zio_data bufs do not appear in crash dumps, we 203 * pass KMC_NOTOUCH so that no allocator metadata is 204 * stored with the buffers. 205 */ 206 (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 207 zio_data_buf_cache[c] = kmem_cache_create(name, size, 208 align, NULL, NULL, NULL, NULL, NULL, 209 cflags | KMC_NOTOUCH | KMC_NODEBUG); 210 } 211 } 212 213 while (--c != 0) { 214 ASSERT(zio_buf_cache[c] != NULL); 215 if (zio_buf_cache[c - 1] == NULL) 216 zio_buf_cache[c - 1] = zio_buf_cache[c]; 217 218 ASSERT(zio_data_buf_cache[c] != NULL); 219 if (zio_data_buf_cache[c - 1] == NULL) 220 zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 221 } 222out: 223 224 /* 225 * The zio write taskqs have 1 thread per cpu, allow 1/2 of the taskqs 226 * to fail 3 times per txg or 8 failures, whichever is greater. 227 */ 228 if (zfs_mg_alloc_failures == 0) 229 zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8); 230 else if (zfs_mg_alloc_failures < 8) 231 zfs_mg_alloc_failures = 8; 232 233 zio_inject_init(); 234 235 zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc", 236 KSTAT_TYPE_NAMED, 237 sizeof(zio_trim_stats) / sizeof(kstat_named_t), 238 KSTAT_FLAG_VIRTUAL); 239 240 if (zio_trim_ksp != NULL) { 241 zio_trim_ksp->ks_data = &zio_trim_stats; 242 kstat_install(zio_trim_ksp); 243 } 244} 245 246void 247zio_fini(void) 248{ 249 size_t c; 250 kmem_cache_t *last_cache = NULL; 251 kmem_cache_t *last_data_cache = NULL; 252 253 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 254 if (zio_buf_cache[c] != last_cache) { 255 last_cache = zio_buf_cache[c]; 256 kmem_cache_destroy(zio_buf_cache[c]); 257 } 258 zio_buf_cache[c] = NULL; 259 260 if (zio_data_buf_cache[c] != last_data_cache) { 261 last_data_cache = zio_data_buf_cache[c]; 262 kmem_cache_destroy(zio_data_buf_cache[c]); 263 } 264 zio_data_buf_cache[c] = NULL; 265 } 266 267 kmem_cache_destroy(zio_link_cache); 268 kmem_cache_destroy(zio_cache); 269 270 zio_inject_fini(); 271 272 if (zio_trim_ksp != NULL) { 273 kstat_delete(zio_trim_ksp); 274 zio_trim_ksp = NULL; 275 } 276} 277 278/* 279 * ========================================================================== 280 * Allocate and free I/O buffers 281 * ========================================================================== 282 */ 283 284/* 285 * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 286 * crashdump if the kernel panics, so use it judiciously. Obviously, it's 287 * useful to inspect ZFS metadata, but if possible, we should avoid keeping 288 * excess / transient data in-core during a crashdump. 289 */ 290void * 291zio_buf_alloc(size_t size) 292{ 293 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 294 int flags = zio_exclude_metadata ? KM_NODEBUG : 0; 295 296 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 297 298 if (zio_use_uma) 299 return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 300 else 301 return (kmem_alloc(size, KM_SLEEP|flags)); 302} 303 304/* 305 * Use zio_data_buf_alloc to allocate data. The data will not appear in a 306 * crashdump if the kernel panics. This exists so that we will limit the amount 307 * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 308 * of kernel heap dumped to disk when the kernel panics) 309 */ 310void * 311zio_data_buf_alloc(size_t size) 312{ 313 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 314 315 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 316 317 if (zio_use_uma) 318 return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 319 else 320 return (kmem_alloc(size, KM_SLEEP | KM_NODEBUG)); 321} 322 323void 324zio_buf_free(void *buf, size_t size) 325{ 326 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 327 328 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 329 330 if (zio_use_uma) 331 kmem_cache_free(zio_buf_cache[c], buf); 332 else 333 kmem_free(buf, size); 334} 335 336void 337zio_data_buf_free(void *buf, size_t size) 338{ 339 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 340 341 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 342 343 if (zio_use_uma) 344 kmem_cache_free(zio_data_buf_cache[c], buf); 345 else 346 kmem_free(buf, size); 347} 348 349/* 350 * ========================================================================== 351 * Push and pop I/O transform buffers 352 * ========================================================================== 353 */ 354static void 355zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, 356 zio_transform_func_t *transform) 357{ 358 zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 359 360 zt->zt_orig_data = zio->io_data; 361 zt->zt_orig_size = zio->io_size; 362 zt->zt_bufsize = bufsize; 363 zt->zt_transform = transform; 364 365 zt->zt_next = zio->io_transform_stack; 366 zio->io_transform_stack = zt; 367 368 zio->io_data = data; 369 zio->io_size = size; 370} 371 372static void 373zio_pop_transforms(zio_t *zio) 374{ 375 zio_transform_t *zt; 376 377 while ((zt = zio->io_transform_stack) != NULL) { 378 if (zt->zt_transform != NULL) 379 zt->zt_transform(zio, 380 zt->zt_orig_data, zt->zt_orig_size); 381 382 if (zt->zt_bufsize != 0) 383 zio_buf_free(zio->io_data, zt->zt_bufsize); 384 385 zio->io_data = zt->zt_orig_data; 386 zio->io_size = zt->zt_orig_size; 387 zio->io_transform_stack = zt->zt_next; 388 389 kmem_free(zt, sizeof (zio_transform_t)); 390 } 391} 392 393/* 394 * ========================================================================== 395 * I/O transform callbacks for subblocks and decompression 396 * ========================================================================== 397 */ 398static void 399zio_subblock(zio_t *zio, void *data, uint64_t size) 400{ 401 ASSERT(zio->io_size > size); 402 403 if (zio->io_type == ZIO_TYPE_READ) 404 bcopy(zio->io_data, data, size); 405} 406 407static void 408zio_decompress(zio_t *zio, void *data, uint64_t size) 409{ 410 if (zio->io_error == 0 && 411 zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), 412 zio->io_data, data, zio->io_size, size) != 0) 413 zio->io_error = SET_ERROR(EIO); 414} 415 416/* 417 * ========================================================================== 418 * I/O parent/child relationships and pipeline interlocks 419 * ========================================================================== 420 */ 421/* 422 * NOTE - Callers to zio_walk_parents() and zio_walk_children must 423 * continue calling these functions until they return NULL. 424 * Otherwise, the next caller will pick up the list walk in 425 * some indeterminate state. (Otherwise every caller would 426 * have to pass in a cookie to keep the state represented by 427 * io_walk_link, which gets annoying.) 428 */ 429zio_t * 430zio_walk_parents(zio_t *cio) 431{ 432 zio_link_t *zl = cio->io_walk_link; 433 list_t *pl = &cio->io_parent_list; 434 435 zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); 436 cio->io_walk_link = zl; 437 438 if (zl == NULL) 439 return (NULL); 440 441 ASSERT(zl->zl_child == cio); 442 return (zl->zl_parent); 443} 444 445zio_t * 446zio_walk_children(zio_t *pio) 447{ 448 zio_link_t *zl = pio->io_walk_link; 449 list_t *cl = &pio->io_child_list; 450 451 zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); 452 pio->io_walk_link = zl; 453 454 if (zl == NULL) 455 return (NULL); 456 457 ASSERT(zl->zl_parent == pio); 458 return (zl->zl_child); 459} 460 461zio_t * 462zio_unique_parent(zio_t *cio) 463{ 464 zio_t *pio = zio_walk_parents(cio); 465 466 VERIFY(zio_walk_parents(cio) == NULL); 467 return (pio); 468} 469 470void 471zio_add_child(zio_t *pio, zio_t *cio) 472{ 473 zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); 474 475 /* 476 * Logical I/Os can have logical, gang, or vdev children. 477 * Gang I/Os can have gang or vdev children. 478 * Vdev I/Os can only have vdev children. 479 * The following ASSERT captures all of these constraints. 480 */ 481 ASSERT(cio->io_child_type <= pio->io_child_type); 482 483 zl->zl_parent = pio; 484 zl->zl_child = cio; 485 486 mutex_enter(&cio->io_lock); 487 mutex_enter(&pio->io_lock); 488 489 ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); 490 491 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 492 pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; 493 494 list_insert_head(&pio->io_child_list, zl); 495 list_insert_head(&cio->io_parent_list, zl); 496 497 pio->io_child_count++; 498 cio->io_parent_count++; 499 500 mutex_exit(&pio->io_lock); 501 mutex_exit(&cio->io_lock); 502} 503 504static void 505zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) 506{ 507 ASSERT(zl->zl_parent == pio); 508 ASSERT(zl->zl_child == cio); 509 510 mutex_enter(&cio->io_lock); 511 mutex_enter(&pio->io_lock); 512 513 list_remove(&pio->io_child_list, zl); 514 list_remove(&cio->io_parent_list, zl); 515 516 pio->io_child_count--; 517 cio->io_parent_count--; 518 519 mutex_exit(&pio->io_lock); 520 mutex_exit(&cio->io_lock); 521 522 kmem_cache_free(zio_link_cache, zl); 523} 524 525static boolean_t 526zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) 527{ 528 uint64_t *countp = &zio->io_children[child][wait]; 529 boolean_t waiting = B_FALSE; 530 531 mutex_enter(&zio->io_lock); 532 ASSERT(zio->io_stall == NULL); 533 if (*countp != 0) { 534 zio->io_stage >>= 1; 535 zio->io_stall = countp; 536 waiting = B_TRUE; 537 } 538 mutex_exit(&zio->io_lock); 539 540 return (waiting); 541} 542 543static void 544zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) 545{ 546 uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; 547 int *errorp = &pio->io_child_error[zio->io_child_type]; 548 549 mutex_enter(&pio->io_lock); 550 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 551 *errorp = zio_worst_error(*errorp, zio->io_error); 552 pio->io_reexecute |= zio->io_reexecute; 553 ASSERT3U(*countp, >, 0); 554 if (--*countp == 0 && pio->io_stall == countp) { 555 pio->io_stall = NULL; 556 mutex_exit(&pio->io_lock); 557 zio_execute(pio); 558 } else { 559 mutex_exit(&pio->io_lock); 560 } 561} 562 563static void 564zio_inherit_child_errors(zio_t *zio, enum zio_child c) 565{ 566 if (zio->io_child_error[c] != 0 && zio->io_error == 0) 567 zio->io_error = zio->io_child_error[c]; 568} 569 570/* 571 * ========================================================================== 572 * Create the various types of I/O (read, write, free, etc) 573 * ========================================================================== 574 */ 575static zio_t * 576zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 577 void *data, uint64_t size, zio_done_func_t *done, void *private, 578 zio_type_t type, int priority, enum zio_flag flags, 579 vdev_t *vd, uint64_t offset, const zbookmark_t *zb, 580 enum zio_stage stage, enum zio_stage pipeline) 581{ 582 zio_t *zio; 583 584 ASSERT3U(type == ZIO_TYPE_FREE || size, <=, SPA_MAXBLOCKSIZE); 585 ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 586 ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 587 588 ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); 589 ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); 590 ASSERT(vd || stage == ZIO_STAGE_OPEN); 591 592 zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 593 bzero(zio, sizeof (zio_t)); 594 595 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 596 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 597 598 list_create(&zio->io_parent_list, sizeof (zio_link_t), 599 offsetof(zio_link_t, zl_parent_node)); 600 list_create(&zio->io_child_list, sizeof (zio_link_t), 601 offsetof(zio_link_t, zl_child_node)); 602 603 if (vd != NULL) 604 zio->io_child_type = ZIO_CHILD_VDEV; 605 else if (flags & ZIO_FLAG_GANG_CHILD) 606 zio->io_child_type = ZIO_CHILD_GANG; 607 else if (flags & ZIO_FLAG_DDT_CHILD) 608 zio->io_child_type = ZIO_CHILD_DDT; 609 else 610 zio->io_child_type = ZIO_CHILD_LOGICAL; 611 612 if (bp != NULL) { 613 zio->io_bp = (blkptr_t *)bp; 614 zio->io_bp_copy = *bp; 615 zio->io_bp_orig = *bp; 616 if (type != ZIO_TYPE_WRITE || 617 zio->io_child_type == ZIO_CHILD_DDT) 618 zio->io_bp = &zio->io_bp_copy; /* so caller can free */ 619 if (zio->io_child_type == ZIO_CHILD_LOGICAL) 620 zio->io_logical = zio; 621 if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) 622 pipeline |= ZIO_GANG_STAGES; 623 } 624 625 zio->io_spa = spa; 626 zio->io_txg = txg; 627 zio->io_done = done; 628 zio->io_private = private; 629 zio->io_type = type; 630 zio->io_priority = priority; 631 zio->io_vd = vd; 632 zio->io_offset = offset; 633 zio->io_orig_data = zio->io_data = data; 634 zio->io_orig_size = zio->io_size = size; 635 zio->io_orig_flags = zio->io_flags = flags; 636 zio->io_orig_stage = zio->io_stage = stage; 637 zio->io_orig_pipeline = zio->io_pipeline = pipeline; 638 639 zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); 640 zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); 641 642 if (zb != NULL) 643 zio->io_bookmark = *zb; 644 645 if (pio != NULL) { 646 if (zio->io_logical == NULL) 647 zio->io_logical = pio->io_logical; 648 if (zio->io_child_type == ZIO_CHILD_GANG) 649 zio->io_gang_leader = pio->io_gang_leader; 650 zio_add_child(pio, zio); 651 } 652 653 return (zio); 654} 655 656static void 657zio_destroy(zio_t *zio) 658{ 659 list_destroy(&zio->io_parent_list); 660 list_destroy(&zio->io_child_list); 661 mutex_destroy(&zio->io_lock); 662 cv_destroy(&zio->io_cv); 663 kmem_cache_free(zio_cache, zio); 664} 665 666zio_t * 667zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, 668 void *private, enum zio_flag flags) 669{ 670 zio_t *zio; 671 672 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 673 ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 674 ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); 675 676 return (zio); 677} 678 679zio_t * 680zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) 681{ 682 return (zio_null(NULL, spa, NULL, done, private, flags)); 683} 684 685zio_t * 686zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, 687 void *data, uint64_t size, zio_done_func_t *done, void *private, 688 int priority, enum zio_flag flags, const zbookmark_t *zb) 689{ 690 zio_t *zio; 691 692 zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, 693 data, size, done, private, 694 ZIO_TYPE_READ, priority, flags, NULL, 0, zb, 695 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 696 ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); 697 698 return (zio); 699} 700 701zio_t * 702zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 703 void *data, uint64_t size, const zio_prop_t *zp, 704 zio_done_func_t *ready, zio_done_func_t *done, void *private, 705 int priority, enum zio_flag flags, const zbookmark_t *zb) 706{ 707 zio_t *zio; 708 709 ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && 710 zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && 711 zp->zp_compress >= ZIO_COMPRESS_OFF && 712 zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && 713 DMU_OT_IS_VALID(zp->zp_type) && 714 zp->zp_level < 32 && 715 zp->zp_copies > 0 && 716 zp->zp_copies <= spa_max_replication(spa)); 717 718 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 719 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 720 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 721 ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); 722 723 zio->io_ready = ready; 724 zio->io_prop = *zp; 725 726 return (zio); 727} 728 729zio_t * 730zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, 731 uint64_t size, zio_done_func_t *done, void *private, int priority, 732 enum zio_flag flags, zbookmark_t *zb) 733{ 734 zio_t *zio; 735 736 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 737 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 738 ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 739 740 return (zio); 741} 742 743void 744zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) 745{ 746 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 747 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 748 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 749 ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); 750 751 /* 752 * We must reset the io_prop to match the values that existed 753 * when the bp was first written by dmu_sync() keeping in mind 754 * that nopwrite and dedup are mutually exclusive. 755 */ 756 zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; 757 zio->io_prop.zp_nopwrite = nopwrite; 758 zio->io_prop.zp_copies = copies; 759 zio->io_bp_override = bp; 760} 761 762void 763zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) 764{ 765 metaslab_check_free(spa, bp); 766 bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); 767} 768 769zio_t * 770zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 771 uint64_t size, enum zio_flag flags) 772{ 773 zio_t *zio; 774 775 dprintf_bp(bp, "freeing in txg %llu, pass %u", 776 (longlong_t)txg, spa->spa_sync_pass); 777 778 ASSERT(!BP_IS_HOLE(bp)); 779 ASSERT(spa_syncing_txg(spa) == txg); 780 ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); 781 782 metaslab_check_free(spa, bp); 783 784 zio = zio_create(pio, spa, txg, bp, NULL, size, 785 NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags, 786 NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); 787 788 return (zio); 789} 790 791zio_t * 792zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 793 zio_done_func_t *done, void *private, enum zio_flag flags) 794{ 795 zio_t *zio; 796 797 /* 798 * A claim is an allocation of a specific block. Claims are needed 799 * to support immediate writes in the intent log. The issue is that 800 * immediate writes contain committed data, but in a txg that was 801 * *not* committed. Upon opening the pool after an unclean shutdown, 802 * the intent log claims all blocks that contain immediate write data 803 * so that the SPA knows they're in use. 804 * 805 * All claims *must* be resolved in the first txg -- before the SPA 806 * starts allocating blocks -- so that nothing is allocated twice. 807 * If txg == 0 we just verify that the block is claimable. 808 */ 809 ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 810 ASSERT(txg == spa_first_txg(spa) || txg == 0); 811 ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ 812 813 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 814 done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, 815 NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 816 817 return (zio); 818} 819 820zio_t * 821zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset, 822 uint64_t size, zio_done_func_t *done, void *private, int priority, 823 enum zio_flag flags) 824{ 825 zio_t *zio; 826 int c; 827 828 if (vd->vdev_children == 0) { 829 zio = zio_create(pio, spa, 0, NULL, NULL, size, done, private, 830 ZIO_TYPE_IOCTL, priority, flags, vd, offset, NULL, 831 ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 832 833 zio->io_cmd = cmd; 834 } else { 835 zio = zio_null(pio, spa, NULL, NULL, NULL, flags); 836 837 for (c = 0; c < vd->vdev_children; c++) 838 zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 839 offset, size, done, private, priority, flags)); 840 } 841 842 return (zio); 843} 844 845zio_t * 846zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 847 void *data, int checksum, zio_done_func_t *done, void *private, 848 int priority, enum zio_flag flags, boolean_t labels) 849{ 850 zio_t *zio; 851 852 ASSERT(vd->vdev_children == 0); 853 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 854 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 855 ASSERT3U(offset + size, <=, vd->vdev_psize); 856 857 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 858 ZIO_TYPE_READ, priority, flags, vd, offset, NULL, 859 ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 860 861 zio->io_prop.zp_checksum = checksum; 862 863 return (zio); 864} 865 866zio_t * 867zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 868 void *data, int checksum, zio_done_func_t *done, void *private, 869 int priority, enum zio_flag flags, boolean_t labels) 870{ 871 zio_t *zio; 872 873 ASSERT(vd->vdev_children == 0); 874 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 875 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 876 ASSERT3U(offset + size, <=, vd->vdev_psize); 877 878 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 879 ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL, 880 ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 881 882 zio->io_prop.zp_checksum = checksum; 883 884 if (zio_checksum_table[checksum].ci_eck) { 885 /* 886 * zec checksums are necessarily destructive -- they modify 887 * the end of the write buffer to hold the verifier/checksum. 888 * Therefore, we must make a local copy in case the data is 889 * being written to multiple places in parallel. 890 */ 891 void *wbuf = zio_buf_alloc(size); 892 bcopy(data, wbuf, size); 893 zio_push_transform(zio, wbuf, size, size, NULL); 894 } 895 896 return (zio); 897} 898 899/* 900 * Create a child I/O to do some work for us. 901 */ 902zio_t * 903zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 904 void *data, uint64_t size, int type, int priority, enum zio_flag flags, 905 zio_done_func_t *done, void *private) 906{ 907 enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; 908 zio_t *zio; 909 910 ASSERT(vd->vdev_parent == 911 (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); 912 913 if (type == ZIO_TYPE_READ && bp != NULL) { 914 /* 915 * If we have the bp, then the child should perform the 916 * checksum and the parent need not. This pushes error 917 * detection as close to the leaves as possible and 918 * eliminates redundant checksums in the interior nodes. 919 */ 920 pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; 921 pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 922 } 923 924 if (vd->vdev_children == 0) 925 offset += VDEV_LABEL_START_SIZE; 926 927 flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE; 928 929 /* 930 * If we've decided to do a repair, the write is not speculative -- 931 * even if the original read was. 932 */ 933 if (flags & ZIO_FLAG_IO_REPAIR) 934 flags &= ~ZIO_FLAG_SPECULATIVE; 935 936 zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, 937 done, private, type, priority, flags, vd, offset, &pio->io_bookmark, 938 ZIO_STAGE_VDEV_IO_START >> 1, pipeline); 939 940 return (zio); 941} 942 943zio_t * 944zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, 945 int type, int priority, enum zio_flag flags, 946 zio_done_func_t *done, void *private) 947{ 948 zio_t *zio; 949 950 ASSERT(vd->vdev_ops->vdev_op_leaf); 951 952 zio = zio_create(NULL, vd->vdev_spa, 0, NULL, 953 data, size, done, private, type, priority, 954 flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY, 955 vd, offset, NULL, 956 ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); 957 958 return (zio); 959} 960 961void 962zio_flush(zio_t *zio, vdev_t *vd) 963{ 964 zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0, 965 NULL, NULL, ZIO_PRIORITY_NOW, 966 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); 967} 968 969zio_t * 970zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size) 971{ 972 973 ASSERT(vd->vdev_ops->vdev_op_leaf); 974 975 return zio_ioctl(zio, spa, vd, DKIOCTRIM, offset, size, 976 NULL, NULL, ZIO_PRIORITY_TRIM, 977 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY); 978} 979 980void 981zio_shrink(zio_t *zio, uint64_t size) 982{ 983 ASSERT(zio->io_executor == NULL); 984 ASSERT(zio->io_orig_size == zio->io_size); 985 ASSERT(size <= zio->io_size); 986 987 /* 988 * We don't shrink for raidz because of problems with the 989 * reconstruction when reading back less than the block size. 990 * Note, BP_IS_RAIDZ() assumes no compression. 991 */ 992 ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); 993 if (!BP_IS_RAIDZ(zio->io_bp)) 994 zio->io_orig_size = zio->io_size = size; 995} 996 997/* 998 * ========================================================================== 999 * Prepare to read and write logical blocks 1000 * ========================================================================== 1001 */ 1002 1003static int 1004zio_read_bp_init(zio_t *zio) 1005{ 1006 blkptr_t *bp = zio->io_bp; 1007 1008 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && 1009 zio->io_child_type == ZIO_CHILD_LOGICAL && 1010 !(zio->io_flags & ZIO_FLAG_RAW)) { 1011 uint64_t psize = BP_GET_PSIZE(bp); 1012 void *cbuf = zio_buf_alloc(psize); 1013 1014 zio_push_transform(zio, cbuf, psize, psize, zio_decompress); 1015 } 1016 1017 if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) 1018 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1019 1020 if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) 1021 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1022 1023 if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) 1024 zio->io_pipeline = ZIO_DDT_READ_PIPELINE; 1025 1026 return (ZIO_PIPELINE_CONTINUE); 1027} 1028 1029static int 1030zio_write_bp_init(zio_t *zio) 1031{ 1032 spa_t *spa = zio->io_spa; 1033 zio_prop_t *zp = &zio->io_prop; 1034 enum zio_compress compress = zp->zp_compress; 1035 blkptr_t *bp = zio->io_bp; 1036 uint64_t lsize = zio->io_size; 1037 uint64_t psize = lsize; 1038 int pass = 1; 1039 1040 /* 1041 * If our children haven't all reached the ready stage, 1042 * wait for them and then repeat this pipeline stage. 1043 */ 1044 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 1045 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) 1046 return (ZIO_PIPELINE_STOP); 1047 1048 if (!IO_IS_ALLOCATING(zio)) 1049 return (ZIO_PIPELINE_CONTINUE); 1050 1051 ASSERT(zio->io_child_type != ZIO_CHILD_DDT); 1052 1053 if (zio->io_bp_override) { 1054 ASSERT(bp->blk_birth != zio->io_txg); 1055 ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); 1056 1057 *bp = *zio->io_bp_override; 1058 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1059 1060 /* 1061 * If we've been overridden and nopwrite is set then 1062 * set the flag accordingly to indicate that a nopwrite 1063 * has already occurred. 1064 */ 1065 if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { 1066 ASSERT(!zp->zp_dedup); 1067 zio->io_flags |= ZIO_FLAG_NOPWRITE; 1068 return (ZIO_PIPELINE_CONTINUE); 1069 } 1070 1071 ASSERT(!zp->zp_nopwrite); 1072 1073 if (BP_IS_HOLE(bp) || !zp->zp_dedup) 1074 return (ZIO_PIPELINE_CONTINUE); 1075 1076 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup || 1077 zp->zp_dedup_verify); 1078 1079 if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { 1080 BP_SET_DEDUP(bp, 1); 1081 zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; 1082 return (ZIO_PIPELINE_CONTINUE); 1083 } 1084 zio->io_bp_override = NULL; 1085 BP_ZERO(bp); 1086 } 1087 1088 if (bp->blk_birth == zio->io_txg) { 1089 /* 1090 * We're rewriting an existing block, which means we're 1091 * working on behalf of spa_sync(). For spa_sync() to 1092 * converge, it must eventually be the case that we don't 1093 * have to allocate new blocks. But compression changes 1094 * the blocksize, which forces a reallocate, and makes 1095 * convergence take longer. Therefore, after the first 1096 * few passes, stop compressing to ensure convergence. 1097 */ 1098 pass = spa_sync_pass(spa); 1099 1100 ASSERT(zio->io_txg == spa_syncing_txg(spa)); 1101 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1102 ASSERT(!BP_GET_DEDUP(bp)); 1103 1104 if (pass >= zfs_sync_pass_dont_compress) 1105 compress = ZIO_COMPRESS_OFF; 1106 1107 /* Make sure someone doesn't change their mind on overwrites */ 1108 ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp), 1109 spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 1110 } 1111 1112 if (compress != ZIO_COMPRESS_OFF) { 1113 void *cbuf = zio_buf_alloc(lsize); 1114 psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); 1115 if (psize == 0 || psize == lsize) { 1116 compress = ZIO_COMPRESS_OFF; 1117 zio_buf_free(cbuf, lsize); 1118 } else { 1119 ASSERT(psize < lsize); 1120 zio_push_transform(zio, cbuf, psize, lsize, NULL); 1121 } 1122 } 1123 1124 /* 1125 * The final pass of spa_sync() must be all rewrites, but the first 1126 * few passes offer a trade-off: allocating blocks defers convergence, 1127 * but newly allocated blocks are sequential, so they can be written 1128 * to disk faster. Therefore, we allow the first few passes of 1129 * spa_sync() to allocate new blocks, but force rewrites after that. 1130 * There should only be a handful of blocks after pass 1 in any case. 1131 */ 1132 if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize && 1133 pass >= zfs_sync_pass_rewrite) { 1134 ASSERT(psize != 0); 1135 enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; 1136 zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; 1137 zio->io_flags |= ZIO_FLAG_IO_REWRITE; 1138 } else { 1139 BP_ZERO(bp); 1140 zio->io_pipeline = ZIO_WRITE_PIPELINE; 1141 } 1142 1143 if (psize == 0) { 1144 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1145 } else { 1146 ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); 1147 BP_SET_LSIZE(bp, lsize); 1148 BP_SET_PSIZE(bp, psize); 1149 BP_SET_COMPRESS(bp, compress); 1150 BP_SET_CHECKSUM(bp, zp->zp_checksum); 1151 BP_SET_TYPE(bp, zp->zp_type); 1152 BP_SET_LEVEL(bp, zp->zp_level); 1153 BP_SET_DEDUP(bp, zp->zp_dedup); 1154 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1155 if (zp->zp_dedup) { 1156 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1157 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1158 zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; 1159 } 1160 if (zp->zp_nopwrite) { 1161 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1162 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1163 zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; 1164 } 1165 } 1166 1167 return (ZIO_PIPELINE_CONTINUE); 1168} 1169 1170static int 1171zio_free_bp_init(zio_t *zio) 1172{ 1173 blkptr_t *bp = zio->io_bp; 1174 1175 if (zio->io_child_type == ZIO_CHILD_LOGICAL) { 1176 if (BP_GET_DEDUP(bp)) 1177 zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; 1178 } 1179 1180 return (ZIO_PIPELINE_CONTINUE); 1181} 1182 1183/* 1184 * ========================================================================== 1185 * Execute the I/O pipeline 1186 * ========================================================================== 1187 */ 1188 1189static void 1190zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline) 1191{ 1192 spa_t *spa = zio->io_spa; 1193 zio_type_t t = zio->io_type; 1194 int flags = TQ_SLEEP | (cutinline ? TQ_FRONT : 0); 1195 1196 ASSERT(q == ZIO_TASKQ_ISSUE || q == ZIO_TASKQ_INTERRUPT); 1197 1198 /* 1199 * If we're a config writer or a probe, the normal issue and 1200 * interrupt threads may all be blocked waiting for the config lock. 1201 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. 1202 */ 1203 if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) 1204 t = ZIO_TYPE_NULL; 1205 1206 /* 1207 * A similar issue exists for the L2ARC write thread until L2ARC 2.0. 1208 */ 1209 if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) 1210 t = ZIO_TYPE_NULL; 1211 1212 /* 1213 * If this is a high priority I/O, then use the high priority taskq. 1214 */ 1215 if (zio->io_priority == ZIO_PRIORITY_NOW && 1216 spa->spa_zio_taskq[t][q + 1] != NULL) 1217 q++; 1218 1219 ASSERT3U(q, <, ZIO_TASKQ_TYPES); 1220#ifdef _KERNEL 1221 (void) taskq_dispatch_safe(spa->spa_zio_taskq[t][q], 1222 (task_func_t *)zio_execute, zio, flags, &zio->io_task); 1223#else 1224 (void) taskq_dispatch(spa->spa_zio_taskq[t][q], 1225 (task_func_t *)zio_execute, zio, flags); 1226#endif 1227} 1228 1229static boolean_t 1230zio_taskq_member(zio_t *zio, enum zio_taskq_type q) 1231{ 1232 kthread_t *executor = zio->io_executor; 1233 spa_t *spa = zio->io_spa; 1234 1235 for (zio_type_t t = 0; t < ZIO_TYPES; t++) 1236 if (taskq_member(spa->spa_zio_taskq[t][q], executor)) 1237 return (B_TRUE); 1238 1239 return (B_FALSE); 1240} 1241 1242static int 1243zio_issue_async(zio_t *zio) 1244{ 1245 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 1246 1247 return (ZIO_PIPELINE_STOP); 1248} 1249 1250void 1251zio_interrupt(zio_t *zio) 1252{ 1253 zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); 1254} 1255 1256/* 1257 * Execute the I/O pipeline until one of the following occurs: 1258 * (1) the I/O completes; (2) the pipeline stalls waiting for 1259 * dependent child I/Os; (3) the I/O issues, so we're waiting 1260 * for an I/O completion interrupt; (4) the I/O is delegated by 1261 * vdev-level caching or aggregation; (5) the I/O is deferred 1262 * due to vdev-level queueing; (6) the I/O is handed off to 1263 * another thread. In all cases, the pipeline stops whenever 1264 * there's no CPU work; it never burns a thread in cv_wait(). 1265 * 1266 * There's no locking on io_stage because there's no legitimate way 1267 * for multiple threads to be attempting to process the same I/O. 1268 */ 1269static zio_pipe_stage_t *zio_pipeline[]; 1270 1271void 1272zio_execute(zio_t *zio) 1273{ 1274 zio->io_executor = curthread; 1275 1276 while (zio->io_stage < ZIO_STAGE_DONE) { 1277 enum zio_stage pipeline = zio->io_pipeline; 1278 enum zio_stage stage = zio->io_stage; 1279 int rv; 1280 1281 ASSERT(!MUTEX_HELD(&zio->io_lock)); 1282 ASSERT(ISP2(stage)); 1283 ASSERT(zio->io_stall == NULL); 1284 1285 do { 1286 stage <<= 1; 1287 } while ((stage & pipeline) == 0); 1288 1289 ASSERT(stage <= ZIO_STAGE_DONE); 1290 1291 /* 1292 * If we are in interrupt context and this pipeline stage 1293 * will grab a config lock that is held across I/O, 1294 * or may wait for an I/O that needs an interrupt thread 1295 * to complete, issue async to avoid deadlock. 1296 * 1297 * For VDEV_IO_START, we cut in line so that the io will 1298 * be sent to disk promptly. 1299 */ 1300 if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && 1301 zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { 1302 boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? 1303 zio_requeue_io_start_cut_in_line : B_FALSE; 1304 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); 1305 return; 1306 } 1307 1308 zio->io_stage = stage; 1309 rv = zio_pipeline[highbit(stage) - 1](zio); 1310 1311 if (rv == ZIO_PIPELINE_STOP) 1312 return; 1313 1314 ASSERT(rv == ZIO_PIPELINE_CONTINUE); 1315 } 1316} 1317 1318/* 1319 * ========================================================================== 1320 * Initiate I/O, either sync or async 1321 * ========================================================================== 1322 */ 1323int 1324zio_wait(zio_t *zio) 1325{ 1326 int error; 1327 1328 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 1329 ASSERT(zio->io_executor == NULL); 1330 1331 zio->io_waiter = curthread; 1332 1333 zio_execute(zio); 1334 1335 mutex_enter(&zio->io_lock); 1336 while (zio->io_executor != NULL) 1337 cv_wait(&zio->io_cv, &zio->io_lock); 1338 mutex_exit(&zio->io_lock); 1339 1340 error = zio->io_error; 1341 zio_destroy(zio); 1342 1343 return (error); 1344} 1345 1346void 1347zio_nowait(zio_t *zio) 1348{ 1349 ASSERT(zio->io_executor == NULL); 1350 1351 if (zio->io_child_type == ZIO_CHILD_LOGICAL && 1352 zio_unique_parent(zio) == NULL) { 1353 /* 1354 * This is a logical async I/O with no parent to wait for it. 1355 * We add it to the spa_async_root_zio "Godfather" I/O which 1356 * will ensure they complete prior to unloading the pool. 1357 */ 1358 spa_t *spa = zio->io_spa; 1359 1360 zio_add_child(spa->spa_async_zio_root, zio); 1361 } 1362 1363 zio_execute(zio); 1364} 1365 1366/* 1367 * ========================================================================== 1368 * Reexecute or suspend/resume failed I/O 1369 * ========================================================================== 1370 */ 1371 1372static void 1373zio_reexecute(zio_t *pio) 1374{ 1375 zio_t *cio, *cio_next; 1376 1377 ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); 1378 ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); 1379 ASSERT(pio->io_gang_leader == NULL); 1380 ASSERT(pio->io_gang_tree == NULL); 1381 1382 pio->io_flags = pio->io_orig_flags; 1383 pio->io_stage = pio->io_orig_stage; 1384 pio->io_pipeline = pio->io_orig_pipeline; 1385 pio->io_reexecute = 0; 1386 pio->io_flags |= ZIO_FLAG_REEXECUTED; 1387 pio->io_error = 0; 1388 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1389 pio->io_state[w] = 0; 1390 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 1391 pio->io_child_error[c] = 0; 1392 1393 if (IO_IS_ALLOCATING(pio)) 1394 BP_ZERO(pio->io_bp); 1395 1396 /* 1397 * As we reexecute pio's children, new children could be created. 1398 * New children go to the head of pio's io_child_list, however, 1399 * so we will (correctly) not reexecute them. The key is that 1400 * the remainder of pio's io_child_list, from 'cio_next' onward, 1401 * cannot be affected by any side effects of reexecuting 'cio'. 1402 */ 1403 for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { 1404 cio_next = zio_walk_children(pio); 1405 mutex_enter(&pio->io_lock); 1406 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1407 pio->io_children[cio->io_child_type][w]++; 1408 mutex_exit(&pio->io_lock); 1409 zio_reexecute(cio); 1410 } 1411 1412 /* 1413 * Now that all children have been reexecuted, execute the parent. 1414 * We don't reexecute "The Godfather" I/O here as it's the 1415 * responsibility of the caller to wait on him. 1416 */ 1417 if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) 1418 zio_execute(pio); 1419} 1420 1421void 1422zio_suspend(spa_t *spa, zio_t *zio) 1423{ 1424 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) 1425 fm_panic("Pool '%s' has encountered an uncorrectable I/O " 1426 "failure and the failure mode property for this pool " 1427 "is set to panic.", spa_name(spa)); 1428 1429 zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); 1430 1431 mutex_enter(&spa->spa_suspend_lock); 1432 1433 if (spa->spa_suspend_zio_root == NULL) 1434 spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 1435 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 1436 ZIO_FLAG_GODFATHER); 1437 1438 spa->spa_suspended = B_TRUE; 1439 1440 if (zio != NULL) { 1441 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 1442 ASSERT(zio != spa->spa_suspend_zio_root); 1443 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1444 ASSERT(zio_unique_parent(zio) == NULL); 1445 ASSERT(zio->io_stage == ZIO_STAGE_DONE); 1446 zio_add_child(spa->spa_suspend_zio_root, zio); 1447 } 1448 1449 mutex_exit(&spa->spa_suspend_lock); 1450} 1451 1452int 1453zio_resume(spa_t *spa) 1454{ 1455 zio_t *pio; 1456 1457 /* 1458 * Reexecute all previously suspended i/o. 1459 */ 1460 mutex_enter(&spa->spa_suspend_lock); 1461 spa->spa_suspended = B_FALSE; 1462 cv_broadcast(&spa->spa_suspend_cv); 1463 pio = spa->spa_suspend_zio_root; 1464 spa->spa_suspend_zio_root = NULL; 1465 mutex_exit(&spa->spa_suspend_lock); 1466 1467 if (pio == NULL) 1468 return (0); 1469 1470 zio_reexecute(pio); 1471 return (zio_wait(pio)); 1472} 1473 1474void 1475zio_resume_wait(spa_t *spa) 1476{ 1477 mutex_enter(&spa->spa_suspend_lock); 1478 while (spa_suspended(spa)) 1479 cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); 1480 mutex_exit(&spa->spa_suspend_lock); 1481} 1482 1483/* 1484 * ========================================================================== 1485 * Gang blocks. 1486 * 1487 * A gang block is a collection of small blocks that looks to the DMU 1488 * like one large block. When zio_dva_allocate() cannot find a block 1489 * of the requested size, due to either severe fragmentation or the pool 1490 * being nearly full, it calls zio_write_gang_block() to construct the 1491 * block from smaller fragments. 1492 * 1493 * A gang block consists of a gang header (zio_gbh_phys_t) and up to 1494 * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like 1495 * an indirect block: it's an array of block pointers. It consumes 1496 * only one sector and hence is allocatable regardless of fragmentation. 1497 * The gang header's bps point to its gang members, which hold the data. 1498 * 1499 * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> 1500 * as the verifier to ensure uniqueness of the SHA256 checksum. 1501 * Critically, the gang block bp's blk_cksum is the checksum of the data, 1502 * not the gang header. This ensures that data block signatures (needed for 1503 * deduplication) are independent of how the block is physically stored. 1504 * 1505 * Gang blocks can be nested: a gang member may itself be a gang block. 1506 * Thus every gang block is a tree in which root and all interior nodes are 1507 * gang headers, and the leaves are normal blocks that contain user data. 1508 * The root of the gang tree is called the gang leader. 1509 * 1510 * To perform any operation (read, rewrite, free, claim) on a gang block, 1511 * zio_gang_assemble() first assembles the gang tree (minus data leaves) 1512 * in the io_gang_tree field of the original logical i/o by recursively 1513 * reading the gang leader and all gang headers below it. This yields 1514 * an in-core tree containing the contents of every gang header and the 1515 * bps for every constituent of the gang block. 1516 * 1517 * With the gang tree now assembled, zio_gang_issue() just walks the gang tree 1518 * and invokes a callback on each bp. To free a gang block, zio_gang_issue() 1519 * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. 1520 * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). 1521 * zio_read_gang() is a wrapper around zio_read() that omits reading gang 1522 * headers, since we already have those in io_gang_tree. zio_rewrite_gang() 1523 * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() 1524 * of the gang header plus zio_checksum_compute() of the data to update the 1525 * gang header's blk_cksum as described above. 1526 * 1527 * The two-phase assemble/issue model solves the problem of partial failure -- 1528 * what if you'd freed part of a gang block but then couldn't read the 1529 * gang header for another part? Assembling the entire gang tree first 1530 * ensures that all the necessary gang header I/O has succeeded before 1531 * starting the actual work of free, claim, or write. Once the gang tree 1532 * is assembled, free and claim are in-memory operations that cannot fail. 1533 * 1534 * In the event that a gang write fails, zio_dva_unallocate() walks the 1535 * gang tree to immediately free (i.e. insert back into the space map) 1536 * everything we've allocated. This ensures that we don't get ENOSPC 1537 * errors during repeated suspend/resume cycles due to a flaky device. 1538 * 1539 * Gang rewrites only happen during sync-to-convergence. If we can't assemble 1540 * the gang tree, we won't modify the block, so we can safely defer the free 1541 * (knowing that the block is still intact). If we *can* assemble the gang 1542 * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free 1543 * each constituent bp and we can allocate a new block on the next sync pass. 1544 * 1545 * In all cases, the gang tree allows complete recovery from partial failure. 1546 * ========================================================================== 1547 */ 1548 1549static zio_t * 1550zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1551{ 1552 if (gn != NULL) 1553 return (pio); 1554 1555 return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), 1556 NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1557 &pio->io_bookmark)); 1558} 1559 1560zio_t * 1561zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1562{ 1563 zio_t *zio; 1564 1565 if (gn != NULL) { 1566 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1567 gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, 1568 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1569 /* 1570 * As we rewrite each gang header, the pipeline will compute 1571 * a new gang block header checksum for it; but no one will 1572 * compute a new data checksum, so we do that here. The one 1573 * exception is the gang leader: the pipeline already computed 1574 * its data checksum because that stage precedes gang assembly. 1575 * (Presently, nothing actually uses interior data checksums; 1576 * this is just good hygiene.) 1577 */ 1578 if (gn != pio->io_gang_leader->io_gang_tree) { 1579 zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), 1580 data, BP_GET_PSIZE(bp)); 1581 } 1582 /* 1583 * If we are here to damage data for testing purposes, 1584 * leave the GBH alone so that we can detect the damage. 1585 */ 1586 if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) 1587 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 1588 } else { 1589 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1590 data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, 1591 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1592 } 1593 1594 return (zio); 1595} 1596 1597/* ARGSUSED */ 1598zio_t * 1599zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1600{ 1601 return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, 1602 BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp), 1603 ZIO_GANG_CHILD_FLAGS(pio))); 1604} 1605 1606/* ARGSUSED */ 1607zio_t * 1608zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1609{ 1610 return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, 1611 NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); 1612} 1613 1614static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { 1615 NULL, 1616 zio_read_gang, 1617 zio_rewrite_gang, 1618 zio_free_gang, 1619 zio_claim_gang, 1620 NULL 1621}; 1622 1623static void zio_gang_tree_assemble_done(zio_t *zio); 1624 1625static zio_gang_node_t * 1626zio_gang_node_alloc(zio_gang_node_t **gnpp) 1627{ 1628 zio_gang_node_t *gn; 1629 1630 ASSERT(*gnpp == NULL); 1631 1632 gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); 1633 gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); 1634 *gnpp = gn; 1635 1636 return (gn); 1637} 1638 1639static void 1640zio_gang_node_free(zio_gang_node_t **gnpp) 1641{ 1642 zio_gang_node_t *gn = *gnpp; 1643 1644 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1645 ASSERT(gn->gn_child[g] == NULL); 1646 1647 zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); 1648 kmem_free(gn, sizeof (*gn)); 1649 *gnpp = NULL; 1650} 1651 1652static void 1653zio_gang_tree_free(zio_gang_node_t **gnpp) 1654{ 1655 zio_gang_node_t *gn = *gnpp; 1656 1657 if (gn == NULL) 1658 return; 1659 1660 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1661 zio_gang_tree_free(&gn->gn_child[g]); 1662 1663 zio_gang_node_free(gnpp); 1664} 1665 1666static void 1667zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) 1668{ 1669 zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); 1670 1671 ASSERT(gio->io_gang_leader == gio); 1672 ASSERT(BP_IS_GANG(bp)); 1673 1674 zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, 1675 SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, 1676 gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); 1677} 1678 1679static void 1680zio_gang_tree_assemble_done(zio_t *zio) 1681{ 1682 zio_t *gio = zio->io_gang_leader; 1683 zio_gang_node_t *gn = zio->io_private; 1684 blkptr_t *bp = zio->io_bp; 1685 1686 ASSERT(gio == zio_unique_parent(zio)); 1687 ASSERT(zio->io_child_count == 0); 1688 1689 if (zio->io_error) 1690 return; 1691 1692 if (BP_SHOULD_BYTESWAP(bp)) 1693 byteswap_uint64_array(zio->io_data, zio->io_size); 1694 1695 ASSERT(zio->io_data == gn->gn_gbh); 1696 ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 1697 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1698 1699 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1700 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1701 if (!BP_IS_GANG(gbp)) 1702 continue; 1703 zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); 1704 } 1705} 1706 1707static void 1708zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) 1709{ 1710 zio_t *gio = pio->io_gang_leader; 1711 zio_t *zio; 1712 1713 ASSERT(BP_IS_GANG(bp) == !!gn); 1714 ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); 1715 ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); 1716 1717 /* 1718 * If you're a gang header, your data is in gn->gn_gbh. 1719 * If you're a gang member, your data is in 'data' and gn == NULL. 1720 */ 1721 zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); 1722 1723 if (gn != NULL) { 1724 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1725 1726 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1727 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1728 if (BP_IS_HOLE(gbp)) 1729 continue; 1730 zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); 1731 data = (char *)data + BP_GET_PSIZE(gbp); 1732 } 1733 } 1734 1735 if (gn == gio->io_gang_tree && gio->io_data != NULL) 1736 ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); 1737 1738 if (zio != pio) 1739 zio_nowait(zio); 1740} 1741 1742static int 1743zio_gang_assemble(zio_t *zio) 1744{ 1745 blkptr_t *bp = zio->io_bp; 1746 1747 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); 1748 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1749 1750 zio->io_gang_leader = zio; 1751 1752 zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); 1753 1754 return (ZIO_PIPELINE_CONTINUE); 1755} 1756 1757static int 1758zio_gang_issue(zio_t *zio) 1759{ 1760 blkptr_t *bp = zio->io_bp; 1761 1762 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) 1763 return (ZIO_PIPELINE_STOP); 1764 1765 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); 1766 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1767 1768 if (zio->io_child_error[ZIO_CHILD_GANG] == 0) 1769 zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); 1770 else 1771 zio_gang_tree_free(&zio->io_gang_tree); 1772 1773 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1774 1775 return (ZIO_PIPELINE_CONTINUE); 1776} 1777 1778static void 1779zio_write_gang_member_ready(zio_t *zio) 1780{ 1781 zio_t *pio = zio_unique_parent(zio); 1782 zio_t *gio = zio->io_gang_leader; 1783 dva_t *cdva = zio->io_bp->blk_dva; 1784 dva_t *pdva = pio->io_bp->blk_dva; 1785 uint64_t asize; 1786 1787 if (BP_IS_HOLE(zio->io_bp)) 1788 return; 1789 1790 ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); 1791 1792 ASSERT(zio->io_child_type == ZIO_CHILD_GANG); 1793 ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); 1794 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); 1795 ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); 1796 ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 1797 1798 mutex_enter(&pio->io_lock); 1799 for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { 1800 ASSERT(DVA_GET_GANG(&pdva[d])); 1801 asize = DVA_GET_ASIZE(&pdva[d]); 1802 asize += DVA_GET_ASIZE(&cdva[d]); 1803 DVA_SET_ASIZE(&pdva[d], asize); 1804 } 1805 mutex_exit(&pio->io_lock); 1806} 1807 1808static int 1809zio_write_gang_block(zio_t *pio) 1810{ 1811 spa_t *spa = pio->io_spa; 1812 blkptr_t *bp = pio->io_bp; 1813 zio_t *gio = pio->io_gang_leader; 1814 zio_t *zio; 1815 zio_gang_node_t *gn, **gnpp; 1816 zio_gbh_phys_t *gbh; 1817 uint64_t txg = pio->io_txg; 1818 uint64_t resid = pio->io_size; 1819 uint64_t lsize; 1820 int copies = gio->io_prop.zp_copies; 1821 int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); 1822 zio_prop_t zp; 1823 int error; 1824 1825 error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE, 1826 bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, 1827 METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); 1828 if (error) { 1829 pio->io_error = error; 1830 return (ZIO_PIPELINE_CONTINUE); 1831 } 1832 1833 if (pio == gio) { 1834 gnpp = &gio->io_gang_tree; 1835 } else { 1836 gnpp = pio->io_private; 1837 ASSERT(pio->io_ready == zio_write_gang_member_ready); 1838 } 1839 1840 gn = zio_gang_node_alloc(gnpp); 1841 gbh = gn->gn_gbh; 1842 bzero(gbh, SPA_GANGBLOCKSIZE); 1843 1844 /* 1845 * Create the gang header. 1846 */ 1847 zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, 1848 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1849 1850 /* 1851 * Create and nowait the gang children. 1852 */ 1853 for (int g = 0; resid != 0; resid -= lsize, g++) { 1854 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), 1855 SPA_MINBLOCKSIZE); 1856 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); 1857 1858 zp.zp_checksum = gio->io_prop.zp_checksum; 1859 zp.zp_compress = ZIO_COMPRESS_OFF; 1860 zp.zp_type = DMU_OT_NONE; 1861 zp.zp_level = 0; 1862 zp.zp_copies = gio->io_prop.zp_copies; 1863 zp.zp_dedup = B_FALSE; 1864 zp.zp_dedup_verify = B_FALSE; 1865 zp.zp_nopwrite = B_FALSE; 1866 1867 zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], 1868 (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, 1869 zio_write_gang_member_ready, NULL, &gn->gn_child[g], 1870 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1871 &pio->io_bookmark)); 1872 } 1873 1874 /* 1875 * Set pio's pipeline to just wait for zio to finish. 1876 */ 1877 pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1878 1879 zio_nowait(zio); 1880 1881 return (ZIO_PIPELINE_CONTINUE); 1882} 1883 1884/* 1885 * The zio_nop_write stage in the pipeline determines if allocating 1886 * a new bp is necessary. By leveraging a cryptographically secure checksum, 1887 * such as SHA256, we can compare the checksums of the new data and the old 1888 * to determine if allocating a new block is required. The nopwrite 1889 * feature can handle writes in either syncing or open context (i.e. zil 1890 * writes) and as a result is mutually exclusive with dedup. 1891 */ 1892static int 1893zio_nop_write(zio_t *zio) 1894{ 1895 blkptr_t *bp = zio->io_bp; 1896 blkptr_t *bp_orig = &zio->io_bp_orig; 1897 zio_prop_t *zp = &zio->io_prop; 1898 1899 ASSERT(BP_GET_LEVEL(bp) == 0); 1900 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1901 ASSERT(zp->zp_nopwrite); 1902 ASSERT(!zp->zp_dedup); 1903 ASSERT(zio->io_bp_override == NULL); 1904 ASSERT(IO_IS_ALLOCATING(zio)); 1905 1906 /* 1907 * Check to see if the original bp and the new bp have matching 1908 * characteristics (i.e. same checksum, compression algorithms, etc). 1909 * If they don't then just continue with the pipeline which will 1910 * allocate a new bp. 1911 */ 1912 if (BP_IS_HOLE(bp_orig) || 1913 !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup || 1914 BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || 1915 BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || 1916 BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || 1917 zp->zp_copies != BP_GET_NDVAS(bp_orig)) 1918 return (ZIO_PIPELINE_CONTINUE); 1919 1920 /* 1921 * If the checksums match then reset the pipeline so that we 1922 * avoid allocating a new bp and issuing any I/O. 1923 */ 1924 if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { 1925 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup); 1926 ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); 1927 ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); 1928 ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); 1929 ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, 1930 sizeof (uint64_t)) == 0); 1931 1932 *bp = *bp_orig; 1933 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1934 zio->io_flags |= ZIO_FLAG_NOPWRITE; 1935 } 1936 1937 return (ZIO_PIPELINE_CONTINUE); 1938} 1939 1940/* 1941 * ========================================================================== 1942 * Dedup 1943 * ========================================================================== 1944 */ 1945static void 1946zio_ddt_child_read_done(zio_t *zio) 1947{ 1948 blkptr_t *bp = zio->io_bp; 1949 ddt_entry_t *dde = zio->io_private; 1950 ddt_phys_t *ddp; 1951 zio_t *pio = zio_unique_parent(zio); 1952 1953 mutex_enter(&pio->io_lock); 1954 ddp = ddt_phys_select(dde, bp); 1955 if (zio->io_error == 0) 1956 ddt_phys_clear(ddp); /* this ddp doesn't need repair */ 1957 if (zio->io_error == 0 && dde->dde_repair_data == NULL) 1958 dde->dde_repair_data = zio->io_data; 1959 else 1960 zio_buf_free(zio->io_data, zio->io_size); 1961 mutex_exit(&pio->io_lock); 1962} 1963 1964static int 1965zio_ddt_read_start(zio_t *zio) 1966{ 1967 blkptr_t *bp = zio->io_bp; 1968 1969 ASSERT(BP_GET_DEDUP(bp)); 1970 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 1971 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1972 1973 if (zio->io_child_error[ZIO_CHILD_DDT]) { 1974 ddt_t *ddt = ddt_select(zio->io_spa, bp); 1975 ddt_entry_t *dde = ddt_repair_start(ddt, bp); 1976 ddt_phys_t *ddp = dde->dde_phys; 1977 ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); 1978 blkptr_t blk; 1979 1980 ASSERT(zio->io_vsd == NULL); 1981 zio->io_vsd = dde; 1982 1983 if (ddp_self == NULL) 1984 return (ZIO_PIPELINE_CONTINUE); 1985 1986 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 1987 if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) 1988 continue; 1989 ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, 1990 &blk); 1991 zio_nowait(zio_read(zio, zio->io_spa, &blk, 1992 zio_buf_alloc(zio->io_size), zio->io_size, 1993 zio_ddt_child_read_done, dde, zio->io_priority, 1994 ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, 1995 &zio->io_bookmark)); 1996 } 1997 return (ZIO_PIPELINE_CONTINUE); 1998 } 1999 2000 zio_nowait(zio_read(zio, zio->io_spa, bp, 2001 zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, 2002 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); 2003 2004 return (ZIO_PIPELINE_CONTINUE); 2005} 2006 2007static int 2008zio_ddt_read_done(zio_t *zio) 2009{ 2010 blkptr_t *bp = zio->io_bp; 2011 2012 if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)) 2013 return (ZIO_PIPELINE_STOP); 2014 2015 ASSERT(BP_GET_DEDUP(bp)); 2016 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2017 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2018 2019 if (zio->io_child_error[ZIO_CHILD_DDT]) { 2020 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2021 ddt_entry_t *dde = zio->io_vsd; 2022 if (ddt == NULL) { 2023 ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); 2024 return (ZIO_PIPELINE_CONTINUE); 2025 } 2026 if (dde == NULL) { 2027 zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; 2028 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 2029 return (ZIO_PIPELINE_STOP); 2030 } 2031 if (dde->dde_repair_data != NULL) { 2032 bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); 2033 zio->io_child_error[ZIO_CHILD_DDT] = 0; 2034 } 2035 ddt_repair_done(ddt, dde); 2036 zio->io_vsd = NULL; 2037 } 2038 2039 ASSERT(zio->io_vsd == NULL); 2040 2041 return (ZIO_PIPELINE_CONTINUE); 2042} 2043 2044static boolean_t 2045zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) 2046{ 2047 spa_t *spa = zio->io_spa; 2048 2049 /* 2050 * Note: we compare the original data, not the transformed data, 2051 * because when zio->io_bp is an override bp, we will not have 2052 * pushed the I/O transforms. That's an important optimization 2053 * because otherwise we'd compress/encrypt all dmu_sync() data twice. 2054 */ 2055 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2056 zio_t *lio = dde->dde_lead_zio[p]; 2057 2058 if (lio != NULL) { 2059 return (lio->io_orig_size != zio->io_orig_size || 2060 bcmp(zio->io_orig_data, lio->io_orig_data, 2061 zio->io_orig_size) != 0); 2062 } 2063 } 2064 2065 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2066 ddt_phys_t *ddp = &dde->dde_phys[p]; 2067 2068 if (ddp->ddp_phys_birth != 0) { 2069 arc_buf_t *abuf = NULL; 2070 uint32_t aflags = ARC_WAIT; 2071 blkptr_t blk = *zio->io_bp; 2072 int error; 2073 2074 ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); 2075 2076 ddt_exit(ddt); 2077 2078 error = arc_read(NULL, spa, &blk, 2079 arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, 2080 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2081 &aflags, &zio->io_bookmark); 2082 2083 if (error == 0) { 2084 if (arc_buf_size(abuf) != zio->io_orig_size || 2085 bcmp(abuf->b_data, zio->io_orig_data, 2086 zio->io_orig_size) != 0) 2087 error = SET_ERROR(EEXIST); 2088 VERIFY(arc_buf_remove_ref(abuf, &abuf)); 2089 } 2090 2091 ddt_enter(ddt); 2092 return (error != 0); 2093 } 2094 } 2095 2096 return (B_FALSE); 2097} 2098 2099static void 2100zio_ddt_child_write_ready(zio_t *zio) 2101{ 2102 int p = zio->io_prop.zp_copies; 2103 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2104 ddt_entry_t *dde = zio->io_private; 2105 ddt_phys_t *ddp = &dde->dde_phys[p]; 2106 zio_t *pio; 2107 2108 if (zio->io_error) 2109 return; 2110 2111 ddt_enter(ddt); 2112 2113 ASSERT(dde->dde_lead_zio[p] == zio); 2114 2115 ddt_phys_fill(ddp, zio->io_bp); 2116 2117 while ((pio = zio_walk_parents(zio)) != NULL) 2118 ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); 2119 2120 ddt_exit(ddt); 2121} 2122 2123static void 2124zio_ddt_child_write_done(zio_t *zio) 2125{ 2126 int p = zio->io_prop.zp_copies; 2127 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2128 ddt_entry_t *dde = zio->io_private; 2129 ddt_phys_t *ddp = &dde->dde_phys[p]; 2130 2131 ddt_enter(ddt); 2132 2133 ASSERT(ddp->ddp_refcnt == 0); 2134 ASSERT(dde->dde_lead_zio[p] == zio); 2135 dde->dde_lead_zio[p] = NULL; 2136 2137 if (zio->io_error == 0) { 2138 while (zio_walk_parents(zio) != NULL) 2139 ddt_phys_addref(ddp); 2140 } else { 2141 ddt_phys_clear(ddp); 2142 } 2143 2144 ddt_exit(ddt); 2145} 2146 2147static void 2148zio_ddt_ditto_write_done(zio_t *zio) 2149{ 2150 int p = DDT_PHYS_DITTO; 2151 zio_prop_t *zp = &zio->io_prop; 2152 blkptr_t *bp = zio->io_bp; 2153 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2154 ddt_entry_t *dde = zio->io_private; 2155 ddt_phys_t *ddp = &dde->dde_phys[p]; 2156 ddt_key_t *ddk = &dde->dde_key; 2157 2158 ddt_enter(ddt); 2159 2160 ASSERT(ddp->ddp_refcnt == 0); 2161 ASSERT(dde->dde_lead_zio[p] == zio); 2162 dde->dde_lead_zio[p] = NULL; 2163 2164 if (zio->io_error == 0) { 2165 ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); 2166 ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); 2167 ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); 2168 if (ddp->ddp_phys_birth != 0) 2169 ddt_phys_free(ddt, ddk, ddp, zio->io_txg); 2170 ddt_phys_fill(ddp, bp); 2171 } 2172 2173 ddt_exit(ddt); 2174} 2175 2176static int 2177zio_ddt_write(zio_t *zio) 2178{ 2179 spa_t *spa = zio->io_spa; 2180 blkptr_t *bp = zio->io_bp; 2181 uint64_t txg = zio->io_txg; 2182 zio_prop_t *zp = &zio->io_prop; 2183 int p = zp->zp_copies; 2184 int ditto_copies; 2185 zio_t *cio = NULL; 2186 zio_t *dio = NULL; 2187 ddt_t *ddt = ddt_select(spa, bp); 2188 ddt_entry_t *dde; 2189 ddt_phys_t *ddp; 2190 2191 ASSERT(BP_GET_DEDUP(bp)); 2192 ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); 2193 ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); 2194 2195 ddt_enter(ddt); 2196 dde = ddt_lookup(ddt, bp, B_TRUE); 2197 ddp = &dde->dde_phys[p]; 2198 2199 if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { 2200 /* 2201 * If we're using a weak checksum, upgrade to a strong checksum 2202 * and try again. If we're already using a strong checksum, 2203 * we can't resolve it, so just convert to an ordinary write. 2204 * (And automatically e-mail a paper to Nature?) 2205 */ 2206 if (!zio_checksum_table[zp->zp_checksum].ci_dedup) { 2207 zp->zp_checksum = spa_dedup_checksum(spa); 2208 zio_pop_transforms(zio); 2209 zio->io_stage = ZIO_STAGE_OPEN; 2210 BP_ZERO(bp); 2211 } else { 2212 zp->zp_dedup = B_FALSE; 2213 } 2214 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2215 ddt_exit(ddt); 2216 return (ZIO_PIPELINE_CONTINUE); 2217 } 2218 2219 ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); 2220 ASSERT(ditto_copies < SPA_DVAS_PER_BP); 2221 2222 if (ditto_copies > ddt_ditto_copies_present(dde) && 2223 dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { 2224 zio_prop_t czp = *zp; 2225 2226 czp.zp_copies = ditto_copies; 2227 2228 /* 2229 * If we arrived here with an override bp, we won't have run 2230 * the transform stack, so we won't have the data we need to 2231 * generate a child i/o. So, toss the override bp and restart. 2232 * This is safe, because using the override bp is just an 2233 * optimization; and it's rare, so the cost doesn't matter. 2234 */ 2235 if (zio->io_bp_override) { 2236 zio_pop_transforms(zio); 2237 zio->io_stage = ZIO_STAGE_OPEN; 2238 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2239 zio->io_bp_override = NULL; 2240 BP_ZERO(bp); 2241 ddt_exit(ddt); 2242 return (ZIO_PIPELINE_CONTINUE); 2243 } 2244 2245 dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2246 zio->io_orig_size, &czp, NULL, 2247 zio_ddt_ditto_write_done, dde, zio->io_priority, 2248 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2249 2250 zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); 2251 dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; 2252 } 2253 2254 if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { 2255 if (ddp->ddp_phys_birth != 0) 2256 ddt_bp_fill(ddp, bp, txg); 2257 if (dde->dde_lead_zio[p] != NULL) 2258 zio_add_child(zio, dde->dde_lead_zio[p]); 2259 else 2260 ddt_phys_addref(ddp); 2261 } else if (zio->io_bp_override) { 2262 ASSERT(bp->blk_birth == txg); 2263 ASSERT(BP_EQUAL(bp, zio->io_bp_override)); 2264 ddt_phys_fill(ddp, bp); 2265 ddt_phys_addref(ddp); 2266 } else { 2267 cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2268 zio->io_orig_size, zp, zio_ddt_child_write_ready, 2269 zio_ddt_child_write_done, dde, zio->io_priority, 2270 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2271 2272 zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); 2273 dde->dde_lead_zio[p] = cio; 2274 } 2275 2276 ddt_exit(ddt); 2277 2278 if (cio) 2279 zio_nowait(cio); 2280 if (dio) 2281 zio_nowait(dio); 2282 2283 return (ZIO_PIPELINE_CONTINUE); 2284} 2285 2286ddt_entry_t *freedde; /* for debugging */ 2287 2288static int 2289zio_ddt_free(zio_t *zio) 2290{ 2291 spa_t *spa = zio->io_spa; 2292 blkptr_t *bp = zio->io_bp; 2293 ddt_t *ddt = ddt_select(spa, bp); 2294 ddt_entry_t *dde; 2295 ddt_phys_t *ddp; 2296 2297 ASSERT(BP_GET_DEDUP(bp)); 2298 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2299 2300 ddt_enter(ddt); 2301 freedde = dde = ddt_lookup(ddt, bp, B_TRUE); 2302 ddp = ddt_phys_select(dde, bp); 2303 ddt_phys_decref(ddp); 2304 ddt_exit(ddt); 2305 2306 return (ZIO_PIPELINE_CONTINUE); 2307} 2308 2309/* 2310 * ========================================================================== 2311 * Allocate and free blocks 2312 * ========================================================================== 2313 */ 2314static int 2315zio_dva_allocate(zio_t *zio) 2316{ 2317 spa_t *spa = zio->io_spa; 2318 metaslab_class_t *mc = spa_normal_class(spa); 2319 blkptr_t *bp = zio->io_bp; 2320 int error; 2321 int flags = 0; 2322 2323 if (zio->io_gang_leader == NULL) { 2324 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2325 zio->io_gang_leader = zio; 2326 } 2327 2328 ASSERT(BP_IS_HOLE(bp)); 2329 ASSERT0(BP_GET_NDVAS(bp)); 2330 ASSERT3U(zio->io_prop.zp_copies, >, 0); 2331 ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); 2332 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 2333 2334 /* 2335 * The dump device does not support gang blocks so allocation on 2336 * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid 2337 * the "fast" gang feature. 2338 */ 2339 flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0; 2340 flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ? 2341 METASLAB_GANG_CHILD : 0; 2342 error = metaslab_alloc(spa, mc, zio->io_size, bp, 2343 zio->io_prop.zp_copies, zio->io_txg, NULL, flags); 2344 2345 if (error) { 2346 spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " 2347 "size %llu, error %d", spa_name(spa), zio, zio->io_size, 2348 error); 2349 if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) 2350 return (zio_write_gang_block(zio)); 2351 zio->io_error = error; 2352 } 2353 2354 return (ZIO_PIPELINE_CONTINUE); 2355} 2356 2357static int 2358zio_dva_free(zio_t *zio) 2359{ 2360 metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); 2361 2362 return (ZIO_PIPELINE_CONTINUE); 2363} 2364 2365static int 2366zio_dva_claim(zio_t *zio) 2367{ 2368 int error; 2369 2370 error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 2371 if (error) 2372 zio->io_error = error; 2373 2374 return (ZIO_PIPELINE_CONTINUE); 2375} 2376 2377/* 2378 * Undo an allocation. This is used by zio_done() when an I/O fails 2379 * and we want to give back the block we just allocated. 2380 * This handles both normal blocks and gang blocks. 2381 */ 2382static void 2383zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) 2384{ 2385 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 2386 ASSERT(zio->io_bp_override == NULL); 2387 2388 if (!BP_IS_HOLE(bp)) 2389 metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); 2390 2391 if (gn != NULL) { 2392 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 2393 zio_dva_unallocate(zio, gn->gn_child[g], 2394 &gn->gn_gbh->zg_blkptr[g]); 2395 } 2396 } 2397} 2398 2399/* 2400 * Try to allocate an intent log block. Return 0 on success, errno on failure. 2401 */ 2402int 2403zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, 2404 uint64_t size, boolean_t use_slog) 2405{ 2406 int error = 1; 2407 2408 ASSERT(txg > spa_syncing_txg(spa)); 2409 2410 /* 2411 * ZIL blocks are always contiguous (i.e. not gang blocks) so we 2412 * set the METASLAB_GANG_AVOID flag so that they don't "fast gang" 2413 * when allocating them. 2414 */ 2415 if (use_slog) { 2416 error = metaslab_alloc(spa, spa_log_class(spa), size, 2417 new_bp, 1, txg, old_bp, 2418 METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID); 2419 } 2420 2421 if (error) { 2422 error = metaslab_alloc(spa, spa_normal_class(spa), size, 2423 new_bp, 1, txg, old_bp, 2424 METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID); 2425 } 2426 2427 if (error == 0) { 2428 BP_SET_LSIZE(new_bp, size); 2429 BP_SET_PSIZE(new_bp, size); 2430 BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 2431 BP_SET_CHECKSUM(new_bp, 2432 spa_version(spa) >= SPA_VERSION_SLIM_ZIL 2433 ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); 2434 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 2435 BP_SET_LEVEL(new_bp, 0); 2436 BP_SET_DEDUP(new_bp, 0); 2437 BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 2438 } 2439 2440 return (error); 2441} 2442 2443/* 2444 * Free an intent log block. 2445 */ 2446void 2447zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp) 2448{ 2449 ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG); 2450 ASSERT(!BP_IS_GANG(bp)); 2451 2452 zio_free(spa, txg, bp); 2453} 2454 2455/* 2456 * ========================================================================== 2457 * Read, write and delete to physical devices 2458 * ========================================================================== 2459 */ 2460static int 2461zio_vdev_io_start(zio_t *zio) 2462{ 2463 vdev_t *vd = zio->io_vd; 2464 uint64_t align; 2465 spa_t *spa = zio->io_spa; 2466 2467 ASSERT(zio->io_error == 0); 2468 ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); 2469 2470 if (vd == NULL) { 2471 if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2472 spa_config_enter(spa, SCL_ZIO, zio, RW_READER); 2473 2474 /* 2475 * The mirror_ops handle multiple DVAs in a single BP. 2476 */ 2477 return (vdev_mirror_ops.vdev_op_io_start(zio)); 2478 } 2479 2480 if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE) { 2481 trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg); 2482 return (ZIO_PIPELINE_CONTINUE); 2483 } 2484 2485 /* 2486 * We keep track of time-sensitive I/Os so that the scan thread 2487 * can quickly react to certain workloads. In particular, we care 2488 * about non-scrubbing, top-level reads and writes with the following 2489 * characteristics: 2490 * - synchronous writes of user data to non-slog devices 2491 * - any reads of user data 2492 * When these conditions are met, adjust the timestamp of spa_last_io 2493 * which allows the scan thread to adjust its workload accordingly. 2494 */ 2495 if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && 2496 vd == vd->vdev_top && !vd->vdev_islog && 2497 zio->io_bookmark.zb_objset != DMU_META_OBJSET && 2498 zio->io_txg != spa_syncing_txg(spa)) { 2499 uint64_t old = spa->spa_last_io; 2500 uint64_t new = ddi_get_lbolt64(); 2501 if (old != new) 2502 (void) atomic_cas_64(&spa->spa_last_io, old, new); 2503 } 2504 2505 align = 1ULL << vd->vdev_top->vdev_ashift; 2506 2507 if (P2PHASE(zio->io_size, align) != 0) { 2508 uint64_t asize = P2ROUNDUP(zio->io_size, align); 2509 char *abuf = NULL; 2510 if (zio->io_type == ZIO_TYPE_READ || 2511 zio->io_type == ZIO_TYPE_WRITE) 2512 abuf = zio_buf_alloc(asize); 2513 ASSERT(vd == vd->vdev_top); 2514 if (zio->io_type == ZIO_TYPE_WRITE) { 2515 bcopy(zio->io_data, abuf, zio->io_size); 2516 bzero(abuf + zio->io_size, asize - zio->io_size); 2517 } 2518 zio_push_transform(zio, abuf, asize, abuf ? asize : 0, 2519 zio_subblock); 2520 } 2521 2522 ASSERT(P2PHASE(zio->io_offset, align) == 0); 2523 ASSERT(P2PHASE(zio->io_size, align) == 0); 2524 VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa)); 2525 2526 /* 2527 * If this is a repair I/O, and there's no self-healing involved -- 2528 * that is, we're just resilvering what we expect to resilver -- 2529 * then don't do the I/O unless zio's txg is actually in vd's DTL. 2530 * This prevents spurious resilvering with nested replication. 2531 * For example, given a mirror of mirrors, (A+B)+(C+D), if only 2532 * A is out of date, we'll read from C+D, then use the data to 2533 * resilver A+B -- but we don't actually want to resilver B, just A. 2534 * The top-level mirror has no way to know this, so instead we just 2535 * discard unnecessary repairs as we work our way down the vdev tree. 2536 * The same logic applies to any form of nested replication: 2537 * ditto + mirror, RAID-Z + replacing, etc. This covers them all. 2538 */ 2539 if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && 2540 !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && 2541 zio->io_txg != 0 && /* not a delegated i/o */ 2542 !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { 2543 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 2544 zio_vdev_io_bypass(zio); 2545 return (ZIO_PIPELINE_CONTINUE); 2546 } 2547 2548 if (vd->vdev_ops->vdev_op_leaf && 2549 (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { 2550 2551 if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) 2552 return (ZIO_PIPELINE_CONTINUE); 2553 2554 if ((zio = vdev_queue_io(zio)) == NULL) 2555 return (ZIO_PIPELINE_STOP); 2556 2557 if (!vdev_accessible(vd, zio)) { 2558 zio->io_error = SET_ERROR(ENXIO); 2559 zio_interrupt(zio); 2560 return (ZIO_PIPELINE_STOP); 2561 } 2562 } 2563 2564 /* 2565 * Note that we ignore repair writes for TRIM because they can conflict 2566 * with normal writes. This isn't an issue because, by definition, we 2567 * only repair blocks that aren't freed. 2568 */ 2569 if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_WRITE && 2570 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 2571 if (!trim_map_write_start(zio)) 2572 return (ZIO_PIPELINE_STOP); 2573 } 2574 2575 return (vd->vdev_ops->vdev_op_io_start(zio)); 2576} 2577 2578static int 2579zio_vdev_io_done(zio_t *zio) 2580{ 2581 vdev_t *vd = zio->io_vd; 2582 vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; 2583 boolean_t unexpected_error = B_FALSE; 2584 2585 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2586 return (ZIO_PIPELINE_STOP); 2587 2588 ASSERT(zio->io_type == ZIO_TYPE_READ || 2589 zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE); 2590 2591 if (vd != NULL && vd->vdev_ops->vdev_op_leaf && 2592 (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { 2593 2594 if (zio->io_type == ZIO_TYPE_WRITE && 2595 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) 2596 trim_map_write_done(zio); 2597 2598 vdev_queue_io_done(zio); 2599 2600 if (zio->io_type == ZIO_TYPE_WRITE) 2601 vdev_cache_write(zio); 2602 2603 if (zio_injection_enabled && zio->io_error == 0) 2604 zio->io_error = zio_handle_device_injection(vd, 2605 zio, EIO); 2606 2607 if (zio_injection_enabled && zio->io_error == 0) 2608 zio->io_error = zio_handle_label_injection(zio, EIO); 2609 2610 if (zio->io_error) { 2611 if (!vdev_accessible(vd, zio)) { 2612 zio->io_error = SET_ERROR(ENXIO); 2613 } else { 2614 unexpected_error = B_TRUE; 2615 } 2616 } 2617 } 2618 2619 ops->vdev_op_io_done(zio); 2620 2621 if (unexpected_error) 2622 VERIFY(vdev_probe(vd, zio) == NULL); 2623 2624 return (ZIO_PIPELINE_CONTINUE); 2625} 2626 2627/* 2628 * For non-raidz ZIOs, we can just copy aside the bad data read from the 2629 * disk, and use that to finish the checksum ereport later. 2630 */ 2631static void 2632zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, 2633 const void *good_buf) 2634{ 2635 /* no processing needed */ 2636 zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); 2637} 2638 2639/*ARGSUSED*/ 2640void 2641zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) 2642{ 2643 void *buf = zio_buf_alloc(zio->io_size); 2644 2645 bcopy(zio->io_data, buf, zio->io_size); 2646 2647 zcr->zcr_cbinfo = zio->io_size; 2648 zcr->zcr_cbdata = buf; 2649 zcr->zcr_finish = zio_vsd_default_cksum_finish; 2650 zcr->zcr_free = zio_buf_free; 2651} 2652 2653static int 2654zio_vdev_io_assess(zio_t *zio) 2655{ 2656 vdev_t *vd = zio->io_vd; 2657 2658 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2659 return (ZIO_PIPELINE_STOP); 2660 2661 if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2662 spa_config_exit(zio->io_spa, SCL_ZIO, zio); 2663 2664 if (zio->io_vsd != NULL) { 2665 zio->io_vsd_ops->vsd_free(zio); 2666 zio->io_vsd = NULL; 2667 } 2668 2669 if (zio_injection_enabled && zio->io_error == 0) 2670 zio->io_error = zio_handle_fault_injection(zio, EIO); 2671 2672 if (zio->io_type == ZIO_TYPE_IOCTL && zio->io_cmd == DKIOCTRIM) 2673 switch (zio->io_error) { 2674 case 0: 2675 ZIO_TRIM_STAT_INCR(bytes, zio->io_size); 2676 ZIO_TRIM_STAT_BUMP(success); 2677 break; 2678 case EOPNOTSUPP: 2679 ZIO_TRIM_STAT_BUMP(unsupported); 2680 break; 2681 default: 2682 ZIO_TRIM_STAT_BUMP(failed); 2683 break; 2684 } 2685 2686 /* 2687 * If the I/O failed, determine whether we should attempt to retry it. 2688 * 2689 * On retry, we cut in line in the issue queue, since we don't want 2690 * compression/checksumming/etc. work to prevent our (cheap) IO reissue. 2691 */ 2692 if (zio->io_error && vd == NULL && 2693 !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { 2694 ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ 2695 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ 2696 zio->io_error = 0; 2697 zio->io_flags |= ZIO_FLAG_IO_RETRY | 2698 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; 2699 zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; 2700 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, 2701 zio_requeue_io_start_cut_in_line); 2702 return (ZIO_PIPELINE_STOP); 2703 } 2704 2705 /* 2706 * If we got an error on a leaf device, convert it to ENXIO 2707 * if the device is not accessible at all. 2708 */ 2709 if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && 2710 !vdev_accessible(vd, zio)) 2711 zio->io_error = SET_ERROR(ENXIO); 2712 2713 /* 2714 * If we can't write to an interior vdev (mirror or RAID-Z), 2715 * set vdev_cant_write so that we stop trying to allocate from it. 2716 */ 2717 if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && 2718 vd != NULL && !vd->vdev_ops->vdev_op_leaf) { 2719 vd->vdev_cant_write = B_TRUE; 2720 } 2721 2722 if (zio->io_error) 2723 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2724 2725 return (ZIO_PIPELINE_CONTINUE); 2726} 2727 2728void 2729zio_vdev_io_reissue(zio_t *zio) 2730{ 2731 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2732 ASSERT(zio->io_error == 0); 2733 2734 zio->io_stage >>= 1; 2735} 2736 2737void 2738zio_vdev_io_redone(zio_t *zio) 2739{ 2740 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 2741 2742 zio->io_stage >>= 1; 2743} 2744 2745void 2746zio_vdev_io_bypass(zio_t *zio) 2747{ 2748 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2749 ASSERT(zio->io_error == 0); 2750 2751 zio->io_flags |= ZIO_FLAG_IO_BYPASS; 2752 zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; 2753} 2754 2755/* 2756 * ========================================================================== 2757 * Generate and verify checksums 2758 * ========================================================================== 2759 */ 2760static int 2761zio_checksum_generate(zio_t *zio) 2762{ 2763 blkptr_t *bp = zio->io_bp; 2764 enum zio_checksum checksum; 2765 2766 if (bp == NULL) { 2767 /* 2768 * This is zio_write_phys(). 2769 * We're either generating a label checksum, or none at all. 2770 */ 2771 checksum = zio->io_prop.zp_checksum; 2772 2773 if (checksum == ZIO_CHECKSUM_OFF) 2774 return (ZIO_PIPELINE_CONTINUE); 2775 2776 ASSERT(checksum == ZIO_CHECKSUM_LABEL); 2777 } else { 2778 if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { 2779 ASSERT(!IO_IS_ALLOCATING(zio)); 2780 checksum = ZIO_CHECKSUM_GANG_HEADER; 2781 } else { 2782 checksum = BP_GET_CHECKSUM(bp); 2783 } 2784 } 2785 2786 zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); 2787 2788 return (ZIO_PIPELINE_CONTINUE); 2789} 2790 2791static int 2792zio_checksum_verify(zio_t *zio) 2793{ 2794 zio_bad_cksum_t info; 2795 blkptr_t *bp = zio->io_bp; 2796 int error; 2797 2798 ASSERT(zio->io_vd != NULL); 2799 2800 if (bp == NULL) { 2801 /* 2802 * This is zio_read_phys(). 2803 * We're either verifying a label checksum, or nothing at all. 2804 */ 2805 if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) 2806 return (ZIO_PIPELINE_CONTINUE); 2807 2808 ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); 2809 } 2810 2811 if ((error = zio_checksum_error(zio, &info)) != 0) { 2812 zio->io_error = error; 2813 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2814 zfs_ereport_start_checksum(zio->io_spa, 2815 zio->io_vd, zio, zio->io_offset, 2816 zio->io_size, NULL, &info); 2817 } 2818 } 2819 2820 return (ZIO_PIPELINE_CONTINUE); 2821} 2822 2823/* 2824 * Called by RAID-Z to ensure we don't compute the checksum twice. 2825 */ 2826void 2827zio_checksum_verified(zio_t *zio) 2828{ 2829 zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 2830} 2831 2832/* 2833 * ========================================================================== 2834 * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. 2835 * An error of 0 indictes success. ENXIO indicates whole-device failure, 2836 * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO 2837 * indicate errors that are specific to one I/O, and most likely permanent. 2838 * Any other error is presumed to be worse because we weren't expecting it. 2839 * ========================================================================== 2840 */ 2841int 2842zio_worst_error(int e1, int e2) 2843{ 2844 static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; 2845 int r1, r2; 2846 2847 for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) 2848 if (e1 == zio_error_rank[r1]) 2849 break; 2850 2851 for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) 2852 if (e2 == zio_error_rank[r2]) 2853 break; 2854 2855 return (r1 > r2 ? e1 : e2); 2856} 2857 2858/* 2859 * ========================================================================== 2860 * I/O completion 2861 * ========================================================================== 2862 */ 2863static int 2864zio_ready(zio_t *zio) 2865{ 2866 blkptr_t *bp = zio->io_bp; 2867 zio_t *pio, *pio_next; 2868 2869 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 2870 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY)) 2871 return (ZIO_PIPELINE_STOP); 2872 2873 if (zio->io_ready) { 2874 ASSERT(IO_IS_ALLOCATING(zio)); 2875 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || 2876 (zio->io_flags & ZIO_FLAG_NOPWRITE)); 2877 ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); 2878 2879 zio->io_ready(zio); 2880 } 2881 2882 if (bp != NULL && bp != &zio->io_bp_copy) 2883 zio->io_bp_copy = *bp; 2884 2885 if (zio->io_error) 2886 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2887 2888 mutex_enter(&zio->io_lock); 2889 zio->io_state[ZIO_WAIT_READY] = 1; 2890 pio = zio_walk_parents(zio); 2891 mutex_exit(&zio->io_lock); 2892 2893 /* 2894 * As we notify zio's parents, new parents could be added. 2895 * New parents go to the head of zio's io_parent_list, however, 2896 * so we will (correctly) not notify them. The remainder of zio's 2897 * io_parent_list, from 'pio_next' onward, cannot change because 2898 * all parents must wait for us to be done before they can be done. 2899 */ 2900 for (; pio != NULL; pio = pio_next) { 2901 pio_next = zio_walk_parents(zio); 2902 zio_notify_parent(pio, zio, ZIO_WAIT_READY); 2903 } 2904 2905 if (zio->io_flags & ZIO_FLAG_NODATA) { 2906 if (BP_IS_GANG(bp)) { 2907 zio->io_flags &= ~ZIO_FLAG_NODATA; 2908 } else { 2909 ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE); 2910 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 2911 } 2912 } 2913 2914 if (zio_injection_enabled && 2915 zio->io_spa->spa_syncing_txg == zio->io_txg) 2916 zio_handle_ignored_writes(zio); 2917 2918 return (ZIO_PIPELINE_CONTINUE); 2919} 2920 2921static int 2922zio_done(zio_t *zio) 2923{ 2924 spa_t *spa = zio->io_spa; 2925 zio_t *lio = zio->io_logical; 2926 blkptr_t *bp = zio->io_bp; 2927 vdev_t *vd = zio->io_vd; 2928 uint64_t psize = zio->io_size; 2929 zio_t *pio, *pio_next; 2930 2931 /* 2932 * If our children haven't all completed, 2933 * wait for them and then repeat this pipeline stage. 2934 */ 2935 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || 2936 zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || 2937 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) || 2938 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) 2939 return (ZIO_PIPELINE_STOP); 2940 2941 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 2942 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 2943 ASSERT(zio->io_children[c][w] == 0); 2944 2945 if (bp != NULL) { 2946 ASSERT(bp->blk_pad[0] == 0); 2947 ASSERT(bp->blk_pad[1] == 0); 2948 ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || 2949 (bp == zio_unique_parent(zio)->io_bp)); 2950 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 2951 zio->io_bp_override == NULL && 2952 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 2953 ASSERT(!BP_SHOULD_BYTESWAP(bp)); 2954 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); 2955 ASSERT(BP_COUNT_GANG(bp) == 0 || 2956 (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 2957 } 2958 if (zio->io_flags & ZIO_FLAG_NOPWRITE) 2959 VERIFY(BP_EQUAL(bp, &zio->io_bp_orig)); 2960 } 2961 2962 /* 2963 * If there were child vdev/gang/ddt errors, they apply to us now. 2964 */ 2965 zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); 2966 zio_inherit_child_errors(zio, ZIO_CHILD_GANG); 2967 zio_inherit_child_errors(zio, ZIO_CHILD_DDT); 2968 2969 /* 2970 * If the I/O on the transformed data was successful, generate any 2971 * checksum reports now while we still have the transformed data. 2972 */ 2973 if (zio->io_error == 0) { 2974 while (zio->io_cksum_report != NULL) { 2975 zio_cksum_report_t *zcr = zio->io_cksum_report; 2976 uint64_t align = zcr->zcr_align; 2977 uint64_t asize = P2ROUNDUP(psize, align); 2978 char *abuf = zio->io_data; 2979 2980 if (asize != psize) { 2981 abuf = zio_buf_alloc(asize); 2982 bcopy(zio->io_data, abuf, psize); 2983 bzero(abuf + psize, asize - psize); 2984 } 2985 2986 zio->io_cksum_report = zcr->zcr_next; 2987 zcr->zcr_next = NULL; 2988 zcr->zcr_finish(zcr, abuf); 2989 zfs_ereport_free_checksum(zcr); 2990 2991 if (asize != psize) 2992 zio_buf_free(abuf, asize); 2993 } 2994 } 2995 2996 zio_pop_transforms(zio); /* note: may set zio->io_error */ 2997 2998 vdev_stat_update(zio, psize); 2999 3000 if (zio->io_error) { 3001 /* 3002 * If this I/O is attached to a particular vdev, 3003 * generate an error message describing the I/O failure 3004 * at the block level. We ignore these errors if the 3005 * device is currently unavailable. 3006 */ 3007 if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 3008 zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 3009 3010 if ((zio->io_error == EIO || !(zio->io_flags & 3011 (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && 3012 zio == lio) { 3013 /* 3014 * For logical I/O requests, tell the SPA to log the 3015 * error and generate a logical data ereport. 3016 */ 3017 spa_log_error(spa, zio); 3018 zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 3019 0, 0); 3020 } 3021 } 3022 3023 if (zio->io_error && zio == lio) { 3024 /* 3025 * Determine whether zio should be reexecuted. This will 3026 * propagate all the way to the root via zio_notify_parent(). 3027 */ 3028 ASSERT(vd == NULL && bp != NULL); 3029 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3030 3031 if (IO_IS_ALLOCATING(zio) && 3032 !(zio->io_flags & ZIO_FLAG_CANFAIL)) { 3033 if (zio->io_error != ENOSPC) 3034 zio->io_reexecute |= ZIO_REEXECUTE_NOW; 3035 else 3036 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3037 } 3038 3039 if ((zio->io_type == ZIO_TYPE_READ || 3040 zio->io_type == ZIO_TYPE_FREE) && 3041 !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && 3042 zio->io_error == ENXIO && 3043 spa_load_state(spa) == SPA_LOAD_NONE && 3044 spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 3045 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3046 3047 if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) 3048 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3049 3050 /* 3051 * Here is a possibly good place to attempt to do 3052 * either combinatorial reconstruction or error correction 3053 * based on checksums. It also might be a good place 3054 * to send out preliminary ereports before we suspend 3055 * processing. 3056 */ 3057 } 3058 3059 /* 3060 * If there were logical child errors, they apply to us now. 3061 * We defer this until now to avoid conflating logical child 3062 * errors with errors that happened to the zio itself when 3063 * updating vdev stats and reporting FMA events above. 3064 */ 3065 zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); 3066 3067 if ((zio->io_error || zio->io_reexecute) && 3068 IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && 3069 !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) 3070 zio_dva_unallocate(zio, zio->io_gang_tree, bp); 3071 3072 zio_gang_tree_free(&zio->io_gang_tree); 3073 3074 /* 3075 * Godfather I/Os should never suspend. 3076 */ 3077 if ((zio->io_flags & ZIO_FLAG_GODFATHER) && 3078 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) 3079 zio->io_reexecute = 0; 3080 3081 if (zio->io_reexecute) { 3082 /* 3083 * This is a logical I/O that wants to reexecute. 3084 * 3085 * Reexecute is top-down. When an i/o fails, if it's not 3086 * the root, it simply notifies its parent and sticks around. 3087 * The parent, seeing that it still has children in zio_done(), 3088 * does the same. This percolates all the way up to the root. 3089 * The root i/o will reexecute or suspend the entire tree. 3090 * 3091 * This approach ensures that zio_reexecute() honors 3092 * all the original i/o dependency relationships, e.g. 3093 * parents not executing until children are ready. 3094 */ 3095 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3096 3097 zio->io_gang_leader = NULL; 3098 3099 mutex_enter(&zio->io_lock); 3100 zio->io_state[ZIO_WAIT_DONE] = 1; 3101 mutex_exit(&zio->io_lock); 3102 3103 /* 3104 * "The Godfather" I/O monitors its children but is 3105 * not a true parent to them. It will track them through 3106 * the pipeline but severs its ties whenever they get into 3107 * trouble (e.g. suspended). This allows "The Godfather" 3108 * I/O to return status without blocking. 3109 */ 3110 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 3111 zio_link_t *zl = zio->io_walk_link; 3112 pio_next = zio_walk_parents(zio); 3113 3114 if ((pio->io_flags & ZIO_FLAG_GODFATHER) && 3115 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { 3116 zio_remove_child(pio, zio, zl); 3117 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3118 } 3119 } 3120 3121 if ((pio = zio_unique_parent(zio)) != NULL) { 3122 /* 3123 * We're not a root i/o, so there's nothing to do 3124 * but notify our parent. Don't propagate errors 3125 * upward since we haven't permanently failed yet. 3126 */ 3127 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 3128 zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; 3129 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3130 } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { 3131 /* 3132 * We'd fail again if we reexecuted now, so suspend 3133 * until conditions improve (e.g. device comes online). 3134 */ 3135 zio_suspend(spa, zio); 3136 } else { 3137 /* 3138 * Reexecution is potentially a huge amount of work. 3139 * Hand it off to the otherwise-unused claim taskq. 3140 */ 3141#ifdef _KERNEL 3142 (void) taskq_dispatch_safe( 3143 spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE], 3144 (task_func_t *)zio_reexecute, zio, TQ_SLEEP, 3145 &zio->io_task); 3146#else 3147 (void) taskq_dispatch( 3148 spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE], 3149 (task_func_t *)zio_reexecute, zio, TQ_SLEEP); 3150#endif 3151 } 3152 return (ZIO_PIPELINE_STOP); 3153 } 3154 3155 ASSERT(zio->io_child_count == 0); 3156 ASSERT(zio->io_reexecute == 0); 3157 ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); 3158 3159 /* 3160 * Report any checksum errors, since the I/O is complete. 3161 */ 3162 while (zio->io_cksum_report != NULL) { 3163 zio_cksum_report_t *zcr = zio->io_cksum_report; 3164 zio->io_cksum_report = zcr->zcr_next; 3165 zcr->zcr_next = NULL; 3166 zcr->zcr_finish(zcr, NULL); 3167 zfs_ereport_free_checksum(zcr); 3168 } 3169 3170 /* 3171 * It is the responsibility of the done callback to ensure that this 3172 * particular zio is no longer discoverable for adoption, and as 3173 * such, cannot acquire any new parents. 3174 */ 3175 if (zio->io_done) 3176 zio->io_done(zio); 3177 3178 mutex_enter(&zio->io_lock); 3179 zio->io_state[ZIO_WAIT_DONE] = 1; 3180 mutex_exit(&zio->io_lock); 3181 3182 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 3183 zio_link_t *zl = zio->io_walk_link; 3184 pio_next = zio_walk_parents(zio); 3185 zio_remove_child(pio, zio, zl); 3186 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3187 } 3188 3189 if (zio->io_waiter != NULL) { 3190 mutex_enter(&zio->io_lock); 3191 zio->io_executor = NULL; 3192 cv_broadcast(&zio->io_cv); 3193 mutex_exit(&zio->io_lock); 3194 } else { 3195 zio_destroy(zio); 3196 } 3197 3198 return (ZIO_PIPELINE_STOP); 3199} 3200 3201/* 3202 * ========================================================================== 3203 * I/O pipeline definition 3204 * ========================================================================== 3205 */ 3206static zio_pipe_stage_t *zio_pipeline[] = { 3207 NULL, 3208 zio_read_bp_init, 3209 zio_free_bp_init, 3210 zio_issue_async, 3211 zio_write_bp_init, 3212 zio_checksum_generate, 3213 zio_nop_write, 3214 zio_ddt_read_start, 3215 zio_ddt_read_done, 3216 zio_ddt_write, 3217 zio_ddt_free, 3218 zio_gang_assemble, 3219 zio_gang_issue, 3220 zio_dva_allocate, 3221 zio_dva_free, 3222 zio_dva_claim, 3223 zio_ready, 3224 zio_vdev_io_start, 3225 zio_vdev_io_done, 3226 zio_vdev_io_assess, 3227 zio_checksum_verify, 3228 zio_done 3229}; 3230 3231/* dnp is the dnode for zb1->zb_object */ 3232boolean_t 3233zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1, 3234 const zbookmark_t *zb2) 3235{ 3236 uint64_t zb1nextL0, zb2thisobj; 3237 3238 ASSERT(zb1->zb_objset == zb2->zb_objset); 3239 ASSERT(zb2->zb_level == 0); 3240 3241 /* 3242 * A bookmark in the deadlist is considered to be after 3243 * everything else. 3244 */ 3245 if (zb2->zb_object == DMU_DEADLIST_OBJECT) 3246 return (B_TRUE); 3247 3248 /* The objset_phys_t isn't before anything. */ 3249 if (dnp == NULL) 3250 return (B_FALSE); 3251 3252 zb1nextL0 = (zb1->zb_blkid + 1) << 3253 ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); 3254 3255 zb2thisobj = zb2->zb_object ? zb2->zb_object : 3256 zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); 3257 3258 if (zb1->zb_object == DMU_META_DNODE_OBJECT) { 3259 uint64_t nextobj = zb1nextL0 * 3260 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; 3261 return (nextobj <= zb2thisobj); 3262 } 3263 3264 if (zb1->zb_object < zb2thisobj) 3265 return (B_TRUE); 3266 if (zb1->zb_object > zb2thisobj) 3267 return (B_FALSE); 3268 if (zb2->zb_object == DMU_META_DNODE_OBJECT) 3269 return (B_FALSE); 3270 return (zb1nextL0 <= zb2->zb_blkid); 3271} 3272