zio.c revision 253992
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2013 by Delphix. All rights reserved. 24 */ 25 26#include <sys/zfs_context.h> 27#include <sys/fm/fs/zfs.h> 28#include <sys/spa.h> 29#include <sys/txg.h> 30#include <sys/spa_impl.h> 31#include <sys/vdev_impl.h> 32#include <sys/zio_impl.h> 33#include <sys/zio_compress.h> 34#include <sys/zio_checksum.h> 35#include <sys/dmu_objset.h> 36#include <sys/arc.h> 37#include <sys/ddt.h> 38#include <sys/trim_map.h> 39 40SYSCTL_DECL(_vfs_zfs); 41SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); 42static int zio_use_uma = 0; 43TUNABLE_INT("vfs.zfs.zio.use_uma", &zio_use_uma); 44SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0, 45 "Use uma(9) for ZIO allocations"); 46static int zio_exclude_metadata = 0; 47TUNABLE_INT("vfs.zfs.zio.exclude_metadata", &zio_exclude_metadata); 48SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0, 49 "Exclude metadata buffers from dumps as well"); 50 51zio_trim_stats_t zio_trim_stats = { 52 { "bytes", KSTAT_DATA_UINT64, 53 "Number of bytes successfully TRIMmed" }, 54 { "success", KSTAT_DATA_UINT64, 55 "Number of successful TRIM requests" }, 56 { "unsupported", KSTAT_DATA_UINT64, 57 "Number of TRIM requests that failed because TRIM is not supported" }, 58 { "failed", KSTAT_DATA_UINT64, 59 "Number of TRIM requests that failed for reasons other than not supported" }, 60}; 61 62static kstat_t *zio_trim_ksp; 63 64/* 65 * ========================================================================== 66 * I/O priority table 67 * ========================================================================== 68 */ 69uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 70 0, /* ZIO_PRIORITY_NOW */ 71 0, /* ZIO_PRIORITY_SYNC_READ */ 72 0, /* ZIO_PRIORITY_SYNC_WRITE */ 73 0, /* ZIO_PRIORITY_LOG_WRITE */ 74 1, /* ZIO_PRIORITY_CACHE_FILL */ 75 1, /* ZIO_PRIORITY_AGG */ 76 4, /* ZIO_PRIORITY_FREE */ 77 4, /* ZIO_PRIORITY_ASYNC_WRITE */ 78 6, /* ZIO_PRIORITY_ASYNC_READ */ 79 10, /* ZIO_PRIORITY_RESILVER */ 80 20, /* ZIO_PRIORITY_SCRUB */ 81 2, /* ZIO_PRIORITY_DDT_PREFETCH */ 82 30, /* ZIO_PRIORITY_TRIM */ 83}; 84 85/* 86 * ========================================================================== 87 * I/O type descriptions 88 * ========================================================================== 89 */ 90char *zio_type_name[ZIO_TYPES] = { 91 "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", 92 "zio_ioctl" 93}; 94 95/* 96 * ========================================================================== 97 * I/O kmem caches 98 * ========================================================================== 99 */ 100kmem_cache_t *zio_cache; 101kmem_cache_t *zio_link_cache; 102kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 103kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 104 105#ifdef _KERNEL 106extern vmem_t *zio_alloc_arena; 107#endif 108extern int zfs_mg_alloc_failures; 109 110/* 111 * The following actions directly effect the spa's sync-to-convergence logic. 112 * The values below define the sync pass when we start performing the action. 113 * Care should be taken when changing these values as they directly impact 114 * spa_sync() performance. Tuning these values may introduce subtle performance 115 * pathologies and should only be done in the context of performance analysis. 116 * These tunables will eventually be removed and replaced with #defines once 117 * enough analysis has been done to determine optimal values. 118 * 119 * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that 120 * regular blocks are not deferred. 121 */ 122int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */ 123TUNABLE_INT("vfs.zfs.sync_pass_deferred_free", &zfs_sync_pass_deferred_free); 124SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_deferred_free, CTLFLAG_RDTUN, 125 &zfs_sync_pass_deferred_free, 0, "defer frees starting in this pass"); 126int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */ 127TUNABLE_INT("vfs.zfs.sync_pass_dont_compress", &zfs_sync_pass_dont_compress); 128SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_dont_compress, CTLFLAG_RDTUN, 129 &zfs_sync_pass_dont_compress, 0, "don't compress starting in this pass"); 130int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ 131TUNABLE_INT("vfs.zfs.sync_pass_rewrite", &zfs_sync_pass_rewrite); 132SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_rewrite, CTLFLAG_RDTUN, 133 &zfs_sync_pass_rewrite, 0, "rewrite new bps starting in this pass"); 134 135/* 136 * An allocating zio is one that either currently has the DVA allocate 137 * stage set or will have it later in its lifetime. 138 */ 139#define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) 140 141boolean_t zio_requeue_io_start_cut_in_line = B_TRUE; 142 143#ifdef ZFS_DEBUG 144int zio_buf_debug_limit = 16384; 145#else 146int zio_buf_debug_limit = 0; 147#endif 148 149void 150zio_init(void) 151{ 152 size_t c; 153 zio_cache = kmem_cache_create("zio_cache", 154 sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 155 zio_link_cache = kmem_cache_create("zio_link_cache", 156 sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 157 if (!zio_use_uma) 158 goto out; 159 160 /* 161 * For small buffers, we want a cache for each multiple of 162 * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 163 * for each quarter-power of 2. For large buffers, we want 164 * a cache for each multiple of PAGESIZE. 165 */ 166 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 167 size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 168 size_t p2 = size; 169 size_t align = 0; 170 size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0; 171 172 while (p2 & (p2 - 1)) 173 p2 &= p2 - 1; 174 175#ifdef illumos 176#ifndef _KERNEL 177 /* 178 * If we are using watchpoints, put each buffer on its own page, 179 * to eliminate the performance overhead of trapping to the 180 * kernel when modifying a non-watched buffer that shares the 181 * page with a watched buffer. 182 */ 183 if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) 184 continue; 185#endif 186#endif /* illumos */ 187 if (size <= 4 * SPA_MINBLOCKSIZE) { 188 align = SPA_MINBLOCKSIZE; 189 } else if (IS_P2ALIGNED(size, PAGESIZE)) { 190 align = PAGESIZE; 191 } else if (IS_P2ALIGNED(size, p2 >> 2)) { 192 align = p2 >> 2; 193 } 194 195 if (align != 0) { 196 char name[36]; 197 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 198 zio_buf_cache[c] = kmem_cache_create(name, size, 199 align, NULL, NULL, NULL, NULL, NULL, cflags); 200 201 /* 202 * Since zio_data bufs do not appear in crash dumps, we 203 * pass KMC_NOTOUCH so that no allocator metadata is 204 * stored with the buffers. 205 */ 206 (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 207 zio_data_buf_cache[c] = kmem_cache_create(name, size, 208 align, NULL, NULL, NULL, NULL, NULL, 209 cflags | KMC_NOTOUCH | KMC_NODEBUG); 210 } 211 } 212 213 while (--c != 0) { 214 ASSERT(zio_buf_cache[c] != NULL); 215 if (zio_buf_cache[c - 1] == NULL) 216 zio_buf_cache[c - 1] = zio_buf_cache[c]; 217 218 ASSERT(zio_data_buf_cache[c] != NULL); 219 if (zio_data_buf_cache[c - 1] == NULL) 220 zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 221 } 222out: 223 224 /* 225 * The zio write taskqs have 1 thread per cpu, allow 1/2 of the taskqs 226 * to fail 3 times per txg or 8 failures, whichever is greater. 227 */ 228 if (zfs_mg_alloc_failures == 0) 229 zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8); 230 else if (zfs_mg_alloc_failures < 8) 231 zfs_mg_alloc_failures = 8; 232 233 zio_inject_init(); 234 235 zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc", 236 KSTAT_TYPE_NAMED, 237 sizeof(zio_trim_stats) / sizeof(kstat_named_t), 238 KSTAT_FLAG_VIRTUAL); 239 240 if (zio_trim_ksp != NULL) { 241 zio_trim_ksp->ks_data = &zio_trim_stats; 242 kstat_install(zio_trim_ksp); 243 } 244} 245 246void 247zio_fini(void) 248{ 249 size_t c; 250 kmem_cache_t *last_cache = NULL; 251 kmem_cache_t *last_data_cache = NULL; 252 253 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 254 if (zio_buf_cache[c] != last_cache) { 255 last_cache = zio_buf_cache[c]; 256 kmem_cache_destroy(zio_buf_cache[c]); 257 } 258 zio_buf_cache[c] = NULL; 259 260 if (zio_data_buf_cache[c] != last_data_cache) { 261 last_data_cache = zio_data_buf_cache[c]; 262 kmem_cache_destroy(zio_data_buf_cache[c]); 263 } 264 zio_data_buf_cache[c] = NULL; 265 } 266 267 kmem_cache_destroy(zio_link_cache); 268 kmem_cache_destroy(zio_cache); 269 270 zio_inject_fini(); 271 272 if (zio_trim_ksp != NULL) { 273 kstat_delete(zio_trim_ksp); 274 zio_trim_ksp = NULL; 275 } 276} 277 278/* 279 * ========================================================================== 280 * Allocate and free I/O buffers 281 * ========================================================================== 282 */ 283 284/* 285 * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 286 * crashdump if the kernel panics, so use it judiciously. Obviously, it's 287 * useful to inspect ZFS metadata, but if possible, we should avoid keeping 288 * excess / transient data in-core during a crashdump. 289 */ 290void * 291zio_buf_alloc(size_t size) 292{ 293 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 294 int flags = zio_exclude_metadata ? KM_NODEBUG : 0; 295 296 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 297 298 if (zio_use_uma) 299 return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 300 else 301 return (kmem_alloc(size, KM_SLEEP|flags)); 302} 303 304/* 305 * Use zio_data_buf_alloc to allocate data. The data will not appear in a 306 * crashdump if the kernel panics. This exists so that we will limit the amount 307 * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 308 * of kernel heap dumped to disk when the kernel panics) 309 */ 310void * 311zio_data_buf_alloc(size_t size) 312{ 313 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 314 315 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 316 317 if (zio_use_uma) 318 return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 319 else 320 return (kmem_alloc(size, KM_SLEEP | KM_NODEBUG)); 321} 322 323void 324zio_buf_free(void *buf, size_t size) 325{ 326 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 327 328 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 329 330 if (zio_use_uma) 331 kmem_cache_free(zio_buf_cache[c], buf); 332 else 333 kmem_free(buf, size); 334} 335 336void 337zio_data_buf_free(void *buf, size_t size) 338{ 339 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 340 341 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 342 343 if (zio_use_uma) 344 kmem_cache_free(zio_data_buf_cache[c], buf); 345 else 346 kmem_free(buf, size); 347} 348 349/* 350 * ========================================================================== 351 * Push and pop I/O transform buffers 352 * ========================================================================== 353 */ 354static void 355zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, 356 zio_transform_func_t *transform) 357{ 358 zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 359 360 zt->zt_orig_data = zio->io_data; 361 zt->zt_orig_size = zio->io_size; 362 zt->zt_bufsize = bufsize; 363 zt->zt_transform = transform; 364 365 zt->zt_next = zio->io_transform_stack; 366 zio->io_transform_stack = zt; 367 368 zio->io_data = data; 369 zio->io_size = size; 370} 371 372static void 373zio_pop_transforms(zio_t *zio) 374{ 375 zio_transform_t *zt; 376 377 while ((zt = zio->io_transform_stack) != NULL) { 378 if (zt->zt_transform != NULL) 379 zt->zt_transform(zio, 380 zt->zt_orig_data, zt->zt_orig_size); 381 382 if (zt->zt_bufsize != 0) 383 zio_buf_free(zio->io_data, zt->zt_bufsize); 384 385 zio->io_data = zt->zt_orig_data; 386 zio->io_size = zt->zt_orig_size; 387 zio->io_transform_stack = zt->zt_next; 388 389 kmem_free(zt, sizeof (zio_transform_t)); 390 } 391} 392 393/* 394 * ========================================================================== 395 * I/O transform callbacks for subblocks and decompression 396 * ========================================================================== 397 */ 398static void 399zio_subblock(zio_t *zio, void *data, uint64_t size) 400{ 401 ASSERT(zio->io_size > size); 402 403 if (zio->io_type == ZIO_TYPE_READ) 404 bcopy(zio->io_data, data, size); 405} 406 407static void 408zio_decompress(zio_t *zio, void *data, uint64_t size) 409{ 410 if (zio->io_error == 0 && 411 zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), 412 zio->io_data, data, zio->io_size, size) != 0) 413 zio->io_error = SET_ERROR(EIO); 414} 415 416/* 417 * ========================================================================== 418 * I/O parent/child relationships and pipeline interlocks 419 * ========================================================================== 420 */ 421/* 422 * NOTE - Callers to zio_walk_parents() and zio_walk_children must 423 * continue calling these functions until they return NULL. 424 * Otherwise, the next caller will pick up the list walk in 425 * some indeterminate state. (Otherwise every caller would 426 * have to pass in a cookie to keep the state represented by 427 * io_walk_link, which gets annoying.) 428 */ 429zio_t * 430zio_walk_parents(zio_t *cio) 431{ 432 zio_link_t *zl = cio->io_walk_link; 433 list_t *pl = &cio->io_parent_list; 434 435 zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); 436 cio->io_walk_link = zl; 437 438 if (zl == NULL) 439 return (NULL); 440 441 ASSERT(zl->zl_child == cio); 442 return (zl->zl_parent); 443} 444 445zio_t * 446zio_walk_children(zio_t *pio) 447{ 448 zio_link_t *zl = pio->io_walk_link; 449 list_t *cl = &pio->io_child_list; 450 451 zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); 452 pio->io_walk_link = zl; 453 454 if (zl == NULL) 455 return (NULL); 456 457 ASSERT(zl->zl_parent == pio); 458 return (zl->zl_child); 459} 460 461zio_t * 462zio_unique_parent(zio_t *cio) 463{ 464 zio_t *pio = zio_walk_parents(cio); 465 466 VERIFY(zio_walk_parents(cio) == NULL); 467 return (pio); 468} 469 470void 471zio_add_child(zio_t *pio, zio_t *cio) 472{ 473 zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); 474 475 /* 476 * Logical I/Os can have logical, gang, or vdev children. 477 * Gang I/Os can have gang or vdev children. 478 * Vdev I/Os can only have vdev children. 479 * The following ASSERT captures all of these constraints. 480 */ 481 ASSERT(cio->io_child_type <= pio->io_child_type); 482 483 zl->zl_parent = pio; 484 zl->zl_child = cio; 485 486 mutex_enter(&cio->io_lock); 487 mutex_enter(&pio->io_lock); 488 489 ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); 490 491 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 492 pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; 493 494 list_insert_head(&pio->io_child_list, zl); 495 list_insert_head(&cio->io_parent_list, zl); 496 497 pio->io_child_count++; 498 cio->io_parent_count++; 499 500 mutex_exit(&pio->io_lock); 501 mutex_exit(&cio->io_lock); 502} 503 504static void 505zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) 506{ 507 ASSERT(zl->zl_parent == pio); 508 ASSERT(zl->zl_child == cio); 509 510 mutex_enter(&cio->io_lock); 511 mutex_enter(&pio->io_lock); 512 513 list_remove(&pio->io_child_list, zl); 514 list_remove(&cio->io_parent_list, zl); 515 516 pio->io_child_count--; 517 cio->io_parent_count--; 518 519 mutex_exit(&pio->io_lock); 520 mutex_exit(&cio->io_lock); 521 522 kmem_cache_free(zio_link_cache, zl); 523} 524 525static boolean_t 526zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) 527{ 528 uint64_t *countp = &zio->io_children[child][wait]; 529 boolean_t waiting = B_FALSE; 530 531 mutex_enter(&zio->io_lock); 532 ASSERT(zio->io_stall == NULL); 533 if (*countp != 0) { 534 zio->io_stage >>= 1; 535 zio->io_stall = countp; 536 waiting = B_TRUE; 537 } 538 mutex_exit(&zio->io_lock); 539 540 return (waiting); 541} 542 543static void 544zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) 545{ 546 uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; 547 int *errorp = &pio->io_child_error[zio->io_child_type]; 548 549 mutex_enter(&pio->io_lock); 550 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 551 *errorp = zio_worst_error(*errorp, zio->io_error); 552 pio->io_reexecute |= zio->io_reexecute; 553 ASSERT3U(*countp, >, 0); 554 if (--*countp == 0 && pio->io_stall == countp) { 555 pio->io_stall = NULL; 556 mutex_exit(&pio->io_lock); 557 zio_execute(pio); 558 } else { 559 mutex_exit(&pio->io_lock); 560 } 561} 562 563static void 564zio_inherit_child_errors(zio_t *zio, enum zio_child c) 565{ 566 if (zio->io_child_error[c] != 0 && zio->io_error == 0) 567 zio->io_error = zio->io_child_error[c]; 568} 569 570/* 571 * ========================================================================== 572 * Create the various types of I/O (read, write, free, etc) 573 * ========================================================================== 574 */ 575static zio_t * 576zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 577 void *data, uint64_t size, zio_done_func_t *done, void *private, 578 zio_type_t type, int priority, enum zio_flag flags, 579 vdev_t *vd, uint64_t offset, const zbookmark_t *zb, 580 enum zio_stage stage, enum zio_stage pipeline) 581{ 582 zio_t *zio; 583 584 ASSERT3U(type == ZIO_TYPE_FREE || size, <=, SPA_MAXBLOCKSIZE); 585 ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 586 ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 587 588 ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); 589 ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); 590 ASSERT(vd || stage == ZIO_STAGE_OPEN); 591 592 zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 593 bzero(zio, sizeof (zio_t)); 594 595 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 596 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 597 598 list_create(&zio->io_parent_list, sizeof (zio_link_t), 599 offsetof(zio_link_t, zl_parent_node)); 600 list_create(&zio->io_child_list, sizeof (zio_link_t), 601 offsetof(zio_link_t, zl_child_node)); 602 603 if (vd != NULL) 604 zio->io_child_type = ZIO_CHILD_VDEV; 605 else if (flags & ZIO_FLAG_GANG_CHILD) 606 zio->io_child_type = ZIO_CHILD_GANG; 607 else if (flags & ZIO_FLAG_DDT_CHILD) 608 zio->io_child_type = ZIO_CHILD_DDT; 609 else 610 zio->io_child_type = ZIO_CHILD_LOGICAL; 611 612 if (bp != NULL) { 613 zio->io_bp = (blkptr_t *)bp; 614 zio->io_bp_copy = *bp; 615 zio->io_bp_orig = *bp; 616 if (type != ZIO_TYPE_WRITE || 617 zio->io_child_type == ZIO_CHILD_DDT) 618 zio->io_bp = &zio->io_bp_copy; /* so caller can free */ 619 if (zio->io_child_type == ZIO_CHILD_LOGICAL) 620 zio->io_logical = zio; 621 if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) 622 pipeline |= ZIO_GANG_STAGES; 623 } 624 625 zio->io_spa = spa; 626 zio->io_txg = txg; 627 zio->io_done = done; 628 zio->io_private = private; 629 zio->io_type = type; 630 zio->io_priority = priority; 631 zio->io_vd = vd; 632 zio->io_offset = offset; 633 zio->io_orig_data = zio->io_data = data; 634 zio->io_orig_size = zio->io_size = size; 635 zio->io_orig_flags = zio->io_flags = flags; 636 zio->io_orig_stage = zio->io_stage = stage; 637 zio->io_orig_pipeline = zio->io_pipeline = pipeline; 638 639 zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); 640 zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); 641 642 if (zb != NULL) 643 zio->io_bookmark = *zb; 644 645 if (pio != NULL) { 646 if (zio->io_logical == NULL) 647 zio->io_logical = pio->io_logical; 648 if (zio->io_child_type == ZIO_CHILD_GANG) 649 zio->io_gang_leader = pio->io_gang_leader; 650 zio_add_child(pio, zio); 651 } 652 653 return (zio); 654} 655 656static void 657zio_destroy(zio_t *zio) 658{ 659 list_destroy(&zio->io_parent_list); 660 list_destroy(&zio->io_child_list); 661 mutex_destroy(&zio->io_lock); 662 cv_destroy(&zio->io_cv); 663 kmem_cache_free(zio_cache, zio); 664} 665 666zio_t * 667zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, 668 void *private, enum zio_flag flags) 669{ 670 zio_t *zio; 671 672 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 673 ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 674 ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); 675 676 return (zio); 677} 678 679zio_t * 680zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) 681{ 682 return (zio_null(NULL, spa, NULL, done, private, flags)); 683} 684 685zio_t * 686zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, 687 void *data, uint64_t size, zio_done_func_t *done, void *private, 688 int priority, enum zio_flag flags, const zbookmark_t *zb) 689{ 690 zio_t *zio; 691 692 zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, 693 data, size, done, private, 694 ZIO_TYPE_READ, priority, flags, NULL, 0, zb, 695 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 696 ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); 697 698 return (zio); 699} 700 701zio_t * 702zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 703 void *data, uint64_t size, const zio_prop_t *zp, 704 zio_done_func_t *ready, zio_done_func_t *done, void *private, 705 int priority, enum zio_flag flags, const zbookmark_t *zb) 706{ 707 zio_t *zio; 708 709 ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && 710 zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && 711 zp->zp_compress >= ZIO_COMPRESS_OFF && 712 zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && 713 DMU_OT_IS_VALID(zp->zp_type) && 714 zp->zp_level < 32 && 715 zp->zp_copies > 0 && 716 zp->zp_copies <= spa_max_replication(spa)); 717 718 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 719 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 720 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 721 ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); 722 723 zio->io_ready = ready; 724 zio->io_prop = *zp; 725 726 return (zio); 727} 728 729zio_t * 730zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, 731 uint64_t size, zio_done_func_t *done, void *private, int priority, 732 enum zio_flag flags, zbookmark_t *zb) 733{ 734 zio_t *zio; 735 736 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 737 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 738 ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 739 740 return (zio); 741} 742 743void 744zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) 745{ 746 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 747 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 748 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 749 ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); 750 751 /* 752 * We must reset the io_prop to match the values that existed 753 * when the bp was first written by dmu_sync() keeping in mind 754 * that nopwrite and dedup are mutually exclusive. 755 */ 756 zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; 757 zio->io_prop.zp_nopwrite = nopwrite; 758 zio->io_prop.zp_copies = copies; 759 zio->io_bp_override = bp; 760} 761 762void 763zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) 764{ 765 metaslab_check_free(spa, bp); 766 767 /* 768 * Frees that are for the currently-syncing txg, are not going to be 769 * deferred, and which will not need to do a read (i.e. not GANG or 770 * DEDUP), can be processed immediately. Otherwise, put them on the 771 * in-memory list for later processing. 772 */ 773 if (zfs_trim_enabled || BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || 774 txg != spa->spa_syncing_txg || 775 spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) { 776 bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); 777 } else { 778 VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 779 BP_GET_PSIZE(bp), 0))); 780 } 781} 782 783zio_t * 784zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 785 uint64_t size, enum zio_flag flags) 786{ 787 zio_t *zio; 788 enum zio_stage stage = ZIO_FREE_PIPELINE; 789 790 dprintf_bp(bp, "freeing in txg %llu, pass %u", 791 (longlong_t)txg, spa->spa_sync_pass); 792 793 ASSERT(!BP_IS_HOLE(bp)); 794 ASSERT(spa_syncing_txg(spa) == txg); 795 ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); 796 797 metaslab_check_free(spa, bp); 798 arc_freed(spa, bp); 799 800 if (zfs_trim_enabled) 801 stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START | 802 ZIO_STAGE_VDEV_IO_ASSESS; 803 /* 804 * GANG and DEDUP blocks can induce a read (for the gang block header, 805 * or the DDT), so issue them asynchronously so that this thread is 806 * not tied up. 807 */ 808 else if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) 809 stage |= ZIO_STAGE_ISSUE_ASYNC; 810 811 zio = zio_create(pio, spa, txg, bp, NULL, size, 812 NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags, 813 NULL, 0, NULL, ZIO_STAGE_OPEN, stage); 814 815 return (zio); 816} 817 818zio_t * 819zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 820 zio_done_func_t *done, void *private, enum zio_flag flags) 821{ 822 zio_t *zio; 823 824 /* 825 * A claim is an allocation of a specific block. Claims are needed 826 * to support immediate writes in the intent log. The issue is that 827 * immediate writes contain committed data, but in a txg that was 828 * *not* committed. Upon opening the pool after an unclean shutdown, 829 * the intent log claims all blocks that contain immediate write data 830 * so that the SPA knows they're in use. 831 * 832 * All claims *must* be resolved in the first txg -- before the SPA 833 * starts allocating blocks -- so that nothing is allocated twice. 834 * If txg == 0 we just verify that the block is claimable. 835 */ 836 ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 837 ASSERT(txg == spa_first_txg(spa) || txg == 0); 838 ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ 839 840 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 841 done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, 842 NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 843 844 return (zio); 845} 846 847zio_t * 848zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset, 849 uint64_t size, zio_done_func_t *done, void *private, int priority, 850 enum zio_flag flags) 851{ 852 zio_t *zio; 853 int c; 854 855 if (vd->vdev_children == 0) { 856 zio = zio_create(pio, spa, 0, NULL, NULL, size, done, private, 857 ZIO_TYPE_IOCTL, priority, flags, vd, offset, NULL, 858 ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 859 860 zio->io_cmd = cmd; 861 } else { 862 zio = zio_null(pio, spa, NULL, NULL, NULL, flags); 863 864 for (c = 0; c < vd->vdev_children; c++) 865 zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 866 offset, size, done, private, priority, flags)); 867 } 868 869 return (zio); 870} 871 872zio_t * 873zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 874 void *data, int checksum, zio_done_func_t *done, void *private, 875 int priority, enum zio_flag flags, boolean_t labels) 876{ 877 zio_t *zio; 878 879 ASSERT(vd->vdev_children == 0); 880 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 881 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 882 ASSERT3U(offset + size, <=, vd->vdev_psize); 883 884 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 885 ZIO_TYPE_READ, priority, flags, vd, offset, NULL, 886 ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 887 888 zio->io_prop.zp_checksum = checksum; 889 890 return (zio); 891} 892 893zio_t * 894zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 895 void *data, int checksum, zio_done_func_t *done, void *private, 896 int priority, enum zio_flag flags, boolean_t labels) 897{ 898 zio_t *zio; 899 900 ASSERT(vd->vdev_children == 0); 901 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 902 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 903 ASSERT3U(offset + size, <=, vd->vdev_psize); 904 905 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 906 ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL, 907 ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 908 909 zio->io_prop.zp_checksum = checksum; 910 911 if (zio_checksum_table[checksum].ci_eck) { 912 /* 913 * zec checksums are necessarily destructive -- they modify 914 * the end of the write buffer to hold the verifier/checksum. 915 * Therefore, we must make a local copy in case the data is 916 * being written to multiple places in parallel. 917 */ 918 void *wbuf = zio_buf_alloc(size); 919 bcopy(data, wbuf, size); 920 zio_push_transform(zio, wbuf, size, size, NULL); 921 } 922 923 return (zio); 924} 925 926/* 927 * Create a child I/O to do some work for us. 928 */ 929zio_t * 930zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 931 void *data, uint64_t size, int type, int priority, enum zio_flag flags, 932 zio_done_func_t *done, void *private) 933{ 934 enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; 935 zio_t *zio; 936 937 ASSERT(vd->vdev_parent == 938 (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); 939 940 if (type == ZIO_TYPE_READ && bp != NULL) { 941 /* 942 * If we have the bp, then the child should perform the 943 * checksum and the parent need not. This pushes error 944 * detection as close to the leaves as possible and 945 * eliminates redundant checksums in the interior nodes. 946 */ 947 pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; 948 pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 949 } 950 951 if (vd->vdev_children == 0) 952 offset += VDEV_LABEL_START_SIZE; 953 954 flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE; 955 956 /* 957 * If we've decided to do a repair, the write is not speculative -- 958 * even if the original read was. 959 */ 960 if (flags & ZIO_FLAG_IO_REPAIR) 961 flags &= ~ZIO_FLAG_SPECULATIVE; 962 963 zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, 964 done, private, type, priority, flags, vd, offset, &pio->io_bookmark, 965 ZIO_STAGE_VDEV_IO_START >> 1, pipeline); 966 967 return (zio); 968} 969 970zio_t * 971zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, 972 int type, int priority, enum zio_flag flags, 973 zio_done_func_t *done, void *private) 974{ 975 zio_t *zio; 976 977 ASSERT(vd->vdev_ops->vdev_op_leaf); 978 979 zio = zio_create(NULL, vd->vdev_spa, 0, NULL, 980 data, size, done, private, type, priority, 981 flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY, 982 vd, offset, NULL, 983 ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); 984 985 return (zio); 986} 987 988void 989zio_flush(zio_t *zio, vdev_t *vd) 990{ 991 zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0, 992 NULL, NULL, ZIO_PRIORITY_NOW, 993 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); 994} 995 996zio_t * 997zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size) 998{ 999 1000 ASSERT(vd->vdev_ops->vdev_op_leaf); 1001 1002 return zio_ioctl(zio, spa, vd, DKIOCTRIM, offset, size, 1003 NULL, NULL, ZIO_PRIORITY_TRIM, 1004 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY); 1005} 1006 1007void 1008zio_shrink(zio_t *zio, uint64_t size) 1009{ 1010 ASSERT(zio->io_executor == NULL); 1011 ASSERT(zio->io_orig_size == zio->io_size); 1012 ASSERT(size <= zio->io_size); 1013 1014 /* 1015 * We don't shrink for raidz because of problems with the 1016 * reconstruction when reading back less than the block size. 1017 * Note, BP_IS_RAIDZ() assumes no compression. 1018 */ 1019 ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); 1020 if (!BP_IS_RAIDZ(zio->io_bp)) 1021 zio->io_orig_size = zio->io_size = size; 1022} 1023 1024/* 1025 * ========================================================================== 1026 * Prepare to read and write logical blocks 1027 * ========================================================================== 1028 */ 1029 1030static int 1031zio_read_bp_init(zio_t *zio) 1032{ 1033 blkptr_t *bp = zio->io_bp; 1034 1035 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && 1036 zio->io_child_type == ZIO_CHILD_LOGICAL && 1037 !(zio->io_flags & ZIO_FLAG_RAW)) { 1038 uint64_t psize = BP_GET_PSIZE(bp); 1039 void *cbuf = zio_buf_alloc(psize); 1040 1041 zio_push_transform(zio, cbuf, psize, psize, zio_decompress); 1042 } 1043 1044 if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) 1045 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1046 1047 if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) 1048 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1049 1050 if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) 1051 zio->io_pipeline = ZIO_DDT_READ_PIPELINE; 1052 1053 return (ZIO_PIPELINE_CONTINUE); 1054} 1055 1056static int 1057zio_write_bp_init(zio_t *zio) 1058{ 1059 spa_t *spa = zio->io_spa; 1060 zio_prop_t *zp = &zio->io_prop; 1061 enum zio_compress compress = zp->zp_compress; 1062 blkptr_t *bp = zio->io_bp; 1063 uint64_t lsize = zio->io_size; 1064 uint64_t psize = lsize; 1065 int pass = 1; 1066 1067 /* 1068 * If our children haven't all reached the ready stage, 1069 * wait for them and then repeat this pipeline stage. 1070 */ 1071 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 1072 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) 1073 return (ZIO_PIPELINE_STOP); 1074 1075 if (!IO_IS_ALLOCATING(zio)) 1076 return (ZIO_PIPELINE_CONTINUE); 1077 1078 ASSERT(zio->io_child_type != ZIO_CHILD_DDT); 1079 1080 if (zio->io_bp_override) { 1081 ASSERT(bp->blk_birth != zio->io_txg); 1082 ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); 1083 1084 *bp = *zio->io_bp_override; 1085 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1086 1087 /* 1088 * If we've been overridden and nopwrite is set then 1089 * set the flag accordingly to indicate that a nopwrite 1090 * has already occurred. 1091 */ 1092 if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { 1093 ASSERT(!zp->zp_dedup); 1094 zio->io_flags |= ZIO_FLAG_NOPWRITE; 1095 return (ZIO_PIPELINE_CONTINUE); 1096 } 1097 1098 ASSERT(!zp->zp_nopwrite); 1099 1100 if (BP_IS_HOLE(bp) || !zp->zp_dedup) 1101 return (ZIO_PIPELINE_CONTINUE); 1102 1103 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup || 1104 zp->zp_dedup_verify); 1105 1106 if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { 1107 BP_SET_DEDUP(bp, 1); 1108 zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; 1109 return (ZIO_PIPELINE_CONTINUE); 1110 } 1111 zio->io_bp_override = NULL; 1112 BP_ZERO(bp); 1113 } 1114 1115 if (bp->blk_birth == zio->io_txg) { 1116 /* 1117 * We're rewriting an existing block, which means we're 1118 * working on behalf of spa_sync(). For spa_sync() to 1119 * converge, it must eventually be the case that we don't 1120 * have to allocate new blocks. But compression changes 1121 * the blocksize, which forces a reallocate, and makes 1122 * convergence take longer. Therefore, after the first 1123 * few passes, stop compressing to ensure convergence. 1124 */ 1125 pass = spa_sync_pass(spa); 1126 1127 ASSERT(zio->io_txg == spa_syncing_txg(spa)); 1128 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1129 ASSERT(!BP_GET_DEDUP(bp)); 1130 1131 if (pass >= zfs_sync_pass_dont_compress) 1132 compress = ZIO_COMPRESS_OFF; 1133 1134 /* Make sure someone doesn't change their mind on overwrites */ 1135 ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp), 1136 spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 1137 } 1138 1139 if (compress != ZIO_COMPRESS_OFF) { 1140 void *cbuf = zio_buf_alloc(lsize); 1141 psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); 1142 if (psize == 0 || psize == lsize) { 1143 compress = ZIO_COMPRESS_OFF; 1144 zio_buf_free(cbuf, lsize); 1145 } else { 1146 ASSERT(psize < lsize); 1147 zio_push_transform(zio, cbuf, psize, lsize, NULL); 1148 } 1149 } 1150 1151 /* 1152 * The final pass of spa_sync() must be all rewrites, but the first 1153 * few passes offer a trade-off: allocating blocks defers convergence, 1154 * but newly allocated blocks are sequential, so they can be written 1155 * to disk faster. Therefore, we allow the first few passes of 1156 * spa_sync() to allocate new blocks, but force rewrites after that. 1157 * There should only be a handful of blocks after pass 1 in any case. 1158 */ 1159 if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize && 1160 pass >= zfs_sync_pass_rewrite) { 1161 ASSERT(psize != 0); 1162 enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; 1163 zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; 1164 zio->io_flags |= ZIO_FLAG_IO_REWRITE; 1165 } else { 1166 BP_ZERO(bp); 1167 zio->io_pipeline = ZIO_WRITE_PIPELINE; 1168 } 1169 1170 if (psize == 0) { 1171 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1172 } else { 1173 ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); 1174 BP_SET_LSIZE(bp, lsize); 1175 BP_SET_PSIZE(bp, psize); 1176 BP_SET_COMPRESS(bp, compress); 1177 BP_SET_CHECKSUM(bp, zp->zp_checksum); 1178 BP_SET_TYPE(bp, zp->zp_type); 1179 BP_SET_LEVEL(bp, zp->zp_level); 1180 BP_SET_DEDUP(bp, zp->zp_dedup); 1181 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1182 if (zp->zp_dedup) { 1183 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1184 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1185 zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; 1186 } 1187 if (zp->zp_nopwrite) { 1188 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1189 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1190 zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; 1191 } 1192 } 1193 1194 return (ZIO_PIPELINE_CONTINUE); 1195} 1196 1197static int 1198zio_free_bp_init(zio_t *zio) 1199{ 1200 blkptr_t *bp = zio->io_bp; 1201 1202 if (zio->io_child_type == ZIO_CHILD_LOGICAL) { 1203 if (BP_GET_DEDUP(bp)) 1204 zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; 1205 } 1206 1207 return (ZIO_PIPELINE_CONTINUE); 1208} 1209 1210/* 1211 * ========================================================================== 1212 * Execute the I/O pipeline 1213 * ========================================================================== 1214 */ 1215 1216static void 1217zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline) 1218{ 1219 spa_t *spa = zio->io_spa; 1220 zio_type_t t = zio->io_type; 1221 int flags = TQ_SLEEP | (cutinline ? TQ_FRONT : 0); 1222 1223 ASSERT(q == ZIO_TASKQ_ISSUE || q == ZIO_TASKQ_INTERRUPT); 1224 1225 /* 1226 * If we're a config writer or a probe, the normal issue and 1227 * interrupt threads may all be blocked waiting for the config lock. 1228 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. 1229 */ 1230 if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) 1231 t = ZIO_TYPE_NULL; 1232 1233 /* 1234 * A similar issue exists for the L2ARC write thread until L2ARC 2.0. 1235 */ 1236 if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) 1237 t = ZIO_TYPE_NULL; 1238 1239 /* 1240 * If this is a high priority I/O, then use the high priority taskq. 1241 */ 1242 if (zio->io_priority == ZIO_PRIORITY_NOW && 1243 spa->spa_zio_taskq[t][q + 1] != NULL) 1244 q++; 1245 1246 ASSERT3U(q, <, ZIO_TASKQ_TYPES); 1247#ifdef _KERNEL 1248 (void) taskq_dispatch_safe(spa->spa_zio_taskq[t][q], 1249 (task_func_t *)zio_execute, zio, flags, &zio->io_task); 1250#else 1251 (void) taskq_dispatch(spa->spa_zio_taskq[t][q], 1252 (task_func_t *)zio_execute, zio, flags); 1253#endif 1254} 1255 1256static boolean_t 1257zio_taskq_member(zio_t *zio, enum zio_taskq_type q) 1258{ 1259 kthread_t *executor = zio->io_executor; 1260 spa_t *spa = zio->io_spa; 1261 1262 for (zio_type_t t = 0; t < ZIO_TYPES; t++) 1263 if (taskq_member(spa->spa_zio_taskq[t][q], executor)) 1264 return (B_TRUE); 1265 1266 return (B_FALSE); 1267} 1268 1269static int 1270zio_issue_async(zio_t *zio) 1271{ 1272 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 1273 1274 return (ZIO_PIPELINE_STOP); 1275} 1276 1277void 1278zio_interrupt(zio_t *zio) 1279{ 1280 zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); 1281} 1282 1283/* 1284 * Execute the I/O pipeline until one of the following occurs: 1285 * 1286 * (1) the I/O completes 1287 * (2) the pipeline stalls waiting for dependent child I/Os 1288 * (3) the I/O issues, so we're waiting for an I/O completion interrupt 1289 * (4) the I/O is delegated by vdev-level caching or aggregation 1290 * (5) the I/O is deferred due to vdev-level queueing 1291 * (6) the I/O is handed off to another thread. 1292 * 1293 * In all cases, the pipeline stops whenever there's no CPU work; it never 1294 * burns a thread in cv_wait(). 1295 * 1296 * There's no locking on io_stage because there's no legitimate way 1297 * for multiple threads to be attempting to process the same I/O. 1298 */ 1299static zio_pipe_stage_t *zio_pipeline[]; 1300 1301void 1302zio_execute(zio_t *zio) 1303{ 1304 zio->io_executor = curthread; 1305 1306 while (zio->io_stage < ZIO_STAGE_DONE) { 1307 enum zio_stage pipeline = zio->io_pipeline; 1308 enum zio_stage stage = zio->io_stage; 1309 int rv; 1310 1311 ASSERT(!MUTEX_HELD(&zio->io_lock)); 1312 ASSERT(ISP2(stage)); 1313 ASSERT(zio->io_stall == NULL); 1314 1315 do { 1316 stage <<= 1; 1317 } while ((stage & pipeline) == 0); 1318 1319 ASSERT(stage <= ZIO_STAGE_DONE); 1320 1321 /* 1322 * If we are in interrupt context and this pipeline stage 1323 * will grab a config lock that is held across I/O, 1324 * or may wait for an I/O that needs an interrupt thread 1325 * to complete, issue async to avoid deadlock. 1326 * 1327 * For VDEV_IO_START, we cut in line so that the io will 1328 * be sent to disk promptly. 1329 */ 1330 if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && 1331 zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { 1332 boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? 1333 zio_requeue_io_start_cut_in_line : B_FALSE; 1334 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); 1335 return; 1336 } 1337 1338 zio->io_stage = stage; 1339 rv = zio_pipeline[highbit(stage) - 1](zio); 1340 1341 if (rv == ZIO_PIPELINE_STOP) 1342 return; 1343 1344 ASSERT(rv == ZIO_PIPELINE_CONTINUE); 1345 } 1346} 1347 1348/* 1349 * ========================================================================== 1350 * Initiate I/O, either sync or async 1351 * ========================================================================== 1352 */ 1353int 1354zio_wait(zio_t *zio) 1355{ 1356 int error; 1357 1358 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 1359 ASSERT(zio->io_executor == NULL); 1360 1361 zio->io_waiter = curthread; 1362 1363 zio_execute(zio); 1364 1365 mutex_enter(&zio->io_lock); 1366 while (zio->io_executor != NULL) 1367 cv_wait(&zio->io_cv, &zio->io_lock); 1368 mutex_exit(&zio->io_lock); 1369 1370 error = zio->io_error; 1371 zio_destroy(zio); 1372 1373 return (error); 1374} 1375 1376void 1377zio_nowait(zio_t *zio) 1378{ 1379 ASSERT(zio->io_executor == NULL); 1380 1381 if (zio->io_child_type == ZIO_CHILD_LOGICAL && 1382 zio_unique_parent(zio) == NULL) { 1383 /* 1384 * This is a logical async I/O with no parent to wait for it. 1385 * We add it to the spa_async_root_zio "Godfather" I/O which 1386 * will ensure they complete prior to unloading the pool. 1387 */ 1388 spa_t *spa = zio->io_spa; 1389 1390 zio_add_child(spa->spa_async_zio_root, zio); 1391 } 1392 1393 zio_execute(zio); 1394} 1395 1396/* 1397 * ========================================================================== 1398 * Reexecute or suspend/resume failed I/O 1399 * ========================================================================== 1400 */ 1401 1402static void 1403zio_reexecute(zio_t *pio) 1404{ 1405 zio_t *cio, *cio_next; 1406 1407 ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); 1408 ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); 1409 ASSERT(pio->io_gang_leader == NULL); 1410 ASSERT(pio->io_gang_tree == NULL); 1411 1412 pio->io_flags = pio->io_orig_flags; 1413 pio->io_stage = pio->io_orig_stage; 1414 pio->io_pipeline = pio->io_orig_pipeline; 1415 pio->io_reexecute = 0; 1416 pio->io_flags |= ZIO_FLAG_REEXECUTED; 1417 pio->io_error = 0; 1418 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1419 pio->io_state[w] = 0; 1420 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 1421 pio->io_child_error[c] = 0; 1422 1423 if (IO_IS_ALLOCATING(pio)) 1424 BP_ZERO(pio->io_bp); 1425 1426 /* 1427 * As we reexecute pio's children, new children could be created. 1428 * New children go to the head of pio's io_child_list, however, 1429 * so we will (correctly) not reexecute them. The key is that 1430 * the remainder of pio's io_child_list, from 'cio_next' onward, 1431 * cannot be affected by any side effects of reexecuting 'cio'. 1432 */ 1433 for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { 1434 cio_next = zio_walk_children(pio); 1435 mutex_enter(&pio->io_lock); 1436 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1437 pio->io_children[cio->io_child_type][w]++; 1438 mutex_exit(&pio->io_lock); 1439 zio_reexecute(cio); 1440 } 1441 1442 /* 1443 * Now that all children have been reexecuted, execute the parent. 1444 * We don't reexecute "The Godfather" I/O here as it's the 1445 * responsibility of the caller to wait on him. 1446 */ 1447 if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) 1448 zio_execute(pio); 1449} 1450 1451void 1452zio_suspend(spa_t *spa, zio_t *zio) 1453{ 1454 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) 1455 fm_panic("Pool '%s' has encountered an uncorrectable I/O " 1456 "failure and the failure mode property for this pool " 1457 "is set to panic.", spa_name(spa)); 1458 1459 zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); 1460 1461 mutex_enter(&spa->spa_suspend_lock); 1462 1463 if (spa->spa_suspend_zio_root == NULL) 1464 spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 1465 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 1466 ZIO_FLAG_GODFATHER); 1467 1468 spa->spa_suspended = B_TRUE; 1469 1470 if (zio != NULL) { 1471 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 1472 ASSERT(zio != spa->spa_suspend_zio_root); 1473 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1474 ASSERT(zio_unique_parent(zio) == NULL); 1475 ASSERT(zio->io_stage == ZIO_STAGE_DONE); 1476 zio_add_child(spa->spa_suspend_zio_root, zio); 1477 } 1478 1479 mutex_exit(&spa->spa_suspend_lock); 1480} 1481 1482int 1483zio_resume(spa_t *spa) 1484{ 1485 zio_t *pio; 1486 1487 /* 1488 * Reexecute all previously suspended i/o. 1489 */ 1490 mutex_enter(&spa->spa_suspend_lock); 1491 spa->spa_suspended = B_FALSE; 1492 cv_broadcast(&spa->spa_suspend_cv); 1493 pio = spa->spa_suspend_zio_root; 1494 spa->spa_suspend_zio_root = NULL; 1495 mutex_exit(&spa->spa_suspend_lock); 1496 1497 if (pio == NULL) 1498 return (0); 1499 1500 zio_reexecute(pio); 1501 return (zio_wait(pio)); 1502} 1503 1504void 1505zio_resume_wait(spa_t *spa) 1506{ 1507 mutex_enter(&spa->spa_suspend_lock); 1508 while (spa_suspended(spa)) 1509 cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); 1510 mutex_exit(&spa->spa_suspend_lock); 1511} 1512 1513/* 1514 * ========================================================================== 1515 * Gang blocks. 1516 * 1517 * A gang block is a collection of small blocks that looks to the DMU 1518 * like one large block. When zio_dva_allocate() cannot find a block 1519 * of the requested size, due to either severe fragmentation or the pool 1520 * being nearly full, it calls zio_write_gang_block() to construct the 1521 * block from smaller fragments. 1522 * 1523 * A gang block consists of a gang header (zio_gbh_phys_t) and up to 1524 * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like 1525 * an indirect block: it's an array of block pointers. It consumes 1526 * only one sector and hence is allocatable regardless of fragmentation. 1527 * The gang header's bps point to its gang members, which hold the data. 1528 * 1529 * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> 1530 * as the verifier to ensure uniqueness of the SHA256 checksum. 1531 * Critically, the gang block bp's blk_cksum is the checksum of the data, 1532 * not the gang header. This ensures that data block signatures (needed for 1533 * deduplication) are independent of how the block is physically stored. 1534 * 1535 * Gang blocks can be nested: a gang member may itself be a gang block. 1536 * Thus every gang block is a tree in which root and all interior nodes are 1537 * gang headers, and the leaves are normal blocks that contain user data. 1538 * The root of the gang tree is called the gang leader. 1539 * 1540 * To perform any operation (read, rewrite, free, claim) on a gang block, 1541 * zio_gang_assemble() first assembles the gang tree (minus data leaves) 1542 * in the io_gang_tree field of the original logical i/o by recursively 1543 * reading the gang leader and all gang headers below it. This yields 1544 * an in-core tree containing the contents of every gang header and the 1545 * bps for every constituent of the gang block. 1546 * 1547 * With the gang tree now assembled, zio_gang_issue() just walks the gang tree 1548 * and invokes a callback on each bp. To free a gang block, zio_gang_issue() 1549 * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. 1550 * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). 1551 * zio_read_gang() is a wrapper around zio_read() that omits reading gang 1552 * headers, since we already have those in io_gang_tree. zio_rewrite_gang() 1553 * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() 1554 * of the gang header plus zio_checksum_compute() of the data to update the 1555 * gang header's blk_cksum as described above. 1556 * 1557 * The two-phase assemble/issue model solves the problem of partial failure -- 1558 * what if you'd freed part of a gang block but then couldn't read the 1559 * gang header for another part? Assembling the entire gang tree first 1560 * ensures that all the necessary gang header I/O has succeeded before 1561 * starting the actual work of free, claim, or write. Once the gang tree 1562 * is assembled, free and claim are in-memory operations that cannot fail. 1563 * 1564 * In the event that a gang write fails, zio_dva_unallocate() walks the 1565 * gang tree to immediately free (i.e. insert back into the space map) 1566 * everything we've allocated. This ensures that we don't get ENOSPC 1567 * errors during repeated suspend/resume cycles due to a flaky device. 1568 * 1569 * Gang rewrites only happen during sync-to-convergence. If we can't assemble 1570 * the gang tree, we won't modify the block, so we can safely defer the free 1571 * (knowing that the block is still intact). If we *can* assemble the gang 1572 * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free 1573 * each constituent bp and we can allocate a new block on the next sync pass. 1574 * 1575 * In all cases, the gang tree allows complete recovery from partial failure. 1576 * ========================================================================== 1577 */ 1578 1579static zio_t * 1580zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1581{ 1582 if (gn != NULL) 1583 return (pio); 1584 1585 return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), 1586 NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1587 &pio->io_bookmark)); 1588} 1589 1590zio_t * 1591zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1592{ 1593 zio_t *zio; 1594 1595 if (gn != NULL) { 1596 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1597 gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, 1598 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1599 /* 1600 * As we rewrite each gang header, the pipeline will compute 1601 * a new gang block header checksum for it; but no one will 1602 * compute a new data checksum, so we do that here. The one 1603 * exception is the gang leader: the pipeline already computed 1604 * its data checksum because that stage precedes gang assembly. 1605 * (Presently, nothing actually uses interior data checksums; 1606 * this is just good hygiene.) 1607 */ 1608 if (gn != pio->io_gang_leader->io_gang_tree) { 1609 zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), 1610 data, BP_GET_PSIZE(bp)); 1611 } 1612 /* 1613 * If we are here to damage data for testing purposes, 1614 * leave the GBH alone so that we can detect the damage. 1615 */ 1616 if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) 1617 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 1618 } else { 1619 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1620 data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, 1621 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1622 } 1623 1624 return (zio); 1625} 1626 1627/* ARGSUSED */ 1628zio_t * 1629zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1630{ 1631 return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, 1632 BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp), 1633 ZIO_GANG_CHILD_FLAGS(pio))); 1634} 1635 1636/* ARGSUSED */ 1637zio_t * 1638zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1639{ 1640 return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, 1641 NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); 1642} 1643 1644static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { 1645 NULL, 1646 zio_read_gang, 1647 zio_rewrite_gang, 1648 zio_free_gang, 1649 zio_claim_gang, 1650 NULL 1651}; 1652 1653static void zio_gang_tree_assemble_done(zio_t *zio); 1654 1655static zio_gang_node_t * 1656zio_gang_node_alloc(zio_gang_node_t **gnpp) 1657{ 1658 zio_gang_node_t *gn; 1659 1660 ASSERT(*gnpp == NULL); 1661 1662 gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); 1663 gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); 1664 *gnpp = gn; 1665 1666 return (gn); 1667} 1668 1669static void 1670zio_gang_node_free(zio_gang_node_t **gnpp) 1671{ 1672 zio_gang_node_t *gn = *gnpp; 1673 1674 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1675 ASSERT(gn->gn_child[g] == NULL); 1676 1677 zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); 1678 kmem_free(gn, sizeof (*gn)); 1679 *gnpp = NULL; 1680} 1681 1682static void 1683zio_gang_tree_free(zio_gang_node_t **gnpp) 1684{ 1685 zio_gang_node_t *gn = *gnpp; 1686 1687 if (gn == NULL) 1688 return; 1689 1690 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1691 zio_gang_tree_free(&gn->gn_child[g]); 1692 1693 zio_gang_node_free(gnpp); 1694} 1695 1696static void 1697zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) 1698{ 1699 zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); 1700 1701 ASSERT(gio->io_gang_leader == gio); 1702 ASSERT(BP_IS_GANG(bp)); 1703 1704 zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, 1705 SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, 1706 gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); 1707} 1708 1709static void 1710zio_gang_tree_assemble_done(zio_t *zio) 1711{ 1712 zio_t *gio = zio->io_gang_leader; 1713 zio_gang_node_t *gn = zio->io_private; 1714 blkptr_t *bp = zio->io_bp; 1715 1716 ASSERT(gio == zio_unique_parent(zio)); 1717 ASSERT(zio->io_child_count == 0); 1718 1719 if (zio->io_error) 1720 return; 1721 1722 if (BP_SHOULD_BYTESWAP(bp)) 1723 byteswap_uint64_array(zio->io_data, zio->io_size); 1724 1725 ASSERT(zio->io_data == gn->gn_gbh); 1726 ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 1727 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1728 1729 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1730 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1731 if (!BP_IS_GANG(gbp)) 1732 continue; 1733 zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); 1734 } 1735} 1736 1737static void 1738zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) 1739{ 1740 zio_t *gio = pio->io_gang_leader; 1741 zio_t *zio; 1742 1743 ASSERT(BP_IS_GANG(bp) == !!gn); 1744 ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); 1745 ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); 1746 1747 /* 1748 * If you're a gang header, your data is in gn->gn_gbh. 1749 * If you're a gang member, your data is in 'data' and gn == NULL. 1750 */ 1751 zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); 1752 1753 if (gn != NULL) { 1754 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1755 1756 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1757 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1758 if (BP_IS_HOLE(gbp)) 1759 continue; 1760 zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); 1761 data = (char *)data + BP_GET_PSIZE(gbp); 1762 } 1763 } 1764 1765 if (gn == gio->io_gang_tree && gio->io_data != NULL) 1766 ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); 1767 1768 if (zio != pio) 1769 zio_nowait(zio); 1770} 1771 1772static int 1773zio_gang_assemble(zio_t *zio) 1774{ 1775 blkptr_t *bp = zio->io_bp; 1776 1777 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); 1778 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1779 1780 zio->io_gang_leader = zio; 1781 1782 zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); 1783 1784 return (ZIO_PIPELINE_CONTINUE); 1785} 1786 1787static int 1788zio_gang_issue(zio_t *zio) 1789{ 1790 blkptr_t *bp = zio->io_bp; 1791 1792 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) 1793 return (ZIO_PIPELINE_STOP); 1794 1795 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); 1796 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1797 1798 if (zio->io_child_error[ZIO_CHILD_GANG] == 0) 1799 zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); 1800 else 1801 zio_gang_tree_free(&zio->io_gang_tree); 1802 1803 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1804 1805 return (ZIO_PIPELINE_CONTINUE); 1806} 1807 1808static void 1809zio_write_gang_member_ready(zio_t *zio) 1810{ 1811 zio_t *pio = zio_unique_parent(zio); 1812 zio_t *gio = zio->io_gang_leader; 1813 dva_t *cdva = zio->io_bp->blk_dva; 1814 dva_t *pdva = pio->io_bp->blk_dva; 1815 uint64_t asize; 1816 1817 if (BP_IS_HOLE(zio->io_bp)) 1818 return; 1819 1820 ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); 1821 1822 ASSERT(zio->io_child_type == ZIO_CHILD_GANG); 1823 ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); 1824 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); 1825 ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); 1826 ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 1827 1828 mutex_enter(&pio->io_lock); 1829 for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { 1830 ASSERT(DVA_GET_GANG(&pdva[d])); 1831 asize = DVA_GET_ASIZE(&pdva[d]); 1832 asize += DVA_GET_ASIZE(&cdva[d]); 1833 DVA_SET_ASIZE(&pdva[d], asize); 1834 } 1835 mutex_exit(&pio->io_lock); 1836} 1837 1838static int 1839zio_write_gang_block(zio_t *pio) 1840{ 1841 spa_t *spa = pio->io_spa; 1842 blkptr_t *bp = pio->io_bp; 1843 zio_t *gio = pio->io_gang_leader; 1844 zio_t *zio; 1845 zio_gang_node_t *gn, **gnpp; 1846 zio_gbh_phys_t *gbh; 1847 uint64_t txg = pio->io_txg; 1848 uint64_t resid = pio->io_size; 1849 uint64_t lsize; 1850 int copies = gio->io_prop.zp_copies; 1851 int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); 1852 zio_prop_t zp; 1853 int error; 1854 1855 error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE, 1856 bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, 1857 METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); 1858 if (error) { 1859 pio->io_error = error; 1860 return (ZIO_PIPELINE_CONTINUE); 1861 } 1862 1863 if (pio == gio) { 1864 gnpp = &gio->io_gang_tree; 1865 } else { 1866 gnpp = pio->io_private; 1867 ASSERT(pio->io_ready == zio_write_gang_member_ready); 1868 } 1869 1870 gn = zio_gang_node_alloc(gnpp); 1871 gbh = gn->gn_gbh; 1872 bzero(gbh, SPA_GANGBLOCKSIZE); 1873 1874 /* 1875 * Create the gang header. 1876 */ 1877 zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, 1878 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1879 1880 /* 1881 * Create and nowait the gang children. 1882 */ 1883 for (int g = 0; resid != 0; resid -= lsize, g++) { 1884 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), 1885 SPA_MINBLOCKSIZE); 1886 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); 1887 1888 zp.zp_checksum = gio->io_prop.zp_checksum; 1889 zp.zp_compress = ZIO_COMPRESS_OFF; 1890 zp.zp_type = DMU_OT_NONE; 1891 zp.zp_level = 0; 1892 zp.zp_copies = gio->io_prop.zp_copies; 1893 zp.zp_dedup = B_FALSE; 1894 zp.zp_dedup_verify = B_FALSE; 1895 zp.zp_nopwrite = B_FALSE; 1896 1897 zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], 1898 (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, 1899 zio_write_gang_member_ready, NULL, &gn->gn_child[g], 1900 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1901 &pio->io_bookmark)); 1902 } 1903 1904 /* 1905 * Set pio's pipeline to just wait for zio to finish. 1906 */ 1907 pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1908 1909 zio_nowait(zio); 1910 1911 return (ZIO_PIPELINE_CONTINUE); 1912} 1913 1914/* 1915 * The zio_nop_write stage in the pipeline determines if allocating 1916 * a new bp is necessary. By leveraging a cryptographically secure checksum, 1917 * such as SHA256, we can compare the checksums of the new data and the old 1918 * to determine if allocating a new block is required. The nopwrite 1919 * feature can handle writes in either syncing or open context (i.e. zil 1920 * writes) and as a result is mutually exclusive with dedup. 1921 */ 1922static int 1923zio_nop_write(zio_t *zio) 1924{ 1925 blkptr_t *bp = zio->io_bp; 1926 blkptr_t *bp_orig = &zio->io_bp_orig; 1927 zio_prop_t *zp = &zio->io_prop; 1928 1929 ASSERT(BP_GET_LEVEL(bp) == 0); 1930 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1931 ASSERT(zp->zp_nopwrite); 1932 ASSERT(!zp->zp_dedup); 1933 ASSERT(zio->io_bp_override == NULL); 1934 ASSERT(IO_IS_ALLOCATING(zio)); 1935 1936 /* 1937 * Check to see if the original bp and the new bp have matching 1938 * characteristics (i.e. same checksum, compression algorithms, etc). 1939 * If they don't then just continue with the pipeline which will 1940 * allocate a new bp. 1941 */ 1942 if (BP_IS_HOLE(bp_orig) || 1943 !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup || 1944 BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || 1945 BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || 1946 BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || 1947 zp->zp_copies != BP_GET_NDVAS(bp_orig)) 1948 return (ZIO_PIPELINE_CONTINUE); 1949 1950 /* 1951 * If the checksums match then reset the pipeline so that we 1952 * avoid allocating a new bp and issuing any I/O. 1953 */ 1954 if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { 1955 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup); 1956 ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); 1957 ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); 1958 ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); 1959 ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, 1960 sizeof (uint64_t)) == 0); 1961 1962 *bp = *bp_orig; 1963 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1964 zio->io_flags |= ZIO_FLAG_NOPWRITE; 1965 } 1966 1967 return (ZIO_PIPELINE_CONTINUE); 1968} 1969 1970/* 1971 * ========================================================================== 1972 * Dedup 1973 * ========================================================================== 1974 */ 1975static void 1976zio_ddt_child_read_done(zio_t *zio) 1977{ 1978 blkptr_t *bp = zio->io_bp; 1979 ddt_entry_t *dde = zio->io_private; 1980 ddt_phys_t *ddp; 1981 zio_t *pio = zio_unique_parent(zio); 1982 1983 mutex_enter(&pio->io_lock); 1984 ddp = ddt_phys_select(dde, bp); 1985 if (zio->io_error == 0) 1986 ddt_phys_clear(ddp); /* this ddp doesn't need repair */ 1987 if (zio->io_error == 0 && dde->dde_repair_data == NULL) 1988 dde->dde_repair_data = zio->io_data; 1989 else 1990 zio_buf_free(zio->io_data, zio->io_size); 1991 mutex_exit(&pio->io_lock); 1992} 1993 1994static int 1995zio_ddt_read_start(zio_t *zio) 1996{ 1997 blkptr_t *bp = zio->io_bp; 1998 1999 ASSERT(BP_GET_DEDUP(bp)); 2000 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2001 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2002 2003 if (zio->io_child_error[ZIO_CHILD_DDT]) { 2004 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2005 ddt_entry_t *dde = ddt_repair_start(ddt, bp); 2006 ddt_phys_t *ddp = dde->dde_phys; 2007 ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); 2008 blkptr_t blk; 2009 2010 ASSERT(zio->io_vsd == NULL); 2011 zio->io_vsd = dde; 2012 2013 if (ddp_self == NULL) 2014 return (ZIO_PIPELINE_CONTINUE); 2015 2016 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 2017 if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) 2018 continue; 2019 ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, 2020 &blk); 2021 zio_nowait(zio_read(zio, zio->io_spa, &blk, 2022 zio_buf_alloc(zio->io_size), zio->io_size, 2023 zio_ddt_child_read_done, dde, zio->io_priority, 2024 ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, 2025 &zio->io_bookmark)); 2026 } 2027 return (ZIO_PIPELINE_CONTINUE); 2028 } 2029 2030 zio_nowait(zio_read(zio, zio->io_spa, bp, 2031 zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, 2032 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); 2033 2034 return (ZIO_PIPELINE_CONTINUE); 2035} 2036 2037static int 2038zio_ddt_read_done(zio_t *zio) 2039{ 2040 blkptr_t *bp = zio->io_bp; 2041 2042 if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)) 2043 return (ZIO_PIPELINE_STOP); 2044 2045 ASSERT(BP_GET_DEDUP(bp)); 2046 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2047 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2048 2049 if (zio->io_child_error[ZIO_CHILD_DDT]) { 2050 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2051 ddt_entry_t *dde = zio->io_vsd; 2052 if (ddt == NULL) { 2053 ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); 2054 return (ZIO_PIPELINE_CONTINUE); 2055 } 2056 if (dde == NULL) { 2057 zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; 2058 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 2059 return (ZIO_PIPELINE_STOP); 2060 } 2061 if (dde->dde_repair_data != NULL) { 2062 bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); 2063 zio->io_child_error[ZIO_CHILD_DDT] = 0; 2064 } 2065 ddt_repair_done(ddt, dde); 2066 zio->io_vsd = NULL; 2067 } 2068 2069 ASSERT(zio->io_vsd == NULL); 2070 2071 return (ZIO_PIPELINE_CONTINUE); 2072} 2073 2074static boolean_t 2075zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) 2076{ 2077 spa_t *spa = zio->io_spa; 2078 2079 /* 2080 * Note: we compare the original data, not the transformed data, 2081 * because when zio->io_bp is an override bp, we will not have 2082 * pushed the I/O transforms. That's an important optimization 2083 * because otherwise we'd compress/encrypt all dmu_sync() data twice. 2084 */ 2085 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2086 zio_t *lio = dde->dde_lead_zio[p]; 2087 2088 if (lio != NULL) { 2089 return (lio->io_orig_size != zio->io_orig_size || 2090 bcmp(zio->io_orig_data, lio->io_orig_data, 2091 zio->io_orig_size) != 0); 2092 } 2093 } 2094 2095 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2096 ddt_phys_t *ddp = &dde->dde_phys[p]; 2097 2098 if (ddp->ddp_phys_birth != 0) { 2099 arc_buf_t *abuf = NULL; 2100 uint32_t aflags = ARC_WAIT; 2101 blkptr_t blk = *zio->io_bp; 2102 int error; 2103 2104 ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); 2105 2106 ddt_exit(ddt); 2107 2108 error = arc_read(NULL, spa, &blk, 2109 arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, 2110 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2111 &aflags, &zio->io_bookmark); 2112 2113 if (error == 0) { 2114 if (arc_buf_size(abuf) != zio->io_orig_size || 2115 bcmp(abuf->b_data, zio->io_orig_data, 2116 zio->io_orig_size) != 0) 2117 error = SET_ERROR(EEXIST); 2118 VERIFY(arc_buf_remove_ref(abuf, &abuf)); 2119 } 2120 2121 ddt_enter(ddt); 2122 return (error != 0); 2123 } 2124 } 2125 2126 return (B_FALSE); 2127} 2128 2129static void 2130zio_ddt_child_write_ready(zio_t *zio) 2131{ 2132 int p = zio->io_prop.zp_copies; 2133 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2134 ddt_entry_t *dde = zio->io_private; 2135 ddt_phys_t *ddp = &dde->dde_phys[p]; 2136 zio_t *pio; 2137 2138 if (zio->io_error) 2139 return; 2140 2141 ddt_enter(ddt); 2142 2143 ASSERT(dde->dde_lead_zio[p] == zio); 2144 2145 ddt_phys_fill(ddp, zio->io_bp); 2146 2147 while ((pio = zio_walk_parents(zio)) != NULL) 2148 ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); 2149 2150 ddt_exit(ddt); 2151} 2152 2153static void 2154zio_ddt_child_write_done(zio_t *zio) 2155{ 2156 int p = zio->io_prop.zp_copies; 2157 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2158 ddt_entry_t *dde = zio->io_private; 2159 ddt_phys_t *ddp = &dde->dde_phys[p]; 2160 2161 ddt_enter(ddt); 2162 2163 ASSERT(ddp->ddp_refcnt == 0); 2164 ASSERT(dde->dde_lead_zio[p] == zio); 2165 dde->dde_lead_zio[p] = NULL; 2166 2167 if (zio->io_error == 0) { 2168 while (zio_walk_parents(zio) != NULL) 2169 ddt_phys_addref(ddp); 2170 } else { 2171 ddt_phys_clear(ddp); 2172 } 2173 2174 ddt_exit(ddt); 2175} 2176 2177static void 2178zio_ddt_ditto_write_done(zio_t *zio) 2179{ 2180 int p = DDT_PHYS_DITTO; 2181 zio_prop_t *zp = &zio->io_prop; 2182 blkptr_t *bp = zio->io_bp; 2183 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2184 ddt_entry_t *dde = zio->io_private; 2185 ddt_phys_t *ddp = &dde->dde_phys[p]; 2186 ddt_key_t *ddk = &dde->dde_key; 2187 2188 ddt_enter(ddt); 2189 2190 ASSERT(ddp->ddp_refcnt == 0); 2191 ASSERT(dde->dde_lead_zio[p] == zio); 2192 dde->dde_lead_zio[p] = NULL; 2193 2194 if (zio->io_error == 0) { 2195 ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); 2196 ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); 2197 ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); 2198 if (ddp->ddp_phys_birth != 0) 2199 ddt_phys_free(ddt, ddk, ddp, zio->io_txg); 2200 ddt_phys_fill(ddp, bp); 2201 } 2202 2203 ddt_exit(ddt); 2204} 2205 2206static int 2207zio_ddt_write(zio_t *zio) 2208{ 2209 spa_t *spa = zio->io_spa; 2210 blkptr_t *bp = zio->io_bp; 2211 uint64_t txg = zio->io_txg; 2212 zio_prop_t *zp = &zio->io_prop; 2213 int p = zp->zp_copies; 2214 int ditto_copies; 2215 zio_t *cio = NULL; 2216 zio_t *dio = NULL; 2217 ddt_t *ddt = ddt_select(spa, bp); 2218 ddt_entry_t *dde; 2219 ddt_phys_t *ddp; 2220 2221 ASSERT(BP_GET_DEDUP(bp)); 2222 ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); 2223 ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); 2224 2225 ddt_enter(ddt); 2226 dde = ddt_lookup(ddt, bp, B_TRUE); 2227 ddp = &dde->dde_phys[p]; 2228 2229 if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { 2230 /* 2231 * If we're using a weak checksum, upgrade to a strong checksum 2232 * and try again. If we're already using a strong checksum, 2233 * we can't resolve it, so just convert to an ordinary write. 2234 * (And automatically e-mail a paper to Nature?) 2235 */ 2236 if (!zio_checksum_table[zp->zp_checksum].ci_dedup) { 2237 zp->zp_checksum = spa_dedup_checksum(spa); 2238 zio_pop_transforms(zio); 2239 zio->io_stage = ZIO_STAGE_OPEN; 2240 BP_ZERO(bp); 2241 } else { 2242 zp->zp_dedup = B_FALSE; 2243 } 2244 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2245 ddt_exit(ddt); 2246 return (ZIO_PIPELINE_CONTINUE); 2247 } 2248 2249 ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); 2250 ASSERT(ditto_copies < SPA_DVAS_PER_BP); 2251 2252 if (ditto_copies > ddt_ditto_copies_present(dde) && 2253 dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { 2254 zio_prop_t czp = *zp; 2255 2256 czp.zp_copies = ditto_copies; 2257 2258 /* 2259 * If we arrived here with an override bp, we won't have run 2260 * the transform stack, so we won't have the data we need to 2261 * generate a child i/o. So, toss the override bp and restart. 2262 * This is safe, because using the override bp is just an 2263 * optimization; and it's rare, so the cost doesn't matter. 2264 */ 2265 if (zio->io_bp_override) { 2266 zio_pop_transforms(zio); 2267 zio->io_stage = ZIO_STAGE_OPEN; 2268 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2269 zio->io_bp_override = NULL; 2270 BP_ZERO(bp); 2271 ddt_exit(ddt); 2272 return (ZIO_PIPELINE_CONTINUE); 2273 } 2274 2275 dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2276 zio->io_orig_size, &czp, NULL, 2277 zio_ddt_ditto_write_done, dde, zio->io_priority, 2278 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2279 2280 zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); 2281 dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; 2282 } 2283 2284 if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { 2285 if (ddp->ddp_phys_birth != 0) 2286 ddt_bp_fill(ddp, bp, txg); 2287 if (dde->dde_lead_zio[p] != NULL) 2288 zio_add_child(zio, dde->dde_lead_zio[p]); 2289 else 2290 ddt_phys_addref(ddp); 2291 } else if (zio->io_bp_override) { 2292 ASSERT(bp->blk_birth == txg); 2293 ASSERT(BP_EQUAL(bp, zio->io_bp_override)); 2294 ddt_phys_fill(ddp, bp); 2295 ddt_phys_addref(ddp); 2296 } else { 2297 cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2298 zio->io_orig_size, zp, zio_ddt_child_write_ready, 2299 zio_ddt_child_write_done, dde, zio->io_priority, 2300 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2301 2302 zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); 2303 dde->dde_lead_zio[p] = cio; 2304 } 2305 2306 ddt_exit(ddt); 2307 2308 if (cio) 2309 zio_nowait(cio); 2310 if (dio) 2311 zio_nowait(dio); 2312 2313 return (ZIO_PIPELINE_CONTINUE); 2314} 2315 2316ddt_entry_t *freedde; /* for debugging */ 2317 2318static int 2319zio_ddt_free(zio_t *zio) 2320{ 2321 spa_t *spa = zio->io_spa; 2322 blkptr_t *bp = zio->io_bp; 2323 ddt_t *ddt = ddt_select(spa, bp); 2324 ddt_entry_t *dde; 2325 ddt_phys_t *ddp; 2326 2327 ASSERT(BP_GET_DEDUP(bp)); 2328 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2329 2330 ddt_enter(ddt); 2331 freedde = dde = ddt_lookup(ddt, bp, B_TRUE); 2332 ddp = ddt_phys_select(dde, bp); 2333 ddt_phys_decref(ddp); 2334 ddt_exit(ddt); 2335 2336 return (ZIO_PIPELINE_CONTINUE); 2337} 2338 2339/* 2340 * ========================================================================== 2341 * Allocate and free blocks 2342 * ========================================================================== 2343 */ 2344static int 2345zio_dva_allocate(zio_t *zio) 2346{ 2347 spa_t *spa = zio->io_spa; 2348 metaslab_class_t *mc = spa_normal_class(spa); 2349 blkptr_t *bp = zio->io_bp; 2350 int error; 2351 int flags = 0; 2352 2353 if (zio->io_gang_leader == NULL) { 2354 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2355 zio->io_gang_leader = zio; 2356 } 2357 2358 ASSERT(BP_IS_HOLE(bp)); 2359 ASSERT0(BP_GET_NDVAS(bp)); 2360 ASSERT3U(zio->io_prop.zp_copies, >, 0); 2361 ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); 2362 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 2363 2364 /* 2365 * The dump device does not support gang blocks so allocation on 2366 * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid 2367 * the "fast" gang feature. 2368 */ 2369 flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0; 2370 flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ? 2371 METASLAB_GANG_CHILD : 0; 2372 error = metaslab_alloc(spa, mc, zio->io_size, bp, 2373 zio->io_prop.zp_copies, zio->io_txg, NULL, flags); 2374 2375 if (error) { 2376 spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " 2377 "size %llu, error %d", spa_name(spa), zio, zio->io_size, 2378 error); 2379 if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) 2380 return (zio_write_gang_block(zio)); 2381 zio->io_error = error; 2382 } 2383 2384 return (ZIO_PIPELINE_CONTINUE); 2385} 2386 2387static int 2388zio_dva_free(zio_t *zio) 2389{ 2390 metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); 2391 2392 return (ZIO_PIPELINE_CONTINUE); 2393} 2394 2395static int 2396zio_dva_claim(zio_t *zio) 2397{ 2398 int error; 2399 2400 error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 2401 if (error) 2402 zio->io_error = error; 2403 2404 return (ZIO_PIPELINE_CONTINUE); 2405} 2406 2407/* 2408 * Undo an allocation. This is used by zio_done() when an I/O fails 2409 * and we want to give back the block we just allocated. 2410 * This handles both normal blocks and gang blocks. 2411 */ 2412static void 2413zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) 2414{ 2415 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 2416 ASSERT(zio->io_bp_override == NULL); 2417 2418 if (!BP_IS_HOLE(bp)) 2419 metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); 2420 2421 if (gn != NULL) { 2422 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 2423 zio_dva_unallocate(zio, gn->gn_child[g], 2424 &gn->gn_gbh->zg_blkptr[g]); 2425 } 2426 } 2427} 2428 2429/* 2430 * Try to allocate an intent log block. Return 0 on success, errno on failure. 2431 */ 2432int 2433zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, 2434 uint64_t size, boolean_t use_slog) 2435{ 2436 int error = 1; 2437 2438 ASSERT(txg > spa_syncing_txg(spa)); 2439 2440 /* 2441 * ZIL blocks are always contiguous (i.e. not gang blocks) so we 2442 * set the METASLAB_GANG_AVOID flag so that they don't "fast gang" 2443 * when allocating them. 2444 */ 2445 if (use_slog) { 2446 error = metaslab_alloc(spa, spa_log_class(spa), size, 2447 new_bp, 1, txg, old_bp, 2448 METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID); 2449 } 2450 2451 if (error) { 2452 error = metaslab_alloc(spa, spa_normal_class(spa), size, 2453 new_bp, 1, txg, old_bp, 2454 METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID); 2455 } 2456 2457 if (error == 0) { 2458 BP_SET_LSIZE(new_bp, size); 2459 BP_SET_PSIZE(new_bp, size); 2460 BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 2461 BP_SET_CHECKSUM(new_bp, 2462 spa_version(spa) >= SPA_VERSION_SLIM_ZIL 2463 ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); 2464 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 2465 BP_SET_LEVEL(new_bp, 0); 2466 BP_SET_DEDUP(new_bp, 0); 2467 BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 2468 } 2469 2470 return (error); 2471} 2472 2473/* 2474 * Free an intent log block. 2475 */ 2476void 2477zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp) 2478{ 2479 ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG); 2480 ASSERT(!BP_IS_GANG(bp)); 2481 2482 zio_free(spa, txg, bp); 2483} 2484 2485/* 2486 * ========================================================================== 2487 * Read, write and delete to physical devices 2488 * ========================================================================== 2489 */ 2490static int 2491zio_vdev_io_start(zio_t *zio) 2492{ 2493 vdev_t *vd = zio->io_vd; 2494 uint64_t align; 2495 spa_t *spa = zio->io_spa; 2496 2497 ASSERT(zio->io_error == 0); 2498 ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); 2499 2500 if (vd == NULL) { 2501 if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2502 spa_config_enter(spa, SCL_ZIO, zio, RW_READER); 2503 2504 /* 2505 * The mirror_ops handle multiple DVAs in a single BP. 2506 */ 2507 return (vdev_mirror_ops.vdev_op_io_start(zio)); 2508 } 2509 2510 if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE) { 2511 trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg); 2512 return (ZIO_PIPELINE_CONTINUE); 2513 } 2514 2515 /* 2516 * We keep track of time-sensitive I/Os so that the scan thread 2517 * can quickly react to certain workloads. In particular, we care 2518 * about non-scrubbing, top-level reads and writes with the following 2519 * characteristics: 2520 * - synchronous writes of user data to non-slog devices 2521 * - any reads of user data 2522 * When these conditions are met, adjust the timestamp of spa_last_io 2523 * which allows the scan thread to adjust its workload accordingly. 2524 */ 2525 if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && 2526 vd == vd->vdev_top && !vd->vdev_islog && 2527 zio->io_bookmark.zb_objset != DMU_META_OBJSET && 2528 zio->io_txg != spa_syncing_txg(spa)) { 2529 uint64_t old = spa->spa_last_io; 2530 uint64_t new = ddi_get_lbolt64(); 2531 if (old != new) 2532 (void) atomic_cas_64(&spa->spa_last_io, old, new); 2533 } 2534 2535 align = 1ULL << vd->vdev_top->vdev_ashift; 2536 2537 if (P2PHASE(zio->io_size, align) != 0) { 2538 uint64_t asize = P2ROUNDUP(zio->io_size, align); 2539 char *abuf = NULL; 2540 if (zio->io_type == ZIO_TYPE_READ || 2541 zio->io_type == ZIO_TYPE_WRITE) 2542 abuf = zio_buf_alloc(asize); 2543 ASSERT(vd == vd->vdev_top); 2544 if (zio->io_type == ZIO_TYPE_WRITE) { 2545 bcopy(zio->io_data, abuf, zio->io_size); 2546 bzero(abuf + zio->io_size, asize - zio->io_size); 2547 } 2548 zio_push_transform(zio, abuf, asize, abuf ? asize : 0, 2549 zio_subblock); 2550 } 2551 2552 ASSERT(P2PHASE(zio->io_offset, align) == 0); 2553 ASSERT(P2PHASE(zio->io_size, align) == 0); 2554 VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa)); 2555 2556 /* 2557 * If this is a repair I/O, and there's no self-healing involved -- 2558 * that is, we're just resilvering what we expect to resilver -- 2559 * then don't do the I/O unless zio's txg is actually in vd's DTL. 2560 * This prevents spurious resilvering with nested replication. 2561 * For example, given a mirror of mirrors, (A+B)+(C+D), if only 2562 * A is out of date, we'll read from C+D, then use the data to 2563 * resilver A+B -- but we don't actually want to resilver B, just A. 2564 * The top-level mirror has no way to know this, so instead we just 2565 * discard unnecessary repairs as we work our way down the vdev tree. 2566 * The same logic applies to any form of nested replication: 2567 * ditto + mirror, RAID-Z + replacing, etc. This covers them all. 2568 */ 2569 if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && 2570 !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && 2571 zio->io_txg != 0 && /* not a delegated i/o */ 2572 !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { 2573 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 2574 zio_vdev_io_bypass(zio); 2575 return (ZIO_PIPELINE_CONTINUE); 2576 } 2577 2578 if (vd->vdev_ops->vdev_op_leaf && 2579 (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { 2580 2581 if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) 2582 return (ZIO_PIPELINE_CONTINUE); 2583 2584 if ((zio = vdev_queue_io(zio)) == NULL) 2585 return (ZIO_PIPELINE_STOP); 2586 2587 if (!vdev_accessible(vd, zio)) { 2588 zio->io_error = SET_ERROR(ENXIO); 2589 zio_interrupt(zio); 2590 return (ZIO_PIPELINE_STOP); 2591 } 2592 } 2593 2594 /* 2595 * Note that we ignore repair writes for TRIM because they can conflict 2596 * with normal writes. This isn't an issue because, by definition, we 2597 * only repair blocks that aren't freed. 2598 */ 2599 if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_WRITE && 2600 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 2601 if (!trim_map_write_start(zio)) 2602 return (ZIO_PIPELINE_STOP); 2603 } 2604 2605 return (vd->vdev_ops->vdev_op_io_start(zio)); 2606} 2607 2608static int 2609zio_vdev_io_done(zio_t *zio) 2610{ 2611 vdev_t *vd = zio->io_vd; 2612 vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; 2613 boolean_t unexpected_error = B_FALSE; 2614 2615 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2616 return (ZIO_PIPELINE_STOP); 2617 2618 ASSERT(zio->io_type == ZIO_TYPE_READ || 2619 zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE); 2620 2621 if (vd != NULL && vd->vdev_ops->vdev_op_leaf && 2622 (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { 2623 2624 if (zio->io_type == ZIO_TYPE_WRITE && 2625 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) 2626 trim_map_write_done(zio); 2627 2628 vdev_queue_io_done(zio); 2629 2630 if (zio->io_type == ZIO_TYPE_WRITE) 2631 vdev_cache_write(zio); 2632 2633 if (zio_injection_enabled && zio->io_error == 0) 2634 zio->io_error = zio_handle_device_injection(vd, 2635 zio, EIO); 2636 2637 if (zio_injection_enabled && zio->io_error == 0) 2638 zio->io_error = zio_handle_label_injection(zio, EIO); 2639 2640 if (zio->io_error) { 2641 if (!vdev_accessible(vd, zio)) { 2642 zio->io_error = SET_ERROR(ENXIO); 2643 } else { 2644 unexpected_error = B_TRUE; 2645 } 2646 } 2647 } 2648 2649 ops->vdev_op_io_done(zio); 2650 2651 if (unexpected_error) 2652 VERIFY(vdev_probe(vd, zio) == NULL); 2653 2654 return (ZIO_PIPELINE_CONTINUE); 2655} 2656 2657/* 2658 * For non-raidz ZIOs, we can just copy aside the bad data read from the 2659 * disk, and use that to finish the checksum ereport later. 2660 */ 2661static void 2662zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, 2663 const void *good_buf) 2664{ 2665 /* no processing needed */ 2666 zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); 2667} 2668 2669/*ARGSUSED*/ 2670void 2671zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) 2672{ 2673 void *buf = zio_buf_alloc(zio->io_size); 2674 2675 bcopy(zio->io_data, buf, zio->io_size); 2676 2677 zcr->zcr_cbinfo = zio->io_size; 2678 zcr->zcr_cbdata = buf; 2679 zcr->zcr_finish = zio_vsd_default_cksum_finish; 2680 zcr->zcr_free = zio_buf_free; 2681} 2682 2683static int 2684zio_vdev_io_assess(zio_t *zio) 2685{ 2686 vdev_t *vd = zio->io_vd; 2687 2688 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2689 return (ZIO_PIPELINE_STOP); 2690 2691 if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2692 spa_config_exit(zio->io_spa, SCL_ZIO, zio); 2693 2694 if (zio->io_vsd != NULL) { 2695 zio->io_vsd_ops->vsd_free(zio); 2696 zio->io_vsd = NULL; 2697 } 2698 2699 if (zio_injection_enabled && zio->io_error == 0) 2700 zio->io_error = zio_handle_fault_injection(zio, EIO); 2701 2702 if (zio->io_type == ZIO_TYPE_IOCTL && zio->io_cmd == DKIOCTRIM) 2703 switch (zio->io_error) { 2704 case 0: 2705 ZIO_TRIM_STAT_INCR(bytes, zio->io_size); 2706 ZIO_TRIM_STAT_BUMP(success); 2707 break; 2708 case EOPNOTSUPP: 2709 ZIO_TRIM_STAT_BUMP(unsupported); 2710 break; 2711 default: 2712 ZIO_TRIM_STAT_BUMP(failed); 2713 break; 2714 } 2715 2716 /* 2717 * If the I/O failed, determine whether we should attempt to retry it. 2718 * 2719 * On retry, we cut in line in the issue queue, since we don't want 2720 * compression/checksumming/etc. work to prevent our (cheap) IO reissue. 2721 */ 2722 if (zio->io_error && vd == NULL && 2723 !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { 2724 ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ 2725 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ 2726 zio->io_error = 0; 2727 zio->io_flags |= ZIO_FLAG_IO_RETRY | 2728 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; 2729 zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; 2730 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, 2731 zio_requeue_io_start_cut_in_line); 2732 return (ZIO_PIPELINE_STOP); 2733 } 2734 2735 /* 2736 * If we got an error on a leaf device, convert it to ENXIO 2737 * if the device is not accessible at all. 2738 */ 2739 if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && 2740 !vdev_accessible(vd, zio)) 2741 zio->io_error = SET_ERROR(ENXIO); 2742 2743 /* 2744 * If we can't write to an interior vdev (mirror or RAID-Z), 2745 * set vdev_cant_write so that we stop trying to allocate from it. 2746 */ 2747 if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && 2748 vd != NULL && !vd->vdev_ops->vdev_op_leaf) { 2749 vd->vdev_cant_write = B_TRUE; 2750 } 2751 2752 if (zio->io_error) 2753 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2754 2755 return (ZIO_PIPELINE_CONTINUE); 2756} 2757 2758void 2759zio_vdev_io_reissue(zio_t *zio) 2760{ 2761 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2762 ASSERT(zio->io_error == 0); 2763 2764 zio->io_stage >>= 1; 2765} 2766 2767void 2768zio_vdev_io_redone(zio_t *zio) 2769{ 2770 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 2771 2772 zio->io_stage >>= 1; 2773} 2774 2775void 2776zio_vdev_io_bypass(zio_t *zio) 2777{ 2778 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2779 ASSERT(zio->io_error == 0); 2780 2781 zio->io_flags |= ZIO_FLAG_IO_BYPASS; 2782 zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; 2783} 2784 2785/* 2786 * ========================================================================== 2787 * Generate and verify checksums 2788 * ========================================================================== 2789 */ 2790static int 2791zio_checksum_generate(zio_t *zio) 2792{ 2793 blkptr_t *bp = zio->io_bp; 2794 enum zio_checksum checksum; 2795 2796 if (bp == NULL) { 2797 /* 2798 * This is zio_write_phys(). 2799 * We're either generating a label checksum, or none at all. 2800 */ 2801 checksum = zio->io_prop.zp_checksum; 2802 2803 if (checksum == ZIO_CHECKSUM_OFF) 2804 return (ZIO_PIPELINE_CONTINUE); 2805 2806 ASSERT(checksum == ZIO_CHECKSUM_LABEL); 2807 } else { 2808 if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { 2809 ASSERT(!IO_IS_ALLOCATING(zio)); 2810 checksum = ZIO_CHECKSUM_GANG_HEADER; 2811 } else { 2812 checksum = BP_GET_CHECKSUM(bp); 2813 } 2814 } 2815 2816 zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); 2817 2818 return (ZIO_PIPELINE_CONTINUE); 2819} 2820 2821static int 2822zio_checksum_verify(zio_t *zio) 2823{ 2824 zio_bad_cksum_t info; 2825 blkptr_t *bp = zio->io_bp; 2826 int error; 2827 2828 ASSERT(zio->io_vd != NULL); 2829 2830 if (bp == NULL) { 2831 /* 2832 * This is zio_read_phys(). 2833 * We're either verifying a label checksum, or nothing at all. 2834 */ 2835 if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) 2836 return (ZIO_PIPELINE_CONTINUE); 2837 2838 ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); 2839 } 2840 2841 if ((error = zio_checksum_error(zio, &info)) != 0) { 2842 zio->io_error = error; 2843 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2844 zfs_ereport_start_checksum(zio->io_spa, 2845 zio->io_vd, zio, zio->io_offset, 2846 zio->io_size, NULL, &info); 2847 } 2848 } 2849 2850 return (ZIO_PIPELINE_CONTINUE); 2851} 2852 2853/* 2854 * Called by RAID-Z to ensure we don't compute the checksum twice. 2855 */ 2856void 2857zio_checksum_verified(zio_t *zio) 2858{ 2859 zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 2860} 2861 2862/* 2863 * ========================================================================== 2864 * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. 2865 * An error of 0 indictes success. ENXIO indicates whole-device failure, 2866 * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO 2867 * indicate errors that are specific to one I/O, and most likely permanent. 2868 * Any other error is presumed to be worse because we weren't expecting it. 2869 * ========================================================================== 2870 */ 2871int 2872zio_worst_error(int e1, int e2) 2873{ 2874 static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; 2875 int r1, r2; 2876 2877 for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) 2878 if (e1 == zio_error_rank[r1]) 2879 break; 2880 2881 for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) 2882 if (e2 == zio_error_rank[r2]) 2883 break; 2884 2885 return (r1 > r2 ? e1 : e2); 2886} 2887 2888/* 2889 * ========================================================================== 2890 * I/O completion 2891 * ========================================================================== 2892 */ 2893static int 2894zio_ready(zio_t *zio) 2895{ 2896 blkptr_t *bp = zio->io_bp; 2897 zio_t *pio, *pio_next; 2898 2899 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 2900 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY)) 2901 return (ZIO_PIPELINE_STOP); 2902 2903 if (zio->io_ready) { 2904 ASSERT(IO_IS_ALLOCATING(zio)); 2905 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || 2906 (zio->io_flags & ZIO_FLAG_NOPWRITE)); 2907 ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); 2908 2909 zio->io_ready(zio); 2910 } 2911 2912 if (bp != NULL && bp != &zio->io_bp_copy) 2913 zio->io_bp_copy = *bp; 2914 2915 if (zio->io_error) 2916 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2917 2918 mutex_enter(&zio->io_lock); 2919 zio->io_state[ZIO_WAIT_READY] = 1; 2920 pio = zio_walk_parents(zio); 2921 mutex_exit(&zio->io_lock); 2922 2923 /* 2924 * As we notify zio's parents, new parents could be added. 2925 * New parents go to the head of zio's io_parent_list, however, 2926 * so we will (correctly) not notify them. The remainder of zio's 2927 * io_parent_list, from 'pio_next' onward, cannot change because 2928 * all parents must wait for us to be done before they can be done. 2929 */ 2930 for (; pio != NULL; pio = pio_next) { 2931 pio_next = zio_walk_parents(zio); 2932 zio_notify_parent(pio, zio, ZIO_WAIT_READY); 2933 } 2934 2935 if (zio->io_flags & ZIO_FLAG_NODATA) { 2936 if (BP_IS_GANG(bp)) { 2937 zio->io_flags &= ~ZIO_FLAG_NODATA; 2938 } else { 2939 ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE); 2940 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 2941 } 2942 } 2943 2944 if (zio_injection_enabled && 2945 zio->io_spa->spa_syncing_txg == zio->io_txg) 2946 zio_handle_ignored_writes(zio); 2947 2948 return (ZIO_PIPELINE_CONTINUE); 2949} 2950 2951static int 2952zio_done(zio_t *zio) 2953{ 2954 spa_t *spa = zio->io_spa; 2955 zio_t *lio = zio->io_logical; 2956 blkptr_t *bp = zio->io_bp; 2957 vdev_t *vd = zio->io_vd; 2958 uint64_t psize = zio->io_size; 2959 zio_t *pio, *pio_next; 2960 2961 /* 2962 * If our children haven't all completed, 2963 * wait for them and then repeat this pipeline stage. 2964 */ 2965 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || 2966 zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || 2967 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) || 2968 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) 2969 return (ZIO_PIPELINE_STOP); 2970 2971 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 2972 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 2973 ASSERT(zio->io_children[c][w] == 0); 2974 2975 if (bp != NULL) { 2976 ASSERT(bp->blk_pad[0] == 0); 2977 ASSERT(bp->blk_pad[1] == 0); 2978 ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || 2979 (bp == zio_unique_parent(zio)->io_bp)); 2980 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 2981 zio->io_bp_override == NULL && 2982 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 2983 ASSERT(!BP_SHOULD_BYTESWAP(bp)); 2984 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); 2985 ASSERT(BP_COUNT_GANG(bp) == 0 || 2986 (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 2987 } 2988 if (zio->io_flags & ZIO_FLAG_NOPWRITE) 2989 VERIFY(BP_EQUAL(bp, &zio->io_bp_orig)); 2990 } 2991 2992 /* 2993 * If there were child vdev/gang/ddt errors, they apply to us now. 2994 */ 2995 zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); 2996 zio_inherit_child_errors(zio, ZIO_CHILD_GANG); 2997 zio_inherit_child_errors(zio, ZIO_CHILD_DDT); 2998 2999 /* 3000 * If the I/O on the transformed data was successful, generate any 3001 * checksum reports now while we still have the transformed data. 3002 */ 3003 if (zio->io_error == 0) { 3004 while (zio->io_cksum_report != NULL) { 3005 zio_cksum_report_t *zcr = zio->io_cksum_report; 3006 uint64_t align = zcr->zcr_align; 3007 uint64_t asize = P2ROUNDUP(psize, align); 3008 char *abuf = zio->io_data; 3009 3010 if (asize != psize) { 3011 abuf = zio_buf_alloc(asize); 3012 bcopy(zio->io_data, abuf, psize); 3013 bzero(abuf + psize, asize - psize); 3014 } 3015 3016 zio->io_cksum_report = zcr->zcr_next; 3017 zcr->zcr_next = NULL; 3018 zcr->zcr_finish(zcr, abuf); 3019 zfs_ereport_free_checksum(zcr); 3020 3021 if (asize != psize) 3022 zio_buf_free(abuf, asize); 3023 } 3024 } 3025 3026 zio_pop_transforms(zio); /* note: may set zio->io_error */ 3027 3028 vdev_stat_update(zio, psize); 3029 3030 if (zio->io_error) { 3031 /* 3032 * If this I/O is attached to a particular vdev, 3033 * generate an error message describing the I/O failure 3034 * at the block level. We ignore these errors if the 3035 * device is currently unavailable. 3036 */ 3037 if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 3038 zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 3039 3040 if ((zio->io_error == EIO || !(zio->io_flags & 3041 (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && 3042 zio == lio) { 3043 /* 3044 * For logical I/O requests, tell the SPA to log the 3045 * error and generate a logical data ereport. 3046 */ 3047 spa_log_error(spa, zio); 3048 zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 3049 0, 0); 3050 } 3051 } 3052 3053 if (zio->io_error && zio == lio) { 3054 /* 3055 * Determine whether zio should be reexecuted. This will 3056 * propagate all the way to the root via zio_notify_parent(). 3057 */ 3058 ASSERT(vd == NULL && bp != NULL); 3059 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3060 3061 if (IO_IS_ALLOCATING(zio) && 3062 !(zio->io_flags & ZIO_FLAG_CANFAIL)) { 3063 if (zio->io_error != ENOSPC) 3064 zio->io_reexecute |= ZIO_REEXECUTE_NOW; 3065 else 3066 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3067 } 3068 3069 if ((zio->io_type == ZIO_TYPE_READ || 3070 zio->io_type == ZIO_TYPE_FREE) && 3071 !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && 3072 zio->io_error == ENXIO && 3073 spa_load_state(spa) == SPA_LOAD_NONE && 3074 spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 3075 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3076 3077 if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) 3078 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3079 3080 /* 3081 * Here is a possibly good place to attempt to do 3082 * either combinatorial reconstruction or error correction 3083 * based on checksums. It also might be a good place 3084 * to send out preliminary ereports before we suspend 3085 * processing. 3086 */ 3087 } 3088 3089 /* 3090 * If there were logical child errors, they apply to us now. 3091 * We defer this until now to avoid conflating logical child 3092 * errors with errors that happened to the zio itself when 3093 * updating vdev stats and reporting FMA events above. 3094 */ 3095 zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); 3096 3097 if ((zio->io_error || zio->io_reexecute) && 3098 IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && 3099 !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) 3100 zio_dva_unallocate(zio, zio->io_gang_tree, bp); 3101 3102 zio_gang_tree_free(&zio->io_gang_tree); 3103 3104 /* 3105 * Godfather I/Os should never suspend. 3106 */ 3107 if ((zio->io_flags & ZIO_FLAG_GODFATHER) && 3108 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) 3109 zio->io_reexecute = 0; 3110 3111 if (zio->io_reexecute) { 3112 /* 3113 * This is a logical I/O that wants to reexecute. 3114 * 3115 * Reexecute is top-down. When an i/o fails, if it's not 3116 * the root, it simply notifies its parent and sticks around. 3117 * The parent, seeing that it still has children in zio_done(), 3118 * does the same. This percolates all the way up to the root. 3119 * The root i/o will reexecute or suspend the entire tree. 3120 * 3121 * This approach ensures that zio_reexecute() honors 3122 * all the original i/o dependency relationships, e.g. 3123 * parents not executing until children are ready. 3124 */ 3125 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3126 3127 zio->io_gang_leader = NULL; 3128 3129 mutex_enter(&zio->io_lock); 3130 zio->io_state[ZIO_WAIT_DONE] = 1; 3131 mutex_exit(&zio->io_lock); 3132 3133 /* 3134 * "The Godfather" I/O monitors its children but is 3135 * not a true parent to them. It will track them through 3136 * the pipeline but severs its ties whenever they get into 3137 * trouble (e.g. suspended). This allows "The Godfather" 3138 * I/O to return status without blocking. 3139 */ 3140 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 3141 zio_link_t *zl = zio->io_walk_link; 3142 pio_next = zio_walk_parents(zio); 3143 3144 if ((pio->io_flags & ZIO_FLAG_GODFATHER) && 3145 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { 3146 zio_remove_child(pio, zio, zl); 3147 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3148 } 3149 } 3150 3151 if ((pio = zio_unique_parent(zio)) != NULL) { 3152 /* 3153 * We're not a root i/o, so there's nothing to do 3154 * but notify our parent. Don't propagate errors 3155 * upward since we haven't permanently failed yet. 3156 */ 3157 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 3158 zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; 3159 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3160 } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { 3161 /* 3162 * We'd fail again if we reexecuted now, so suspend 3163 * until conditions improve (e.g. device comes online). 3164 */ 3165 zio_suspend(spa, zio); 3166 } else { 3167 /* 3168 * Reexecution is potentially a huge amount of work. 3169 * Hand it off to the otherwise-unused claim taskq. 3170 */ 3171#ifdef _KERNEL 3172 (void) taskq_dispatch_safe( 3173 spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE], 3174 (task_func_t *)zio_reexecute, zio, TQ_SLEEP, 3175 &zio->io_task); 3176#else 3177 (void) taskq_dispatch( 3178 spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE], 3179 (task_func_t *)zio_reexecute, zio, TQ_SLEEP); 3180#endif 3181 } 3182 return (ZIO_PIPELINE_STOP); 3183 } 3184 3185 ASSERT(zio->io_child_count == 0); 3186 ASSERT(zio->io_reexecute == 0); 3187 ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); 3188 3189 /* 3190 * Report any checksum errors, since the I/O is complete. 3191 */ 3192 while (zio->io_cksum_report != NULL) { 3193 zio_cksum_report_t *zcr = zio->io_cksum_report; 3194 zio->io_cksum_report = zcr->zcr_next; 3195 zcr->zcr_next = NULL; 3196 zcr->zcr_finish(zcr, NULL); 3197 zfs_ereport_free_checksum(zcr); 3198 } 3199 3200 /* 3201 * It is the responsibility of the done callback to ensure that this 3202 * particular zio is no longer discoverable for adoption, and as 3203 * such, cannot acquire any new parents. 3204 */ 3205 if (zio->io_done) 3206 zio->io_done(zio); 3207 3208 mutex_enter(&zio->io_lock); 3209 zio->io_state[ZIO_WAIT_DONE] = 1; 3210 mutex_exit(&zio->io_lock); 3211 3212 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 3213 zio_link_t *zl = zio->io_walk_link; 3214 pio_next = zio_walk_parents(zio); 3215 zio_remove_child(pio, zio, zl); 3216 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3217 } 3218 3219 if (zio->io_waiter != NULL) { 3220 mutex_enter(&zio->io_lock); 3221 zio->io_executor = NULL; 3222 cv_broadcast(&zio->io_cv); 3223 mutex_exit(&zio->io_lock); 3224 } else { 3225 zio_destroy(zio); 3226 } 3227 3228 return (ZIO_PIPELINE_STOP); 3229} 3230 3231/* 3232 * ========================================================================== 3233 * I/O pipeline definition 3234 * ========================================================================== 3235 */ 3236static zio_pipe_stage_t *zio_pipeline[] = { 3237 NULL, 3238 zio_read_bp_init, 3239 zio_free_bp_init, 3240 zio_issue_async, 3241 zio_write_bp_init, 3242 zio_checksum_generate, 3243 zio_nop_write, 3244 zio_ddt_read_start, 3245 zio_ddt_read_done, 3246 zio_ddt_write, 3247 zio_ddt_free, 3248 zio_gang_assemble, 3249 zio_gang_issue, 3250 zio_dva_allocate, 3251 zio_dva_free, 3252 zio_dva_claim, 3253 zio_ready, 3254 zio_vdev_io_start, 3255 zio_vdev_io_done, 3256 zio_vdev_io_assess, 3257 zio_checksum_verify, 3258 zio_done 3259}; 3260 3261/* dnp is the dnode for zb1->zb_object */ 3262boolean_t 3263zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1, 3264 const zbookmark_t *zb2) 3265{ 3266 uint64_t zb1nextL0, zb2thisobj; 3267 3268 ASSERT(zb1->zb_objset == zb2->zb_objset); 3269 ASSERT(zb2->zb_level == 0); 3270 3271 /* 3272 * A bookmark in the deadlist is considered to be after 3273 * everything else. 3274 */ 3275 if (zb2->zb_object == DMU_DEADLIST_OBJECT) 3276 return (B_TRUE); 3277 3278 /* The objset_phys_t isn't before anything. */ 3279 if (dnp == NULL) 3280 return (B_FALSE); 3281 3282 zb1nextL0 = (zb1->zb_blkid + 1) << 3283 ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); 3284 3285 zb2thisobj = zb2->zb_object ? zb2->zb_object : 3286 zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); 3287 3288 if (zb1->zb_object == DMU_META_DNODE_OBJECT) { 3289 uint64_t nextobj = zb1nextL0 * 3290 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; 3291 return (nextobj <= zb2thisobj); 3292 } 3293 3294 if (zb1->zb_object < zb2thisobj) 3295 return (B_TRUE); 3296 if (zb1->zb_object > zb2thisobj) 3297 return (B_FALSE); 3298 if (zb2->zb_object == DMU_META_DNODE_OBJECT) 3299 return (B_FALSE); 3300 return (zb1nextL0 <= zb2->zb_blkid); 3301} 3302