1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2013 by Delphix. All rights reserved. 24 */ 25 26#include <sys/zfs_context.h> 27#include <sys/fm/fs/zfs.h> 28#include <sys/spa.h> 29#include <sys/txg.h> 30#include <sys/spa_impl.h> 31#include <sys/vdev_impl.h> 32#include <sys/zio_impl.h> 33#include <sys/zio_compress.h> 34#include <sys/zio_checksum.h> 35#include <sys/dmu_objset.h> 36#include <sys/arc.h> 37#include <sys/ddt.h> 38#include <sys/trim_map.h> 39 40SYSCTL_DECL(_vfs_zfs); 41SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); 42static int zio_use_uma = 0; 43TUNABLE_INT("vfs.zfs.zio.use_uma", &zio_use_uma); 44SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0, 45 "Use uma(9) for ZIO allocations"); 46static int zio_exclude_metadata = 0; 47TUNABLE_INT("vfs.zfs.zio.exclude_metadata", &zio_exclude_metadata); 48SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0, 49 "Exclude metadata buffers from dumps as well"); 50 51zio_trim_stats_t zio_trim_stats = { 52 { "bytes", KSTAT_DATA_UINT64, 53 "Number of bytes successfully TRIMmed" }, 54 { "success", KSTAT_DATA_UINT64, 55 "Number of successful TRIM requests" }, 56 { "unsupported", KSTAT_DATA_UINT64, 57 "Number of TRIM requests that failed because TRIM is not supported" }, 58 { "failed", KSTAT_DATA_UINT64, 59 "Number of TRIM requests that failed for reasons other than not supported" }, 60}; 61 62static kstat_t *zio_trim_ksp; 63 64/* 65 * ========================================================================== 66 * I/O priority table 67 * ========================================================================== 68 */ 69uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 70 0, /* ZIO_PRIORITY_NOW */ 71 0, /* ZIO_PRIORITY_SYNC_READ */ 72 0, /* ZIO_PRIORITY_SYNC_WRITE */ 73 0, /* ZIO_PRIORITY_LOG_WRITE */ 74 1, /* ZIO_PRIORITY_CACHE_FILL */ 75 1, /* ZIO_PRIORITY_AGG */ 76 4, /* ZIO_PRIORITY_FREE */ 77 4, /* ZIO_PRIORITY_ASYNC_WRITE */ 78 6, /* ZIO_PRIORITY_ASYNC_READ */ 79 10, /* ZIO_PRIORITY_RESILVER */ 80 20, /* ZIO_PRIORITY_SCRUB */ 81 2, /* ZIO_PRIORITY_DDT_PREFETCH */ 82 30, /* ZIO_PRIORITY_TRIM */ 83}; 84 85/* 86 * ========================================================================== 87 * I/O type descriptions 88 * ========================================================================== 89 */ 90char *zio_type_name[ZIO_TYPES] = { 91 "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", 92 "zio_ioctl" 93}; 94 95/* 96 * ========================================================================== 97 * I/O kmem caches 98 * ========================================================================== 99 */ 100kmem_cache_t *zio_cache; 101kmem_cache_t *zio_link_cache; 102kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 103kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 104 105#ifdef _KERNEL 106extern vmem_t *zio_alloc_arena; 107#endif 108extern int zfs_mg_alloc_failures; 109 110/* 111 * The following actions directly effect the spa's sync-to-convergence logic. 112 * The values below define the sync pass when we start performing the action. 113 * Care should be taken when changing these values as they directly impact 114 * spa_sync() performance. Tuning these values may introduce subtle performance 115 * pathologies and should only be done in the context of performance analysis. 116 * These tunables will eventually be removed and replaced with #defines once 117 * enough analysis has been done to determine optimal values. 118 * 119 * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that 120 * regular blocks are not deferred. 121 */ 122int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */ 123TUNABLE_INT("vfs.zfs.sync_pass_deferred_free", &zfs_sync_pass_deferred_free); 124SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_deferred_free, CTLFLAG_RDTUN, 125 &zfs_sync_pass_deferred_free, 0, "defer frees starting in this pass"); 126int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */ 127TUNABLE_INT("vfs.zfs.sync_pass_dont_compress", &zfs_sync_pass_dont_compress); 128SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_dont_compress, CTLFLAG_RDTUN, 129 &zfs_sync_pass_dont_compress, 0, "don't compress starting in this pass"); 130int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ 131TUNABLE_INT("vfs.zfs.sync_pass_rewrite", &zfs_sync_pass_rewrite); 132SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_rewrite, CTLFLAG_RDTUN, 133 &zfs_sync_pass_rewrite, 0, "rewrite new bps starting in this pass"); 134 135/* 136 * An allocating zio is one that either currently has the DVA allocate 137 * stage set or will have it later in its lifetime. 138 */ 139#define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) 140 141boolean_t zio_requeue_io_start_cut_in_line = B_TRUE; 142 143#ifdef ZFS_DEBUG 144int zio_buf_debug_limit = 16384; 145#else 146int zio_buf_debug_limit = 0; 147#endif 148 149void 150zio_init(void) 151{ 152 size_t c; 153 zio_cache = kmem_cache_create("zio_cache", 154 sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 155 zio_link_cache = kmem_cache_create("zio_link_cache", 156 sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 157 if (!zio_use_uma) 158 goto out; 159 160 /* 161 * For small buffers, we want a cache for each multiple of 162 * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 163 * for each quarter-power of 2. For large buffers, we want 164 * a cache for each multiple of PAGESIZE. 165 */ 166 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 167 size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 168 size_t p2 = size; 169 size_t align = 0; 170 size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0; 171 172 while (p2 & (p2 - 1)) 173 p2 &= p2 - 1; 174 175#ifdef illumos 176#ifndef _KERNEL 177 /* 178 * If we are using watchpoints, put each buffer on its own page, 179 * to eliminate the performance overhead of trapping to the 180 * kernel when modifying a non-watched buffer that shares the 181 * page with a watched buffer. 182 */ 183 if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) 184 continue; 185#endif 186#endif /* illumos */ 187 if (size <= 4 * SPA_MINBLOCKSIZE) { 188 align = SPA_MINBLOCKSIZE; 189 } else if (IS_P2ALIGNED(size, PAGESIZE)) { 190 align = PAGESIZE; 191 } else if (IS_P2ALIGNED(size, p2 >> 2)) { 192 align = p2 >> 2; 193 } 194 195 if (align != 0) { 196 char name[36]; 197 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 198 zio_buf_cache[c] = kmem_cache_create(name, size, 199 align, NULL, NULL, NULL, NULL, NULL, cflags); 200 201 /* 202 * Since zio_data bufs do not appear in crash dumps, we 203 * pass KMC_NOTOUCH so that no allocator metadata is 204 * stored with the buffers. 205 */ 206 (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 207 zio_data_buf_cache[c] = kmem_cache_create(name, size, 208 align, NULL, NULL, NULL, NULL, NULL, 209 cflags | KMC_NOTOUCH | KMC_NODEBUG); 210 } 211 } 212 213 while (--c != 0) { 214 ASSERT(zio_buf_cache[c] != NULL); 215 if (zio_buf_cache[c - 1] == NULL) 216 zio_buf_cache[c - 1] = zio_buf_cache[c]; 217 218 ASSERT(zio_data_buf_cache[c] != NULL); 219 if (zio_data_buf_cache[c - 1] == NULL) 220 zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 221 } 222out: 223 224 /* 225 * The zio write taskqs have 1 thread per cpu, allow 1/2 of the taskqs 226 * to fail 3 times per txg or 8 failures, whichever is greater. 227 */ 228 if (zfs_mg_alloc_failures == 0) 229 zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8); 230 else if (zfs_mg_alloc_failures < 8) 231 zfs_mg_alloc_failures = 8; 232 233 zio_inject_init(); 234 235 zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc", 236 KSTAT_TYPE_NAMED, 237 sizeof(zio_trim_stats) / sizeof(kstat_named_t), 238 KSTAT_FLAG_VIRTUAL); 239 240 if (zio_trim_ksp != NULL) { 241 zio_trim_ksp->ks_data = &zio_trim_stats; 242 kstat_install(zio_trim_ksp); 243 } 244} 245 246void 247zio_fini(void) 248{ 249 size_t c; 250 kmem_cache_t *last_cache = NULL; 251 kmem_cache_t *last_data_cache = NULL; 252 253 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 254 if (zio_buf_cache[c] != last_cache) { 255 last_cache = zio_buf_cache[c]; 256 kmem_cache_destroy(zio_buf_cache[c]); 257 } 258 zio_buf_cache[c] = NULL; 259 260 if (zio_data_buf_cache[c] != last_data_cache) { 261 last_data_cache = zio_data_buf_cache[c]; 262 kmem_cache_destroy(zio_data_buf_cache[c]); 263 } 264 zio_data_buf_cache[c] = NULL; 265 } 266 267 kmem_cache_destroy(zio_link_cache); 268 kmem_cache_destroy(zio_cache); 269 270 zio_inject_fini(); 271 272 if (zio_trim_ksp != NULL) { 273 kstat_delete(zio_trim_ksp); 274 zio_trim_ksp = NULL; 275 } 276} 277 278/* 279 * ========================================================================== 280 * Allocate and free I/O buffers 281 * ========================================================================== 282 */ 283 284/* 285 * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 286 * crashdump if the kernel panics, so use it judiciously. Obviously, it's 287 * useful to inspect ZFS metadata, but if possible, we should avoid keeping 288 * excess / transient data in-core during a crashdump. 289 */ 290void * 291zio_buf_alloc(size_t size) 292{ 293 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 294 int flags = zio_exclude_metadata ? KM_NODEBUG : 0; 295 296 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 297 298 if (zio_use_uma) 299 return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 300 else 301 return (kmem_alloc(size, KM_SLEEP|flags)); 302} 303 304/* 305 * Use zio_data_buf_alloc to allocate data. The data will not appear in a 306 * crashdump if the kernel panics. This exists so that we will limit the amount 307 * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 308 * of kernel heap dumped to disk when the kernel panics) 309 */ 310void * 311zio_data_buf_alloc(size_t size) 312{ 313 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 314 315 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 316 317 if (zio_use_uma) 318 return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 319 else 320 return (kmem_alloc(size, KM_SLEEP | KM_NODEBUG)); 321} 322 323void 324zio_buf_free(void *buf, size_t size) 325{ 326 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 327 328 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 329 330 if (zio_use_uma) 331 kmem_cache_free(zio_buf_cache[c], buf); 332 else 333 kmem_free(buf, size); 334} 335 336void 337zio_data_buf_free(void *buf, size_t size) 338{ 339 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 340 341 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 342 343 if (zio_use_uma) 344 kmem_cache_free(zio_data_buf_cache[c], buf); 345 else 346 kmem_free(buf, size); 347} 348 349/* 350 * ========================================================================== 351 * Push and pop I/O transform buffers 352 * ========================================================================== 353 */ 354static void 355zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, 356 zio_transform_func_t *transform) 357{ 358 zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 359 360 zt->zt_orig_data = zio->io_data; 361 zt->zt_orig_size = zio->io_size; 362 zt->zt_bufsize = bufsize; 363 zt->zt_transform = transform; 364 365 zt->zt_next = zio->io_transform_stack; 366 zio->io_transform_stack = zt; 367 368 zio->io_data = data; 369 zio->io_size = size; 370} 371 372static void 373zio_pop_transforms(zio_t *zio) 374{ 375 zio_transform_t *zt; 376 377 while ((zt = zio->io_transform_stack) != NULL) { 378 if (zt->zt_transform != NULL) 379 zt->zt_transform(zio, 380 zt->zt_orig_data, zt->zt_orig_size); 381 382 if (zt->zt_bufsize != 0) 383 zio_buf_free(zio->io_data, zt->zt_bufsize); 384 385 zio->io_data = zt->zt_orig_data; 386 zio->io_size = zt->zt_orig_size; 387 zio->io_transform_stack = zt->zt_next; 388 389 kmem_free(zt, sizeof (zio_transform_t)); 390 } 391} 392 393/* 394 * ========================================================================== 395 * I/O transform callbacks for subblocks and decompression 396 * ========================================================================== 397 */ 398static void 399zio_subblock(zio_t *zio, void *data, uint64_t size) 400{ 401 ASSERT(zio->io_size > size); 402 403 if (zio->io_type == ZIO_TYPE_READ) 404 bcopy(zio->io_data, data, size); 405} 406 407static void 408zio_decompress(zio_t *zio, void *data, uint64_t size) 409{ 410 if (zio->io_error == 0 && 411 zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), 412 zio->io_data, data, zio->io_size, size) != 0) 413 zio->io_error = SET_ERROR(EIO); 414} 415 416/* 417 * ========================================================================== 418 * I/O parent/child relationships and pipeline interlocks 419 * ========================================================================== 420 */ 421/* 422 * NOTE - Callers to zio_walk_parents() and zio_walk_children must 423 * continue calling these functions until they return NULL. 424 * Otherwise, the next caller will pick up the list walk in 425 * some indeterminate state. (Otherwise every caller would 426 * have to pass in a cookie to keep the state represented by 427 * io_walk_link, which gets annoying.) 428 */ 429zio_t * 430zio_walk_parents(zio_t *cio) 431{ 432 zio_link_t *zl = cio->io_walk_link; 433 list_t *pl = &cio->io_parent_list; 434 435 zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); 436 cio->io_walk_link = zl; 437 438 if (zl == NULL) 439 return (NULL); 440 441 ASSERT(zl->zl_child == cio); 442 return (zl->zl_parent); 443} 444 445zio_t * 446zio_walk_children(zio_t *pio) 447{ 448 zio_link_t *zl = pio->io_walk_link; 449 list_t *cl = &pio->io_child_list; 450 451 zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); 452 pio->io_walk_link = zl; 453 454 if (zl == NULL) 455 return (NULL); 456 457 ASSERT(zl->zl_parent == pio); 458 return (zl->zl_child); 459} 460 461zio_t * 462zio_unique_parent(zio_t *cio) 463{ 464 zio_t *pio = zio_walk_parents(cio); 465 466 VERIFY(zio_walk_parents(cio) == NULL); 467 return (pio); 468} 469 470void 471zio_add_child(zio_t *pio, zio_t *cio) 472{ 473 zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); 474 475 /* 476 * Logical I/Os can have logical, gang, or vdev children. 477 * Gang I/Os can have gang or vdev children. 478 * Vdev I/Os can only have vdev children. 479 * The following ASSERT captures all of these constraints. 480 */ 481 ASSERT(cio->io_child_type <= pio->io_child_type); 482 483 zl->zl_parent = pio; 484 zl->zl_child = cio; 485 486 mutex_enter(&cio->io_lock); 487 mutex_enter(&pio->io_lock); 488 489 ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); 490 491 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 492 pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; 493 494 list_insert_head(&pio->io_child_list, zl); 495 list_insert_head(&cio->io_parent_list, zl); 496 497 pio->io_child_count++; 498 cio->io_parent_count++; 499 500 mutex_exit(&pio->io_lock); 501 mutex_exit(&cio->io_lock); 502} 503 504static void 505zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) 506{ 507 ASSERT(zl->zl_parent == pio); 508 ASSERT(zl->zl_child == cio); 509 510 mutex_enter(&cio->io_lock); 511 mutex_enter(&pio->io_lock); 512 513 list_remove(&pio->io_child_list, zl); 514 list_remove(&cio->io_parent_list, zl); 515 516 pio->io_child_count--; 517 cio->io_parent_count--; 518 519 mutex_exit(&pio->io_lock); 520 mutex_exit(&cio->io_lock); 521 522 kmem_cache_free(zio_link_cache, zl); 523} 524 525static boolean_t 526zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) 527{ 528 uint64_t *countp = &zio->io_children[child][wait]; 529 boolean_t waiting = B_FALSE; 530 531 mutex_enter(&zio->io_lock); 532 ASSERT(zio->io_stall == NULL); 533 if (*countp != 0) { 534 zio->io_stage >>= 1; 535 zio->io_stall = countp; 536 waiting = B_TRUE; 537 } 538 mutex_exit(&zio->io_lock); 539 540 return (waiting); 541} 542 543static void 544zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) 545{ 546 uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; 547 int *errorp = &pio->io_child_error[zio->io_child_type]; 548 549 mutex_enter(&pio->io_lock); 550 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 551 *errorp = zio_worst_error(*errorp, zio->io_error); 552 pio->io_reexecute |= zio->io_reexecute; 553 ASSERT3U(*countp, >, 0); 554 if (--*countp == 0 && pio->io_stall == countp) { 555 pio->io_stall = NULL; 556 mutex_exit(&pio->io_lock); 557 zio_execute(pio); 558 } else { 559 mutex_exit(&pio->io_lock); 560 } 561} 562 563static void 564zio_inherit_child_errors(zio_t *zio, enum zio_child c) 565{ 566 if (zio->io_child_error[c] != 0 && zio->io_error == 0) 567 zio->io_error = zio->io_child_error[c]; 568} 569 570/* 571 * ========================================================================== 572 * Create the various types of I/O (read, write, free, etc) 573 * ========================================================================== 574 */ 575static zio_t * 576zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 577 void *data, uint64_t size, zio_done_func_t *done, void *private, 578 zio_type_t type, int priority, enum zio_flag flags, 579 vdev_t *vd, uint64_t offset, const zbookmark_t *zb, 580 enum zio_stage stage, enum zio_stage pipeline) 581{ 582 zio_t *zio; 583 584 ASSERT3U(type == ZIO_TYPE_FREE || size, <=, SPA_MAXBLOCKSIZE); 585 ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 586 ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 587 588 ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); 589 ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); 590 ASSERT(vd || stage == ZIO_STAGE_OPEN); 591 592 zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 593 bzero(zio, sizeof (zio_t)); 594 595 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 596 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 597 598 list_create(&zio->io_parent_list, sizeof (zio_link_t), 599 offsetof(zio_link_t, zl_parent_node)); 600 list_create(&zio->io_child_list, sizeof (zio_link_t), 601 offsetof(zio_link_t, zl_child_node)); 602 603 if (vd != NULL) 604 zio->io_child_type = ZIO_CHILD_VDEV; 605 else if (flags & ZIO_FLAG_GANG_CHILD) 606 zio->io_child_type = ZIO_CHILD_GANG; 607 else if (flags & ZIO_FLAG_DDT_CHILD) 608 zio->io_child_type = ZIO_CHILD_DDT; 609 else 610 zio->io_child_type = ZIO_CHILD_LOGICAL; 611 612 if (bp != NULL) { 613 zio->io_bp = (blkptr_t *)bp; 614 zio->io_bp_copy = *bp; 615 zio->io_bp_orig = *bp; 616 if (type != ZIO_TYPE_WRITE || 617 zio->io_child_type == ZIO_CHILD_DDT) 618 zio->io_bp = &zio->io_bp_copy; /* so caller can free */ 619 if (zio->io_child_type == ZIO_CHILD_LOGICAL) 620 zio->io_logical = zio; 621 if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) 622 pipeline |= ZIO_GANG_STAGES; 623 } 624 625 zio->io_spa = spa; 626 zio->io_txg = txg; 627 zio->io_done = done; 628 zio->io_private = private; 629 zio->io_type = type; 630 zio->io_priority = priority; 631 zio->io_vd = vd; 632 zio->io_offset = offset; 633 zio->io_orig_data = zio->io_data = data; 634 zio->io_orig_size = zio->io_size = size; 635 zio->io_orig_flags = zio->io_flags = flags; 636 zio->io_orig_stage = zio->io_stage = stage; 637 zio->io_orig_pipeline = zio->io_pipeline = pipeline; 638 639 zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); 640 zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); 641 642 if (zb != NULL) 643 zio->io_bookmark = *zb; 644 645 if (pio != NULL) { 646 if (zio->io_logical == NULL) 647 zio->io_logical = pio->io_logical; 648 if (zio->io_child_type == ZIO_CHILD_GANG) 649 zio->io_gang_leader = pio->io_gang_leader; 650 zio_add_child(pio, zio); 651 } 652 653 return (zio); 654} 655 656static void 657zio_destroy(zio_t *zio) 658{ 659 list_destroy(&zio->io_parent_list); 660 list_destroy(&zio->io_child_list); 661 mutex_destroy(&zio->io_lock); 662 cv_destroy(&zio->io_cv); 663 kmem_cache_free(zio_cache, zio); 664} 665 666zio_t * 667zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, 668 void *private, enum zio_flag flags) 669{ 670 zio_t *zio; 671 672 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 673 ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 674 ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); 675 676 return (zio); 677} 678 679zio_t * 680zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) 681{ 682 return (zio_null(NULL, spa, NULL, done, private, flags)); 683} 684 685zio_t * 686zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, 687 void *data, uint64_t size, zio_done_func_t *done, void *private, 688 int priority, enum zio_flag flags, const zbookmark_t *zb) 689{ 690 zio_t *zio; 691 692 zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, 693 data, size, done, private, 694 ZIO_TYPE_READ, priority, flags, NULL, 0, zb, 695 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 696 ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); 697 698 return (zio); 699} 700 701zio_t * 702zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 703 void *data, uint64_t size, const zio_prop_t *zp, 704 zio_done_func_t *ready, zio_done_func_t *done, void *private, 705 int priority, enum zio_flag flags, const zbookmark_t *zb) 706{ 707 zio_t *zio; 708 709 ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && 710 zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && 711 zp->zp_compress >= ZIO_COMPRESS_OFF && 712 zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && 713 DMU_OT_IS_VALID(zp->zp_type) && 714 zp->zp_level < 32 && 715 zp->zp_copies > 0 && 716 zp->zp_copies <= spa_max_replication(spa)); 717 718 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 719 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 720 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 721 ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); 722 723 zio->io_ready = ready; 724 zio->io_prop = *zp; 725 726 return (zio); 727} 728 729zio_t * 730zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, 731 uint64_t size, zio_done_func_t *done, void *private, int priority, 732 enum zio_flag flags, zbookmark_t *zb) 733{ 734 zio_t *zio; 735 736 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 737 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 738 ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 739 740 return (zio); 741} 742 743void 744zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) 745{ 746 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 747 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 748 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 749 ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); 750 751 /* 752 * We must reset the io_prop to match the values that existed 753 * when the bp was first written by dmu_sync() keeping in mind 754 * that nopwrite and dedup are mutually exclusive. 755 */ 756 zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; 757 zio->io_prop.zp_nopwrite = nopwrite; 758 zio->io_prop.zp_copies = copies; 759 zio->io_bp_override = bp; 760} 761 762void 763zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) 764{ 765 metaslab_check_free(spa, bp); 766 767 /* 768 * Frees that are for the currently-syncing txg, are not going to be 769 * deferred, and which will not need to do a read (i.e. not GANG or 770 * DEDUP), can be processed immediately. Otherwise, put them on the 771 * in-memory list for later processing. 772 */ 773 if (zfs_trim_enabled || BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || 774 txg != spa->spa_syncing_txg || 775 spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) { 776 bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); 777 } else { 778 VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 779 BP_GET_PSIZE(bp), 0))); 780 } 781} 782 783zio_t * 784zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 785 uint64_t size, enum zio_flag flags) 786{ 787 zio_t *zio; 788 enum zio_stage stage = ZIO_FREE_PIPELINE; 789 790 dprintf_bp(bp, "freeing in txg %llu, pass %u", 791 (longlong_t)txg, spa->spa_sync_pass); 792 793 ASSERT(!BP_IS_HOLE(bp)); 794 ASSERT(spa_syncing_txg(spa) == txg); 795 ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); 796 797 metaslab_check_free(spa, bp); 798 arc_freed(spa, bp); 799 800 if (zfs_trim_enabled) 801 stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START | 802 ZIO_STAGE_VDEV_IO_ASSESS; 803 /* 804 * GANG and DEDUP blocks can induce a read (for the gang block header, 805 * or the DDT), so issue them asynchronously so that this thread is 806 * not tied up. 807 */ 808 else if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) 809 stage |= ZIO_STAGE_ISSUE_ASYNC; 810 811 zio = zio_create(pio, spa, txg, bp, NULL, size, 812 NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags, 813 NULL, 0, NULL, ZIO_STAGE_OPEN, stage); 814 815 return (zio); 816} 817 818zio_t * 819zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 820 zio_done_func_t *done, void *private, enum zio_flag flags) 821{ 822 zio_t *zio; 823 824 /* 825 * A claim is an allocation of a specific block. Claims are needed 826 * to support immediate writes in the intent log. The issue is that 827 * immediate writes contain committed data, but in a txg that was 828 * *not* committed. Upon opening the pool after an unclean shutdown, 829 * the intent log claims all blocks that contain immediate write data 830 * so that the SPA knows they're in use. 831 * 832 * All claims *must* be resolved in the first txg -- before the SPA 833 * starts allocating blocks -- so that nothing is allocated twice. 834 * If txg == 0 we just verify that the block is claimable. 835 */ 836 ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 837 ASSERT(txg == spa_first_txg(spa) || txg == 0); 838 ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ 839 840 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 841 done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, 842 NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 843 844 return (zio); 845} 846 847zio_t * 848zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset, 849 uint64_t size, zio_done_func_t *done, void *private, int priority, 850 enum zio_flag flags) 851{ 852 zio_t *zio; 853 int c; 854 855 if (vd->vdev_children == 0) { 856 zio = zio_create(pio, spa, 0, NULL, NULL, size, done, private, 857 ZIO_TYPE_IOCTL, priority, flags, vd, offset, NULL, 858 ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 859 860 zio->io_cmd = cmd; 861 } else { 862 zio = zio_null(pio, spa, NULL, NULL, NULL, flags); 863 864 for (c = 0; c < vd->vdev_children; c++) 865 zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 866 offset, size, done, private, priority, flags)); 867 } 868 869 return (zio); 870} 871 872zio_t * 873zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 874 void *data, int checksum, zio_done_func_t *done, void *private, 875 int priority, enum zio_flag flags, boolean_t labels) 876{ 877 zio_t *zio; 878 879 ASSERT(vd->vdev_children == 0); 880 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 881 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 882 ASSERT3U(offset + size, <=, vd->vdev_psize); 883 884 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 885 ZIO_TYPE_READ, priority, flags, vd, offset, NULL, 886 ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 887 888 zio->io_prop.zp_checksum = checksum; 889 890 return (zio); 891} 892 893zio_t * 894zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 895 void *data, int checksum, zio_done_func_t *done, void *private, 896 int priority, enum zio_flag flags, boolean_t labels) 897{ 898 zio_t *zio; 899 900 ASSERT(vd->vdev_children == 0); 901 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 902 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 903 ASSERT3U(offset + size, <=, vd->vdev_psize); 904 905 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 906 ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL, 907 ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 908 909 zio->io_prop.zp_checksum = checksum; 910 911 if (zio_checksum_table[checksum].ci_eck) { 912 /* 913 * zec checksums are necessarily destructive -- they modify 914 * the end of the write buffer to hold the verifier/checksum. 915 * Therefore, we must make a local copy in case the data is 916 * being written to multiple places in parallel. 917 */ 918 void *wbuf = zio_buf_alloc(size); 919 bcopy(data, wbuf, size); 920 zio_push_transform(zio, wbuf, size, size, NULL); 921 } 922 923 return (zio); 924} 925 926/* 927 * Create a child I/O to do some work for us. 928 */ 929zio_t * 930zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 931 void *data, uint64_t size, int type, int priority, enum zio_flag flags, 932 zio_done_func_t *done, void *private) 933{ 934 enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; 935 zio_t *zio; 936 937 ASSERT(vd->vdev_parent == 938 (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); 939 940 if (type == ZIO_TYPE_READ && bp != NULL) { 941 /* 942 * If we have the bp, then the child should perform the 943 * checksum and the parent need not. This pushes error 944 * detection as close to the leaves as possible and 945 * eliminates redundant checksums in the interior nodes. 946 */ 947 pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; 948 pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 949 } 950 951 if (vd->vdev_children == 0) 952 offset += VDEV_LABEL_START_SIZE; 953 954 flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE; 955 956 /* 957 * If we've decided to do a repair, the write is not speculative -- 958 * even if the original read was. 959 */ 960 if (flags & ZIO_FLAG_IO_REPAIR) 961 flags &= ~ZIO_FLAG_SPECULATIVE; 962 963 zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, 964 done, private, type, priority, flags, vd, offset, &pio->io_bookmark, 965 ZIO_STAGE_VDEV_IO_START >> 1, pipeline); 966 967 return (zio); 968} 969 970zio_t * 971zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, 972 int type, int priority, enum zio_flag flags, 973 zio_done_func_t *done, void *private) 974{ 975 zio_t *zio; 976 977 ASSERT(vd->vdev_ops->vdev_op_leaf); 978 979 zio = zio_create(NULL, vd->vdev_spa, 0, NULL, 980 data, size, done, private, type, priority, 981 flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY, 982 vd, offset, NULL, 983 ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); 984 985 return (zio); 986} 987 988void 989zio_flush(zio_t *zio, vdev_t *vd) 990{ 991 zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0, 992 NULL, NULL, ZIO_PRIORITY_NOW, 993 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); 994} 995 996zio_t * 997zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size) 998{ 999 1000 ASSERT(vd->vdev_ops->vdev_op_leaf); 1001 1002 return zio_ioctl(zio, spa, vd, DKIOCTRIM, offset, size, 1003 NULL, NULL, ZIO_PRIORITY_TRIM, 1004 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY); 1005} 1006 1007void 1008zio_shrink(zio_t *zio, uint64_t size) 1009{ 1010 ASSERT(zio->io_executor == NULL); 1011 ASSERT(zio->io_orig_size == zio->io_size); 1012 ASSERT(size <= zio->io_size); 1013 1014 /* 1015 * We don't shrink for raidz because of problems with the 1016 * reconstruction when reading back less than the block size. 1017 * Note, BP_IS_RAIDZ() assumes no compression. 1018 */ 1019 ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); 1020 if (!BP_IS_RAIDZ(zio->io_bp)) 1021 zio->io_orig_size = zio->io_size = size; 1022} 1023 1024/* 1025 * ========================================================================== 1026 * Prepare to read and write logical blocks 1027 * ========================================================================== 1028 */ 1029 1030static int 1031zio_read_bp_init(zio_t *zio) 1032{ 1033 blkptr_t *bp = zio->io_bp; 1034 1035 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && 1036 zio->io_child_type == ZIO_CHILD_LOGICAL && 1037 !(zio->io_flags & ZIO_FLAG_RAW)) { 1038 uint64_t psize = BP_GET_PSIZE(bp); 1039 void *cbuf = zio_buf_alloc(psize); 1040 1041 zio_push_transform(zio, cbuf, psize, psize, zio_decompress); 1042 } 1043 1044 if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) 1045 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1046 1047 if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) 1048 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1049 1050 if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) 1051 zio->io_pipeline = ZIO_DDT_READ_PIPELINE; 1052 1053 return (ZIO_PIPELINE_CONTINUE); 1054} 1055 1056static int 1057zio_write_bp_init(zio_t *zio) 1058{ 1059 spa_t *spa = zio->io_spa; 1060 zio_prop_t *zp = &zio->io_prop; 1061 enum zio_compress compress = zp->zp_compress; 1062 blkptr_t *bp = zio->io_bp; 1063 uint64_t lsize = zio->io_size; 1064 uint64_t psize = lsize; 1065 int pass = 1; 1066 1067 /* 1068 * If our children haven't all reached the ready stage, 1069 * wait for them and then repeat this pipeline stage. 1070 */ 1071 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 1072 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) 1073 return (ZIO_PIPELINE_STOP); 1074 1075 if (!IO_IS_ALLOCATING(zio)) 1076 return (ZIO_PIPELINE_CONTINUE); 1077 1078 ASSERT(zio->io_child_type != ZIO_CHILD_DDT); 1079 1080 if (zio->io_bp_override) { 1081 ASSERT(bp->blk_birth != zio->io_txg); 1082 ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); 1083 1084 *bp = *zio->io_bp_override; 1085 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1086 1087 /* 1088 * If we've been overridden and nopwrite is set then 1089 * set the flag accordingly to indicate that a nopwrite 1090 * has already occurred. 1091 */ 1092 if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { 1093 ASSERT(!zp->zp_dedup); 1094 zio->io_flags |= ZIO_FLAG_NOPWRITE; 1095 return (ZIO_PIPELINE_CONTINUE); 1096 } 1097 1098 ASSERT(!zp->zp_nopwrite); 1099 1100 if (BP_IS_HOLE(bp) || !zp->zp_dedup) 1101 return (ZIO_PIPELINE_CONTINUE); 1102 1103 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup || 1104 zp->zp_dedup_verify); 1105 1106 if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { 1107 BP_SET_DEDUP(bp, 1); 1108 zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; 1109 return (ZIO_PIPELINE_CONTINUE); 1110 } 1111 zio->io_bp_override = NULL; 1112 BP_ZERO(bp); 1113 } 1114 1115 if (bp->blk_birth == zio->io_txg) { 1116 /* 1117 * We're rewriting an existing block, which means we're 1118 * working on behalf of spa_sync(). For spa_sync() to 1119 * converge, it must eventually be the case that we don't 1120 * have to allocate new blocks. But compression changes 1121 * the blocksize, which forces a reallocate, and makes 1122 * convergence take longer. Therefore, after the first 1123 * few passes, stop compressing to ensure convergence. 1124 */ 1125 pass = spa_sync_pass(spa); 1126 1127 ASSERT(zio->io_txg == spa_syncing_txg(spa)); 1128 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1129 ASSERT(!BP_GET_DEDUP(bp)); 1130 1131 if (pass >= zfs_sync_pass_dont_compress) 1132 compress = ZIO_COMPRESS_OFF; 1133 1134 /* Make sure someone doesn't change their mind on overwrites */ 1135 ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp), 1136 spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 1137 } 1138 1139 if (compress != ZIO_COMPRESS_OFF) { 1140 metaslab_class_t *mc = spa_normal_class(spa); 1141 void *cbuf = zio_buf_alloc(lsize); 1142 psize = zio_compress_data(compress, zio->io_data, cbuf, lsize, 1143 (size_t)metaslab_class_get_minblocksize(mc)); 1144 if (psize == 0 || psize == lsize) { 1145 compress = ZIO_COMPRESS_OFF; 1146 zio_buf_free(cbuf, lsize); 1147 } else { 1148 ASSERT(psize < lsize); 1149 zio_push_transform(zio, cbuf, psize, lsize, NULL); 1150 } 1151 } 1152 1153 /* 1154 * The final pass of spa_sync() must be all rewrites, but the first 1155 * few passes offer a trade-off: allocating blocks defers convergence, 1156 * but newly allocated blocks are sequential, so they can be written 1157 * to disk faster. Therefore, we allow the first few passes of 1158 * spa_sync() to allocate new blocks, but force rewrites after that. 1159 * There should only be a handful of blocks after pass 1 in any case. 1160 */ 1161 if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize && 1162 pass >= zfs_sync_pass_rewrite) { 1163 ASSERT(psize != 0); 1164 enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; 1165 zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; 1166 zio->io_flags |= ZIO_FLAG_IO_REWRITE; 1167 } else { 1168 BP_ZERO(bp); 1169 zio->io_pipeline = ZIO_WRITE_PIPELINE; 1170 } 1171 1172 if (psize == 0) { 1173 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1174 } else { 1175 ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); 1176 BP_SET_LSIZE(bp, lsize); 1177 BP_SET_PSIZE(bp, psize); 1178 BP_SET_COMPRESS(bp, compress); 1179 BP_SET_CHECKSUM(bp, zp->zp_checksum); 1180 BP_SET_TYPE(bp, zp->zp_type); 1181 BP_SET_LEVEL(bp, zp->zp_level); 1182 BP_SET_DEDUP(bp, zp->zp_dedup); 1183 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1184 if (zp->zp_dedup) { 1185 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1186 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1187 zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; 1188 } 1189 if (zp->zp_nopwrite) { 1190 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1191 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1192 zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; 1193 } 1194 } 1195 1196 return (ZIO_PIPELINE_CONTINUE); 1197} 1198 1199static int 1200zio_free_bp_init(zio_t *zio) 1201{ 1202 blkptr_t *bp = zio->io_bp; 1203 1204 if (zio->io_child_type == ZIO_CHILD_LOGICAL) { 1205 if (BP_GET_DEDUP(bp)) 1206 zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; 1207 } 1208 1209 return (ZIO_PIPELINE_CONTINUE); 1210} 1211 1212/* 1213 * ========================================================================== 1214 * Execute the I/O pipeline 1215 * ========================================================================== 1216 */ 1217 1218static void 1219zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline) 1220{ 1221 spa_t *spa = zio->io_spa; 1222 zio_type_t t = zio->io_type; 1223 int flags = TQ_SLEEP | (cutinline ? TQ_FRONT : 0); 1224 1225 ASSERT(q == ZIO_TASKQ_ISSUE || q == ZIO_TASKQ_INTERRUPT); 1226 1227 /* 1228 * If we're a config writer or a probe, the normal issue and 1229 * interrupt threads may all be blocked waiting for the config lock. 1230 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. 1231 */ 1232 if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) 1233 t = ZIO_TYPE_NULL; 1234 1235 /* 1236 * A similar issue exists for the L2ARC write thread until L2ARC 2.0. 1237 */ 1238 if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) 1239 t = ZIO_TYPE_NULL; 1240 1241 /* 1242 * If this is a high priority I/O, then use the high priority taskq. 1243 */ 1244 if (zio->io_priority == ZIO_PRIORITY_NOW && 1245 spa->spa_zio_taskq[t][q + 1] != NULL) 1246 q++; 1247 1248 ASSERT3U(q, <, ZIO_TASKQ_TYPES); 1249#ifdef _KERNEL 1250 (void) taskq_dispatch_safe(spa->spa_zio_taskq[t][q], 1251 (task_func_t *)zio_execute, zio, flags, &zio->io_task); 1252#else 1253 (void) taskq_dispatch(spa->spa_zio_taskq[t][q], 1254 (task_func_t *)zio_execute, zio, flags); 1255#endif 1256} 1257 1258static boolean_t 1259zio_taskq_member(zio_t *zio, enum zio_taskq_type q) 1260{ 1261 kthread_t *executor = zio->io_executor; 1262 spa_t *spa = zio->io_spa; 1263 1264 for (zio_type_t t = 0; t < ZIO_TYPES; t++) 1265 if (taskq_member(spa->spa_zio_taskq[t][q], executor)) 1266 return (B_TRUE); 1267 1268 return (B_FALSE); 1269} 1270 1271static int 1272zio_issue_async(zio_t *zio) 1273{ 1274 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 1275 1276 return (ZIO_PIPELINE_STOP); 1277} 1278 1279void 1280zio_interrupt(zio_t *zio) 1281{ 1282 zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); 1283} 1284 1285/* 1286 * Execute the I/O pipeline until one of the following occurs: 1287 * 1288 * (1) the I/O completes 1289 * (2) the pipeline stalls waiting for dependent child I/Os 1290 * (3) the I/O issues, so we're waiting for an I/O completion interrupt 1291 * (4) the I/O is delegated by vdev-level caching or aggregation 1292 * (5) the I/O is deferred due to vdev-level queueing 1293 * (6) the I/O is handed off to another thread. 1294 * 1295 * In all cases, the pipeline stops whenever there's no CPU work; it never 1296 * burns a thread in cv_wait(). 1297 * 1298 * There's no locking on io_stage because there's no legitimate way 1299 * for multiple threads to be attempting to process the same I/O. 1300 */ 1301static zio_pipe_stage_t *zio_pipeline[]; 1302 1303void 1304zio_execute(zio_t *zio) 1305{ 1306 zio->io_executor = curthread; 1307 1308 while (zio->io_stage < ZIO_STAGE_DONE) { 1309 enum zio_stage pipeline = zio->io_pipeline; 1310 enum zio_stage stage = zio->io_stage; 1311 int rv; 1312 1313 ASSERT(!MUTEX_HELD(&zio->io_lock)); 1314 ASSERT(ISP2(stage)); 1315 ASSERT(zio->io_stall == NULL); 1316 1317 do { 1318 stage <<= 1; 1319 } while ((stage & pipeline) == 0); 1320 1321 ASSERT(stage <= ZIO_STAGE_DONE); 1322 1323 /* 1324 * If we are in interrupt context and this pipeline stage 1325 * will grab a config lock that is held across I/O, 1326 * or may wait for an I/O that needs an interrupt thread 1327 * to complete, issue async to avoid deadlock. 1328 * 1329 * For VDEV_IO_START, we cut in line so that the io will 1330 * be sent to disk promptly. 1331 */ 1332 if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && 1333 zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { 1334 boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? 1335 zio_requeue_io_start_cut_in_line : B_FALSE; 1336 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); 1337 return; 1338 } 1339 1340 zio->io_stage = stage; 1341 rv = zio_pipeline[highbit(stage) - 1](zio); 1342 1343 if (rv == ZIO_PIPELINE_STOP) 1344 return; 1345 1346 ASSERT(rv == ZIO_PIPELINE_CONTINUE); 1347 } 1348} 1349 1350/* 1351 * ========================================================================== 1352 * Initiate I/O, either sync or async 1353 * ========================================================================== 1354 */ 1355int 1356zio_wait(zio_t *zio) 1357{ 1358 int error; 1359 1360 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 1361 ASSERT(zio->io_executor == NULL); 1362 1363 zio->io_waiter = curthread; 1364 1365 zio_execute(zio); 1366 1367 mutex_enter(&zio->io_lock); 1368 while (zio->io_executor != NULL) 1369 cv_wait(&zio->io_cv, &zio->io_lock); 1370 mutex_exit(&zio->io_lock); 1371 1372 error = zio->io_error; 1373 zio_destroy(zio); 1374 1375 return (error); 1376} 1377 1378void 1379zio_nowait(zio_t *zio) 1380{ 1381 ASSERT(zio->io_executor == NULL); 1382 1383 if (zio->io_child_type == ZIO_CHILD_LOGICAL && 1384 zio_unique_parent(zio) == NULL) { 1385 /* 1386 * This is a logical async I/O with no parent to wait for it. 1387 * We add it to the spa_async_root_zio "Godfather" I/O which 1388 * will ensure they complete prior to unloading the pool. 1389 */ 1390 spa_t *spa = zio->io_spa; 1391 1392 zio_add_child(spa->spa_async_zio_root, zio); 1393 } 1394 1395 zio_execute(zio); 1396} 1397 1398/* 1399 * ========================================================================== 1400 * Reexecute or suspend/resume failed I/O 1401 * ========================================================================== 1402 */ 1403 1404static void 1405zio_reexecute(zio_t *pio) 1406{ 1407 zio_t *cio, *cio_next; 1408 1409 ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); 1410 ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); 1411 ASSERT(pio->io_gang_leader == NULL); 1412 ASSERT(pio->io_gang_tree == NULL); 1413 1414 pio->io_flags = pio->io_orig_flags; 1415 pio->io_stage = pio->io_orig_stage; 1416 pio->io_pipeline = pio->io_orig_pipeline; 1417 pio->io_reexecute = 0; 1418 pio->io_flags |= ZIO_FLAG_REEXECUTED; 1419 pio->io_error = 0; 1420 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1421 pio->io_state[w] = 0; 1422 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 1423 pio->io_child_error[c] = 0; 1424 1425 if (IO_IS_ALLOCATING(pio)) 1426 BP_ZERO(pio->io_bp); 1427 1428 /* 1429 * As we reexecute pio's children, new children could be created. 1430 * New children go to the head of pio's io_child_list, however, 1431 * so we will (correctly) not reexecute them. The key is that 1432 * the remainder of pio's io_child_list, from 'cio_next' onward, 1433 * cannot be affected by any side effects of reexecuting 'cio'. 1434 */ 1435 for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { 1436 cio_next = zio_walk_children(pio); 1437 mutex_enter(&pio->io_lock); 1438 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1439 pio->io_children[cio->io_child_type][w]++; 1440 mutex_exit(&pio->io_lock); 1441 zio_reexecute(cio); 1442 } 1443 1444 /* 1445 * Now that all children have been reexecuted, execute the parent. 1446 * We don't reexecute "The Godfather" I/O here as it's the 1447 * responsibility of the caller to wait on him. 1448 */ 1449 if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) 1450 zio_execute(pio); 1451} 1452 1453void 1454zio_suspend(spa_t *spa, zio_t *zio) 1455{ 1456 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) 1457 fm_panic("Pool '%s' has encountered an uncorrectable I/O " 1458 "failure and the failure mode property for this pool " 1459 "is set to panic.", spa_name(spa)); 1460 1461 zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); 1462 1463 mutex_enter(&spa->spa_suspend_lock); 1464 1465 if (spa->spa_suspend_zio_root == NULL) 1466 spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 1467 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 1468 ZIO_FLAG_GODFATHER); 1469 1470 spa->spa_suspended = B_TRUE; 1471 1472 if (zio != NULL) { 1473 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 1474 ASSERT(zio != spa->spa_suspend_zio_root); 1475 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1476 ASSERT(zio_unique_parent(zio) == NULL); 1477 ASSERT(zio->io_stage == ZIO_STAGE_DONE); 1478 zio_add_child(spa->spa_suspend_zio_root, zio); 1479 } 1480 1481 mutex_exit(&spa->spa_suspend_lock); 1482} 1483 1484int 1485zio_resume(spa_t *spa) 1486{ 1487 zio_t *pio; 1488 1489 /* 1490 * Reexecute all previously suspended i/o. 1491 */ 1492 mutex_enter(&spa->spa_suspend_lock); 1493 spa->spa_suspended = B_FALSE; 1494 cv_broadcast(&spa->spa_suspend_cv); 1495 pio = spa->spa_suspend_zio_root; 1496 spa->spa_suspend_zio_root = NULL; 1497 mutex_exit(&spa->spa_suspend_lock); 1498 1499 if (pio == NULL) 1500 return (0); 1501 1502 zio_reexecute(pio); 1503 return (zio_wait(pio)); 1504} 1505 1506void 1507zio_resume_wait(spa_t *spa) 1508{ 1509 mutex_enter(&spa->spa_suspend_lock); 1510 while (spa_suspended(spa)) 1511 cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); 1512 mutex_exit(&spa->spa_suspend_lock); 1513} 1514 1515/* 1516 * ========================================================================== 1517 * Gang blocks. 1518 * 1519 * A gang block is a collection of small blocks that looks to the DMU 1520 * like one large block. When zio_dva_allocate() cannot find a block 1521 * of the requested size, due to either severe fragmentation or the pool 1522 * being nearly full, it calls zio_write_gang_block() to construct the 1523 * block from smaller fragments. 1524 * 1525 * A gang block consists of a gang header (zio_gbh_phys_t) and up to 1526 * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like 1527 * an indirect block: it's an array of block pointers. It consumes 1528 * only one sector and hence is allocatable regardless of fragmentation. 1529 * The gang header's bps point to its gang members, which hold the data. 1530 * 1531 * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> 1532 * as the verifier to ensure uniqueness of the SHA256 checksum. 1533 * Critically, the gang block bp's blk_cksum is the checksum of the data, 1534 * not the gang header. This ensures that data block signatures (needed for 1535 * deduplication) are independent of how the block is physically stored. 1536 * 1537 * Gang blocks can be nested: a gang member may itself be a gang block. 1538 * Thus every gang block is a tree in which root and all interior nodes are 1539 * gang headers, and the leaves are normal blocks that contain user data. 1540 * The root of the gang tree is called the gang leader. 1541 * 1542 * To perform any operation (read, rewrite, free, claim) on a gang block, 1543 * zio_gang_assemble() first assembles the gang tree (minus data leaves) 1544 * in the io_gang_tree field of the original logical i/o by recursively 1545 * reading the gang leader and all gang headers below it. This yields 1546 * an in-core tree containing the contents of every gang header and the 1547 * bps for every constituent of the gang block. 1548 * 1549 * With the gang tree now assembled, zio_gang_issue() just walks the gang tree 1550 * and invokes a callback on each bp. To free a gang block, zio_gang_issue() 1551 * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. 1552 * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). 1553 * zio_read_gang() is a wrapper around zio_read() that omits reading gang 1554 * headers, since we already have those in io_gang_tree. zio_rewrite_gang() 1555 * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() 1556 * of the gang header plus zio_checksum_compute() of the data to update the 1557 * gang header's blk_cksum as described above. 1558 * 1559 * The two-phase assemble/issue model solves the problem of partial failure -- 1560 * what if you'd freed part of a gang block but then couldn't read the 1561 * gang header for another part? Assembling the entire gang tree first 1562 * ensures that all the necessary gang header I/O has succeeded before 1563 * starting the actual work of free, claim, or write. Once the gang tree 1564 * is assembled, free and claim are in-memory operations that cannot fail. 1565 * 1566 * In the event that a gang write fails, zio_dva_unallocate() walks the 1567 * gang tree to immediately free (i.e. insert back into the space map) 1568 * everything we've allocated. This ensures that we don't get ENOSPC 1569 * errors during repeated suspend/resume cycles due to a flaky device. 1570 * 1571 * Gang rewrites only happen during sync-to-convergence. If we can't assemble 1572 * the gang tree, we won't modify the block, so we can safely defer the free 1573 * (knowing that the block is still intact). If we *can* assemble the gang 1574 * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free 1575 * each constituent bp and we can allocate a new block on the next sync pass. 1576 * 1577 * In all cases, the gang tree allows complete recovery from partial failure. 1578 * ========================================================================== 1579 */ 1580 1581static zio_t * 1582zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1583{ 1584 if (gn != NULL) 1585 return (pio); 1586 1587 return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), 1588 NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1589 &pio->io_bookmark)); 1590} 1591 1592zio_t * 1593zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1594{ 1595 zio_t *zio; 1596 1597 if (gn != NULL) { 1598 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1599 gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, 1600 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1601 /* 1602 * As we rewrite each gang header, the pipeline will compute 1603 * a new gang block header checksum for it; but no one will 1604 * compute a new data checksum, so we do that here. The one 1605 * exception is the gang leader: the pipeline already computed 1606 * its data checksum because that stage precedes gang assembly. 1607 * (Presently, nothing actually uses interior data checksums; 1608 * this is just good hygiene.) 1609 */ 1610 if (gn != pio->io_gang_leader->io_gang_tree) { 1611 zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), 1612 data, BP_GET_PSIZE(bp)); 1613 } 1614 /* 1615 * If we are here to damage data for testing purposes, 1616 * leave the GBH alone so that we can detect the damage. 1617 */ 1618 if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) 1619 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 1620 } else { 1621 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1622 data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, 1623 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1624 } 1625 1626 return (zio); 1627} 1628 1629/* ARGSUSED */ 1630zio_t * 1631zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1632{ 1633 return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, 1634 BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp), 1635 ZIO_GANG_CHILD_FLAGS(pio))); 1636} 1637 1638/* ARGSUSED */ 1639zio_t * 1640zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1641{ 1642 return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, 1643 NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); 1644} 1645 1646static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { 1647 NULL, 1648 zio_read_gang, 1649 zio_rewrite_gang, 1650 zio_free_gang, 1651 zio_claim_gang, 1652 NULL 1653}; 1654 1655static void zio_gang_tree_assemble_done(zio_t *zio); 1656 1657static zio_gang_node_t * 1658zio_gang_node_alloc(zio_gang_node_t **gnpp) 1659{ 1660 zio_gang_node_t *gn; 1661 1662 ASSERT(*gnpp == NULL); 1663 1664 gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); 1665 gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); 1666 *gnpp = gn; 1667 1668 return (gn); 1669} 1670 1671static void 1672zio_gang_node_free(zio_gang_node_t **gnpp) 1673{ 1674 zio_gang_node_t *gn = *gnpp; 1675 1676 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1677 ASSERT(gn->gn_child[g] == NULL); 1678 1679 zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); 1680 kmem_free(gn, sizeof (*gn)); 1681 *gnpp = NULL; 1682} 1683 1684static void 1685zio_gang_tree_free(zio_gang_node_t **gnpp) 1686{ 1687 zio_gang_node_t *gn = *gnpp; 1688 1689 if (gn == NULL) 1690 return; 1691 1692 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1693 zio_gang_tree_free(&gn->gn_child[g]); 1694 1695 zio_gang_node_free(gnpp); 1696} 1697 1698static void 1699zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) 1700{ 1701 zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); 1702 1703 ASSERT(gio->io_gang_leader == gio); 1704 ASSERT(BP_IS_GANG(bp)); 1705 1706 zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, 1707 SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, 1708 gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); 1709} 1710 1711static void 1712zio_gang_tree_assemble_done(zio_t *zio) 1713{ 1714 zio_t *gio = zio->io_gang_leader; 1715 zio_gang_node_t *gn = zio->io_private; 1716 blkptr_t *bp = zio->io_bp; 1717 1718 ASSERT(gio == zio_unique_parent(zio)); 1719 ASSERT(zio->io_child_count == 0); 1720 1721 if (zio->io_error) 1722 return; 1723 1724 if (BP_SHOULD_BYTESWAP(bp)) 1725 byteswap_uint64_array(zio->io_data, zio->io_size); 1726 1727 ASSERT(zio->io_data == gn->gn_gbh); 1728 ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 1729 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1730 1731 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1732 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1733 if (!BP_IS_GANG(gbp)) 1734 continue; 1735 zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); 1736 } 1737} 1738 1739static void 1740zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) 1741{ 1742 zio_t *gio = pio->io_gang_leader; 1743 zio_t *zio; 1744 1745 ASSERT(BP_IS_GANG(bp) == !!gn); 1746 ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); 1747 ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); 1748 1749 /* 1750 * If you're a gang header, your data is in gn->gn_gbh. 1751 * If you're a gang member, your data is in 'data' and gn == NULL. 1752 */ 1753 zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); 1754 1755 if (gn != NULL) { 1756 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1757 1758 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1759 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1760 if (BP_IS_HOLE(gbp)) 1761 continue; 1762 zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); 1763 data = (char *)data + BP_GET_PSIZE(gbp); 1764 } 1765 } 1766 1767 if (gn == gio->io_gang_tree && gio->io_data != NULL) 1768 ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); 1769 1770 if (zio != pio) 1771 zio_nowait(zio); 1772} 1773 1774static int 1775zio_gang_assemble(zio_t *zio) 1776{ 1777 blkptr_t *bp = zio->io_bp; 1778 1779 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); 1780 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1781 1782 zio->io_gang_leader = zio; 1783 1784 zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); 1785 1786 return (ZIO_PIPELINE_CONTINUE); 1787} 1788 1789static int 1790zio_gang_issue(zio_t *zio) 1791{ 1792 blkptr_t *bp = zio->io_bp; 1793 1794 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) 1795 return (ZIO_PIPELINE_STOP); 1796 1797 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); 1798 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1799 1800 if (zio->io_child_error[ZIO_CHILD_GANG] == 0) 1801 zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); 1802 else 1803 zio_gang_tree_free(&zio->io_gang_tree); 1804 1805 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1806 1807 return (ZIO_PIPELINE_CONTINUE); 1808} 1809 1810static void 1811zio_write_gang_member_ready(zio_t *zio) 1812{ 1813 zio_t *pio = zio_unique_parent(zio); 1814 zio_t *gio = zio->io_gang_leader; 1815 dva_t *cdva = zio->io_bp->blk_dva; 1816 dva_t *pdva = pio->io_bp->blk_dva; 1817 uint64_t asize; 1818 1819 if (BP_IS_HOLE(zio->io_bp)) 1820 return; 1821 1822 ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); 1823 1824 ASSERT(zio->io_child_type == ZIO_CHILD_GANG); 1825 ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); 1826 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); 1827 ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); 1828 ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 1829 1830 mutex_enter(&pio->io_lock); 1831 for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { 1832 ASSERT(DVA_GET_GANG(&pdva[d])); 1833 asize = DVA_GET_ASIZE(&pdva[d]); 1834 asize += DVA_GET_ASIZE(&cdva[d]); 1835 DVA_SET_ASIZE(&pdva[d], asize); 1836 } 1837 mutex_exit(&pio->io_lock); 1838} 1839 1840static int 1841zio_write_gang_block(zio_t *pio) 1842{ 1843 spa_t *spa = pio->io_spa; 1844 blkptr_t *bp = pio->io_bp; 1845 zio_t *gio = pio->io_gang_leader; 1846 zio_t *zio; 1847 zio_gang_node_t *gn, **gnpp; 1848 zio_gbh_phys_t *gbh; 1849 uint64_t txg = pio->io_txg; 1850 uint64_t resid = pio->io_size; 1851 uint64_t lsize; 1852 int copies = gio->io_prop.zp_copies; 1853 int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); 1854 zio_prop_t zp; 1855 int error; 1856 1857 error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE, 1858 bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, 1859 METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); 1860 if (error) { 1861 pio->io_error = error; 1862 return (ZIO_PIPELINE_CONTINUE); 1863 } 1864 1865 if (pio == gio) { 1866 gnpp = &gio->io_gang_tree; 1867 } else { 1868 gnpp = pio->io_private; 1869 ASSERT(pio->io_ready == zio_write_gang_member_ready); 1870 } 1871 1872 gn = zio_gang_node_alloc(gnpp); 1873 gbh = gn->gn_gbh; 1874 bzero(gbh, SPA_GANGBLOCKSIZE); 1875 1876 /* 1877 * Create the gang header. 1878 */ 1879 zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, 1880 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1881 1882 /* 1883 * Create and nowait the gang children. 1884 */ 1885 for (int g = 0; resid != 0; resid -= lsize, g++) { 1886 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), 1887 SPA_MINBLOCKSIZE); 1888 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); 1889 1890 zp.zp_checksum = gio->io_prop.zp_checksum; 1891 zp.zp_compress = ZIO_COMPRESS_OFF; 1892 zp.zp_type = DMU_OT_NONE; 1893 zp.zp_level = 0; 1894 zp.zp_copies = gio->io_prop.zp_copies; 1895 zp.zp_dedup = B_FALSE; 1896 zp.zp_dedup_verify = B_FALSE; 1897 zp.zp_nopwrite = B_FALSE; 1898 1899 zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], 1900 (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, 1901 zio_write_gang_member_ready, NULL, &gn->gn_child[g], 1902 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1903 &pio->io_bookmark)); 1904 } 1905 1906 /* 1907 * Set pio's pipeline to just wait for zio to finish. 1908 */ 1909 pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1910 1911 zio_nowait(zio); 1912 1913 return (ZIO_PIPELINE_CONTINUE); 1914} 1915 1916/* 1917 * The zio_nop_write stage in the pipeline determines if allocating 1918 * a new bp is necessary. By leveraging a cryptographically secure checksum, 1919 * such as SHA256, we can compare the checksums of the new data and the old 1920 * to determine if allocating a new block is required. The nopwrite 1921 * feature can handle writes in either syncing or open context (i.e. zil 1922 * writes) and as a result is mutually exclusive with dedup. 1923 */ 1924static int 1925zio_nop_write(zio_t *zio) 1926{ 1927 blkptr_t *bp = zio->io_bp; 1928 blkptr_t *bp_orig = &zio->io_bp_orig; 1929 zio_prop_t *zp = &zio->io_prop; 1930 1931 ASSERT(BP_GET_LEVEL(bp) == 0); 1932 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1933 ASSERT(zp->zp_nopwrite); 1934 ASSERT(!zp->zp_dedup); 1935 ASSERT(zio->io_bp_override == NULL); 1936 ASSERT(IO_IS_ALLOCATING(zio)); 1937 1938 /* 1939 * Check to see if the original bp and the new bp have matching 1940 * characteristics (i.e. same checksum, compression algorithms, etc). 1941 * If they don't then just continue with the pipeline which will 1942 * allocate a new bp. 1943 */ 1944 if (BP_IS_HOLE(bp_orig) || 1945 !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup || 1946 BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || 1947 BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || 1948 BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || 1949 zp->zp_copies != BP_GET_NDVAS(bp_orig)) 1950 return (ZIO_PIPELINE_CONTINUE); 1951 1952 /* 1953 * If the checksums match then reset the pipeline so that we 1954 * avoid allocating a new bp and issuing any I/O. 1955 */ 1956 if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { 1957 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup); 1958 ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); 1959 ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); 1960 ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); 1961 ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, 1962 sizeof (uint64_t)) == 0); 1963 1964 *bp = *bp_orig; 1965 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1966 zio->io_flags |= ZIO_FLAG_NOPWRITE; 1967 } 1968 1969 return (ZIO_PIPELINE_CONTINUE); 1970} 1971 1972/* 1973 * ========================================================================== 1974 * Dedup 1975 * ========================================================================== 1976 */ 1977static void 1978zio_ddt_child_read_done(zio_t *zio) 1979{ 1980 blkptr_t *bp = zio->io_bp; 1981 ddt_entry_t *dde = zio->io_private; 1982 ddt_phys_t *ddp; 1983 zio_t *pio = zio_unique_parent(zio); 1984 1985 mutex_enter(&pio->io_lock); 1986 ddp = ddt_phys_select(dde, bp); 1987 if (zio->io_error == 0) 1988 ddt_phys_clear(ddp); /* this ddp doesn't need repair */ 1989 if (zio->io_error == 0 && dde->dde_repair_data == NULL) 1990 dde->dde_repair_data = zio->io_data; 1991 else 1992 zio_buf_free(zio->io_data, zio->io_size); 1993 mutex_exit(&pio->io_lock); 1994} 1995 1996static int 1997zio_ddt_read_start(zio_t *zio) 1998{ 1999 blkptr_t *bp = zio->io_bp; 2000 2001 ASSERT(BP_GET_DEDUP(bp)); 2002 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2003 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2004 2005 if (zio->io_child_error[ZIO_CHILD_DDT]) { 2006 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2007 ddt_entry_t *dde = ddt_repair_start(ddt, bp); 2008 ddt_phys_t *ddp = dde->dde_phys; 2009 ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); 2010 blkptr_t blk; 2011 2012 ASSERT(zio->io_vsd == NULL); 2013 zio->io_vsd = dde; 2014 2015 if (ddp_self == NULL) 2016 return (ZIO_PIPELINE_CONTINUE); 2017 2018 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 2019 if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) 2020 continue; 2021 ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, 2022 &blk); 2023 zio_nowait(zio_read(zio, zio->io_spa, &blk, 2024 zio_buf_alloc(zio->io_size), zio->io_size, 2025 zio_ddt_child_read_done, dde, zio->io_priority, 2026 ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, 2027 &zio->io_bookmark)); 2028 } 2029 return (ZIO_PIPELINE_CONTINUE); 2030 } 2031 2032 zio_nowait(zio_read(zio, zio->io_spa, bp, 2033 zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, 2034 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); 2035 2036 return (ZIO_PIPELINE_CONTINUE); 2037} 2038 2039static int 2040zio_ddt_read_done(zio_t *zio) 2041{ 2042 blkptr_t *bp = zio->io_bp; 2043 2044 if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)) 2045 return (ZIO_PIPELINE_STOP); 2046 2047 ASSERT(BP_GET_DEDUP(bp)); 2048 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2049 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2050 2051 if (zio->io_child_error[ZIO_CHILD_DDT]) { 2052 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2053 ddt_entry_t *dde = zio->io_vsd; 2054 if (ddt == NULL) { 2055 ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); 2056 return (ZIO_PIPELINE_CONTINUE); 2057 } 2058 if (dde == NULL) { 2059 zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; 2060 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 2061 return (ZIO_PIPELINE_STOP); 2062 } 2063 if (dde->dde_repair_data != NULL) { 2064 bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); 2065 zio->io_child_error[ZIO_CHILD_DDT] = 0; 2066 } 2067 ddt_repair_done(ddt, dde); 2068 zio->io_vsd = NULL; 2069 } 2070 2071 ASSERT(zio->io_vsd == NULL); 2072 2073 return (ZIO_PIPELINE_CONTINUE); 2074} 2075 2076static boolean_t 2077zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) 2078{ 2079 spa_t *spa = zio->io_spa; 2080 2081 /* 2082 * Note: we compare the original data, not the transformed data, 2083 * because when zio->io_bp is an override bp, we will not have 2084 * pushed the I/O transforms. That's an important optimization 2085 * because otherwise we'd compress/encrypt all dmu_sync() data twice. 2086 */ 2087 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2088 zio_t *lio = dde->dde_lead_zio[p]; 2089 2090 if (lio != NULL) { 2091 return (lio->io_orig_size != zio->io_orig_size || 2092 bcmp(zio->io_orig_data, lio->io_orig_data, 2093 zio->io_orig_size) != 0); 2094 } 2095 } 2096 2097 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2098 ddt_phys_t *ddp = &dde->dde_phys[p]; 2099 2100 if (ddp->ddp_phys_birth != 0) { 2101 arc_buf_t *abuf = NULL; 2102 uint32_t aflags = ARC_WAIT; 2103 blkptr_t blk = *zio->io_bp; 2104 int error; 2105 2106 ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); 2107 2108 ddt_exit(ddt); 2109 2110 error = arc_read(NULL, spa, &blk, 2111 arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, 2112 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2113 &aflags, &zio->io_bookmark); 2114 2115 if (error == 0) { 2116 if (arc_buf_size(abuf) != zio->io_orig_size || 2117 bcmp(abuf->b_data, zio->io_orig_data, 2118 zio->io_orig_size) != 0) 2119 error = SET_ERROR(EEXIST); 2120 VERIFY(arc_buf_remove_ref(abuf, &abuf)); 2121 } 2122 2123 ddt_enter(ddt); 2124 return (error != 0); 2125 } 2126 } 2127 2128 return (B_FALSE); 2129} 2130 2131static void 2132zio_ddt_child_write_ready(zio_t *zio) 2133{ 2134 int p = zio->io_prop.zp_copies; 2135 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2136 ddt_entry_t *dde = zio->io_private; 2137 ddt_phys_t *ddp = &dde->dde_phys[p]; 2138 zio_t *pio; 2139 2140 if (zio->io_error) 2141 return; 2142 2143 ddt_enter(ddt); 2144 2145 ASSERT(dde->dde_lead_zio[p] == zio); 2146 2147 ddt_phys_fill(ddp, zio->io_bp); 2148 2149 while ((pio = zio_walk_parents(zio)) != NULL) 2150 ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); 2151 2152 ddt_exit(ddt); 2153} 2154 2155static void 2156zio_ddt_child_write_done(zio_t *zio) 2157{ 2158 int p = zio->io_prop.zp_copies; 2159 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2160 ddt_entry_t *dde = zio->io_private; 2161 ddt_phys_t *ddp = &dde->dde_phys[p]; 2162 2163 ddt_enter(ddt); 2164 2165 ASSERT(ddp->ddp_refcnt == 0); 2166 ASSERT(dde->dde_lead_zio[p] == zio); 2167 dde->dde_lead_zio[p] = NULL; 2168 2169 if (zio->io_error == 0) { 2170 while (zio_walk_parents(zio) != NULL) 2171 ddt_phys_addref(ddp); 2172 } else { 2173 ddt_phys_clear(ddp); 2174 } 2175 2176 ddt_exit(ddt); 2177} 2178 2179static void 2180zio_ddt_ditto_write_done(zio_t *zio) 2181{ 2182 int p = DDT_PHYS_DITTO; 2183 zio_prop_t *zp = &zio->io_prop; 2184 blkptr_t *bp = zio->io_bp; 2185 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2186 ddt_entry_t *dde = zio->io_private; 2187 ddt_phys_t *ddp = &dde->dde_phys[p]; 2188 ddt_key_t *ddk = &dde->dde_key; 2189 2190 ddt_enter(ddt); 2191 2192 ASSERT(ddp->ddp_refcnt == 0); 2193 ASSERT(dde->dde_lead_zio[p] == zio); 2194 dde->dde_lead_zio[p] = NULL; 2195 2196 if (zio->io_error == 0) { 2197 ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); 2198 ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); 2199 ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); 2200 if (ddp->ddp_phys_birth != 0) 2201 ddt_phys_free(ddt, ddk, ddp, zio->io_txg); 2202 ddt_phys_fill(ddp, bp); 2203 } 2204 2205 ddt_exit(ddt); 2206} 2207 2208static int 2209zio_ddt_write(zio_t *zio) 2210{ 2211 spa_t *spa = zio->io_spa; 2212 blkptr_t *bp = zio->io_bp; 2213 uint64_t txg = zio->io_txg; 2214 zio_prop_t *zp = &zio->io_prop; 2215 int p = zp->zp_copies; 2216 int ditto_copies; 2217 zio_t *cio = NULL; 2218 zio_t *dio = NULL; 2219 ddt_t *ddt = ddt_select(spa, bp); 2220 ddt_entry_t *dde; 2221 ddt_phys_t *ddp; 2222 2223 ASSERT(BP_GET_DEDUP(bp)); 2224 ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); 2225 ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); 2226 2227 ddt_enter(ddt); 2228 dde = ddt_lookup(ddt, bp, B_TRUE); 2229 ddp = &dde->dde_phys[p]; 2230 2231 if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { 2232 /* 2233 * If we're using a weak checksum, upgrade to a strong checksum 2234 * and try again. If we're already using a strong checksum, 2235 * we can't resolve it, so just convert to an ordinary write. 2236 * (And automatically e-mail a paper to Nature?) 2237 */ 2238 if (!zio_checksum_table[zp->zp_checksum].ci_dedup) { 2239 zp->zp_checksum = spa_dedup_checksum(spa); 2240 zio_pop_transforms(zio); 2241 zio->io_stage = ZIO_STAGE_OPEN; 2242 BP_ZERO(bp); 2243 } else { 2244 zp->zp_dedup = B_FALSE; 2245 } 2246 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2247 ddt_exit(ddt); 2248 return (ZIO_PIPELINE_CONTINUE); 2249 } 2250 2251 ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); 2252 ASSERT(ditto_copies < SPA_DVAS_PER_BP); 2253 2254 if (ditto_copies > ddt_ditto_copies_present(dde) && 2255 dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { 2256 zio_prop_t czp = *zp; 2257 2258 czp.zp_copies = ditto_copies; 2259 2260 /* 2261 * If we arrived here with an override bp, we won't have run 2262 * the transform stack, so we won't have the data we need to 2263 * generate a child i/o. So, toss the override bp and restart. 2264 * This is safe, because using the override bp is just an 2265 * optimization; and it's rare, so the cost doesn't matter. 2266 */ 2267 if (zio->io_bp_override) { 2268 zio_pop_transforms(zio); 2269 zio->io_stage = ZIO_STAGE_OPEN; 2270 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2271 zio->io_bp_override = NULL; 2272 BP_ZERO(bp); 2273 ddt_exit(ddt); 2274 return (ZIO_PIPELINE_CONTINUE); 2275 } 2276 2277 dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2278 zio->io_orig_size, &czp, NULL, 2279 zio_ddt_ditto_write_done, dde, zio->io_priority, 2280 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2281 2282 zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); 2283 dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; 2284 } 2285 2286 if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { 2287 if (ddp->ddp_phys_birth != 0) 2288 ddt_bp_fill(ddp, bp, txg); 2289 if (dde->dde_lead_zio[p] != NULL) 2290 zio_add_child(zio, dde->dde_lead_zio[p]); 2291 else 2292 ddt_phys_addref(ddp); 2293 } else if (zio->io_bp_override) { 2294 ASSERT(bp->blk_birth == txg); 2295 ASSERT(BP_EQUAL(bp, zio->io_bp_override)); 2296 ddt_phys_fill(ddp, bp); 2297 ddt_phys_addref(ddp); 2298 } else { 2299 cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2300 zio->io_orig_size, zp, zio_ddt_child_write_ready, 2301 zio_ddt_child_write_done, dde, zio->io_priority, 2302 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2303 2304 zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); 2305 dde->dde_lead_zio[p] = cio; 2306 } 2307 2308 ddt_exit(ddt); 2309 2310 if (cio) 2311 zio_nowait(cio); 2312 if (dio) 2313 zio_nowait(dio); 2314 2315 return (ZIO_PIPELINE_CONTINUE); 2316} 2317 2318ddt_entry_t *freedde; /* for debugging */ 2319 2320static int 2321zio_ddt_free(zio_t *zio) 2322{ 2323 spa_t *spa = zio->io_spa; 2324 blkptr_t *bp = zio->io_bp; 2325 ddt_t *ddt = ddt_select(spa, bp); 2326 ddt_entry_t *dde; 2327 ddt_phys_t *ddp; 2328 2329 ASSERT(BP_GET_DEDUP(bp)); 2330 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2331 2332 ddt_enter(ddt); 2333 freedde = dde = ddt_lookup(ddt, bp, B_TRUE); 2334 ddp = ddt_phys_select(dde, bp); 2335 ddt_phys_decref(ddp); 2336 ddt_exit(ddt); 2337 2338 return (ZIO_PIPELINE_CONTINUE); 2339} 2340 2341/* 2342 * ========================================================================== 2343 * Allocate and free blocks 2344 * ========================================================================== 2345 */ 2346static int 2347zio_dva_allocate(zio_t *zio) 2348{ 2349 spa_t *spa = zio->io_spa; 2350 metaslab_class_t *mc = spa_normal_class(spa); 2351 blkptr_t *bp = zio->io_bp; 2352 int error; 2353 int flags = 0; 2354 2355 if (zio->io_gang_leader == NULL) { 2356 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2357 zio->io_gang_leader = zio; 2358 } 2359 2360 ASSERT(BP_IS_HOLE(bp)); 2361 ASSERT0(BP_GET_NDVAS(bp)); 2362 ASSERT3U(zio->io_prop.zp_copies, >, 0); 2363 ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); 2364 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 2365 2366 /* 2367 * The dump device does not support gang blocks so allocation on 2368 * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid 2369 * the "fast" gang feature. 2370 */ 2371 flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0; 2372 flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ? 2373 METASLAB_GANG_CHILD : 0; 2374 error = metaslab_alloc(spa, mc, zio->io_size, bp, 2375 zio->io_prop.zp_copies, zio->io_txg, NULL, flags); 2376 2377 if (error) { 2378 spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " 2379 "size %llu, error %d", spa_name(spa), zio, zio->io_size, 2380 error); 2381 if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) 2382 return (zio_write_gang_block(zio)); 2383 zio->io_error = error; 2384 } 2385 2386 return (ZIO_PIPELINE_CONTINUE); 2387} 2388 2389static int 2390zio_dva_free(zio_t *zio) 2391{ 2392 metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); 2393 2394 return (ZIO_PIPELINE_CONTINUE); 2395} 2396 2397static int 2398zio_dva_claim(zio_t *zio) 2399{ 2400 int error; 2401 2402 error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 2403 if (error) 2404 zio->io_error = error; 2405 2406 return (ZIO_PIPELINE_CONTINUE); 2407} 2408 2409/* 2410 * Undo an allocation. This is used by zio_done() when an I/O fails 2411 * and we want to give back the block we just allocated. 2412 * This handles both normal blocks and gang blocks. 2413 */ 2414static void 2415zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) 2416{ 2417 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 2418 ASSERT(zio->io_bp_override == NULL); 2419 2420 if (!BP_IS_HOLE(bp)) 2421 metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); 2422 2423 if (gn != NULL) { 2424 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 2425 zio_dva_unallocate(zio, gn->gn_child[g], 2426 &gn->gn_gbh->zg_blkptr[g]); 2427 } 2428 } 2429} 2430 2431/* 2432 * Try to allocate an intent log block. Return 0 on success, errno on failure. 2433 */ 2434int 2435zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, 2436 uint64_t size, boolean_t use_slog) 2437{ 2438 int error = 1; 2439 2440 ASSERT(txg > spa_syncing_txg(spa)); 2441 2442 /* 2443 * ZIL blocks are always contiguous (i.e. not gang blocks) so we 2444 * set the METASLAB_GANG_AVOID flag so that they don't "fast gang" 2445 * when allocating them. 2446 */ 2447 if (use_slog) { 2448 error = metaslab_alloc(spa, spa_log_class(spa), size, 2449 new_bp, 1, txg, old_bp, 2450 METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID); 2451 } 2452 2453 if (error) { 2454 error = metaslab_alloc(spa, spa_normal_class(spa), size, 2455 new_bp, 1, txg, old_bp, 2456 METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID); 2457 } 2458 2459 if (error == 0) { 2460 BP_SET_LSIZE(new_bp, size); 2461 BP_SET_PSIZE(new_bp, size); 2462 BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 2463 BP_SET_CHECKSUM(new_bp, 2464 spa_version(spa) >= SPA_VERSION_SLIM_ZIL 2465 ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); 2466 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 2467 BP_SET_LEVEL(new_bp, 0); 2468 BP_SET_DEDUP(new_bp, 0); 2469 BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 2470 } 2471 2472 return (error); 2473} 2474 2475/* 2476 * Free an intent log block. 2477 */ 2478void 2479zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp) 2480{ 2481 ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG); 2482 ASSERT(!BP_IS_GANG(bp)); 2483 2484 zio_free(spa, txg, bp); 2485} 2486 2487/* 2488 * ========================================================================== 2489 * Read, write and delete to physical devices 2490 * ========================================================================== 2491 */ 2492static int 2493zio_vdev_io_start(zio_t *zio) 2494{ 2495 vdev_t *vd = zio->io_vd; 2496 uint64_t align; 2497 spa_t *spa = zio->io_spa; 2498 2499 ASSERT(zio->io_error == 0); 2500 ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); 2501 2502 if (vd == NULL) { 2503 if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2504 spa_config_enter(spa, SCL_ZIO, zio, RW_READER); 2505 2506 /* 2507 * The mirror_ops handle multiple DVAs in a single BP. 2508 */ 2509 return (vdev_mirror_ops.vdev_op_io_start(zio)); 2510 } 2511 2512 if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE) { 2513 trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg); 2514 return (ZIO_PIPELINE_CONTINUE); 2515 } 2516 2517 /* 2518 * We keep track of time-sensitive I/Os so that the scan thread 2519 * can quickly react to certain workloads. In particular, we care 2520 * about non-scrubbing, top-level reads and writes with the following 2521 * characteristics: 2522 * - synchronous writes of user data to non-slog devices 2523 * - any reads of user data 2524 * When these conditions are met, adjust the timestamp of spa_last_io 2525 * which allows the scan thread to adjust its workload accordingly. 2526 */ 2527 if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && 2528 vd == vd->vdev_top && !vd->vdev_islog && 2529 zio->io_bookmark.zb_objset != DMU_META_OBJSET && 2530 zio->io_txg != spa_syncing_txg(spa)) { 2531 uint64_t old = spa->spa_last_io; 2532 uint64_t new = ddi_get_lbolt64(); 2533 if (old != new) 2534 (void) atomic_cas_64(&spa->spa_last_io, old, new); 2535 } 2536 2537 align = 1ULL << vd->vdev_top->vdev_ashift; 2538 2539 if (P2PHASE(zio->io_size, align) != 0) { 2540 uint64_t asize = P2ROUNDUP(zio->io_size, align); 2541 char *abuf = NULL; 2542 if (zio->io_type == ZIO_TYPE_READ || 2543 zio->io_type == ZIO_TYPE_WRITE) 2544 abuf = zio_buf_alloc(asize); 2545 ASSERT(vd == vd->vdev_top); 2546 if (zio->io_type == ZIO_TYPE_WRITE) { 2547 bcopy(zio->io_data, abuf, zio->io_size); 2548 bzero(abuf + zio->io_size, asize - zio->io_size); 2549 } 2550 zio_push_transform(zio, abuf, asize, abuf ? asize : 0, 2551 zio_subblock); 2552 } 2553 2554 ASSERT(P2PHASE(zio->io_offset, align) == 0); 2555 ASSERT(P2PHASE(zio->io_size, align) == 0); 2556 VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa)); 2557 2558 /* 2559 * If this is a repair I/O, and there's no self-healing involved -- 2560 * that is, we're just resilvering what we expect to resilver -- 2561 * then don't do the I/O unless zio's txg is actually in vd's DTL. 2562 * This prevents spurious resilvering with nested replication. 2563 * For example, given a mirror of mirrors, (A+B)+(C+D), if only 2564 * A is out of date, we'll read from C+D, then use the data to 2565 * resilver A+B -- but we don't actually want to resilver B, just A. 2566 * The top-level mirror has no way to know this, so instead we just 2567 * discard unnecessary repairs as we work our way down the vdev tree. 2568 * The same logic applies to any form of nested replication: 2569 * ditto + mirror, RAID-Z + replacing, etc. This covers them all. 2570 */ 2571 if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && 2572 !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && 2573 zio->io_txg != 0 && /* not a delegated i/o */ 2574 !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { 2575 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 2576 zio_vdev_io_bypass(zio); 2577 return (ZIO_PIPELINE_CONTINUE); 2578 } 2579 2580 if (vd->vdev_ops->vdev_op_leaf && 2581 (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { 2582 2583 if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) 2584 return (ZIO_PIPELINE_CONTINUE); 2585 2586 if ((zio = vdev_queue_io(zio)) == NULL) 2587 return (ZIO_PIPELINE_STOP); 2588 2589 if (!vdev_accessible(vd, zio)) { 2590 zio->io_error = SET_ERROR(ENXIO); 2591 zio_interrupt(zio); 2592 return (ZIO_PIPELINE_STOP); 2593 } 2594 } 2595 2596 /* 2597 * Note that we ignore repair writes for TRIM because they can conflict 2598 * with normal writes. This isn't an issue because, by definition, we 2599 * only repair blocks that aren't freed. 2600 */ 2601 if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_WRITE && 2602 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 2603 if (!trim_map_write_start(zio)) 2604 return (ZIO_PIPELINE_STOP); 2605 } 2606 2607 return (vd->vdev_ops->vdev_op_io_start(zio)); 2608} 2609 2610static int 2611zio_vdev_io_done(zio_t *zio) 2612{ 2613 vdev_t *vd = zio->io_vd; 2614 vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; 2615 boolean_t unexpected_error = B_FALSE; 2616 2617 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2618 return (ZIO_PIPELINE_STOP); 2619 2620 ASSERT(zio->io_type == ZIO_TYPE_READ || 2621 zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE); 2622 2623 if (vd != NULL && vd->vdev_ops->vdev_op_leaf && 2624 (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { 2625 2626 if (zio->io_type == ZIO_TYPE_WRITE && 2627 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) 2628 trim_map_write_done(zio); 2629 2630 vdev_queue_io_done(zio); 2631 2632 if (zio->io_type == ZIO_TYPE_WRITE) 2633 vdev_cache_write(zio); 2634 2635 if (zio_injection_enabled && zio->io_error == 0) 2636 zio->io_error = zio_handle_device_injection(vd, 2637 zio, EIO); 2638 2639 if (zio_injection_enabled && zio->io_error == 0) 2640 zio->io_error = zio_handle_label_injection(zio, EIO); 2641 2642 if (zio->io_error) { 2643 if (!vdev_accessible(vd, zio)) { 2644 zio->io_error = SET_ERROR(ENXIO); 2645 } else { 2646 unexpected_error = B_TRUE; 2647 } 2648 } 2649 } 2650 2651 ops->vdev_op_io_done(zio); 2652 2653 if (unexpected_error) 2654 VERIFY(vdev_probe(vd, zio) == NULL); 2655 2656 return (ZIO_PIPELINE_CONTINUE); 2657} 2658 2659/* 2660 * For non-raidz ZIOs, we can just copy aside the bad data read from the 2661 * disk, and use that to finish the checksum ereport later. 2662 */ 2663static void 2664zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, 2665 const void *good_buf) 2666{ 2667 /* no processing needed */ 2668 zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); 2669} 2670 2671/*ARGSUSED*/ 2672void 2673zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) 2674{ 2675 void *buf = zio_buf_alloc(zio->io_size); 2676 2677 bcopy(zio->io_data, buf, zio->io_size); 2678 2679 zcr->zcr_cbinfo = zio->io_size; 2680 zcr->zcr_cbdata = buf; 2681 zcr->zcr_finish = zio_vsd_default_cksum_finish; 2682 zcr->zcr_free = zio_buf_free; 2683} 2684 2685static int 2686zio_vdev_io_assess(zio_t *zio) 2687{ 2688 vdev_t *vd = zio->io_vd; 2689 2690 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2691 return (ZIO_PIPELINE_STOP); 2692 2693 if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2694 spa_config_exit(zio->io_spa, SCL_ZIO, zio); 2695 2696 if (zio->io_vsd != NULL) { 2697 zio->io_vsd_ops->vsd_free(zio); 2698 zio->io_vsd = NULL; 2699 } 2700 2701 if (zio_injection_enabled && zio->io_error == 0) 2702 zio->io_error = zio_handle_fault_injection(zio, EIO); 2703 2704 if (zio->io_type == ZIO_TYPE_IOCTL && zio->io_cmd == DKIOCTRIM) 2705 switch (zio->io_error) { 2706 case 0: 2707 ZIO_TRIM_STAT_INCR(bytes, zio->io_size); 2708 ZIO_TRIM_STAT_BUMP(success); 2709 break; 2710 case EOPNOTSUPP: 2711 ZIO_TRIM_STAT_BUMP(unsupported); 2712 break; 2713 default: 2714 ZIO_TRIM_STAT_BUMP(failed); 2715 break; 2716 } 2717 2718 /* 2719 * If the I/O failed, determine whether we should attempt to retry it. 2720 * 2721 * On retry, we cut in line in the issue queue, since we don't want 2722 * compression/checksumming/etc. work to prevent our (cheap) IO reissue. 2723 */ 2724 if (zio->io_error && vd == NULL && 2725 !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { 2726 ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ 2727 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ 2728 zio->io_error = 0; 2729 zio->io_flags |= ZIO_FLAG_IO_RETRY | 2730 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; 2731 zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; 2732 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, 2733 zio_requeue_io_start_cut_in_line); 2734 return (ZIO_PIPELINE_STOP); 2735 } 2736 2737 /* 2738 * If we got an error on a leaf device, convert it to ENXIO 2739 * if the device is not accessible at all. 2740 */ 2741 if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && 2742 !vdev_accessible(vd, zio)) 2743 zio->io_error = SET_ERROR(ENXIO); 2744 2745 /* 2746 * If we can't write to an interior vdev (mirror or RAID-Z), 2747 * set vdev_cant_write so that we stop trying to allocate from it. 2748 */ 2749 if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && 2750 vd != NULL && !vd->vdev_ops->vdev_op_leaf) { 2751 vd->vdev_cant_write = B_TRUE; 2752 } 2753 2754 if (zio->io_error) 2755 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2756 2757 return (ZIO_PIPELINE_CONTINUE); 2758} 2759 2760void 2761zio_vdev_io_reissue(zio_t *zio) 2762{ 2763 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2764 ASSERT(zio->io_error == 0); 2765 2766 zio->io_stage >>= 1; 2767} 2768 2769void 2770zio_vdev_io_redone(zio_t *zio) 2771{ 2772 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 2773 2774 zio->io_stage >>= 1; 2775} 2776 2777void 2778zio_vdev_io_bypass(zio_t *zio) 2779{ 2780 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2781 ASSERT(zio->io_error == 0); 2782 2783 zio->io_flags |= ZIO_FLAG_IO_BYPASS; 2784 zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; 2785} 2786 2787/* 2788 * ========================================================================== 2789 * Generate and verify checksums 2790 * ========================================================================== 2791 */ 2792static int 2793zio_checksum_generate(zio_t *zio) 2794{ 2795 blkptr_t *bp = zio->io_bp; 2796 enum zio_checksum checksum; 2797 2798 if (bp == NULL) { 2799 /* 2800 * This is zio_write_phys(). 2801 * We're either generating a label checksum, or none at all. 2802 */ 2803 checksum = zio->io_prop.zp_checksum; 2804 2805 if (checksum == ZIO_CHECKSUM_OFF) 2806 return (ZIO_PIPELINE_CONTINUE); 2807 2808 ASSERT(checksum == ZIO_CHECKSUM_LABEL); 2809 } else { 2810 if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { 2811 ASSERT(!IO_IS_ALLOCATING(zio)); 2812 checksum = ZIO_CHECKSUM_GANG_HEADER; 2813 } else { 2814 checksum = BP_GET_CHECKSUM(bp); 2815 } 2816 } 2817 2818 zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); 2819 2820 return (ZIO_PIPELINE_CONTINUE); 2821} 2822 2823static int 2824zio_checksum_verify(zio_t *zio) 2825{ 2826 zio_bad_cksum_t info; 2827 blkptr_t *bp = zio->io_bp; 2828 int error; 2829 2830 ASSERT(zio->io_vd != NULL); 2831 2832 if (bp == NULL) { 2833 /* 2834 * This is zio_read_phys(). 2835 * We're either verifying a label checksum, or nothing at all. 2836 */ 2837 if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) 2838 return (ZIO_PIPELINE_CONTINUE); 2839 2840 ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); 2841 } 2842 2843 if ((error = zio_checksum_error(zio, &info)) != 0) { 2844 zio->io_error = error; 2845 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2846 zfs_ereport_start_checksum(zio->io_spa, 2847 zio->io_vd, zio, zio->io_offset, 2848 zio->io_size, NULL, &info); 2849 } 2850 } 2851 2852 return (ZIO_PIPELINE_CONTINUE); 2853} 2854 2855/* 2856 * Called by RAID-Z to ensure we don't compute the checksum twice. 2857 */ 2858void 2859zio_checksum_verified(zio_t *zio) 2860{ 2861 zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 2862} 2863 2864/* 2865 * ========================================================================== 2866 * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. 2867 * An error of 0 indictes success. ENXIO indicates whole-device failure, 2868 * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO 2869 * indicate errors that are specific to one I/O, and most likely permanent. 2870 * Any other error is presumed to be worse because we weren't expecting it. 2871 * ========================================================================== 2872 */ 2873int 2874zio_worst_error(int e1, int e2) 2875{ 2876 static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; 2877 int r1, r2; 2878 2879 for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) 2880 if (e1 == zio_error_rank[r1]) 2881 break; 2882 2883 for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) 2884 if (e2 == zio_error_rank[r2]) 2885 break; 2886 2887 return (r1 > r2 ? e1 : e2); 2888} 2889 2890/* 2891 * ========================================================================== 2892 * I/O completion 2893 * ========================================================================== 2894 */ 2895static int 2896zio_ready(zio_t *zio) 2897{ 2898 blkptr_t *bp = zio->io_bp; 2899 zio_t *pio, *pio_next; 2900 2901 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 2902 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY)) 2903 return (ZIO_PIPELINE_STOP); 2904 2905 if (zio->io_ready) { 2906 ASSERT(IO_IS_ALLOCATING(zio)); 2907 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || 2908 (zio->io_flags & ZIO_FLAG_NOPWRITE)); 2909 ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); 2910 2911 zio->io_ready(zio); 2912 } 2913 2914 if (bp != NULL && bp != &zio->io_bp_copy) 2915 zio->io_bp_copy = *bp; 2916 2917 if (zio->io_error) 2918 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2919 2920 mutex_enter(&zio->io_lock); 2921 zio->io_state[ZIO_WAIT_READY] = 1; 2922 pio = zio_walk_parents(zio); 2923 mutex_exit(&zio->io_lock); 2924 2925 /* 2926 * As we notify zio's parents, new parents could be added. 2927 * New parents go to the head of zio's io_parent_list, however, 2928 * so we will (correctly) not notify them. The remainder of zio's 2929 * io_parent_list, from 'pio_next' onward, cannot change because 2930 * all parents must wait for us to be done before they can be done. 2931 */ 2932 for (; pio != NULL; pio = pio_next) { 2933 pio_next = zio_walk_parents(zio); 2934 zio_notify_parent(pio, zio, ZIO_WAIT_READY); 2935 } 2936 2937 if (zio->io_flags & ZIO_FLAG_NODATA) { 2938 if (BP_IS_GANG(bp)) { 2939 zio->io_flags &= ~ZIO_FLAG_NODATA; 2940 } else { 2941 ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE); 2942 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 2943 } 2944 } 2945 2946 if (zio_injection_enabled && 2947 zio->io_spa->spa_syncing_txg == zio->io_txg) 2948 zio_handle_ignored_writes(zio); 2949 2950 return (ZIO_PIPELINE_CONTINUE); 2951} 2952 2953static int 2954zio_done(zio_t *zio) 2955{ 2956 spa_t *spa = zio->io_spa; 2957 zio_t *lio = zio->io_logical; 2958 blkptr_t *bp = zio->io_bp; 2959 vdev_t *vd = zio->io_vd; 2960 uint64_t psize = zio->io_size; 2961 zio_t *pio, *pio_next; 2962 2963 /* 2964 * If our children haven't all completed, 2965 * wait for them and then repeat this pipeline stage. 2966 */ 2967 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || 2968 zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || 2969 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) || 2970 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) 2971 return (ZIO_PIPELINE_STOP); 2972 2973 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 2974 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 2975 ASSERT(zio->io_children[c][w] == 0); 2976 2977 if (bp != NULL) { 2978 ASSERT(bp->blk_pad[0] == 0); 2979 ASSERT(bp->blk_pad[1] == 0); 2980 ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || 2981 (bp == zio_unique_parent(zio)->io_bp)); 2982 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 2983 zio->io_bp_override == NULL && 2984 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 2985 ASSERT(!BP_SHOULD_BYTESWAP(bp)); 2986 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); 2987 ASSERT(BP_COUNT_GANG(bp) == 0 || 2988 (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 2989 } 2990 if (zio->io_flags & ZIO_FLAG_NOPWRITE) 2991 VERIFY(BP_EQUAL(bp, &zio->io_bp_orig)); 2992 } 2993 2994 /* 2995 * If there were child vdev/gang/ddt errors, they apply to us now. 2996 */ 2997 zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); 2998 zio_inherit_child_errors(zio, ZIO_CHILD_GANG); 2999 zio_inherit_child_errors(zio, ZIO_CHILD_DDT); 3000 3001 /* 3002 * If the I/O on the transformed data was successful, generate any 3003 * checksum reports now while we still have the transformed data. 3004 */ 3005 if (zio->io_error == 0) { 3006 while (zio->io_cksum_report != NULL) { 3007 zio_cksum_report_t *zcr = zio->io_cksum_report; 3008 uint64_t align = zcr->zcr_align; 3009 uint64_t asize = P2ROUNDUP(psize, align); 3010 char *abuf = zio->io_data; 3011 3012 if (asize != psize) { 3013 abuf = zio_buf_alloc(asize); 3014 bcopy(zio->io_data, abuf, psize); 3015 bzero(abuf + psize, asize - psize); 3016 } 3017 3018 zio->io_cksum_report = zcr->zcr_next; 3019 zcr->zcr_next = NULL; 3020 zcr->zcr_finish(zcr, abuf); 3021 zfs_ereport_free_checksum(zcr); 3022 3023 if (asize != psize) 3024 zio_buf_free(abuf, asize); 3025 } 3026 } 3027 3028 zio_pop_transforms(zio); /* note: may set zio->io_error */ 3029 3030 vdev_stat_update(zio, psize); 3031 3032 if (zio->io_error) { 3033 /* 3034 * If this I/O is attached to a particular vdev, 3035 * generate an error message describing the I/O failure 3036 * at the block level. We ignore these errors if the 3037 * device is currently unavailable. 3038 */ 3039 if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 3040 zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 3041 3042 if ((zio->io_error == EIO || !(zio->io_flags & 3043 (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && 3044 zio == lio) { 3045 /* 3046 * For logical I/O requests, tell the SPA to log the 3047 * error and generate a logical data ereport. 3048 */ 3049 spa_log_error(spa, zio); 3050 zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 3051 0, 0); 3052 } 3053 } 3054 3055 if (zio->io_error && zio == lio) { 3056 /* 3057 * Determine whether zio should be reexecuted. This will 3058 * propagate all the way to the root via zio_notify_parent(). 3059 */ 3060 ASSERT(vd == NULL && bp != NULL); 3061 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3062 3063 if (IO_IS_ALLOCATING(zio) && 3064 !(zio->io_flags & ZIO_FLAG_CANFAIL)) { 3065 if (zio->io_error != ENOSPC) 3066 zio->io_reexecute |= ZIO_REEXECUTE_NOW; 3067 else 3068 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3069 } 3070 3071 if ((zio->io_type == ZIO_TYPE_READ || 3072 zio->io_type == ZIO_TYPE_FREE) && 3073 !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && 3074 zio->io_error == ENXIO && 3075 spa_load_state(spa) == SPA_LOAD_NONE && 3076 spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 3077 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3078 3079 if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) 3080 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3081 3082 /* 3083 * Here is a possibly good place to attempt to do 3084 * either combinatorial reconstruction or error correction 3085 * based on checksums. It also might be a good place 3086 * to send out preliminary ereports before we suspend 3087 * processing. 3088 */ 3089 } 3090 3091 /* 3092 * If there were logical child errors, they apply to us now. 3093 * We defer this until now to avoid conflating logical child 3094 * errors with errors that happened to the zio itself when 3095 * updating vdev stats and reporting FMA events above. 3096 */ 3097 zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); 3098 3099 if ((zio->io_error || zio->io_reexecute) && 3100 IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && 3101 !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) 3102 zio_dva_unallocate(zio, zio->io_gang_tree, bp); 3103 3104 zio_gang_tree_free(&zio->io_gang_tree); 3105 3106 /* 3107 * Godfather I/Os should never suspend. 3108 */ 3109 if ((zio->io_flags & ZIO_FLAG_GODFATHER) && 3110 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) 3111 zio->io_reexecute = 0; 3112 3113 if (zio->io_reexecute) { 3114 /* 3115 * This is a logical I/O that wants to reexecute. 3116 * 3117 * Reexecute is top-down. When an i/o fails, if it's not 3118 * the root, it simply notifies its parent and sticks around. 3119 * The parent, seeing that it still has children in zio_done(), 3120 * does the same. This percolates all the way up to the root. 3121 * The root i/o will reexecute or suspend the entire tree. 3122 * 3123 * This approach ensures that zio_reexecute() honors 3124 * all the original i/o dependency relationships, e.g. 3125 * parents not executing until children are ready. 3126 */ 3127 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3128 3129 zio->io_gang_leader = NULL; 3130 3131 mutex_enter(&zio->io_lock); 3132 zio->io_state[ZIO_WAIT_DONE] = 1; 3133 mutex_exit(&zio->io_lock); 3134 3135 /* 3136 * "The Godfather" I/O monitors its children but is 3137 * not a true parent to them. It will track them through 3138 * the pipeline but severs its ties whenever they get into 3139 * trouble (e.g. suspended). This allows "The Godfather" 3140 * I/O to return status without blocking. 3141 */ 3142 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 3143 zio_link_t *zl = zio->io_walk_link; 3144 pio_next = zio_walk_parents(zio); 3145 3146 if ((pio->io_flags & ZIO_FLAG_GODFATHER) && 3147 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { 3148 zio_remove_child(pio, zio, zl); 3149 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3150 } 3151 } 3152 3153 if ((pio = zio_unique_parent(zio)) != NULL) { 3154 /* 3155 * We're not a root i/o, so there's nothing to do 3156 * but notify our parent. Don't propagate errors 3157 * upward since we haven't permanently failed yet. 3158 */ 3159 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 3160 zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; 3161 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3162 } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { 3163 /* 3164 * We'd fail again if we reexecuted now, so suspend 3165 * until conditions improve (e.g. device comes online). 3166 */ 3167 zio_suspend(spa, zio); 3168 } else { 3169 /* 3170 * Reexecution is potentially a huge amount of work. 3171 * Hand it off to the otherwise-unused claim taskq. 3172 */ 3173#ifdef _KERNEL 3174 (void) taskq_dispatch_safe( 3175 spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE], 3176 (task_func_t *)zio_reexecute, zio, TQ_SLEEP, 3177 &zio->io_task); 3178#else 3179 (void) taskq_dispatch( 3180 spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE], 3181 (task_func_t *)zio_reexecute, zio, TQ_SLEEP); 3182#endif 3183 } 3184 return (ZIO_PIPELINE_STOP); 3185 } 3186 3187 ASSERT(zio->io_child_count == 0); 3188 ASSERT(zio->io_reexecute == 0); 3189 ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); 3190 3191 /* 3192 * Report any checksum errors, since the I/O is complete. 3193 */ 3194 while (zio->io_cksum_report != NULL) { 3195 zio_cksum_report_t *zcr = zio->io_cksum_report; 3196 zio->io_cksum_report = zcr->zcr_next; 3197 zcr->zcr_next = NULL; 3198 zcr->zcr_finish(zcr, NULL); 3199 zfs_ereport_free_checksum(zcr); 3200 } 3201 3202 /* 3203 * It is the responsibility of the done callback to ensure that this 3204 * particular zio is no longer discoverable for adoption, and as 3205 * such, cannot acquire any new parents. 3206 */ 3207 if (zio->io_done) 3208 zio->io_done(zio); 3209 3210 mutex_enter(&zio->io_lock); 3211 zio->io_state[ZIO_WAIT_DONE] = 1; 3212 mutex_exit(&zio->io_lock); 3213 3214 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 3215 zio_link_t *zl = zio->io_walk_link; 3216 pio_next = zio_walk_parents(zio); 3217 zio_remove_child(pio, zio, zl); 3218 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3219 } 3220 3221 if (zio->io_waiter != NULL) { 3222 mutex_enter(&zio->io_lock); 3223 zio->io_executor = NULL; 3224 cv_broadcast(&zio->io_cv); 3225 mutex_exit(&zio->io_lock); 3226 } else { 3227 zio_destroy(zio); 3228 } 3229 3230 return (ZIO_PIPELINE_STOP); 3231} 3232 3233/* 3234 * ========================================================================== 3235 * I/O pipeline definition 3236 * ========================================================================== 3237 */ 3238static zio_pipe_stage_t *zio_pipeline[] = { 3239 NULL, 3240 zio_read_bp_init, 3241 zio_free_bp_init, 3242 zio_issue_async, 3243 zio_write_bp_init, 3244 zio_checksum_generate, 3245 zio_nop_write, 3246 zio_ddt_read_start, 3247 zio_ddt_read_done, 3248 zio_ddt_write, 3249 zio_ddt_free, 3250 zio_gang_assemble, 3251 zio_gang_issue, 3252 zio_dva_allocate, 3253 zio_dva_free, 3254 zio_dva_claim, 3255 zio_ready, 3256 zio_vdev_io_start, 3257 zio_vdev_io_done, 3258 zio_vdev_io_assess, 3259 zio_checksum_verify, 3260 zio_done 3261}; 3262 3263/* dnp is the dnode for zb1->zb_object */ 3264boolean_t 3265zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1, 3266 const zbookmark_t *zb2) 3267{ 3268 uint64_t zb1nextL0, zb2thisobj; 3269 3270 ASSERT(zb1->zb_objset == zb2->zb_objset); 3271 ASSERT(zb2->zb_level == 0); 3272 3273 /* 3274 * A bookmark in the deadlist is considered to be after 3275 * everything else. 3276 */ 3277 if (zb2->zb_object == DMU_DEADLIST_OBJECT) 3278 return (B_TRUE); 3279 3280 /* The objset_phys_t isn't before anything. */ 3281 if (dnp == NULL) 3282 return (B_FALSE); 3283 3284 zb1nextL0 = (zb1->zb_blkid + 1) << 3285 ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); 3286 3287 zb2thisobj = zb2->zb_object ? zb2->zb_object : 3288 zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); 3289 3290 if (zb1->zb_object == DMU_META_DNODE_OBJECT) { 3291 uint64_t nextobj = zb1nextL0 * 3292 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; 3293 return (nextobj <= zb2thisobj); 3294 } 3295 3296 if (zb1->zb_object < zb2thisobj) 3297 return (B_TRUE); 3298 if (zb1->zb_object > zb2thisobj) 3299 return (B_FALSE); 3300 if (zb2->zb_object == DMU_META_DNODE_OBJECT) 3301 return (B_FALSE); 3302 return (zb1nextL0 <= zb2->zb_blkid); 3303} 3304