zio.c revision 252840
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2013 by Delphix. All rights reserved. 24 */ 25 26#include <sys/zfs_context.h> 27#include <sys/fm/fs/zfs.h> 28#include <sys/spa.h> 29#include <sys/txg.h> 30#include <sys/spa_impl.h> 31#include <sys/vdev_impl.h> 32#include <sys/zio_impl.h> 33#include <sys/zio_compress.h> 34#include <sys/zio_checksum.h> 35#include <sys/dmu_objset.h> 36#include <sys/arc.h> 37#include <sys/ddt.h> 38#include <sys/trim_map.h> 39 40SYSCTL_DECL(_vfs_zfs); 41SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); 42static int zio_use_uma = 0; 43TUNABLE_INT("vfs.zfs.zio.use_uma", &zio_use_uma); 44SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0, 45 "Use uma(9) for ZIO allocations"); 46static int zio_exclude_metadata = 0; 47TUNABLE_INT("vfs.zfs.zio.exclude_metadata", &zio_exclude_metadata); 48SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0, 49 "Exclude metadata buffers from dumps as well"); 50 51zio_trim_stats_t zio_trim_stats = { 52 { "bytes", KSTAT_DATA_UINT64, 53 "Number of bytes successfully TRIMmed" }, 54 { "success", KSTAT_DATA_UINT64, 55 "Number of successful TRIM requests" }, 56 { "unsupported", KSTAT_DATA_UINT64, 57 "Number of TRIM requests that failed because TRIM is not supported" }, 58 { "failed", KSTAT_DATA_UINT64, 59 "Number of TRIM requests that failed for reasons other than not supported" }, 60}; 61 62static kstat_t *zio_trim_ksp; 63 64/* 65 * ========================================================================== 66 * I/O priority table 67 * ========================================================================== 68 */ 69uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 70 0, /* ZIO_PRIORITY_NOW */ 71 0, /* ZIO_PRIORITY_SYNC_READ */ 72 0, /* ZIO_PRIORITY_SYNC_WRITE */ 73 0, /* ZIO_PRIORITY_LOG_WRITE */ 74 1, /* ZIO_PRIORITY_CACHE_FILL */ 75 1, /* ZIO_PRIORITY_AGG */ 76 4, /* ZIO_PRIORITY_FREE */ 77 4, /* ZIO_PRIORITY_ASYNC_WRITE */ 78 6, /* ZIO_PRIORITY_ASYNC_READ */ 79 10, /* ZIO_PRIORITY_RESILVER */ 80 20, /* ZIO_PRIORITY_SCRUB */ 81 2, /* ZIO_PRIORITY_DDT_PREFETCH */ 82 30, /* ZIO_PRIORITY_TRIM */ 83}; 84 85/* 86 * ========================================================================== 87 * I/O type descriptions 88 * ========================================================================== 89 */ 90char *zio_type_name[ZIO_TYPES] = { 91 "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", 92 "zio_ioctl" 93}; 94 95/* 96 * ========================================================================== 97 * I/O kmem caches 98 * ========================================================================== 99 */ 100kmem_cache_t *zio_cache; 101kmem_cache_t *zio_link_cache; 102kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 103kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 104 105#ifdef _KERNEL 106extern vmem_t *zio_alloc_arena; 107#endif 108extern int zfs_mg_alloc_failures; 109 110/* 111 * The following actions directly effect the spa's sync-to-convergence logic. 112 * The values below define the sync pass when we start performing the action. 113 * Care should be taken when changing these values as they directly impact 114 * spa_sync() performance. Tuning these values may introduce subtle performance 115 * pathologies and should only be done in the context of performance analysis. 116 * These tunables will eventually be removed and replaced with #defines once 117 * enough analysis has been done to determine optimal values. 118 * 119 * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that 120 * regular blocks are not deferred. 121 */ 122int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */ 123TUNABLE_INT("vfs.zfs.sync_pass_deferred_free", &zfs_sync_pass_deferred_free); 124SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_deferred_free, CTLFLAG_RDTUN, 125 &zfs_sync_pass_deferred_free, 0, "defer frees starting in this pass"); 126int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */ 127TUNABLE_INT("vfs.zfs.sync_pass_dont_compress", &zfs_sync_pass_dont_compress); 128SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_dont_compress, CTLFLAG_RDTUN, 129 &zfs_sync_pass_dont_compress, 0, "don't compress starting in this pass"); 130int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ 131TUNABLE_INT("vfs.zfs.sync_pass_rewrite", &zfs_sync_pass_rewrite); 132SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_rewrite, CTLFLAG_RDTUN, 133 &zfs_sync_pass_rewrite, 0, "rewrite new bps starting in this pass"); 134 135/* 136 * An allocating zio is one that either currently has the DVA allocate 137 * stage set or will have it later in its lifetime. 138 */ 139#define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) 140 141boolean_t zio_requeue_io_start_cut_in_line = B_TRUE; 142 143#ifdef ZFS_DEBUG 144int zio_buf_debug_limit = 16384; 145#else 146int zio_buf_debug_limit = 0; 147#endif 148 149void 150zio_init(void) 151{ 152 size_t c; 153 zio_cache = kmem_cache_create("zio_cache", 154 sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 155 zio_link_cache = kmem_cache_create("zio_link_cache", 156 sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 157 if (!zio_use_uma) 158 goto out; 159 160 /* 161 * For small buffers, we want a cache for each multiple of 162 * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 163 * for each quarter-power of 2. For large buffers, we want 164 * a cache for each multiple of PAGESIZE. 165 */ 166 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 167 size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 168 size_t p2 = size; 169 size_t align = 0; 170 size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0; 171 172 while (p2 & (p2 - 1)) 173 p2 &= p2 - 1; 174 175#ifdef illumos 176#ifndef _KERNEL 177 /* 178 * If we are using watchpoints, put each buffer on its own page, 179 * to eliminate the performance overhead of trapping to the 180 * kernel when modifying a non-watched buffer that shares the 181 * page with a watched buffer. 182 */ 183 if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) 184 continue; 185#endif 186#endif /* illumos */ 187 if (size <= 4 * SPA_MINBLOCKSIZE) { 188 align = SPA_MINBLOCKSIZE; 189 } else if (IS_P2ALIGNED(size, PAGESIZE)) { 190 align = PAGESIZE; 191 } else if (IS_P2ALIGNED(size, p2 >> 2)) { 192 align = p2 >> 2; 193 } 194 195 if (align != 0) { 196 char name[36]; 197 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 198 zio_buf_cache[c] = kmem_cache_create(name, size, 199 align, NULL, NULL, NULL, NULL, NULL, cflags); 200 201 /* 202 * Since zio_data bufs do not appear in crash dumps, we 203 * pass KMC_NOTOUCH so that no allocator metadata is 204 * stored with the buffers. 205 */ 206 (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 207 zio_data_buf_cache[c] = kmem_cache_create(name, size, 208 align, NULL, NULL, NULL, NULL, NULL, 209 cflags | KMC_NOTOUCH | KMC_NODEBUG); 210 } 211 } 212 213 while (--c != 0) { 214 ASSERT(zio_buf_cache[c] != NULL); 215 if (zio_buf_cache[c - 1] == NULL) 216 zio_buf_cache[c - 1] = zio_buf_cache[c]; 217 218 ASSERT(zio_data_buf_cache[c] != NULL); 219 if (zio_data_buf_cache[c - 1] == NULL) 220 zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 221 } 222out: 223 224 /* 225 * The zio write taskqs have 1 thread per cpu, allow 1/2 of the taskqs 226 * to fail 3 times per txg or 8 failures, whichever is greater. 227 */ 228 if (zfs_mg_alloc_failures == 0) 229 zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8); 230 else if (zfs_mg_alloc_failures < 8) 231 zfs_mg_alloc_failures = 8; 232 233 zio_inject_init(); 234 235 zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc", 236 KSTAT_TYPE_NAMED, 237 sizeof(zio_trim_stats) / sizeof(kstat_named_t), 238 KSTAT_FLAG_VIRTUAL); 239 240 if (zio_trim_ksp != NULL) { 241 zio_trim_ksp->ks_data = &zio_trim_stats; 242 kstat_install(zio_trim_ksp); 243 } 244} 245 246void 247zio_fini(void) 248{ 249 size_t c; 250 kmem_cache_t *last_cache = NULL; 251 kmem_cache_t *last_data_cache = NULL; 252 253 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 254 if (zio_buf_cache[c] != last_cache) { 255 last_cache = zio_buf_cache[c]; 256 kmem_cache_destroy(zio_buf_cache[c]); 257 } 258 zio_buf_cache[c] = NULL; 259 260 if (zio_data_buf_cache[c] != last_data_cache) { 261 last_data_cache = zio_data_buf_cache[c]; 262 kmem_cache_destroy(zio_data_buf_cache[c]); 263 } 264 zio_data_buf_cache[c] = NULL; 265 } 266 267 kmem_cache_destroy(zio_link_cache); 268 kmem_cache_destroy(zio_cache); 269 270 zio_inject_fini(); 271 272 if (zio_trim_ksp != NULL) { 273 kstat_delete(zio_trim_ksp); 274 zio_trim_ksp = NULL; 275 } 276} 277 278/* 279 * ========================================================================== 280 * Allocate and free I/O buffers 281 * ========================================================================== 282 */ 283 284/* 285 * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 286 * crashdump if the kernel panics, so use it judiciously. Obviously, it's 287 * useful to inspect ZFS metadata, but if possible, we should avoid keeping 288 * excess / transient data in-core during a crashdump. 289 */ 290void * 291zio_buf_alloc(size_t size) 292{ 293 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 294 int flags = zio_exclude_metadata ? KM_NODEBUG : 0; 295 296 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 297 298 if (zio_use_uma) 299 return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 300 else 301 return (kmem_alloc(size, KM_SLEEP|flags)); 302} 303 304/* 305 * Use zio_data_buf_alloc to allocate data. The data will not appear in a 306 * crashdump if the kernel panics. This exists so that we will limit the amount 307 * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 308 * of kernel heap dumped to disk when the kernel panics) 309 */ 310void * 311zio_data_buf_alloc(size_t size) 312{ 313 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 314 315 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 316 317 if (zio_use_uma) 318 return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 319 else 320 return (kmem_alloc(size, KM_SLEEP | KM_NODEBUG)); 321} 322 323void 324zio_buf_free(void *buf, size_t size) 325{ 326 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 327 328 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 329 330 if (zio_use_uma) 331 kmem_cache_free(zio_buf_cache[c], buf); 332 else 333 kmem_free(buf, size); 334} 335 336void 337zio_data_buf_free(void *buf, size_t size) 338{ 339 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 340 341 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 342 343 if (zio_use_uma) 344 kmem_cache_free(zio_data_buf_cache[c], buf); 345 else 346 kmem_free(buf, size); 347} 348 349/* 350 * ========================================================================== 351 * Push and pop I/O transform buffers 352 * ========================================================================== 353 */ 354static void 355zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, 356 zio_transform_func_t *transform) 357{ 358 zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 359 360 zt->zt_orig_data = zio->io_data; 361 zt->zt_orig_size = zio->io_size; 362 zt->zt_bufsize = bufsize; 363 zt->zt_transform = transform; 364 365 zt->zt_next = zio->io_transform_stack; 366 zio->io_transform_stack = zt; 367 368 zio->io_data = data; 369 zio->io_size = size; 370} 371 372static void 373zio_pop_transforms(zio_t *zio) 374{ 375 zio_transform_t *zt; 376 377 while ((zt = zio->io_transform_stack) != NULL) { 378 if (zt->zt_transform != NULL) 379 zt->zt_transform(zio, 380 zt->zt_orig_data, zt->zt_orig_size); 381 382 if (zt->zt_bufsize != 0) 383 zio_buf_free(zio->io_data, zt->zt_bufsize); 384 385 zio->io_data = zt->zt_orig_data; 386 zio->io_size = zt->zt_orig_size; 387 zio->io_transform_stack = zt->zt_next; 388 389 kmem_free(zt, sizeof (zio_transform_t)); 390 } 391} 392 393/* 394 * ========================================================================== 395 * I/O transform callbacks for subblocks and decompression 396 * ========================================================================== 397 */ 398static void 399zio_subblock(zio_t *zio, void *data, uint64_t size) 400{ 401 ASSERT(zio->io_size > size); 402 403 if (zio->io_type == ZIO_TYPE_READ) 404 bcopy(zio->io_data, data, size); 405} 406 407static void 408zio_decompress(zio_t *zio, void *data, uint64_t size) 409{ 410 if (zio->io_error == 0 && 411 zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), 412 zio->io_data, data, zio->io_size, size) != 0) 413 zio->io_error = SET_ERROR(EIO); 414} 415 416/* 417 * ========================================================================== 418 * I/O parent/child relationships and pipeline interlocks 419 * ========================================================================== 420 */ 421/* 422 * NOTE - Callers to zio_walk_parents() and zio_walk_children must 423 * continue calling these functions until they return NULL. 424 * Otherwise, the next caller will pick up the list walk in 425 * some indeterminate state. (Otherwise every caller would 426 * have to pass in a cookie to keep the state represented by 427 * io_walk_link, which gets annoying.) 428 */ 429zio_t * 430zio_walk_parents(zio_t *cio) 431{ 432 zio_link_t *zl = cio->io_walk_link; 433 list_t *pl = &cio->io_parent_list; 434 435 zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); 436 cio->io_walk_link = zl; 437 438 if (zl == NULL) 439 return (NULL); 440 441 ASSERT(zl->zl_child == cio); 442 return (zl->zl_parent); 443} 444 445zio_t * 446zio_walk_children(zio_t *pio) 447{ 448 zio_link_t *zl = pio->io_walk_link; 449 list_t *cl = &pio->io_child_list; 450 451 zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); 452 pio->io_walk_link = zl; 453 454 if (zl == NULL) 455 return (NULL); 456 457 ASSERT(zl->zl_parent == pio); 458 return (zl->zl_child); 459} 460 461zio_t * 462zio_unique_parent(zio_t *cio) 463{ 464 zio_t *pio = zio_walk_parents(cio); 465 466 VERIFY(zio_walk_parents(cio) == NULL); 467 return (pio); 468} 469 470void 471zio_add_child(zio_t *pio, zio_t *cio) 472{ 473 zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); 474 475 /* 476 * Logical I/Os can have logical, gang, or vdev children. 477 * Gang I/Os can have gang or vdev children. 478 * Vdev I/Os can only have vdev children. 479 * The following ASSERT captures all of these constraints. 480 */ 481 ASSERT(cio->io_child_type <= pio->io_child_type); 482 483 zl->zl_parent = pio; 484 zl->zl_child = cio; 485 486 mutex_enter(&cio->io_lock); 487 mutex_enter(&pio->io_lock); 488 489 ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); 490 491 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 492 pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; 493 494 list_insert_head(&pio->io_child_list, zl); 495 list_insert_head(&cio->io_parent_list, zl); 496 497 pio->io_child_count++; 498 cio->io_parent_count++; 499 500 mutex_exit(&pio->io_lock); 501 mutex_exit(&cio->io_lock); 502} 503 504static void 505zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) 506{ 507 ASSERT(zl->zl_parent == pio); 508 ASSERT(zl->zl_child == cio); 509 510 mutex_enter(&cio->io_lock); 511 mutex_enter(&pio->io_lock); 512 513 list_remove(&pio->io_child_list, zl); 514 list_remove(&cio->io_parent_list, zl); 515 516 pio->io_child_count--; 517 cio->io_parent_count--; 518 519 mutex_exit(&pio->io_lock); 520 mutex_exit(&cio->io_lock); 521 522 kmem_cache_free(zio_link_cache, zl); 523} 524 525static boolean_t 526zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) 527{ 528 uint64_t *countp = &zio->io_children[child][wait]; 529 boolean_t waiting = B_FALSE; 530 531 mutex_enter(&zio->io_lock); 532 ASSERT(zio->io_stall == NULL); 533 if (*countp != 0) { 534 zio->io_stage >>= 1; 535 zio->io_stall = countp; 536 waiting = B_TRUE; 537 } 538 mutex_exit(&zio->io_lock); 539 540 return (waiting); 541} 542 543static void 544zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) 545{ 546 uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; 547 int *errorp = &pio->io_child_error[zio->io_child_type]; 548 549 mutex_enter(&pio->io_lock); 550 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 551 *errorp = zio_worst_error(*errorp, zio->io_error); 552 pio->io_reexecute |= zio->io_reexecute; 553 ASSERT3U(*countp, >, 0); 554 if (--*countp == 0 && pio->io_stall == countp) { 555 pio->io_stall = NULL; 556 mutex_exit(&pio->io_lock); 557 zio_execute(pio); 558 } else { 559 mutex_exit(&pio->io_lock); 560 } 561} 562 563static void 564zio_inherit_child_errors(zio_t *zio, enum zio_child c) 565{ 566 if (zio->io_child_error[c] != 0 && zio->io_error == 0) 567 zio->io_error = zio->io_child_error[c]; 568} 569 570/* 571 * ========================================================================== 572 * Create the various types of I/O (read, write, free, etc) 573 * ========================================================================== 574 */ 575static zio_t * 576zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 577 void *data, uint64_t size, zio_done_func_t *done, void *private, 578 zio_type_t type, int priority, enum zio_flag flags, 579 vdev_t *vd, uint64_t offset, const zbookmark_t *zb, 580 enum zio_stage stage, enum zio_stage pipeline) 581{ 582 zio_t *zio; 583 584 ASSERT3U(type == ZIO_TYPE_FREE || size, <=, SPA_MAXBLOCKSIZE); 585 ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 586 ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 587 588 ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); 589 ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); 590 ASSERT(vd || stage == ZIO_STAGE_OPEN); 591 592 zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 593 bzero(zio, sizeof (zio_t)); 594 595 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 596 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 597 598 list_create(&zio->io_parent_list, sizeof (zio_link_t), 599 offsetof(zio_link_t, zl_parent_node)); 600 list_create(&zio->io_child_list, sizeof (zio_link_t), 601 offsetof(zio_link_t, zl_child_node)); 602 603 if (vd != NULL) 604 zio->io_child_type = ZIO_CHILD_VDEV; 605 else if (flags & ZIO_FLAG_GANG_CHILD) 606 zio->io_child_type = ZIO_CHILD_GANG; 607 else if (flags & ZIO_FLAG_DDT_CHILD) 608 zio->io_child_type = ZIO_CHILD_DDT; 609 else 610 zio->io_child_type = ZIO_CHILD_LOGICAL; 611 612 if (bp != NULL) { 613 zio->io_bp = (blkptr_t *)bp; 614 zio->io_bp_copy = *bp; 615 zio->io_bp_orig = *bp; 616 if (type != ZIO_TYPE_WRITE || 617 zio->io_child_type == ZIO_CHILD_DDT) 618 zio->io_bp = &zio->io_bp_copy; /* so caller can free */ 619 if (zio->io_child_type == ZIO_CHILD_LOGICAL) 620 zio->io_logical = zio; 621 if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) 622 pipeline |= ZIO_GANG_STAGES; 623 } 624 625 zio->io_spa = spa; 626 zio->io_txg = txg; 627 zio->io_done = done; 628 zio->io_private = private; 629 zio->io_type = type; 630 zio->io_priority = priority; 631 zio->io_vd = vd; 632 zio->io_offset = offset; 633 zio->io_orig_data = zio->io_data = data; 634 zio->io_orig_size = zio->io_size = size; 635 zio->io_orig_flags = zio->io_flags = flags; 636 zio->io_orig_stage = zio->io_stage = stage; 637 zio->io_orig_pipeline = zio->io_pipeline = pipeline; 638 639 zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); 640 zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); 641 642 if (zb != NULL) 643 zio->io_bookmark = *zb; 644 645 if (pio != NULL) { 646 if (zio->io_logical == NULL) 647 zio->io_logical = pio->io_logical; 648 if (zio->io_child_type == ZIO_CHILD_GANG) 649 zio->io_gang_leader = pio->io_gang_leader; 650 zio_add_child(pio, zio); 651 } 652 653 return (zio); 654} 655 656static void 657zio_destroy(zio_t *zio) 658{ 659 list_destroy(&zio->io_parent_list); 660 list_destroy(&zio->io_child_list); 661 mutex_destroy(&zio->io_lock); 662 cv_destroy(&zio->io_cv); 663 kmem_cache_free(zio_cache, zio); 664} 665 666zio_t * 667zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, 668 void *private, enum zio_flag flags) 669{ 670 zio_t *zio; 671 672 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 673 ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 674 ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); 675 676 return (zio); 677} 678 679zio_t * 680zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) 681{ 682 return (zio_null(NULL, spa, NULL, done, private, flags)); 683} 684 685zio_t * 686zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, 687 void *data, uint64_t size, zio_done_func_t *done, void *private, 688 int priority, enum zio_flag flags, const zbookmark_t *zb) 689{ 690 zio_t *zio; 691 692 zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, 693 data, size, done, private, 694 ZIO_TYPE_READ, priority, flags, NULL, 0, zb, 695 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 696 ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); 697 698 return (zio); 699} 700 701zio_t * 702zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 703 void *data, uint64_t size, const zio_prop_t *zp, 704 zio_done_func_t *ready, zio_done_func_t *done, void *private, 705 int priority, enum zio_flag flags, const zbookmark_t *zb) 706{ 707 zio_t *zio; 708 709 ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && 710 zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && 711 zp->zp_compress >= ZIO_COMPRESS_OFF && 712 zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && 713 DMU_OT_IS_VALID(zp->zp_type) && 714 zp->zp_level < 32 && 715 zp->zp_copies > 0 && 716 zp->zp_copies <= spa_max_replication(spa)); 717 718 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 719 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 720 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 721 ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); 722 723 zio->io_ready = ready; 724 zio->io_prop = *zp; 725 726 return (zio); 727} 728 729zio_t * 730zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, 731 uint64_t size, zio_done_func_t *done, void *private, int priority, 732 enum zio_flag flags, zbookmark_t *zb) 733{ 734 zio_t *zio; 735 736 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 737 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 738 ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 739 740 return (zio); 741} 742 743void 744zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) 745{ 746 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 747 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 748 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 749 ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); 750 751 /* 752 * We must reset the io_prop to match the values that existed 753 * when the bp was first written by dmu_sync() keeping in mind 754 * that nopwrite and dedup are mutually exclusive. 755 */ 756 zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; 757 zio->io_prop.zp_nopwrite = nopwrite; 758 zio->io_prop.zp_copies = copies; 759 zio->io_bp_override = bp; 760} 761 762void 763zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) 764{ 765 metaslab_check_free(spa, bp); 766 767 /* 768 * Frees that are for the currently-syncing txg, are not going to be 769 * deferred, and which will not need to do a read (i.e. not GANG or 770 * DEDUP), can be processed immediately. Otherwise, put them on the 771 * in-memory list for later processing. 772 */ 773 if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || 774 txg != spa->spa_syncing_txg || 775 spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) { 776 bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); 777 } else { 778 VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 779 BP_GET_PSIZE(bp), 0))); 780 } 781} 782 783zio_t * 784zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 785 uint64_t size, enum zio_flag flags) 786{ 787 zio_t *zio; 788 enum zio_stage stage = ZIO_FREE_PIPELINE; 789 790 dprintf_bp(bp, "freeing in txg %llu, pass %u", 791 (longlong_t)txg, spa->spa_sync_pass); 792 793 ASSERT(!BP_IS_HOLE(bp)); 794 ASSERT(spa_syncing_txg(spa) == txg); 795 ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); 796 797 metaslab_check_free(spa, bp); 798 arc_freed(spa, bp); 799 800 /* 801 * GANG and DEDUP blocks can induce a read (for the gang block header, 802 * or the DDT), so issue them asynchronously so that this thread is 803 * not tied up. 804 */ 805 if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) 806 stage |= ZIO_STAGE_ISSUE_ASYNC; 807 808 zio = zio_create(pio, spa, txg, bp, NULL, size, 809 NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags, 810 NULL, 0, NULL, ZIO_STAGE_OPEN, stage); 811 812 return (zio); 813} 814 815zio_t * 816zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 817 zio_done_func_t *done, void *private, enum zio_flag flags) 818{ 819 zio_t *zio; 820 821 /* 822 * A claim is an allocation of a specific block. Claims are needed 823 * to support immediate writes in the intent log. The issue is that 824 * immediate writes contain committed data, but in a txg that was 825 * *not* committed. Upon opening the pool after an unclean shutdown, 826 * the intent log claims all blocks that contain immediate write data 827 * so that the SPA knows they're in use. 828 * 829 * All claims *must* be resolved in the first txg -- before the SPA 830 * starts allocating blocks -- so that nothing is allocated twice. 831 * If txg == 0 we just verify that the block is claimable. 832 */ 833 ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 834 ASSERT(txg == spa_first_txg(spa) || txg == 0); 835 ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ 836 837 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 838 done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, 839 NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 840 841 return (zio); 842} 843 844zio_t * 845zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset, 846 uint64_t size, zio_done_func_t *done, void *private, int priority, 847 enum zio_flag flags) 848{ 849 zio_t *zio; 850 int c; 851 852 if (vd->vdev_children == 0) { 853 zio = zio_create(pio, spa, 0, NULL, NULL, size, done, private, 854 ZIO_TYPE_IOCTL, priority, flags, vd, offset, NULL, 855 ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 856 857 zio->io_cmd = cmd; 858 } else { 859 zio = zio_null(pio, spa, NULL, NULL, NULL, flags); 860 861 for (c = 0; c < vd->vdev_children; c++) 862 zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 863 offset, size, done, private, priority, flags)); 864 } 865 866 return (zio); 867} 868 869zio_t * 870zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 871 void *data, int checksum, zio_done_func_t *done, void *private, 872 int priority, enum zio_flag flags, boolean_t labels) 873{ 874 zio_t *zio; 875 876 ASSERT(vd->vdev_children == 0); 877 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 878 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 879 ASSERT3U(offset + size, <=, vd->vdev_psize); 880 881 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 882 ZIO_TYPE_READ, priority, flags, vd, offset, NULL, 883 ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 884 885 zio->io_prop.zp_checksum = checksum; 886 887 return (zio); 888} 889 890zio_t * 891zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 892 void *data, int checksum, zio_done_func_t *done, void *private, 893 int priority, enum zio_flag flags, boolean_t labels) 894{ 895 zio_t *zio; 896 897 ASSERT(vd->vdev_children == 0); 898 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 899 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 900 ASSERT3U(offset + size, <=, vd->vdev_psize); 901 902 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 903 ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL, 904 ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 905 906 zio->io_prop.zp_checksum = checksum; 907 908 if (zio_checksum_table[checksum].ci_eck) { 909 /* 910 * zec checksums are necessarily destructive -- they modify 911 * the end of the write buffer to hold the verifier/checksum. 912 * Therefore, we must make a local copy in case the data is 913 * being written to multiple places in parallel. 914 */ 915 void *wbuf = zio_buf_alloc(size); 916 bcopy(data, wbuf, size); 917 zio_push_transform(zio, wbuf, size, size, NULL); 918 } 919 920 return (zio); 921} 922 923/* 924 * Create a child I/O to do some work for us. 925 */ 926zio_t * 927zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 928 void *data, uint64_t size, int type, int priority, enum zio_flag flags, 929 zio_done_func_t *done, void *private) 930{ 931 enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; 932 zio_t *zio; 933 934 ASSERT(vd->vdev_parent == 935 (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); 936 937 if (type == ZIO_TYPE_READ && bp != NULL) { 938 /* 939 * If we have the bp, then the child should perform the 940 * checksum and the parent need not. This pushes error 941 * detection as close to the leaves as possible and 942 * eliminates redundant checksums in the interior nodes. 943 */ 944 pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; 945 pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 946 } 947 948 if (vd->vdev_children == 0) 949 offset += VDEV_LABEL_START_SIZE; 950 951 flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE; 952 953 /* 954 * If we've decided to do a repair, the write is not speculative -- 955 * even if the original read was. 956 */ 957 if (flags & ZIO_FLAG_IO_REPAIR) 958 flags &= ~ZIO_FLAG_SPECULATIVE; 959 960 zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, 961 done, private, type, priority, flags, vd, offset, &pio->io_bookmark, 962 ZIO_STAGE_VDEV_IO_START >> 1, pipeline); 963 964 return (zio); 965} 966 967zio_t * 968zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, 969 int type, int priority, enum zio_flag flags, 970 zio_done_func_t *done, void *private) 971{ 972 zio_t *zio; 973 974 ASSERT(vd->vdev_ops->vdev_op_leaf); 975 976 zio = zio_create(NULL, vd->vdev_spa, 0, NULL, 977 data, size, done, private, type, priority, 978 flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY, 979 vd, offset, NULL, 980 ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); 981 982 return (zio); 983} 984 985void 986zio_flush(zio_t *zio, vdev_t *vd) 987{ 988 zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0, 989 NULL, NULL, ZIO_PRIORITY_NOW, 990 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); 991} 992 993zio_t * 994zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size) 995{ 996 997 ASSERT(vd->vdev_ops->vdev_op_leaf); 998 999 return zio_ioctl(zio, spa, vd, DKIOCTRIM, offset, size, 1000 NULL, NULL, ZIO_PRIORITY_TRIM, 1001 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY); 1002} 1003 1004void 1005zio_shrink(zio_t *zio, uint64_t size) 1006{ 1007 ASSERT(zio->io_executor == NULL); 1008 ASSERT(zio->io_orig_size == zio->io_size); 1009 ASSERT(size <= zio->io_size); 1010 1011 /* 1012 * We don't shrink for raidz because of problems with the 1013 * reconstruction when reading back less than the block size. 1014 * Note, BP_IS_RAIDZ() assumes no compression. 1015 */ 1016 ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); 1017 if (!BP_IS_RAIDZ(zio->io_bp)) 1018 zio->io_orig_size = zio->io_size = size; 1019} 1020 1021/* 1022 * ========================================================================== 1023 * Prepare to read and write logical blocks 1024 * ========================================================================== 1025 */ 1026 1027static int 1028zio_read_bp_init(zio_t *zio) 1029{ 1030 blkptr_t *bp = zio->io_bp; 1031 1032 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && 1033 zio->io_child_type == ZIO_CHILD_LOGICAL && 1034 !(zio->io_flags & ZIO_FLAG_RAW)) { 1035 uint64_t psize = BP_GET_PSIZE(bp); 1036 void *cbuf = zio_buf_alloc(psize); 1037 1038 zio_push_transform(zio, cbuf, psize, psize, zio_decompress); 1039 } 1040 1041 if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) 1042 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1043 1044 if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) 1045 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1046 1047 if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) 1048 zio->io_pipeline = ZIO_DDT_READ_PIPELINE; 1049 1050 return (ZIO_PIPELINE_CONTINUE); 1051} 1052 1053static int 1054zio_write_bp_init(zio_t *zio) 1055{ 1056 spa_t *spa = zio->io_spa; 1057 zio_prop_t *zp = &zio->io_prop; 1058 enum zio_compress compress = zp->zp_compress; 1059 blkptr_t *bp = zio->io_bp; 1060 uint64_t lsize = zio->io_size; 1061 uint64_t psize = lsize; 1062 int pass = 1; 1063 1064 /* 1065 * If our children haven't all reached the ready stage, 1066 * wait for them and then repeat this pipeline stage. 1067 */ 1068 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 1069 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) 1070 return (ZIO_PIPELINE_STOP); 1071 1072 if (!IO_IS_ALLOCATING(zio)) 1073 return (ZIO_PIPELINE_CONTINUE); 1074 1075 ASSERT(zio->io_child_type != ZIO_CHILD_DDT); 1076 1077 if (zio->io_bp_override) { 1078 ASSERT(bp->blk_birth != zio->io_txg); 1079 ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); 1080 1081 *bp = *zio->io_bp_override; 1082 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1083 1084 /* 1085 * If we've been overridden and nopwrite is set then 1086 * set the flag accordingly to indicate that a nopwrite 1087 * has already occurred. 1088 */ 1089 if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { 1090 ASSERT(!zp->zp_dedup); 1091 zio->io_flags |= ZIO_FLAG_NOPWRITE; 1092 return (ZIO_PIPELINE_CONTINUE); 1093 } 1094 1095 ASSERT(!zp->zp_nopwrite); 1096 1097 if (BP_IS_HOLE(bp) || !zp->zp_dedup) 1098 return (ZIO_PIPELINE_CONTINUE); 1099 1100 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup || 1101 zp->zp_dedup_verify); 1102 1103 if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { 1104 BP_SET_DEDUP(bp, 1); 1105 zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; 1106 return (ZIO_PIPELINE_CONTINUE); 1107 } 1108 zio->io_bp_override = NULL; 1109 BP_ZERO(bp); 1110 } 1111 1112 if (bp->blk_birth == zio->io_txg) { 1113 /* 1114 * We're rewriting an existing block, which means we're 1115 * working on behalf of spa_sync(). For spa_sync() to 1116 * converge, it must eventually be the case that we don't 1117 * have to allocate new blocks. But compression changes 1118 * the blocksize, which forces a reallocate, and makes 1119 * convergence take longer. Therefore, after the first 1120 * few passes, stop compressing to ensure convergence. 1121 */ 1122 pass = spa_sync_pass(spa); 1123 1124 ASSERT(zio->io_txg == spa_syncing_txg(spa)); 1125 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1126 ASSERT(!BP_GET_DEDUP(bp)); 1127 1128 if (pass >= zfs_sync_pass_dont_compress) 1129 compress = ZIO_COMPRESS_OFF; 1130 1131 /* Make sure someone doesn't change their mind on overwrites */ 1132 ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp), 1133 spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 1134 } 1135 1136 if (compress != ZIO_COMPRESS_OFF) { 1137 void *cbuf = zio_buf_alloc(lsize); 1138 psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); 1139 if (psize == 0 || psize == lsize) { 1140 compress = ZIO_COMPRESS_OFF; 1141 zio_buf_free(cbuf, lsize); 1142 } else { 1143 ASSERT(psize < lsize); 1144 zio_push_transform(zio, cbuf, psize, lsize, NULL); 1145 } 1146 } 1147 1148 /* 1149 * The final pass of spa_sync() must be all rewrites, but the first 1150 * few passes offer a trade-off: allocating blocks defers convergence, 1151 * but newly allocated blocks are sequential, so they can be written 1152 * to disk faster. Therefore, we allow the first few passes of 1153 * spa_sync() to allocate new blocks, but force rewrites after that. 1154 * There should only be a handful of blocks after pass 1 in any case. 1155 */ 1156 if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize && 1157 pass >= zfs_sync_pass_rewrite) { 1158 ASSERT(psize != 0); 1159 enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; 1160 zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; 1161 zio->io_flags |= ZIO_FLAG_IO_REWRITE; 1162 } else { 1163 BP_ZERO(bp); 1164 zio->io_pipeline = ZIO_WRITE_PIPELINE; 1165 } 1166 1167 if (psize == 0) { 1168 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1169 } else { 1170 ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); 1171 BP_SET_LSIZE(bp, lsize); 1172 BP_SET_PSIZE(bp, psize); 1173 BP_SET_COMPRESS(bp, compress); 1174 BP_SET_CHECKSUM(bp, zp->zp_checksum); 1175 BP_SET_TYPE(bp, zp->zp_type); 1176 BP_SET_LEVEL(bp, zp->zp_level); 1177 BP_SET_DEDUP(bp, zp->zp_dedup); 1178 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1179 if (zp->zp_dedup) { 1180 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1181 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1182 zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; 1183 } 1184 if (zp->zp_nopwrite) { 1185 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1186 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1187 zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; 1188 } 1189 } 1190 1191 return (ZIO_PIPELINE_CONTINUE); 1192} 1193 1194static int 1195zio_free_bp_init(zio_t *zio) 1196{ 1197 blkptr_t *bp = zio->io_bp; 1198 1199 if (zio->io_child_type == ZIO_CHILD_LOGICAL) { 1200 if (BP_GET_DEDUP(bp)) 1201 zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; 1202 } 1203 1204 return (ZIO_PIPELINE_CONTINUE); 1205} 1206 1207/* 1208 * ========================================================================== 1209 * Execute the I/O pipeline 1210 * ========================================================================== 1211 */ 1212 1213static void 1214zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline) 1215{ 1216 spa_t *spa = zio->io_spa; 1217 zio_type_t t = zio->io_type; 1218 int flags = TQ_SLEEP | (cutinline ? TQ_FRONT : 0); 1219 1220 ASSERT(q == ZIO_TASKQ_ISSUE || q == ZIO_TASKQ_INTERRUPT); 1221 1222 /* 1223 * If we're a config writer or a probe, the normal issue and 1224 * interrupt threads may all be blocked waiting for the config lock. 1225 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. 1226 */ 1227 if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) 1228 t = ZIO_TYPE_NULL; 1229 1230 /* 1231 * A similar issue exists for the L2ARC write thread until L2ARC 2.0. 1232 */ 1233 if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) 1234 t = ZIO_TYPE_NULL; 1235 1236 /* 1237 * If this is a high priority I/O, then use the high priority taskq. 1238 */ 1239 if (zio->io_priority == ZIO_PRIORITY_NOW && 1240 spa->spa_zio_taskq[t][q + 1] != NULL) 1241 q++; 1242 1243 ASSERT3U(q, <, ZIO_TASKQ_TYPES); 1244#ifdef _KERNEL 1245 (void) taskq_dispatch_safe(spa->spa_zio_taskq[t][q], 1246 (task_func_t *)zio_execute, zio, flags, &zio->io_task); 1247#else 1248 (void) taskq_dispatch(spa->spa_zio_taskq[t][q], 1249 (task_func_t *)zio_execute, zio, flags); 1250#endif 1251} 1252 1253static boolean_t 1254zio_taskq_member(zio_t *zio, enum zio_taskq_type q) 1255{ 1256 kthread_t *executor = zio->io_executor; 1257 spa_t *spa = zio->io_spa; 1258 1259 for (zio_type_t t = 0; t < ZIO_TYPES; t++) 1260 if (taskq_member(spa->spa_zio_taskq[t][q], executor)) 1261 return (B_TRUE); 1262 1263 return (B_FALSE); 1264} 1265 1266static int 1267zio_issue_async(zio_t *zio) 1268{ 1269 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 1270 1271 return (ZIO_PIPELINE_STOP); 1272} 1273 1274void 1275zio_interrupt(zio_t *zio) 1276{ 1277 zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); 1278} 1279 1280/* 1281 * Execute the I/O pipeline until one of the following occurs: 1282 * 1283 * (1) the I/O completes 1284 * (2) the pipeline stalls waiting for dependent child I/Os 1285 * (3) the I/O issues, so we're waiting for an I/O completion interrupt 1286 * (4) the I/O is delegated by vdev-level caching or aggregation 1287 * (5) the I/O is deferred due to vdev-level queueing 1288 * (6) the I/O is handed off to another thread. 1289 * 1290 * In all cases, the pipeline stops whenever there's no CPU work; it never 1291 * burns a thread in cv_wait(). 1292 * 1293 * There's no locking on io_stage because there's no legitimate way 1294 * for multiple threads to be attempting to process the same I/O. 1295 */ 1296static zio_pipe_stage_t *zio_pipeline[]; 1297 1298void 1299zio_execute(zio_t *zio) 1300{ 1301 zio->io_executor = curthread; 1302 1303 while (zio->io_stage < ZIO_STAGE_DONE) { 1304 enum zio_stage pipeline = zio->io_pipeline; 1305 enum zio_stage stage = zio->io_stage; 1306 int rv; 1307 1308 ASSERT(!MUTEX_HELD(&zio->io_lock)); 1309 ASSERT(ISP2(stage)); 1310 ASSERT(zio->io_stall == NULL); 1311 1312 do { 1313 stage <<= 1; 1314 } while ((stage & pipeline) == 0); 1315 1316 ASSERT(stage <= ZIO_STAGE_DONE); 1317 1318 /* 1319 * If we are in interrupt context and this pipeline stage 1320 * will grab a config lock that is held across I/O, 1321 * or may wait for an I/O that needs an interrupt thread 1322 * to complete, issue async to avoid deadlock. 1323 * 1324 * For VDEV_IO_START, we cut in line so that the io will 1325 * be sent to disk promptly. 1326 */ 1327 if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && 1328 zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { 1329 boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? 1330 zio_requeue_io_start_cut_in_line : B_FALSE; 1331 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); 1332 return; 1333 } 1334 1335 zio->io_stage = stage; 1336 rv = zio_pipeline[highbit(stage) - 1](zio); 1337 1338 if (rv == ZIO_PIPELINE_STOP) 1339 return; 1340 1341 ASSERT(rv == ZIO_PIPELINE_CONTINUE); 1342 } 1343} 1344 1345/* 1346 * ========================================================================== 1347 * Initiate I/O, either sync or async 1348 * ========================================================================== 1349 */ 1350int 1351zio_wait(zio_t *zio) 1352{ 1353 int error; 1354 1355 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 1356 ASSERT(zio->io_executor == NULL); 1357 1358 zio->io_waiter = curthread; 1359 1360 zio_execute(zio); 1361 1362 mutex_enter(&zio->io_lock); 1363 while (zio->io_executor != NULL) 1364 cv_wait(&zio->io_cv, &zio->io_lock); 1365 mutex_exit(&zio->io_lock); 1366 1367 error = zio->io_error; 1368 zio_destroy(zio); 1369 1370 return (error); 1371} 1372 1373void 1374zio_nowait(zio_t *zio) 1375{ 1376 ASSERT(zio->io_executor == NULL); 1377 1378 if (zio->io_child_type == ZIO_CHILD_LOGICAL && 1379 zio_unique_parent(zio) == NULL) { 1380 /* 1381 * This is a logical async I/O with no parent to wait for it. 1382 * We add it to the spa_async_root_zio "Godfather" I/O which 1383 * will ensure they complete prior to unloading the pool. 1384 */ 1385 spa_t *spa = zio->io_spa; 1386 1387 zio_add_child(spa->spa_async_zio_root, zio); 1388 } 1389 1390 zio_execute(zio); 1391} 1392 1393/* 1394 * ========================================================================== 1395 * Reexecute or suspend/resume failed I/O 1396 * ========================================================================== 1397 */ 1398 1399static void 1400zio_reexecute(zio_t *pio) 1401{ 1402 zio_t *cio, *cio_next; 1403 1404 ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); 1405 ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); 1406 ASSERT(pio->io_gang_leader == NULL); 1407 ASSERT(pio->io_gang_tree == NULL); 1408 1409 pio->io_flags = pio->io_orig_flags; 1410 pio->io_stage = pio->io_orig_stage; 1411 pio->io_pipeline = pio->io_orig_pipeline; 1412 pio->io_reexecute = 0; 1413 pio->io_flags |= ZIO_FLAG_REEXECUTED; 1414 pio->io_error = 0; 1415 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1416 pio->io_state[w] = 0; 1417 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 1418 pio->io_child_error[c] = 0; 1419 1420 if (IO_IS_ALLOCATING(pio)) 1421 BP_ZERO(pio->io_bp); 1422 1423 /* 1424 * As we reexecute pio's children, new children could be created. 1425 * New children go to the head of pio's io_child_list, however, 1426 * so we will (correctly) not reexecute them. The key is that 1427 * the remainder of pio's io_child_list, from 'cio_next' onward, 1428 * cannot be affected by any side effects of reexecuting 'cio'. 1429 */ 1430 for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { 1431 cio_next = zio_walk_children(pio); 1432 mutex_enter(&pio->io_lock); 1433 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1434 pio->io_children[cio->io_child_type][w]++; 1435 mutex_exit(&pio->io_lock); 1436 zio_reexecute(cio); 1437 } 1438 1439 /* 1440 * Now that all children have been reexecuted, execute the parent. 1441 * We don't reexecute "The Godfather" I/O here as it's the 1442 * responsibility of the caller to wait on him. 1443 */ 1444 if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) 1445 zio_execute(pio); 1446} 1447 1448void 1449zio_suspend(spa_t *spa, zio_t *zio) 1450{ 1451 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) 1452 fm_panic("Pool '%s' has encountered an uncorrectable I/O " 1453 "failure and the failure mode property for this pool " 1454 "is set to panic.", spa_name(spa)); 1455 1456 zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); 1457 1458 mutex_enter(&spa->spa_suspend_lock); 1459 1460 if (spa->spa_suspend_zio_root == NULL) 1461 spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 1462 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 1463 ZIO_FLAG_GODFATHER); 1464 1465 spa->spa_suspended = B_TRUE; 1466 1467 if (zio != NULL) { 1468 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 1469 ASSERT(zio != spa->spa_suspend_zio_root); 1470 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1471 ASSERT(zio_unique_parent(zio) == NULL); 1472 ASSERT(zio->io_stage == ZIO_STAGE_DONE); 1473 zio_add_child(spa->spa_suspend_zio_root, zio); 1474 } 1475 1476 mutex_exit(&spa->spa_suspend_lock); 1477} 1478 1479int 1480zio_resume(spa_t *spa) 1481{ 1482 zio_t *pio; 1483 1484 /* 1485 * Reexecute all previously suspended i/o. 1486 */ 1487 mutex_enter(&spa->spa_suspend_lock); 1488 spa->spa_suspended = B_FALSE; 1489 cv_broadcast(&spa->spa_suspend_cv); 1490 pio = spa->spa_suspend_zio_root; 1491 spa->spa_suspend_zio_root = NULL; 1492 mutex_exit(&spa->spa_suspend_lock); 1493 1494 if (pio == NULL) 1495 return (0); 1496 1497 zio_reexecute(pio); 1498 return (zio_wait(pio)); 1499} 1500 1501void 1502zio_resume_wait(spa_t *spa) 1503{ 1504 mutex_enter(&spa->spa_suspend_lock); 1505 while (spa_suspended(spa)) 1506 cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); 1507 mutex_exit(&spa->spa_suspend_lock); 1508} 1509 1510/* 1511 * ========================================================================== 1512 * Gang blocks. 1513 * 1514 * A gang block is a collection of small blocks that looks to the DMU 1515 * like one large block. When zio_dva_allocate() cannot find a block 1516 * of the requested size, due to either severe fragmentation or the pool 1517 * being nearly full, it calls zio_write_gang_block() to construct the 1518 * block from smaller fragments. 1519 * 1520 * A gang block consists of a gang header (zio_gbh_phys_t) and up to 1521 * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like 1522 * an indirect block: it's an array of block pointers. It consumes 1523 * only one sector and hence is allocatable regardless of fragmentation. 1524 * The gang header's bps point to its gang members, which hold the data. 1525 * 1526 * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> 1527 * as the verifier to ensure uniqueness of the SHA256 checksum. 1528 * Critically, the gang block bp's blk_cksum is the checksum of the data, 1529 * not the gang header. This ensures that data block signatures (needed for 1530 * deduplication) are independent of how the block is physically stored. 1531 * 1532 * Gang blocks can be nested: a gang member may itself be a gang block. 1533 * Thus every gang block is a tree in which root and all interior nodes are 1534 * gang headers, and the leaves are normal blocks that contain user data. 1535 * The root of the gang tree is called the gang leader. 1536 * 1537 * To perform any operation (read, rewrite, free, claim) on a gang block, 1538 * zio_gang_assemble() first assembles the gang tree (minus data leaves) 1539 * in the io_gang_tree field of the original logical i/o by recursively 1540 * reading the gang leader and all gang headers below it. This yields 1541 * an in-core tree containing the contents of every gang header and the 1542 * bps for every constituent of the gang block. 1543 * 1544 * With the gang tree now assembled, zio_gang_issue() just walks the gang tree 1545 * and invokes a callback on each bp. To free a gang block, zio_gang_issue() 1546 * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. 1547 * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). 1548 * zio_read_gang() is a wrapper around zio_read() that omits reading gang 1549 * headers, since we already have those in io_gang_tree. zio_rewrite_gang() 1550 * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() 1551 * of the gang header plus zio_checksum_compute() of the data to update the 1552 * gang header's blk_cksum as described above. 1553 * 1554 * The two-phase assemble/issue model solves the problem of partial failure -- 1555 * what if you'd freed part of a gang block but then couldn't read the 1556 * gang header for another part? Assembling the entire gang tree first 1557 * ensures that all the necessary gang header I/O has succeeded before 1558 * starting the actual work of free, claim, or write. Once the gang tree 1559 * is assembled, free and claim are in-memory operations that cannot fail. 1560 * 1561 * In the event that a gang write fails, zio_dva_unallocate() walks the 1562 * gang tree to immediately free (i.e. insert back into the space map) 1563 * everything we've allocated. This ensures that we don't get ENOSPC 1564 * errors during repeated suspend/resume cycles due to a flaky device. 1565 * 1566 * Gang rewrites only happen during sync-to-convergence. If we can't assemble 1567 * the gang tree, we won't modify the block, so we can safely defer the free 1568 * (knowing that the block is still intact). If we *can* assemble the gang 1569 * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free 1570 * each constituent bp and we can allocate a new block on the next sync pass. 1571 * 1572 * In all cases, the gang tree allows complete recovery from partial failure. 1573 * ========================================================================== 1574 */ 1575 1576static zio_t * 1577zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1578{ 1579 if (gn != NULL) 1580 return (pio); 1581 1582 return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), 1583 NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1584 &pio->io_bookmark)); 1585} 1586 1587zio_t * 1588zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1589{ 1590 zio_t *zio; 1591 1592 if (gn != NULL) { 1593 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1594 gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, 1595 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1596 /* 1597 * As we rewrite each gang header, the pipeline will compute 1598 * a new gang block header checksum for it; but no one will 1599 * compute a new data checksum, so we do that here. The one 1600 * exception is the gang leader: the pipeline already computed 1601 * its data checksum because that stage precedes gang assembly. 1602 * (Presently, nothing actually uses interior data checksums; 1603 * this is just good hygiene.) 1604 */ 1605 if (gn != pio->io_gang_leader->io_gang_tree) { 1606 zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), 1607 data, BP_GET_PSIZE(bp)); 1608 } 1609 /* 1610 * If we are here to damage data for testing purposes, 1611 * leave the GBH alone so that we can detect the damage. 1612 */ 1613 if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) 1614 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 1615 } else { 1616 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1617 data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, 1618 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1619 } 1620 1621 return (zio); 1622} 1623 1624/* ARGSUSED */ 1625zio_t * 1626zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1627{ 1628 return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, 1629 BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp), 1630 ZIO_GANG_CHILD_FLAGS(pio))); 1631} 1632 1633/* ARGSUSED */ 1634zio_t * 1635zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1636{ 1637 return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, 1638 NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); 1639} 1640 1641static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { 1642 NULL, 1643 zio_read_gang, 1644 zio_rewrite_gang, 1645 zio_free_gang, 1646 zio_claim_gang, 1647 NULL 1648}; 1649 1650static void zio_gang_tree_assemble_done(zio_t *zio); 1651 1652static zio_gang_node_t * 1653zio_gang_node_alloc(zio_gang_node_t **gnpp) 1654{ 1655 zio_gang_node_t *gn; 1656 1657 ASSERT(*gnpp == NULL); 1658 1659 gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); 1660 gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); 1661 *gnpp = gn; 1662 1663 return (gn); 1664} 1665 1666static void 1667zio_gang_node_free(zio_gang_node_t **gnpp) 1668{ 1669 zio_gang_node_t *gn = *gnpp; 1670 1671 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1672 ASSERT(gn->gn_child[g] == NULL); 1673 1674 zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); 1675 kmem_free(gn, sizeof (*gn)); 1676 *gnpp = NULL; 1677} 1678 1679static void 1680zio_gang_tree_free(zio_gang_node_t **gnpp) 1681{ 1682 zio_gang_node_t *gn = *gnpp; 1683 1684 if (gn == NULL) 1685 return; 1686 1687 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1688 zio_gang_tree_free(&gn->gn_child[g]); 1689 1690 zio_gang_node_free(gnpp); 1691} 1692 1693static void 1694zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) 1695{ 1696 zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); 1697 1698 ASSERT(gio->io_gang_leader == gio); 1699 ASSERT(BP_IS_GANG(bp)); 1700 1701 zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, 1702 SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, 1703 gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); 1704} 1705 1706static void 1707zio_gang_tree_assemble_done(zio_t *zio) 1708{ 1709 zio_t *gio = zio->io_gang_leader; 1710 zio_gang_node_t *gn = zio->io_private; 1711 blkptr_t *bp = zio->io_bp; 1712 1713 ASSERT(gio == zio_unique_parent(zio)); 1714 ASSERT(zio->io_child_count == 0); 1715 1716 if (zio->io_error) 1717 return; 1718 1719 if (BP_SHOULD_BYTESWAP(bp)) 1720 byteswap_uint64_array(zio->io_data, zio->io_size); 1721 1722 ASSERT(zio->io_data == gn->gn_gbh); 1723 ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 1724 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1725 1726 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1727 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1728 if (!BP_IS_GANG(gbp)) 1729 continue; 1730 zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); 1731 } 1732} 1733 1734static void 1735zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) 1736{ 1737 zio_t *gio = pio->io_gang_leader; 1738 zio_t *zio; 1739 1740 ASSERT(BP_IS_GANG(bp) == !!gn); 1741 ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); 1742 ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); 1743 1744 /* 1745 * If you're a gang header, your data is in gn->gn_gbh. 1746 * If you're a gang member, your data is in 'data' and gn == NULL. 1747 */ 1748 zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); 1749 1750 if (gn != NULL) { 1751 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1752 1753 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1754 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1755 if (BP_IS_HOLE(gbp)) 1756 continue; 1757 zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); 1758 data = (char *)data + BP_GET_PSIZE(gbp); 1759 } 1760 } 1761 1762 if (gn == gio->io_gang_tree && gio->io_data != NULL) 1763 ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); 1764 1765 if (zio != pio) 1766 zio_nowait(zio); 1767} 1768 1769static int 1770zio_gang_assemble(zio_t *zio) 1771{ 1772 blkptr_t *bp = zio->io_bp; 1773 1774 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); 1775 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1776 1777 zio->io_gang_leader = zio; 1778 1779 zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); 1780 1781 return (ZIO_PIPELINE_CONTINUE); 1782} 1783 1784static int 1785zio_gang_issue(zio_t *zio) 1786{ 1787 blkptr_t *bp = zio->io_bp; 1788 1789 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) 1790 return (ZIO_PIPELINE_STOP); 1791 1792 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); 1793 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1794 1795 if (zio->io_child_error[ZIO_CHILD_GANG] == 0) 1796 zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); 1797 else 1798 zio_gang_tree_free(&zio->io_gang_tree); 1799 1800 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1801 1802 return (ZIO_PIPELINE_CONTINUE); 1803} 1804 1805static void 1806zio_write_gang_member_ready(zio_t *zio) 1807{ 1808 zio_t *pio = zio_unique_parent(zio); 1809 zio_t *gio = zio->io_gang_leader; 1810 dva_t *cdva = zio->io_bp->blk_dva; 1811 dva_t *pdva = pio->io_bp->blk_dva; 1812 uint64_t asize; 1813 1814 if (BP_IS_HOLE(zio->io_bp)) 1815 return; 1816 1817 ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); 1818 1819 ASSERT(zio->io_child_type == ZIO_CHILD_GANG); 1820 ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); 1821 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); 1822 ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); 1823 ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 1824 1825 mutex_enter(&pio->io_lock); 1826 for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { 1827 ASSERT(DVA_GET_GANG(&pdva[d])); 1828 asize = DVA_GET_ASIZE(&pdva[d]); 1829 asize += DVA_GET_ASIZE(&cdva[d]); 1830 DVA_SET_ASIZE(&pdva[d], asize); 1831 } 1832 mutex_exit(&pio->io_lock); 1833} 1834 1835static int 1836zio_write_gang_block(zio_t *pio) 1837{ 1838 spa_t *spa = pio->io_spa; 1839 blkptr_t *bp = pio->io_bp; 1840 zio_t *gio = pio->io_gang_leader; 1841 zio_t *zio; 1842 zio_gang_node_t *gn, **gnpp; 1843 zio_gbh_phys_t *gbh; 1844 uint64_t txg = pio->io_txg; 1845 uint64_t resid = pio->io_size; 1846 uint64_t lsize; 1847 int copies = gio->io_prop.zp_copies; 1848 int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); 1849 zio_prop_t zp; 1850 int error; 1851 1852 error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE, 1853 bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, 1854 METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); 1855 if (error) { 1856 pio->io_error = error; 1857 return (ZIO_PIPELINE_CONTINUE); 1858 } 1859 1860 if (pio == gio) { 1861 gnpp = &gio->io_gang_tree; 1862 } else { 1863 gnpp = pio->io_private; 1864 ASSERT(pio->io_ready == zio_write_gang_member_ready); 1865 } 1866 1867 gn = zio_gang_node_alloc(gnpp); 1868 gbh = gn->gn_gbh; 1869 bzero(gbh, SPA_GANGBLOCKSIZE); 1870 1871 /* 1872 * Create the gang header. 1873 */ 1874 zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, 1875 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1876 1877 /* 1878 * Create and nowait the gang children. 1879 */ 1880 for (int g = 0; resid != 0; resid -= lsize, g++) { 1881 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), 1882 SPA_MINBLOCKSIZE); 1883 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); 1884 1885 zp.zp_checksum = gio->io_prop.zp_checksum; 1886 zp.zp_compress = ZIO_COMPRESS_OFF; 1887 zp.zp_type = DMU_OT_NONE; 1888 zp.zp_level = 0; 1889 zp.zp_copies = gio->io_prop.zp_copies; 1890 zp.zp_dedup = B_FALSE; 1891 zp.zp_dedup_verify = B_FALSE; 1892 zp.zp_nopwrite = B_FALSE; 1893 1894 zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], 1895 (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, 1896 zio_write_gang_member_ready, NULL, &gn->gn_child[g], 1897 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1898 &pio->io_bookmark)); 1899 } 1900 1901 /* 1902 * Set pio's pipeline to just wait for zio to finish. 1903 */ 1904 pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1905 1906 zio_nowait(zio); 1907 1908 return (ZIO_PIPELINE_CONTINUE); 1909} 1910 1911/* 1912 * The zio_nop_write stage in the pipeline determines if allocating 1913 * a new bp is necessary. By leveraging a cryptographically secure checksum, 1914 * such as SHA256, we can compare the checksums of the new data and the old 1915 * to determine if allocating a new block is required. The nopwrite 1916 * feature can handle writes in either syncing or open context (i.e. zil 1917 * writes) and as a result is mutually exclusive with dedup. 1918 */ 1919static int 1920zio_nop_write(zio_t *zio) 1921{ 1922 blkptr_t *bp = zio->io_bp; 1923 blkptr_t *bp_orig = &zio->io_bp_orig; 1924 zio_prop_t *zp = &zio->io_prop; 1925 1926 ASSERT(BP_GET_LEVEL(bp) == 0); 1927 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1928 ASSERT(zp->zp_nopwrite); 1929 ASSERT(!zp->zp_dedup); 1930 ASSERT(zio->io_bp_override == NULL); 1931 ASSERT(IO_IS_ALLOCATING(zio)); 1932 1933 /* 1934 * Check to see if the original bp and the new bp have matching 1935 * characteristics (i.e. same checksum, compression algorithms, etc). 1936 * If they don't then just continue with the pipeline which will 1937 * allocate a new bp. 1938 */ 1939 if (BP_IS_HOLE(bp_orig) || 1940 !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup || 1941 BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || 1942 BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || 1943 BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || 1944 zp->zp_copies != BP_GET_NDVAS(bp_orig)) 1945 return (ZIO_PIPELINE_CONTINUE); 1946 1947 /* 1948 * If the checksums match then reset the pipeline so that we 1949 * avoid allocating a new bp and issuing any I/O. 1950 */ 1951 if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { 1952 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup); 1953 ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); 1954 ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); 1955 ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); 1956 ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, 1957 sizeof (uint64_t)) == 0); 1958 1959 *bp = *bp_orig; 1960 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1961 zio->io_flags |= ZIO_FLAG_NOPWRITE; 1962 } 1963 1964 return (ZIO_PIPELINE_CONTINUE); 1965} 1966 1967/* 1968 * ========================================================================== 1969 * Dedup 1970 * ========================================================================== 1971 */ 1972static void 1973zio_ddt_child_read_done(zio_t *zio) 1974{ 1975 blkptr_t *bp = zio->io_bp; 1976 ddt_entry_t *dde = zio->io_private; 1977 ddt_phys_t *ddp; 1978 zio_t *pio = zio_unique_parent(zio); 1979 1980 mutex_enter(&pio->io_lock); 1981 ddp = ddt_phys_select(dde, bp); 1982 if (zio->io_error == 0) 1983 ddt_phys_clear(ddp); /* this ddp doesn't need repair */ 1984 if (zio->io_error == 0 && dde->dde_repair_data == NULL) 1985 dde->dde_repair_data = zio->io_data; 1986 else 1987 zio_buf_free(zio->io_data, zio->io_size); 1988 mutex_exit(&pio->io_lock); 1989} 1990 1991static int 1992zio_ddt_read_start(zio_t *zio) 1993{ 1994 blkptr_t *bp = zio->io_bp; 1995 1996 ASSERT(BP_GET_DEDUP(bp)); 1997 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 1998 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1999 2000 if (zio->io_child_error[ZIO_CHILD_DDT]) { 2001 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2002 ddt_entry_t *dde = ddt_repair_start(ddt, bp); 2003 ddt_phys_t *ddp = dde->dde_phys; 2004 ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); 2005 blkptr_t blk; 2006 2007 ASSERT(zio->io_vsd == NULL); 2008 zio->io_vsd = dde; 2009 2010 if (ddp_self == NULL) 2011 return (ZIO_PIPELINE_CONTINUE); 2012 2013 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 2014 if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) 2015 continue; 2016 ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, 2017 &blk); 2018 zio_nowait(zio_read(zio, zio->io_spa, &blk, 2019 zio_buf_alloc(zio->io_size), zio->io_size, 2020 zio_ddt_child_read_done, dde, zio->io_priority, 2021 ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, 2022 &zio->io_bookmark)); 2023 } 2024 return (ZIO_PIPELINE_CONTINUE); 2025 } 2026 2027 zio_nowait(zio_read(zio, zio->io_spa, bp, 2028 zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, 2029 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); 2030 2031 return (ZIO_PIPELINE_CONTINUE); 2032} 2033 2034static int 2035zio_ddt_read_done(zio_t *zio) 2036{ 2037 blkptr_t *bp = zio->io_bp; 2038 2039 if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)) 2040 return (ZIO_PIPELINE_STOP); 2041 2042 ASSERT(BP_GET_DEDUP(bp)); 2043 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2044 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2045 2046 if (zio->io_child_error[ZIO_CHILD_DDT]) { 2047 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2048 ddt_entry_t *dde = zio->io_vsd; 2049 if (ddt == NULL) { 2050 ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); 2051 return (ZIO_PIPELINE_CONTINUE); 2052 } 2053 if (dde == NULL) { 2054 zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; 2055 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 2056 return (ZIO_PIPELINE_STOP); 2057 } 2058 if (dde->dde_repair_data != NULL) { 2059 bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); 2060 zio->io_child_error[ZIO_CHILD_DDT] = 0; 2061 } 2062 ddt_repair_done(ddt, dde); 2063 zio->io_vsd = NULL; 2064 } 2065 2066 ASSERT(zio->io_vsd == NULL); 2067 2068 return (ZIO_PIPELINE_CONTINUE); 2069} 2070 2071static boolean_t 2072zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) 2073{ 2074 spa_t *spa = zio->io_spa; 2075 2076 /* 2077 * Note: we compare the original data, not the transformed data, 2078 * because when zio->io_bp is an override bp, we will not have 2079 * pushed the I/O transforms. That's an important optimization 2080 * because otherwise we'd compress/encrypt all dmu_sync() data twice. 2081 */ 2082 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2083 zio_t *lio = dde->dde_lead_zio[p]; 2084 2085 if (lio != NULL) { 2086 return (lio->io_orig_size != zio->io_orig_size || 2087 bcmp(zio->io_orig_data, lio->io_orig_data, 2088 zio->io_orig_size) != 0); 2089 } 2090 } 2091 2092 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2093 ddt_phys_t *ddp = &dde->dde_phys[p]; 2094 2095 if (ddp->ddp_phys_birth != 0) { 2096 arc_buf_t *abuf = NULL; 2097 uint32_t aflags = ARC_WAIT; 2098 blkptr_t blk = *zio->io_bp; 2099 int error; 2100 2101 ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); 2102 2103 ddt_exit(ddt); 2104 2105 error = arc_read(NULL, spa, &blk, 2106 arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, 2107 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2108 &aflags, &zio->io_bookmark); 2109 2110 if (error == 0) { 2111 if (arc_buf_size(abuf) != zio->io_orig_size || 2112 bcmp(abuf->b_data, zio->io_orig_data, 2113 zio->io_orig_size) != 0) 2114 error = SET_ERROR(EEXIST); 2115 VERIFY(arc_buf_remove_ref(abuf, &abuf)); 2116 } 2117 2118 ddt_enter(ddt); 2119 return (error != 0); 2120 } 2121 } 2122 2123 return (B_FALSE); 2124} 2125 2126static void 2127zio_ddt_child_write_ready(zio_t *zio) 2128{ 2129 int p = zio->io_prop.zp_copies; 2130 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2131 ddt_entry_t *dde = zio->io_private; 2132 ddt_phys_t *ddp = &dde->dde_phys[p]; 2133 zio_t *pio; 2134 2135 if (zio->io_error) 2136 return; 2137 2138 ddt_enter(ddt); 2139 2140 ASSERT(dde->dde_lead_zio[p] == zio); 2141 2142 ddt_phys_fill(ddp, zio->io_bp); 2143 2144 while ((pio = zio_walk_parents(zio)) != NULL) 2145 ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); 2146 2147 ddt_exit(ddt); 2148} 2149 2150static void 2151zio_ddt_child_write_done(zio_t *zio) 2152{ 2153 int p = zio->io_prop.zp_copies; 2154 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2155 ddt_entry_t *dde = zio->io_private; 2156 ddt_phys_t *ddp = &dde->dde_phys[p]; 2157 2158 ddt_enter(ddt); 2159 2160 ASSERT(ddp->ddp_refcnt == 0); 2161 ASSERT(dde->dde_lead_zio[p] == zio); 2162 dde->dde_lead_zio[p] = NULL; 2163 2164 if (zio->io_error == 0) { 2165 while (zio_walk_parents(zio) != NULL) 2166 ddt_phys_addref(ddp); 2167 } else { 2168 ddt_phys_clear(ddp); 2169 } 2170 2171 ddt_exit(ddt); 2172} 2173 2174static void 2175zio_ddt_ditto_write_done(zio_t *zio) 2176{ 2177 int p = DDT_PHYS_DITTO; 2178 zio_prop_t *zp = &zio->io_prop; 2179 blkptr_t *bp = zio->io_bp; 2180 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2181 ddt_entry_t *dde = zio->io_private; 2182 ddt_phys_t *ddp = &dde->dde_phys[p]; 2183 ddt_key_t *ddk = &dde->dde_key; 2184 2185 ddt_enter(ddt); 2186 2187 ASSERT(ddp->ddp_refcnt == 0); 2188 ASSERT(dde->dde_lead_zio[p] == zio); 2189 dde->dde_lead_zio[p] = NULL; 2190 2191 if (zio->io_error == 0) { 2192 ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); 2193 ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); 2194 ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); 2195 if (ddp->ddp_phys_birth != 0) 2196 ddt_phys_free(ddt, ddk, ddp, zio->io_txg); 2197 ddt_phys_fill(ddp, bp); 2198 } 2199 2200 ddt_exit(ddt); 2201} 2202 2203static int 2204zio_ddt_write(zio_t *zio) 2205{ 2206 spa_t *spa = zio->io_spa; 2207 blkptr_t *bp = zio->io_bp; 2208 uint64_t txg = zio->io_txg; 2209 zio_prop_t *zp = &zio->io_prop; 2210 int p = zp->zp_copies; 2211 int ditto_copies; 2212 zio_t *cio = NULL; 2213 zio_t *dio = NULL; 2214 ddt_t *ddt = ddt_select(spa, bp); 2215 ddt_entry_t *dde; 2216 ddt_phys_t *ddp; 2217 2218 ASSERT(BP_GET_DEDUP(bp)); 2219 ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); 2220 ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); 2221 2222 ddt_enter(ddt); 2223 dde = ddt_lookup(ddt, bp, B_TRUE); 2224 ddp = &dde->dde_phys[p]; 2225 2226 if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { 2227 /* 2228 * If we're using a weak checksum, upgrade to a strong checksum 2229 * and try again. If we're already using a strong checksum, 2230 * we can't resolve it, so just convert to an ordinary write. 2231 * (And automatically e-mail a paper to Nature?) 2232 */ 2233 if (!zio_checksum_table[zp->zp_checksum].ci_dedup) { 2234 zp->zp_checksum = spa_dedup_checksum(spa); 2235 zio_pop_transforms(zio); 2236 zio->io_stage = ZIO_STAGE_OPEN; 2237 BP_ZERO(bp); 2238 } else { 2239 zp->zp_dedup = B_FALSE; 2240 } 2241 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2242 ddt_exit(ddt); 2243 return (ZIO_PIPELINE_CONTINUE); 2244 } 2245 2246 ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); 2247 ASSERT(ditto_copies < SPA_DVAS_PER_BP); 2248 2249 if (ditto_copies > ddt_ditto_copies_present(dde) && 2250 dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { 2251 zio_prop_t czp = *zp; 2252 2253 czp.zp_copies = ditto_copies; 2254 2255 /* 2256 * If we arrived here with an override bp, we won't have run 2257 * the transform stack, so we won't have the data we need to 2258 * generate a child i/o. So, toss the override bp and restart. 2259 * This is safe, because using the override bp is just an 2260 * optimization; and it's rare, so the cost doesn't matter. 2261 */ 2262 if (zio->io_bp_override) { 2263 zio_pop_transforms(zio); 2264 zio->io_stage = ZIO_STAGE_OPEN; 2265 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2266 zio->io_bp_override = NULL; 2267 BP_ZERO(bp); 2268 ddt_exit(ddt); 2269 return (ZIO_PIPELINE_CONTINUE); 2270 } 2271 2272 dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2273 zio->io_orig_size, &czp, NULL, 2274 zio_ddt_ditto_write_done, dde, zio->io_priority, 2275 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2276 2277 zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); 2278 dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; 2279 } 2280 2281 if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { 2282 if (ddp->ddp_phys_birth != 0) 2283 ddt_bp_fill(ddp, bp, txg); 2284 if (dde->dde_lead_zio[p] != NULL) 2285 zio_add_child(zio, dde->dde_lead_zio[p]); 2286 else 2287 ddt_phys_addref(ddp); 2288 } else if (zio->io_bp_override) { 2289 ASSERT(bp->blk_birth == txg); 2290 ASSERT(BP_EQUAL(bp, zio->io_bp_override)); 2291 ddt_phys_fill(ddp, bp); 2292 ddt_phys_addref(ddp); 2293 } else { 2294 cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2295 zio->io_orig_size, zp, zio_ddt_child_write_ready, 2296 zio_ddt_child_write_done, dde, zio->io_priority, 2297 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2298 2299 zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); 2300 dde->dde_lead_zio[p] = cio; 2301 } 2302 2303 ddt_exit(ddt); 2304 2305 if (cio) 2306 zio_nowait(cio); 2307 if (dio) 2308 zio_nowait(dio); 2309 2310 return (ZIO_PIPELINE_CONTINUE); 2311} 2312 2313ddt_entry_t *freedde; /* for debugging */ 2314 2315static int 2316zio_ddt_free(zio_t *zio) 2317{ 2318 spa_t *spa = zio->io_spa; 2319 blkptr_t *bp = zio->io_bp; 2320 ddt_t *ddt = ddt_select(spa, bp); 2321 ddt_entry_t *dde; 2322 ddt_phys_t *ddp; 2323 2324 ASSERT(BP_GET_DEDUP(bp)); 2325 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2326 2327 ddt_enter(ddt); 2328 freedde = dde = ddt_lookup(ddt, bp, B_TRUE); 2329 ddp = ddt_phys_select(dde, bp); 2330 ddt_phys_decref(ddp); 2331 ddt_exit(ddt); 2332 2333 return (ZIO_PIPELINE_CONTINUE); 2334} 2335 2336/* 2337 * ========================================================================== 2338 * Allocate and free blocks 2339 * ========================================================================== 2340 */ 2341static int 2342zio_dva_allocate(zio_t *zio) 2343{ 2344 spa_t *spa = zio->io_spa; 2345 metaslab_class_t *mc = spa_normal_class(spa); 2346 blkptr_t *bp = zio->io_bp; 2347 int error; 2348 int flags = 0; 2349 2350 if (zio->io_gang_leader == NULL) { 2351 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2352 zio->io_gang_leader = zio; 2353 } 2354 2355 ASSERT(BP_IS_HOLE(bp)); 2356 ASSERT0(BP_GET_NDVAS(bp)); 2357 ASSERT3U(zio->io_prop.zp_copies, >, 0); 2358 ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); 2359 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 2360 2361 /* 2362 * The dump device does not support gang blocks so allocation on 2363 * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid 2364 * the "fast" gang feature. 2365 */ 2366 flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0; 2367 flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ? 2368 METASLAB_GANG_CHILD : 0; 2369 error = metaslab_alloc(spa, mc, zio->io_size, bp, 2370 zio->io_prop.zp_copies, zio->io_txg, NULL, flags); 2371 2372 if (error) { 2373 spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " 2374 "size %llu, error %d", spa_name(spa), zio, zio->io_size, 2375 error); 2376 if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) 2377 return (zio_write_gang_block(zio)); 2378 zio->io_error = error; 2379 } 2380 2381 return (ZIO_PIPELINE_CONTINUE); 2382} 2383 2384static int 2385zio_dva_free(zio_t *zio) 2386{ 2387 metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); 2388 2389 return (ZIO_PIPELINE_CONTINUE); 2390} 2391 2392static int 2393zio_dva_claim(zio_t *zio) 2394{ 2395 int error; 2396 2397 error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 2398 if (error) 2399 zio->io_error = error; 2400 2401 return (ZIO_PIPELINE_CONTINUE); 2402} 2403 2404/* 2405 * Undo an allocation. This is used by zio_done() when an I/O fails 2406 * and we want to give back the block we just allocated. 2407 * This handles both normal blocks and gang blocks. 2408 */ 2409static void 2410zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) 2411{ 2412 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 2413 ASSERT(zio->io_bp_override == NULL); 2414 2415 if (!BP_IS_HOLE(bp)) 2416 metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); 2417 2418 if (gn != NULL) { 2419 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 2420 zio_dva_unallocate(zio, gn->gn_child[g], 2421 &gn->gn_gbh->zg_blkptr[g]); 2422 } 2423 } 2424} 2425 2426/* 2427 * Try to allocate an intent log block. Return 0 on success, errno on failure. 2428 */ 2429int 2430zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, 2431 uint64_t size, boolean_t use_slog) 2432{ 2433 int error = 1; 2434 2435 ASSERT(txg > spa_syncing_txg(spa)); 2436 2437 /* 2438 * ZIL blocks are always contiguous (i.e. not gang blocks) so we 2439 * set the METASLAB_GANG_AVOID flag so that they don't "fast gang" 2440 * when allocating them. 2441 */ 2442 if (use_slog) { 2443 error = metaslab_alloc(spa, spa_log_class(spa), size, 2444 new_bp, 1, txg, old_bp, 2445 METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID); 2446 } 2447 2448 if (error) { 2449 error = metaslab_alloc(spa, spa_normal_class(spa), size, 2450 new_bp, 1, txg, old_bp, 2451 METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID); 2452 } 2453 2454 if (error == 0) { 2455 BP_SET_LSIZE(new_bp, size); 2456 BP_SET_PSIZE(new_bp, size); 2457 BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 2458 BP_SET_CHECKSUM(new_bp, 2459 spa_version(spa) >= SPA_VERSION_SLIM_ZIL 2460 ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); 2461 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 2462 BP_SET_LEVEL(new_bp, 0); 2463 BP_SET_DEDUP(new_bp, 0); 2464 BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 2465 } 2466 2467 return (error); 2468} 2469 2470/* 2471 * Free an intent log block. 2472 */ 2473void 2474zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp) 2475{ 2476 ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG); 2477 ASSERT(!BP_IS_GANG(bp)); 2478 2479 zio_free(spa, txg, bp); 2480} 2481 2482/* 2483 * ========================================================================== 2484 * Read, write and delete to physical devices 2485 * ========================================================================== 2486 */ 2487static int 2488zio_vdev_io_start(zio_t *zio) 2489{ 2490 vdev_t *vd = zio->io_vd; 2491 uint64_t align; 2492 spa_t *spa = zio->io_spa; 2493 2494 ASSERT(zio->io_error == 0); 2495 ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); 2496 2497 if (vd == NULL) { 2498 if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2499 spa_config_enter(spa, SCL_ZIO, zio, RW_READER); 2500 2501 /* 2502 * The mirror_ops handle multiple DVAs in a single BP. 2503 */ 2504 return (vdev_mirror_ops.vdev_op_io_start(zio)); 2505 } 2506 2507 if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE) { 2508 trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg); 2509 return (ZIO_PIPELINE_CONTINUE); 2510 } 2511 2512 /* 2513 * We keep track of time-sensitive I/Os so that the scan thread 2514 * can quickly react to certain workloads. In particular, we care 2515 * about non-scrubbing, top-level reads and writes with the following 2516 * characteristics: 2517 * - synchronous writes of user data to non-slog devices 2518 * - any reads of user data 2519 * When these conditions are met, adjust the timestamp of spa_last_io 2520 * which allows the scan thread to adjust its workload accordingly. 2521 */ 2522 if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && 2523 vd == vd->vdev_top && !vd->vdev_islog && 2524 zio->io_bookmark.zb_objset != DMU_META_OBJSET && 2525 zio->io_txg != spa_syncing_txg(spa)) { 2526 uint64_t old = spa->spa_last_io; 2527 uint64_t new = ddi_get_lbolt64(); 2528 if (old != new) 2529 (void) atomic_cas_64(&spa->spa_last_io, old, new); 2530 } 2531 2532 align = 1ULL << vd->vdev_top->vdev_ashift; 2533 2534 if (P2PHASE(zio->io_size, align) != 0) { 2535 uint64_t asize = P2ROUNDUP(zio->io_size, align); 2536 char *abuf = NULL; 2537 if (zio->io_type == ZIO_TYPE_READ || 2538 zio->io_type == ZIO_TYPE_WRITE) 2539 abuf = zio_buf_alloc(asize); 2540 ASSERT(vd == vd->vdev_top); 2541 if (zio->io_type == ZIO_TYPE_WRITE) { 2542 bcopy(zio->io_data, abuf, zio->io_size); 2543 bzero(abuf + zio->io_size, asize - zio->io_size); 2544 } 2545 zio_push_transform(zio, abuf, asize, abuf ? asize : 0, 2546 zio_subblock); 2547 } 2548 2549 ASSERT(P2PHASE(zio->io_offset, align) == 0); 2550 ASSERT(P2PHASE(zio->io_size, align) == 0); 2551 VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa)); 2552 2553 /* 2554 * If this is a repair I/O, and there's no self-healing involved -- 2555 * that is, we're just resilvering what we expect to resilver -- 2556 * then don't do the I/O unless zio's txg is actually in vd's DTL. 2557 * This prevents spurious resilvering with nested replication. 2558 * For example, given a mirror of mirrors, (A+B)+(C+D), if only 2559 * A is out of date, we'll read from C+D, then use the data to 2560 * resilver A+B -- but we don't actually want to resilver B, just A. 2561 * The top-level mirror has no way to know this, so instead we just 2562 * discard unnecessary repairs as we work our way down the vdev tree. 2563 * The same logic applies to any form of nested replication: 2564 * ditto + mirror, RAID-Z + replacing, etc. This covers them all. 2565 */ 2566 if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && 2567 !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && 2568 zio->io_txg != 0 && /* not a delegated i/o */ 2569 !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { 2570 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 2571 zio_vdev_io_bypass(zio); 2572 return (ZIO_PIPELINE_CONTINUE); 2573 } 2574 2575 if (vd->vdev_ops->vdev_op_leaf && 2576 (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { 2577 2578 if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) 2579 return (ZIO_PIPELINE_CONTINUE); 2580 2581 if ((zio = vdev_queue_io(zio)) == NULL) 2582 return (ZIO_PIPELINE_STOP); 2583 2584 if (!vdev_accessible(vd, zio)) { 2585 zio->io_error = SET_ERROR(ENXIO); 2586 zio_interrupt(zio); 2587 return (ZIO_PIPELINE_STOP); 2588 } 2589 } 2590 2591 /* 2592 * Note that we ignore repair writes for TRIM because they can conflict 2593 * with normal writes. This isn't an issue because, by definition, we 2594 * only repair blocks that aren't freed. 2595 */ 2596 if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_WRITE && 2597 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 2598 if (!trim_map_write_start(zio)) 2599 return (ZIO_PIPELINE_STOP); 2600 } 2601 2602 return (vd->vdev_ops->vdev_op_io_start(zio)); 2603} 2604 2605static int 2606zio_vdev_io_done(zio_t *zio) 2607{ 2608 vdev_t *vd = zio->io_vd; 2609 vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; 2610 boolean_t unexpected_error = B_FALSE; 2611 2612 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2613 return (ZIO_PIPELINE_STOP); 2614 2615 ASSERT(zio->io_type == ZIO_TYPE_READ || 2616 zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE); 2617 2618 if (vd != NULL && vd->vdev_ops->vdev_op_leaf && 2619 (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { 2620 2621 if (zio->io_type == ZIO_TYPE_WRITE && 2622 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) 2623 trim_map_write_done(zio); 2624 2625 vdev_queue_io_done(zio); 2626 2627 if (zio->io_type == ZIO_TYPE_WRITE) 2628 vdev_cache_write(zio); 2629 2630 if (zio_injection_enabled && zio->io_error == 0) 2631 zio->io_error = zio_handle_device_injection(vd, 2632 zio, EIO); 2633 2634 if (zio_injection_enabled && zio->io_error == 0) 2635 zio->io_error = zio_handle_label_injection(zio, EIO); 2636 2637 if (zio->io_error) { 2638 if (!vdev_accessible(vd, zio)) { 2639 zio->io_error = SET_ERROR(ENXIO); 2640 } else { 2641 unexpected_error = B_TRUE; 2642 } 2643 } 2644 } 2645 2646 ops->vdev_op_io_done(zio); 2647 2648 if (unexpected_error) 2649 VERIFY(vdev_probe(vd, zio) == NULL); 2650 2651 return (ZIO_PIPELINE_CONTINUE); 2652} 2653 2654/* 2655 * For non-raidz ZIOs, we can just copy aside the bad data read from the 2656 * disk, and use that to finish the checksum ereport later. 2657 */ 2658static void 2659zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, 2660 const void *good_buf) 2661{ 2662 /* no processing needed */ 2663 zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); 2664} 2665 2666/*ARGSUSED*/ 2667void 2668zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) 2669{ 2670 void *buf = zio_buf_alloc(zio->io_size); 2671 2672 bcopy(zio->io_data, buf, zio->io_size); 2673 2674 zcr->zcr_cbinfo = zio->io_size; 2675 zcr->zcr_cbdata = buf; 2676 zcr->zcr_finish = zio_vsd_default_cksum_finish; 2677 zcr->zcr_free = zio_buf_free; 2678} 2679 2680static int 2681zio_vdev_io_assess(zio_t *zio) 2682{ 2683 vdev_t *vd = zio->io_vd; 2684 2685 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2686 return (ZIO_PIPELINE_STOP); 2687 2688 if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2689 spa_config_exit(zio->io_spa, SCL_ZIO, zio); 2690 2691 if (zio->io_vsd != NULL) { 2692 zio->io_vsd_ops->vsd_free(zio); 2693 zio->io_vsd = NULL; 2694 } 2695 2696 if (zio_injection_enabled && zio->io_error == 0) 2697 zio->io_error = zio_handle_fault_injection(zio, EIO); 2698 2699 if (zio->io_type == ZIO_TYPE_IOCTL && zio->io_cmd == DKIOCTRIM) 2700 switch (zio->io_error) { 2701 case 0: 2702 ZIO_TRIM_STAT_INCR(bytes, zio->io_size); 2703 ZIO_TRIM_STAT_BUMP(success); 2704 break; 2705 case EOPNOTSUPP: 2706 ZIO_TRIM_STAT_BUMP(unsupported); 2707 break; 2708 default: 2709 ZIO_TRIM_STAT_BUMP(failed); 2710 break; 2711 } 2712 2713 /* 2714 * If the I/O failed, determine whether we should attempt to retry it. 2715 * 2716 * On retry, we cut in line in the issue queue, since we don't want 2717 * compression/checksumming/etc. work to prevent our (cheap) IO reissue. 2718 */ 2719 if (zio->io_error && vd == NULL && 2720 !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { 2721 ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ 2722 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ 2723 zio->io_error = 0; 2724 zio->io_flags |= ZIO_FLAG_IO_RETRY | 2725 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; 2726 zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; 2727 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, 2728 zio_requeue_io_start_cut_in_line); 2729 return (ZIO_PIPELINE_STOP); 2730 } 2731 2732 /* 2733 * If we got an error on a leaf device, convert it to ENXIO 2734 * if the device is not accessible at all. 2735 */ 2736 if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && 2737 !vdev_accessible(vd, zio)) 2738 zio->io_error = SET_ERROR(ENXIO); 2739 2740 /* 2741 * If we can't write to an interior vdev (mirror or RAID-Z), 2742 * set vdev_cant_write so that we stop trying to allocate from it. 2743 */ 2744 if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && 2745 vd != NULL && !vd->vdev_ops->vdev_op_leaf) { 2746 vd->vdev_cant_write = B_TRUE; 2747 } 2748 2749 if (zio->io_error) 2750 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2751 2752 return (ZIO_PIPELINE_CONTINUE); 2753} 2754 2755void 2756zio_vdev_io_reissue(zio_t *zio) 2757{ 2758 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2759 ASSERT(zio->io_error == 0); 2760 2761 zio->io_stage >>= 1; 2762} 2763 2764void 2765zio_vdev_io_redone(zio_t *zio) 2766{ 2767 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 2768 2769 zio->io_stage >>= 1; 2770} 2771 2772void 2773zio_vdev_io_bypass(zio_t *zio) 2774{ 2775 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2776 ASSERT(zio->io_error == 0); 2777 2778 zio->io_flags |= ZIO_FLAG_IO_BYPASS; 2779 zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; 2780} 2781 2782/* 2783 * ========================================================================== 2784 * Generate and verify checksums 2785 * ========================================================================== 2786 */ 2787static int 2788zio_checksum_generate(zio_t *zio) 2789{ 2790 blkptr_t *bp = zio->io_bp; 2791 enum zio_checksum checksum; 2792 2793 if (bp == NULL) { 2794 /* 2795 * This is zio_write_phys(). 2796 * We're either generating a label checksum, or none at all. 2797 */ 2798 checksum = zio->io_prop.zp_checksum; 2799 2800 if (checksum == ZIO_CHECKSUM_OFF) 2801 return (ZIO_PIPELINE_CONTINUE); 2802 2803 ASSERT(checksum == ZIO_CHECKSUM_LABEL); 2804 } else { 2805 if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { 2806 ASSERT(!IO_IS_ALLOCATING(zio)); 2807 checksum = ZIO_CHECKSUM_GANG_HEADER; 2808 } else { 2809 checksum = BP_GET_CHECKSUM(bp); 2810 } 2811 } 2812 2813 zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); 2814 2815 return (ZIO_PIPELINE_CONTINUE); 2816} 2817 2818static int 2819zio_checksum_verify(zio_t *zio) 2820{ 2821 zio_bad_cksum_t info; 2822 blkptr_t *bp = zio->io_bp; 2823 int error; 2824 2825 ASSERT(zio->io_vd != NULL); 2826 2827 if (bp == NULL) { 2828 /* 2829 * This is zio_read_phys(). 2830 * We're either verifying a label checksum, or nothing at all. 2831 */ 2832 if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) 2833 return (ZIO_PIPELINE_CONTINUE); 2834 2835 ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); 2836 } 2837 2838 if ((error = zio_checksum_error(zio, &info)) != 0) { 2839 zio->io_error = error; 2840 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2841 zfs_ereport_start_checksum(zio->io_spa, 2842 zio->io_vd, zio, zio->io_offset, 2843 zio->io_size, NULL, &info); 2844 } 2845 } 2846 2847 return (ZIO_PIPELINE_CONTINUE); 2848} 2849 2850/* 2851 * Called by RAID-Z to ensure we don't compute the checksum twice. 2852 */ 2853void 2854zio_checksum_verified(zio_t *zio) 2855{ 2856 zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 2857} 2858 2859/* 2860 * ========================================================================== 2861 * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. 2862 * An error of 0 indictes success. ENXIO indicates whole-device failure, 2863 * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO 2864 * indicate errors that are specific to one I/O, and most likely permanent. 2865 * Any other error is presumed to be worse because we weren't expecting it. 2866 * ========================================================================== 2867 */ 2868int 2869zio_worst_error(int e1, int e2) 2870{ 2871 static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; 2872 int r1, r2; 2873 2874 for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) 2875 if (e1 == zio_error_rank[r1]) 2876 break; 2877 2878 for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) 2879 if (e2 == zio_error_rank[r2]) 2880 break; 2881 2882 return (r1 > r2 ? e1 : e2); 2883} 2884 2885/* 2886 * ========================================================================== 2887 * I/O completion 2888 * ========================================================================== 2889 */ 2890static int 2891zio_ready(zio_t *zio) 2892{ 2893 blkptr_t *bp = zio->io_bp; 2894 zio_t *pio, *pio_next; 2895 2896 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 2897 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY)) 2898 return (ZIO_PIPELINE_STOP); 2899 2900 if (zio->io_ready) { 2901 ASSERT(IO_IS_ALLOCATING(zio)); 2902 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || 2903 (zio->io_flags & ZIO_FLAG_NOPWRITE)); 2904 ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); 2905 2906 zio->io_ready(zio); 2907 } 2908 2909 if (bp != NULL && bp != &zio->io_bp_copy) 2910 zio->io_bp_copy = *bp; 2911 2912 if (zio->io_error) 2913 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2914 2915 mutex_enter(&zio->io_lock); 2916 zio->io_state[ZIO_WAIT_READY] = 1; 2917 pio = zio_walk_parents(zio); 2918 mutex_exit(&zio->io_lock); 2919 2920 /* 2921 * As we notify zio's parents, new parents could be added. 2922 * New parents go to the head of zio's io_parent_list, however, 2923 * so we will (correctly) not notify them. The remainder of zio's 2924 * io_parent_list, from 'pio_next' onward, cannot change because 2925 * all parents must wait for us to be done before they can be done. 2926 */ 2927 for (; pio != NULL; pio = pio_next) { 2928 pio_next = zio_walk_parents(zio); 2929 zio_notify_parent(pio, zio, ZIO_WAIT_READY); 2930 } 2931 2932 if (zio->io_flags & ZIO_FLAG_NODATA) { 2933 if (BP_IS_GANG(bp)) { 2934 zio->io_flags &= ~ZIO_FLAG_NODATA; 2935 } else { 2936 ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE); 2937 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 2938 } 2939 } 2940 2941 if (zio_injection_enabled && 2942 zio->io_spa->spa_syncing_txg == zio->io_txg) 2943 zio_handle_ignored_writes(zio); 2944 2945 return (ZIO_PIPELINE_CONTINUE); 2946} 2947 2948static int 2949zio_done(zio_t *zio) 2950{ 2951 spa_t *spa = zio->io_spa; 2952 zio_t *lio = zio->io_logical; 2953 blkptr_t *bp = zio->io_bp; 2954 vdev_t *vd = zio->io_vd; 2955 uint64_t psize = zio->io_size; 2956 zio_t *pio, *pio_next; 2957 2958 /* 2959 * If our children haven't all completed, 2960 * wait for them and then repeat this pipeline stage. 2961 */ 2962 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || 2963 zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || 2964 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) || 2965 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) 2966 return (ZIO_PIPELINE_STOP); 2967 2968 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 2969 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 2970 ASSERT(zio->io_children[c][w] == 0); 2971 2972 if (bp != NULL) { 2973 ASSERT(bp->blk_pad[0] == 0); 2974 ASSERT(bp->blk_pad[1] == 0); 2975 ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || 2976 (bp == zio_unique_parent(zio)->io_bp)); 2977 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 2978 zio->io_bp_override == NULL && 2979 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 2980 ASSERT(!BP_SHOULD_BYTESWAP(bp)); 2981 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); 2982 ASSERT(BP_COUNT_GANG(bp) == 0 || 2983 (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 2984 } 2985 if (zio->io_flags & ZIO_FLAG_NOPWRITE) 2986 VERIFY(BP_EQUAL(bp, &zio->io_bp_orig)); 2987 } 2988 2989 /* 2990 * If there were child vdev/gang/ddt errors, they apply to us now. 2991 */ 2992 zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); 2993 zio_inherit_child_errors(zio, ZIO_CHILD_GANG); 2994 zio_inherit_child_errors(zio, ZIO_CHILD_DDT); 2995 2996 /* 2997 * If the I/O on the transformed data was successful, generate any 2998 * checksum reports now while we still have the transformed data. 2999 */ 3000 if (zio->io_error == 0) { 3001 while (zio->io_cksum_report != NULL) { 3002 zio_cksum_report_t *zcr = zio->io_cksum_report; 3003 uint64_t align = zcr->zcr_align; 3004 uint64_t asize = P2ROUNDUP(psize, align); 3005 char *abuf = zio->io_data; 3006 3007 if (asize != psize) { 3008 abuf = zio_buf_alloc(asize); 3009 bcopy(zio->io_data, abuf, psize); 3010 bzero(abuf + psize, asize - psize); 3011 } 3012 3013 zio->io_cksum_report = zcr->zcr_next; 3014 zcr->zcr_next = NULL; 3015 zcr->zcr_finish(zcr, abuf); 3016 zfs_ereport_free_checksum(zcr); 3017 3018 if (asize != psize) 3019 zio_buf_free(abuf, asize); 3020 } 3021 } 3022 3023 zio_pop_transforms(zio); /* note: may set zio->io_error */ 3024 3025 vdev_stat_update(zio, psize); 3026 3027 if (zio->io_error) { 3028 /* 3029 * If this I/O is attached to a particular vdev, 3030 * generate an error message describing the I/O failure 3031 * at the block level. We ignore these errors if the 3032 * device is currently unavailable. 3033 */ 3034 if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 3035 zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 3036 3037 if ((zio->io_error == EIO || !(zio->io_flags & 3038 (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && 3039 zio == lio) { 3040 /* 3041 * For logical I/O requests, tell the SPA to log the 3042 * error and generate a logical data ereport. 3043 */ 3044 spa_log_error(spa, zio); 3045 zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 3046 0, 0); 3047 } 3048 } 3049 3050 if (zio->io_error && zio == lio) { 3051 /* 3052 * Determine whether zio should be reexecuted. This will 3053 * propagate all the way to the root via zio_notify_parent(). 3054 */ 3055 ASSERT(vd == NULL && bp != NULL); 3056 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3057 3058 if (IO_IS_ALLOCATING(zio) && 3059 !(zio->io_flags & ZIO_FLAG_CANFAIL)) { 3060 if (zio->io_error != ENOSPC) 3061 zio->io_reexecute |= ZIO_REEXECUTE_NOW; 3062 else 3063 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3064 } 3065 3066 if ((zio->io_type == ZIO_TYPE_READ || 3067 zio->io_type == ZIO_TYPE_FREE) && 3068 !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && 3069 zio->io_error == ENXIO && 3070 spa_load_state(spa) == SPA_LOAD_NONE && 3071 spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 3072 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3073 3074 if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) 3075 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3076 3077 /* 3078 * Here is a possibly good place to attempt to do 3079 * either combinatorial reconstruction or error correction 3080 * based on checksums. It also might be a good place 3081 * to send out preliminary ereports before we suspend 3082 * processing. 3083 */ 3084 } 3085 3086 /* 3087 * If there were logical child errors, they apply to us now. 3088 * We defer this until now to avoid conflating logical child 3089 * errors with errors that happened to the zio itself when 3090 * updating vdev stats and reporting FMA events above. 3091 */ 3092 zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); 3093 3094 if ((zio->io_error || zio->io_reexecute) && 3095 IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && 3096 !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) 3097 zio_dva_unallocate(zio, zio->io_gang_tree, bp); 3098 3099 zio_gang_tree_free(&zio->io_gang_tree); 3100 3101 /* 3102 * Godfather I/Os should never suspend. 3103 */ 3104 if ((zio->io_flags & ZIO_FLAG_GODFATHER) && 3105 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) 3106 zio->io_reexecute = 0; 3107 3108 if (zio->io_reexecute) { 3109 /* 3110 * This is a logical I/O that wants to reexecute. 3111 * 3112 * Reexecute is top-down. When an i/o fails, if it's not 3113 * the root, it simply notifies its parent and sticks around. 3114 * The parent, seeing that it still has children in zio_done(), 3115 * does the same. This percolates all the way up to the root. 3116 * The root i/o will reexecute or suspend the entire tree. 3117 * 3118 * This approach ensures that zio_reexecute() honors 3119 * all the original i/o dependency relationships, e.g. 3120 * parents not executing until children are ready. 3121 */ 3122 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3123 3124 zio->io_gang_leader = NULL; 3125 3126 mutex_enter(&zio->io_lock); 3127 zio->io_state[ZIO_WAIT_DONE] = 1; 3128 mutex_exit(&zio->io_lock); 3129 3130 /* 3131 * "The Godfather" I/O monitors its children but is 3132 * not a true parent to them. It will track them through 3133 * the pipeline but severs its ties whenever they get into 3134 * trouble (e.g. suspended). This allows "The Godfather" 3135 * I/O to return status without blocking. 3136 */ 3137 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 3138 zio_link_t *zl = zio->io_walk_link; 3139 pio_next = zio_walk_parents(zio); 3140 3141 if ((pio->io_flags & ZIO_FLAG_GODFATHER) && 3142 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { 3143 zio_remove_child(pio, zio, zl); 3144 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3145 } 3146 } 3147 3148 if ((pio = zio_unique_parent(zio)) != NULL) { 3149 /* 3150 * We're not a root i/o, so there's nothing to do 3151 * but notify our parent. Don't propagate errors 3152 * upward since we haven't permanently failed yet. 3153 */ 3154 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 3155 zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; 3156 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3157 } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { 3158 /* 3159 * We'd fail again if we reexecuted now, so suspend 3160 * until conditions improve (e.g. device comes online). 3161 */ 3162 zio_suspend(spa, zio); 3163 } else { 3164 /* 3165 * Reexecution is potentially a huge amount of work. 3166 * Hand it off to the otherwise-unused claim taskq. 3167 */ 3168#ifdef _KERNEL 3169 (void) taskq_dispatch_safe( 3170 spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE], 3171 (task_func_t *)zio_reexecute, zio, TQ_SLEEP, 3172 &zio->io_task); 3173#else 3174 (void) taskq_dispatch( 3175 spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE], 3176 (task_func_t *)zio_reexecute, zio, TQ_SLEEP); 3177#endif 3178 } 3179 return (ZIO_PIPELINE_STOP); 3180 } 3181 3182 ASSERT(zio->io_child_count == 0); 3183 ASSERT(zio->io_reexecute == 0); 3184 ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); 3185 3186 /* 3187 * Report any checksum errors, since the I/O is complete. 3188 */ 3189 while (zio->io_cksum_report != NULL) { 3190 zio_cksum_report_t *zcr = zio->io_cksum_report; 3191 zio->io_cksum_report = zcr->zcr_next; 3192 zcr->zcr_next = NULL; 3193 zcr->zcr_finish(zcr, NULL); 3194 zfs_ereport_free_checksum(zcr); 3195 } 3196 3197 /* 3198 * It is the responsibility of the done callback to ensure that this 3199 * particular zio is no longer discoverable for adoption, and as 3200 * such, cannot acquire any new parents. 3201 */ 3202 if (zio->io_done) 3203 zio->io_done(zio); 3204 3205 mutex_enter(&zio->io_lock); 3206 zio->io_state[ZIO_WAIT_DONE] = 1; 3207 mutex_exit(&zio->io_lock); 3208 3209 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 3210 zio_link_t *zl = zio->io_walk_link; 3211 pio_next = zio_walk_parents(zio); 3212 zio_remove_child(pio, zio, zl); 3213 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3214 } 3215 3216 if (zio->io_waiter != NULL) { 3217 mutex_enter(&zio->io_lock); 3218 zio->io_executor = NULL; 3219 cv_broadcast(&zio->io_cv); 3220 mutex_exit(&zio->io_lock); 3221 } else { 3222 zio_destroy(zio); 3223 } 3224 3225 return (ZIO_PIPELINE_STOP); 3226} 3227 3228/* 3229 * ========================================================================== 3230 * I/O pipeline definition 3231 * ========================================================================== 3232 */ 3233static zio_pipe_stage_t *zio_pipeline[] = { 3234 NULL, 3235 zio_read_bp_init, 3236 zio_free_bp_init, 3237 zio_issue_async, 3238 zio_write_bp_init, 3239 zio_checksum_generate, 3240 zio_nop_write, 3241 zio_ddt_read_start, 3242 zio_ddt_read_done, 3243 zio_ddt_write, 3244 zio_ddt_free, 3245 zio_gang_assemble, 3246 zio_gang_issue, 3247 zio_dva_allocate, 3248 zio_dva_free, 3249 zio_dva_claim, 3250 zio_ready, 3251 zio_vdev_io_start, 3252 zio_vdev_io_done, 3253 zio_vdev_io_assess, 3254 zio_checksum_verify, 3255 zio_done 3256}; 3257 3258/* dnp is the dnode for zb1->zb_object */ 3259boolean_t 3260zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1, 3261 const zbookmark_t *zb2) 3262{ 3263 uint64_t zb1nextL0, zb2thisobj; 3264 3265 ASSERT(zb1->zb_objset == zb2->zb_objset); 3266 ASSERT(zb2->zb_level == 0); 3267 3268 /* 3269 * A bookmark in the deadlist is considered to be after 3270 * everything else. 3271 */ 3272 if (zb2->zb_object == DMU_DEADLIST_OBJECT) 3273 return (B_TRUE); 3274 3275 /* The objset_phys_t isn't before anything. */ 3276 if (dnp == NULL) 3277 return (B_FALSE); 3278 3279 zb1nextL0 = (zb1->zb_blkid + 1) << 3280 ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); 3281 3282 zb2thisobj = zb2->zb_object ? zb2->zb_object : 3283 zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); 3284 3285 if (zb1->zb_object == DMU_META_DNODE_OBJECT) { 3286 uint64_t nextobj = zb1nextL0 * 3287 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; 3288 return (nextobj <= zb2thisobj); 3289 } 3290 3291 if (zb1->zb_object < zb2thisobj) 3292 return (B_TRUE); 3293 if (zb1->zb_object > zb2thisobj) 3294 return (B_FALSE); 3295 if (zb2->zb_object == DMU_META_DNODE_OBJECT) 3296 return (B_FALSE); 3297 return (zb1nextL0 <= zb2->zb_blkid); 3298} 3299