zio.c revision 260742
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2013 by Delphix. All rights reserved. 24 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. 25 */ 26 27#include <sys/zfs_context.h> 28#include <sys/fm/fs/zfs.h> 29#include <sys/spa.h> 30#include <sys/txg.h> 31#include <sys/spa_impl.h> 32#include <sys/vdev_impl.h> 33#include <sys/zio_impl.h> 34#include <sys/zio_compress.h> 35#include <sys/zio_checksum.h> 36#include <sys/dmu_objset.h> 37#include <sys/arc.h> 38#include <sys/ddt.h> 39#include <sys/trim_map.h> 40 41SYSCTL_DECL(_vfs_zfs); 42SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); 43#if defined(__amd64__) 44static int zio_use_uma = 1; 45#else 46static int zio_use_uma = 0; 47#endif 48TUNABLE_INT("vfs.zfs.zio.use_uma", &zio_use_uma); 49SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0, 50 "Use uma(9) for ZIO allocations"); 51static int zio_exclude_metadata = 0; 52TUNABLE_INT("vfs.zfs.zio.exclude_metadata", &zio_exclude_metadata); 53SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0, 54 "Exclude metadata buffers from dumps as well"); 55 56zio_trim_stats_t zio_trim_stats = { 57 { "bytes", KSTAT_DATA_UINT64, 58 "Number of bytes successfully TRIMmed" }, 59 { "success", KSTAT_DATA_UINT64, 60 "Number of successful TRIM requests" }, 61 { "unsupported", KSTAT_DATA_UINT64, 62 "Number of TRIM requests that failed because TRIM is not supported" }, 63 { "failed", KSTAT_DATA_UINT64, 64 "Number of TRIM requests that failed for reasons other than not supported" }, 65}; 66 67static kstat_t *zio_trim_ksp; 68 69/* 70 * ========================================================================== 71 * I/O priority table 72 * ========================================================================== 73 */ 74uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 75 0, /* ZIO_PRIORITY_NOW */ 76 0, /* ZIO_PRIORITY_SYNC_READ */ 77 0, /* ZIO_PRIORITY_SYNC_WRITE */ 78 0, /* ZIO_PRIORITY_LOG_WRITE */ 79 1, /* ZIO_PRIORITY_CACHE_FILL */ 80 1, /* ZIO_PRIORITY_AGG */ 81 4, /* ZIO_PRIORITY_FREE */ 82 4, /* ZIO_PRIORITY_ASYNC_WRITE */ 83 6, /* ZIO_PRIORITY_ASYNC_READ */ 84 10, /* ZIO_PRIORITY_RESILVER */ 85 20, /* ZIO_PRIORITY_SCRUB */ 86 2, /* ZIO_PRIORITY_DDT_PREFETCH */ 87 30, /* ZIO_PRIORITY_TRIM */ 88}; 89 90/* 91 * ========================================================================== 92 * I/O type descriptions 93 * ========================================================================== 94 */ 95char *zio_type_name[ZIO_TYPES] = { 96 "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", 97 "zio_ioctl" 98}; 99 100/* 101 * ========================================================================== 102 * I/O kmem caches 103 * ========================================================================== 104 */ 105kmem_cache_t *zio_cache; 106kmem_cache_t *zio_link_cache; 107kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 108kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 109 110#ifdef _KERNEL 111extern vmem_t *zio_alloc_arena; 112#endif 113extern int zfs_mg_alloc_failures; 114 115/* 116 * The following actions directly effect the spa's sync-to-convergence logic. 117 * The values below define the sync pass when we start performing the action. 118 * Care should be taken when changing these values as they directly impact 119 * spa_sync() performance. Tuning these values may introduce subtle performance 120 * pathologies and should only be done in the context of performance analysis. 121 * These tunables will eventually be removed and replaced with #defines once 122 * enough analysis has been done to determine optimal values. 123 * 124 * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that 125 * regular blocks are not deferred. 126 */ 127int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */ 128TUNABLE_INT("vfs.zfs.sync_pass_deferred_free", &zfs_sync_pass_deferred_free); 129SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_deferred_free, CTLFLAG_RDTUN, 130 &zfs_sync_pass_deferred_free, 0, "defer frees starting in this pass"); 131int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */ 132TUNABLE_INT("vfs.zfs.sync_pass_dont_compress", &zfs_sync_pass_dont_compress); 133SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_dont_compress, CTLFLAG_RDTUN, 134 &zfs_sync_pass_dont_compress, 0, "don't compress starting in this pass"); 135int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ 136TUNABLE_INT("vfs.zfs.sync_pass_rewrite", &zfs_sync_pass_rewrite); 137SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_rewrite, CTLFLAG_RDTUN, 138 &zfs_sync_pass_rewrite, 0, "rewrite new bps starting in this pass"); 139 140/* 141 * An allocating zio is one that either currently has the DVA allocate 142 * stage set or will have it later in its lifetime. 143 */ 144#define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) 145 146boolean_t zio_requeue_io_start_cut_in_line = B_TRUE; 147 148#ifdef ZFS_DEBUG 149int zio_buf_debug_limit = 16384; 150#else 151int zio_buf_debug_limit = 0; 152#endif 153 154void 155zio_init(void) 156{ 157 size_t c; 158 zio_cache = kmem_cache_create("zio_cache", 159 sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 160 zio_link_cache = kmem_cache_create("zio_link_cache", 161 sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 162 if (!zio_use_uma) 163 goto out; 164 165 /* 166 * For small buffers, we want a cache for each multiple of 167 * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 168 * for each quarter-power of 2. For large buffers, we want 169 * a cache for each multiple of PAGESIZE. 170 */ 171 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 172 size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 173 size_t p2 = size; 174 size_t align = 0; 175 size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0; 176 177 while (p2 & (p2 - 1)) 178 p2 &= p2 - 1; 179 180#ifdef illumos 181#ifndef _KERNEL 182 /* 183 * If we are using watchpoints, put each buffer on its own page, 184 * to eliminate the performance overhead of trapping to the 185 * kernel when modifying a non-watched buffer that shares the 186 * page with a watched buffer. 187 */ 188 if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) 189 continue; 190#endif 191#endif /* illumos */ 192 if (size <= 4 * SPA_MINBLOCKSIZE) { 193 align = SPA_MINBLOCKSIZE; 194 } else if (IS_P2ALIGNED(size, PAGESIZE)) { 195 align = PAGESIZE; 196 } else if (IS_P2ALIGNED(size, p2 >> 2)) { 197 align = p2 >> 2; 198 } 199 200 if (align != 0) { 201 char name[36]; 202 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 203 zio_buf_cache[c] = kmem_cache_create(name, size, 204 align, NULL, NULL, NULL, NULL, NULL, cflags); 205 206 /* 207 * Since zio_data bufs do not appear in crash dumps, we 208 * pass KMC_NOTOUCH so that no allocator metadata is 209 * stored with the buffers. 210 */ 211 (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 212 zio_data_buf_cache[c] = kmem_cache_create(name, size, 213 align, NULL, NULL, NULL, NULL, NULL, 214 cflags | KMC_NOTOUCH | KMC_NODEBUG); 215 } 216 } 217 218 while (--c != 0) { 219 ASSERT(zio_buf_cache[c] != NULL); 220 if (zio_buf_cache[c - 1] == NULL) 221 zio_buf_cache[c - 1] = zio_buf_cache[c]; 222 223 ASSERT(zio_data_buf_cache[c] != NULL); 224 if (zio_data_buf_cache[c - 1] == NULL) 225 zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 226 } 227out: 228 229 /* 230 * The zio write taskqs have 1 thread per cpu, allow 1/2 of the taskqs 231 * to fail 3 times per txg or 8 failures, whichever is greater. 232 */ 233 if (zfs_mg_alloc_failures == 0) 234 zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8); 235 else if (zfs_mg_alloc_failures < 8) 236 zfs_mg_alloc_failures = 8; 237 238 zio_inject_init(); 239 240 zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc", 241 KSTAT_TYPE_NAMED, 242 sizeof(zio_trim_stats) / sizeof(kstat_named_t), 243 KSTAT_FLAG_VIRTUAL); 244 245 if (zio_trim_ksp != NULL) { 246 zio_trim_ksp->ks_data = &zio_trim_stats; 247 kstat_install(zio_trim_ksp); 248 } 249} 250 251void 252zio_fini(void) 253{ 254 size_t c; 255 kmem_cache_t *last_cache = NULL; 256 kmem_cache_t *last_data_cache = NULL; 257 258 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 259 if (zio_buf_cache[c] != last_cache) { 260 last_cache = zio_buf_cache[c]; 261 kmem_cache_destroy(zio_buf_cache[c]); 262 } 263 zio_buf_cache[c] = NULL; 264 265 if (zio_data_buf_cache[c] != last_data_cache) { 266 last_data_cache = zio_data_buf_cache[c]; 267 kmem_cache_destroy(zio_data_buf_cache[c]); 268 } 269 zio_data_buf_cache[c] = NULL; 270 } 271 272 kmem_cache_destroy(zio_link_cache); 273 kmem_cache_destroy(zio_cache); 274 275 zio_inject_fini(); 276 277 if (zio_trim_ksp != NULL) { 278 kstat_delete(zio_trim_ksp); 279 zio_trim_ksp = NULL; 280 } 281} 282 283/* 284 * ========================================================================== 285 * Allocate and free I/O buffers 286 * ========================================================================== 287 */ 288 289/* 290 * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 291 * crashdump if the kernel panics, so use it judiciously. Obviously, it's 292 * useful to inspect ZFS metadata, but if possible, we should avoid keeping 293 * excess / transient data in-core during a crashdump. 294 */ 295void * 296zio_buf_alloc(size_t size) 297{ 298 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 299 int flags = zio_exclude_metadata ? KM_NODEBUG : 0; 300 301 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 302 303 if (zio_use_uma) 304 return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 305 else 306 return (kmem_alloc(size, KM_SLEEP|flags)); 307} 308 309/* 310 * Use zio_data_buf_alloc to allocate data. The data will not appear in a 311 * crashdump if the kernel panics. This exists so that we will limit the amount 312 * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 313 * of kernel heap dumped to disk when the kernel panics) 314 */ 315void * 316zio_data_buf_alloc(size_t size) 317{ 318 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 319 320 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 321 322 if (zio_use_uma) 323 return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 324 else 325 return (kmem_alloc(size, KM_SLEEP | KM_NODEBUG)); 326} 327 328void 329zio_buf_free(void *buf, size_t size) 330{ 331 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 332 333 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 334 335 if (zio_use_uma) 336 kmem_cache_free(zio_buf_cache[c], buf); 337 else 338 kmem_free(buf, size); 339} 340 341void 342zio_data_buf_free(void *buf, size_t size) 343{ 344 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 345 346 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 347 348 if (zio_use_uma) 349 kmem_cache_free(zio_data_buf_cache[c], buf); 350 else 351 kmem_free(buf, size); 352} 353 354/* 355 * ========================================================================== 356 * Push and pop I/O transform buffers 357 * ========================================================================== 358 */ 359static void 360zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, 361 zio_transform_func_t *transform) 362{ 363 zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 364 365 zt->zt_orig_data = zio->io_data; 366 zt->zt_orig_size = zio->io_size; 367 zt->zt_bufsize = bufsize; 368 zt->zt_transform = transform; 369 370 zt->zt_next = zio->io_transform_stack; 371 zio->io_transform_stack = zt; 372 373 zio->io_data = data; 374 zio->io_size = size; 375} 376 377static void 378zio_pop_transforms(zio_t *zio) 379{ 380 zio_transform_t *zt; 381 382 while ((zt = zio->io_transform_stack) != NULL) { 383 if (zt->zt_transform != NULL) 384 zt->zt_transform(zio, 385 zt->zt_orig_data, zt->zt_orig_size); 386 387 if (zt->zt_bufsize != 0) 388 zio_buf_free(zio->io_data, zt->zt_bufsize); 389 390 zio->io_data = zt->zt_orig_data; 391 zio->io_size = zt->zt_orig_size; 392 zio->io_transform_stack = zt->zt_next; 393 394 kmem_free(zt, sizeof (zio_transform_t)); 395 } 396} 397 398/* 399 * ========================================================================== 400 * I/O transform callbacks for subblocks and decompression 401 * ========================================================================== 402 */ 403static void 404zio_subblock(zio_t *zio, void *data, uint64_t size) 405{ 406 ASSERT(zio->io_size > size); 407 408 if (zio->io_type == ZIO_TYPE_READ) 409 bcopy(zio->io_data, data, size); 410} 411 412static void 413zio_decompress(zio_t *zio, void *data, uint64_t size) 414{ 415 if (zio->io_error == 0 && 416 zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), 417 zio->io_data, data, zio->io_size, size) != 0) 418 zio->io_error = SET_ERROR(EIO); 419} 420 421/* 422 * ========================================================================== 423 * I/O parent/child relationships and pipeline interlocks 424 * ========================================================================== 425 */ 426/* 427 * NOTE - Callers to zio_walk_parents() and zio_walk_children must 428 * continue calling these functions until they return NULL. 429 * Otherwise, the next caller will pick up the list walk in 430 * some indeterminate state. (Otherwise every caller would 431 * have to pass in a cookie to keep the state represented by 432 * io_walk_link, which gets annoying.) 433 */ 434zio_t * 435zio_walk_parents(zio_t *cio) 436{ 437 zio_link_t *zl = cio->io_walk_link; 438 list_t *pl = &cio->io_parent_list; 439 440 zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); 441 cio->io_walk_link = zl; 442 443 if (zl == NULL) 444 return (NULL); 445 446 ASSERT(zl->zl_child == cio); 447 return (zl->zl_parent); 448} 449 450zio_t * 451zio_walk_children(zio_t *pio) 452{ 453 zio_link_t *zl = pio->io_walk_link; 454 list_t *cl = &pio->io_child_list; 455 456 zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); 457 pio->io_walk_link = zl; 458 459 if (zl == NULL) 460 return (NULL); 461 462 ASSERT(zl->zl_parent == pio); 463 return (zl->zl_child); 464} 465 466zio_t * 467zio_unique_parent(zio_t *cio) 468{ 469 zio_t *pio = zio_walk_parents(cio); 470 471 VERIFY(zio_walk_parents(cio) == NULL); 472 return (pio); 473} 474 475void 476zio_add_child(zio_t *pio, zio_t *cio) 477{ 478 zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); 479 480 /* 481 * Logical I/Os can have logical, gang, or vdev children. 482 * Gang I/Os can have gang or vdev children. 483 * Vdev I/Os can only have vdev children. 484 * The following ASSERT captures all of these constraints. 485 */ 486 ASSERT(cio->io_child_type <= pio->io_child_type); 487 488 zl->zl_parent = pio; 489 zl->zl_child = cio; 490 491 mutex_enter(&cio->io_lock); 492 mutex_enter(&pio->io_lock); 493 494 ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); 495 496 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 497 pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; 498 499 list_insert_head(&pio->io_child_list, zl); 500 list_insert_head(&cio->io_parent_list, zl); 501 502 pio->io_child_count++; 503 cio->io_parent_count++; 504 505 mutex_exit(&pio->io_lock); 506 mutex_exit(&cio->io_lock); 507} 508 509static void 510zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) 511{ 512 ASSERT(zl->zl_parent == pio); 513 ASSERT(zl->zl_child == cio); 514 515 mutex_enter(&cio->io_lock); 516 mutex_enter(&pio->io_lock); 517 518 list_remove(&pio->io_child_list, zl); 519 list_remove(&cio->io_parent_list, zl); 520 521 pio->io_child_count--; 522 cio->io_parent_count--; 523 524 mutex_exit(&pio->io_lock); 525 mutex_exit(&cio->io_lock); 526 527 kmem_cache_free(zio_link_cache, zl); 528} 529 530static boolean_t 531zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) 532{ 533 uint64_t *countp = &zio->io_children[child][wait]; 534 boolean_t waiting = B_FALSE; 535 536 mutex_enter(&zio->io_lock); 537 ASSERT(zio->io_stall == NULL); 538 if (*countp != 0) { 539 zio->io_stage >>= 1; 540 zio->io_stall = countp; 541 waiting = B_TRUE; 542 } 543 mutex_exit(&zio->io_lock); 544 545 return (waiting); 546} 547 548static void 549zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) 550{ 551 uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; 552 int *errorp = &pio->io_child_error[zio->io_child_type]; 553 554 mutex_enter(&pio->io_lock); 555 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 556 *errorp = zio_worst_error(*errorp, zio->io_error); 557 pio->io_reexecute |= zio->io_reexecute; 558 ASSERT3U(*countp, >, 0); 559 if (--*countp == 0 && pio->io_stall == countp) { 560 pio->io_stall = NULL; 561 mutex_exit(&pio->io_lock); 562 zio_execute(pio); 563 } else { 564 mutex_exit(&pio->io_lock); 565 } 566} 567 568static void 569zio_inherit_child_errors(zio_t *zio, enum zio_child c) 570{ 571 if (zio->io_child_error[c] != 0 && zio->io_error == 0) 572 zio->io_error = zio->io_child_error[c]; 573} 574 575/* 576 * ========================================================================== 577 * Create the various types of I/O (read, write, free, etc) 578 * ========================================================================== 579 */ 580static zio_t * 581zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 582 void *data, uint64_t size, zio_done_func_t *done, void *private, 583 zio_type_t type, int priority, enum zio_flag flags, 584 vdev_t *vd, uint64_t offset, const zbookmark_t *zb, 585 enum zio_stage stage, enum zio_stage pipeline) 586{ 587 zio_t *zio; 588 589 ASSERT3U(type == ZIO_TYPE_FREE || size, <=, SPA_MAXBLOCKSIZE); 590 ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 591 ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 592 593 ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); 594 ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); 595 ASSERT(vd || stage == ZIO_STAGE_OPEN); 596 597 zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 598 bzero(zio, sizeof (zio_t)); 599 600 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 601 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 602 603 list_create(&zio->io_parent_list, sizeof (zio_link_t), 604 offsetof(zio_link_t, zl_parent_node)); 605 list_create(&zio->io_child_list, sizeof (zio_link_t), 606 offsetof(zio_link_t, zl_child_node)); 607 608 if (vd != NULL) 609 zio->io_child_type = ZIO_CHILD_VDEV; 610 else if (flags & ZIO_FLAG_GANG_CHILD) 611 zio->io_child_type = ZIO_CHILD_GANG; 612 else if (flags & ZIO_FLAG_DDT_CHILD) 613 zio->io_child_type = ZIO_CHILD_DDT; 614 else 615 zio->io_child_type = ZIO_CHILD_LOGICAL; 616 617 if (bp != NULL) { 618 zio->io_bp = (blkptr_t *)bp; 619 zio->io_bp_copy = *bp; 620 zio->io_bp_orig = *bp; 621 if (type != ZIO_TYPE_WRITE || 622 zio->io_child_type == ZIO_CHILD_DDT) 623 zio->io_bp = &zio->io_bp_copy; /* so caller can free */ 624 if (zio->io_child_type == ZIO_CHILD_LOGICAL) 625 zio->io_logical = zio; 626 if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) 627 pipeline |= ZIO_GANG_STAGES; 628 } 629 630 zio->io_spa = spa; 631 zio->io_txg = txg; 632 zio->io_done = done; 633 zio->io_private = private; 634 zio->io_type = type; 635 zio->io_priority = priority; 636 zio->io_vd = vd; 637 zio->io_offset = offset; 638 zio->io_orig_data = zio->io_data = data; 639 zio->io_orig_size = zio->io_size = size; 640 zio->io_orig_flags = zio->io_flags = flags; 641 zio->io_orig_stage = zio->io_stage = stage; 642 zio->io_orig_pipeline = zio->io_pipeline = pipeline; 643 644 zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); 645 zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); 646 647 if (zb != NULL) 648 zio->io_bookmark = *zb; 649 650 if (pio != NULL) { 651 if (zio->io_logical == NULL) 652 zio->io_logical = pio->io_logical; 653 if (zio->io_child_type == ZIO_CHILD_GANG) 654 zio->io_gang_leader = pio->io_gang_leader; 655 zio_add_child(pio, zio); 656 } 657 658 return (zio); 659} 660 661static void 662zio_destroy(zio_t *zio) 663{ 664 list_destroy(&zio->io_parent_list); 665 list_destroy(&zio->io_child_list); 666 mutex_destroy(&zio->io_lock); 667 cv_destroy(&zio->io_cv); 668 kmem_cache_free(zio_cache, zio); 669} 670 671zio_t * 672zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, 673 void *private, enum zio_flag flags) 674{ 675 zio_t *zio; 676 677 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 678 ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 679 ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); 680 681 return (zio); 682} 683 684zio_t * 685zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) 686{ 687 return (zio_null(NULL, spa, NULL, done, private, flags)); 688} 689 690zio_t * 691zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, 692 void *data, uint64_t size, zio_done_func_t *done, void *private, 693 int priority, enum zio_flag flags, const zbookmark_t *zb) 694{ 695 zio_t *zio; 696 697 zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, 698 data, size, done, private, 699 ZIO_TYPE_READ, priority, flags, NULL, 0, zb, 700 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 701 ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); 702 703 return (zio); 704} 705 706zio_t * 707zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 708 void *data, uint64_t size, const zio_prop_t *zp, 709 zio_done_func_t *ready, zio_done_func_t *done, void *private, 710 int priority, enum zio_flag flags, const zbookmark_t *zb) 711{ 712 zio_t *zio; 713 714 ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && 715 zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && 716 zp->zp_compress >= ZIO_COMPRESS_OFF && 717 zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && 718 DMU_OT_IS_VALID(zp->zp_type) && 719 zp->zp_level < 32 && 720 zp->zp_copies > 0 && 721 zp->zp_copies <= spa_max_replication(spa)); 722 723 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 724 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 725 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 726 ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); 727 728 zio->io_ready = ready; 729 zio->io_prop = *zp; 730 731 return (zio); 732} 733 734zio_t * 735zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, 736 uint64_t size, zio_done_func_t *done, void *private, int priority, 737 enum zio_flag flags, zbookmark_t *zb) 738{ 739 zio_t *zio; 740 741 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 742 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 743 ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 744 745 return (zio); 746} 747 748void 749zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) 750{ 751 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 752 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 753 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 754 ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); 755 756 /* 757 * We must reset the io_prop to match the values that existed 758 * when the bp was first written by dmu_sync() keeping in mind 759 * that nopwrite and dedup are mutually exclusive. 760 */ 761 zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; 762 zio->io_prop.zp_nopwrite = nopwrite; 763 zio->io_prop.zp_copies = copies; 764 zio->io_bp_override = bp; 765} 766 767void 768zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) 769{ 770 metaslab_check_free(spa, bp); 771 772 /* 773 * Frees that are for the currently-syncing txg, are not going to be 774 * deferred, and which will not need to do a read (i.e. not GANG or 775 * DEDUP), can be processed immediately. Otherwise, put them on the 776 * in-memory list for later processing. 777 */ 778 if (zfs_trim_enabled || BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || 779 txg != spa->spa_syncing_txg || 780 spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) { 781 bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); 782 } else { 783 VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 784 BP_GET_PSIZE(bp), 0))); 785 } 786} 787 788zio_t * 789zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 790 uint64_t size, enum zio_flag flags) 791{ 792 zio_t *zio; 793 enum zio_stage stage = ZIO_FREE_PIPELINE; 794 795 dprintf_bp(bp, "freeing in txg %llu, pass %u", 796 (longlong_t)txg, spa->spa_sync_pass); 797 798 ASSERT(!BP_IS_HOLE(bp)); 799 ASSERT(spa_syncing_txg(spa) == txg); 800 ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); 801 802 metaslab_check_free(spa, bp); 803 arc_freed(spa, bp); 804 805 if (zfs_trim_enabled) 806 stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START | 807 ZIO_STAGE_VDEV_IO_ASSESS; 808 /* 809 * GANG and DEDUP blocks can induce a read (for the gang block header, 810 * or the DDT), so issue them asynchronously so that this thread is 811 * not tied up. 812 */ 813 else if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) 814 stage |= ZIO_STAGE_ISSUE_ASYNC; 815 816 zio = zio_create(pio, spa, txg, bp, NULL, size, 817 NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags, 818 NULL, 0, NULL, ZIO_STAGE_OPEN, stage); 819 820 return (zio); 821} 822 823zio_t * 824zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 825 zio_done_func_t *done, void *private, enum zio_flag flags) 826{ 827 zio_t *zio; 828 829 /* 830 * A claim is an allocation of a specific block. Claims are needed 831 * to support immediate writes in the intent log. The issue is that 832 * immediate writes contain committed data, but in a txg that was 833 * *not* committed. Upon opening the pool after an unclean shutdown, 834 * the intent log claims all blocks that contain immediate write data 835 * so that the SPA knows they're in use. 836 * 837 * All claims *must* be resolved in the first txg -- before the SPA 838 * starts allocating blocks -- so that nothing is allocated twice. 839 * If txg == 0 we just verify that the block is claimable. 840 */ 841 ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 842 ASSERT(txg == spa_first_txg(spa) || txg == 0); 843 ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ 844 845 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 846 done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, 847 NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 848 849 return (zio); 850} 851 852zio_t * 853zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset, 854 uint64_t size, zio_done_func_t *done, void *private, int priority, 855 enum zio_flag flags) 856{ 857 zio_t *zio; 858 int c; 859 860 if (vd->vdev_children == 0) { 861 zio = zio_create(pio, spa, 0, NULL, NULL, size, done, private, 862 ZIO_TYPE_IOCTL, priority, flags, vd, offset, NULL, 863 ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 864 865 zio->io_cmd = cmd; 866 } else { 867 zio = zio_null(pio, spa, NULL, NULL, NULL, flags); 868 869 for (c = 0; c < vd->vdev_children; c++) 870 zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 871 offset, size, done, private, priority, flags)); 872 } 873 874 return (zio); 875} 876 877zio_t * 878zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 879 void *data, int checksum, zio_done_func_t *done, void *private, 880 int priority, enum zio_flag flags, boolean_t labels) 881{ 882 zio_t *zio; 883 884 ASSERT(vd->vdev_children == 0); 885 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 886 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 887 ASSERT3U(offset + size, <=, vd->vdev_psize); 888 889 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 890 ZIO_TYPE_READ, priority, flags, vd, offset, NULL, 891 ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 892 893 zio->io_prop.zp_checksum = checksum; 894 895 return (zio); 896} 897 898zio_t * 899zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 900 void *data, int checksum, zio_done_func_t *done, void *private, 901 int priority, enum zio_flag flags, boolean_t labels) 902{ 903 zio_t *zio; 904 905 ASSERT(vd->vdev_children == 0); 906 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 907 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 908 ASSERT3U(offset + size, <=, vd->vdev_psize); 909 910 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 911 ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL, 912 ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 913 914 zio->io_prop.zp_checksum = checksum; 915 916 if (zio_checksum_table[checksum].ci_eck) { 917 /* 918 * zec checksums are necessarily destructive -- they modify 919 * the end of the write buffer to hold the verifier/checksum. 920 * Therefore, we must make a local copy in case the data is 921 * being written to multiple places in parallel. 922 */ 923 void *wbuf = zio_buf_alloc(size); 924 bcopy(data, wbuf, size); 925 zio_push_transform(zio, wbuf, size, size, NULL); 926 } 927 928 return (zio); 929} 930 931/* 932 * Create a child I/O to do some work for us. 933 */ 934zio_t * 935zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 936 void *data, uint64_t size, int type, int priority, enum zio_flag flags, 937 zio_done_func_t *done, void *private) 938{ 939 enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; 940 zio_t *zio; 941 942 ASSERT(vd->vdev_parent == 943 (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); 944 945 if (type == ZIO_TYPE_READ && bp != NULL) { 946 /* 947 * If we have the bp, then the child should perform the 948 * checksum and the parent need not. This pushes error 949 * detection as close to the leaves as possible and 950 * eliminates redundant checksums in the interior nodes. 951 */ 952 pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; 953 pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 954 } 955 956 if (vd->vdev_children == 0) 957 offset += VDEV_LABEL_START_SIZE; 958 959 flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE; 960 961 /* 962 * If we've decided to do a repair, the write is not speculative -- 963 * even if the original read was. 964 */ 965 if (flags & ZIO_FLAG_IO_REPAIR) 966 flags &= ~ZIO_FLAG_SPECULATIVE; 967 968 zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, 969 done, private, type, priority, flags, vd, offset, &pio->io_bookmark, 970 ZIO_STAGE_VDEV_IO_START >> 1, pipeline); 971 972 return (zio); 973} 974 975zio_t * 976zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, 977 int type, int priority, enum zio_flag flags, 978 zio_done_func_t *done, void *private) 979{ 980 zio_t *zio; 981 982 ASSERT(vd->vdev_ops->vdev_op_leaf); 983 984 zio = zio_create(NULL, vd->vdev_spa, 0, NULL, 985 data, size, done, private, type, priority, 986 flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY, 987 vd, offset, NULL, 988 ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); 989 990 return (zio); 991} 992 993void 994zio_flush(zio_t *zio, vdev_t *vd) 995{ 996 zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0, 997 NULL, NULL, ZIO_PRIORITY_NOW, 998 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); 999} 1000 1001zio_t * 1002zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size) 1003{ 1004 1005 ASSERT(vd->vdev_ops->vdev_op_leaf); 1006 1007 return zio_ioctl(zio, spa, vd, DKIOCTRIM, offset, size, 1008 NULL, NULL, ZIO_PRIORITY_TRIM, 1009 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY); 1010} 1011 1012void 1013zio_shrink(zio_t *zio, uint64_t size) 1014{ 1015 ASSERT(zio->io_executor == NULL); 1016 ASSERT(zio->io_orig_size == zio->io_size); 1017 ASSERT(size <= zio->io_size); 1018 1019 /* 1020 * We don't shrink for raidz because of problems with the 1021 * reconstruction when reading back less than the block size. 1022 * Note, BP_IS_RAIDZ() assumes no compression. 1023 */ 1024 ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); 1025 if (!BP_IS_RAIDZ(zio->io_bp)) 1026 zio->io_orig_size = zio->io_size = size; 1027} 1028 1029/* 1030 * ========================================================================== 1031 * Prepare to read and write logical blocks 1032 * ========================================================================== 1033 */ 1034 1035static int 1036zio_read_bp_init(zio_t *zio) 1037{ 1038 blkptr_t *bp = zio->io_bp; 1039 1040 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && 1041 zio->io_child_type == ZIO_CHILD_LOGICAL && 1042 !(zio->io_flags & ZIO_FLAG_RAW)) { 1043 uint64_t psize = BP_GET_PSIZE(bp); 1044 void *cbuf = zio_buf_alloc(psize); 1045 1046 zio_push_transform(zio, cbuf, psize, psize, zio_decompress); 1047 } 1048 1049 if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) 1050 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1051 1052 if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) 1053 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1054 1055 if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) 1056 zio->io_pipeline = ZIO_DDT_READ_PIPELINE; 1057 1058 return (ZIO_PIPELINE_CONTINUE); 1059} 1060 1061static int 1062zio_write_bp_init(zio_t *zio) 1063{ 1064 spa_t *spa = zio->io_spa; 1065 zio_prop_t *zp = &zio->io_prop; 1066 enum zio_compress compress = zp->zp_compress; 1067 blkptr_t *bp = zio->io_bp; 1068 uint64_t lsize = zio->io_size; 1069 uint64_t psize = lsize; 1070 int pass = 1; 1071 1072 /* 1073 * If our children haven't all reached the ready stage, 1074 * wait for them and then repeat this pipeline stage. 1075 */ 1076 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 1077 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) 1078 return (ZIO_PIPELINE_STOP); 1079 1080 if (!IO_IS_ALLOCATING(zio)) 1081 return (ZIO_PIPELINE_CONTINUE); 1082 1083 ASSERT(zio->io_child_type != ZIO_CHILD_DDT); 1084 1085 if (zio->io_bp_override) { 1086 ASSERT(bp->blk_birth != zio->io_txg); 1087 ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); 1088 1089 *bp = *zio->io_bp_override; 1090 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1091 1092 /* 1093 * If we've been overridden and nopwrite is set then 1094 * set the flag accordingly to indicate that a nopwrite 1095 * has already occurred. 1096 */ 1097 if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { 1098 ASSERT(!zp->zp_dedup); 1099 zio->io_flags |= ZIO_FLAG_NOPWRITE; 1100 return (ZIO_PIPELINE_CONTINUE); 1101 } 1102 1103 ASSERT(!zp->zp_nopwrite); 1104 1105 if (BP_IS_HOLE(bp) || !zp->zp_dedup) 1106 return (ZIO_PIPELINE_CONTINUE); 1107 1108 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup || 1109 zp->zp_dedup_verify); 1110 1111 if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { 1112 BP_SET_DEDUP(bp, 1); 1113 zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; 1114 return (ZIO_PIPELINE_CONTINUE); 1115 } 1116 zio->io_bp_override = NULL; 1117 BP_ZERO(bp); 1118 } 1119 1120 if (bp->blk_birth == zio->io_txg) { 1121 /* 1122 * We're rewriting an existing block, which means we're 1123 * working on behalf of spa_sync(). For spa_sync() to 1124 * converge, it must eventually be the case that we don't 1125 * have to allocate new blocks. But compression changes 1126 * the blocksize, which forces a reallocate, and makes 1127 * convergence take longer. Therefore, after the first 1128 * few passes, stop compressing to ensure convergence. 1129 */ 1130 pass = spa_sync_pass(spa); 1131 1132 ASSERT(zio->io_txg == spa_syncing_txg(spa)); 1133 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1134 ASSERT(!BP_GET_DEDUP(bp)); 1135 1136 if (pass >= zfs_sync_pass_dont_compress) 1137 compress = ZIO_COMPRESS_OFF; 1138 1139 /* Make sure someone doesn't change their mind on overwrites */ 1140 ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp), 1141 spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 1142 } 1143 1144 if (compress != ZIO_COMPRESS_OFF) { 1145 metaslab_class_t *mc = spa_normal_class(spa); 1146 void *cbuf = zio_buf_alloc(lsize); 1147 psize = zio_compress_data(compress, zio->io_data, cbuf, lsize, 1148 (size_t)metaslab_class_get_minblocksize(mc)); 1149 if (psize == 0 || psize == lsize) { 1150 compress = ZIO_COMPRESS_OFF; 1151 zio_buf_free(cbuf, lsize); 1152 } else { 1153 ASSERT(psize < lsize); 1154 zio_push_transform(zio, cbuf, psize, lsize, NULL); 1155 } 1156 } 1157 1158 /* 1159 * The final pass of spa_sync() must be all rewrites, but the first 1160 * few passes offer a trade-off: allocating blocks defers convergence, 1161 * but newly allocated blocks are sequential, so they can be written 1162 * to disk faster. Therefore, we allow the first few passes of 1163 * spa_sync() to allocate new blocks, but force rewrites after that. 1164 * There should only be a handful of blocks after pass 1 in any case. 1165 */ 1166 if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize && 1167 pass >= zfs_sync_pass_rewrite) { 1168 ASSERT(psize != 0); 1169 enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; 1170 zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; 1171 zio->io_flags |= ZIO_FLAG_IO_REWRITE; 1172 } else { 1173 BP_ZERO(bp); 1174 zio->io_pipeline = ZIO_WRITE_PIPELINE; 1175 } 1176 1177 if (psize == 0) { 1178 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1179 } else { 1180 ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); 1181 BP_SET_LSIZE(bp, lsize); 1182 BP_SET_PSIZE(bp, psize); 1183 BP_SET_COMPRESS(bp, compress); 1184 BP_SET_CHECKSUM(bp, zp->zp_checksum); 1185 BP_SET_TYPE(bp, zp->zp_type); 1186 BP_SET_LEVEL(bp, zp->zp_level); 1187 BP_SET_DEDUP(bp, zp->zp_dedup); 1188 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1189 if (zp->zp_dedup) { 1190 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1191 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1192 zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; 1193 } 1194 if (zp->zp_nopwrite) { 1195 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1196 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1197 zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; 1198 } 1199 } 1200 1201 return (ZIO_PIPELINE_CONTINUE); 1202} 1203 1204static int 1205zio_free_bp_init(zio_t *zio) 1206{ 1207 blkptr_t *bp = zio->io_bp; 1208 1209 if (zio->io_child_type == ZIO_CHILD_LOGICAL) { 1210 if (BP_GET_DEDUP(bp)) 1211 zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; 1212 } 1213 1214 return (ZIO_PIPELINE_CONTINUE); 1215} 1216 1217/* 1218 * ========================================================================== 1219 * Execute the I/O pipeline 1220 * ========================================================================== 1221 */ 1222 1223static void 1224zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline) 1225{ 1226 spa_t *spa = zio->io_spa; 1227 zio_type_t t = zio->io_type; 1228 int flags = (cutinline ? TQ_FRONT : 0); 1229 1230 ASSERT(q == ZIO_TASKQ_ISSUE || q == ZIO_TASKQ_INTERRUPT); 1231 1232 /* 1233 * If we're a config writer or a probe, the normal issue and 1234 * interrupt threads may all be blocked waiting for the config lock. 1235 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. 1236 */ 1237 if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) 1238 t = ZIO_TYPE_NULL; 1239 1240 /* 1241 * A similar issue exists for the L2ARC write thread until L2ARC 2.0. 1242 */ 1243 if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) 1244 t = ZIO_TYPE_NULL; 1245 1246 /* 1247 * If this is a high priority I/O, then use the high priority taskq. 1248 */ 1249 if (zio->io_priority == ZIO_PRIORITY_NOW && 1250 spa->spa_zio_taskq[t][q + 1] != NULL) 1251 q++; 1252 1253 ASSERT3U(q, <, ZIO_TASKQ_TYPES); 1254 1255 /* 1256 * NB: We are assuming that the zio can only be dispatched 1257 * to a single taskq at a time. It would be a grievous error 1258 * to dispatch the zio to another taskq at the same time. 1259 */ 1260#if defined(illumos) || !defined(_KERNEL) 1261 ASSERT(zio->io_tqent.tqent_next == NULL); 1262#else 1263 ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); 1264#endif 1265 taskq_dispatch_ent(spa->spa_zio_taskq[t][q], 1266 (task_func_t *)zio_execute, zio, flags, &zio->io_tqent); 1267} 1268 1269static boolean_t 1270zio_taskq_member(zio_t *zio, enum zio_taskq_type q) 1271{ 1272 kthread_t *executor = zio->io_executor; 1273 spa_t *spa = zio->io_spa; 1274 1275 for (zio_type_t t = 0; t < ZIO_TYPES; t++) 1276 if (taskq_member(spa->spa_zio_taskq[t][q], executor)) 1277 return (B_TRUE); 1278 1279 return (B_FALSE); 1280} 1281 1282static int 1283zio_issue_async(zio_t *zio) 1284{ 1285 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 1286 1287 return (ZIO_PIPELINE_STOP); 1288} 1289 1290void 1291zio_interrupt(zio_t *zio) 1292{ 1293 zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); 1294} 1295 1296/* 1297 * Execute the I/O pipeline until one of the following occurs: 1298 * 1299 * (1) the I/O completes 1300 * (2) the pipeline stalls waiting for dependent child I/Os 1301 * (3) the I/O issues, so we're waiting for an I/O completion interrupt 1302 * (4) the I/O is delegated by vdev-level caching or aggregation 1303 * (5) the I/O is deferred due to vdev-level queueing 1304 * (6) the I/O is handed off to another thread. 1305 * 1306 * In all cases, the pipeline stops whenever there's no CPU work; it never 1307 * burns a thread in cv_wait(). 1308 * 1309 * There's no locking on io_stage because there's no legitimate way 1310 * for multiple threads to be attempting to process the same I/O. 1311 */ 1312static zio_pipe_stage_t *zio_pipeline[]; 1313 1314void 1315zio_execute(zio_t *zio) 1316{ 1317 zio->io_executor = curthread; 1318 1319 while (zio->io_stage < ZIO_STAGE_DONE) { 1320 enum zio_stage pipeline = zio->io_pipeline; 1321 enum zio_stage stage = zio->io_stage; 1322 int rv; 1323 1324 ASSERT(!MUTEX_HELD(&zio->io_lock)); 1325 ASSERT(ISP2(stage)); 1326 ASSERT(zio->io_stall == NULL); 1327 1328 do { 1329 stage <<= 1; 1330 } while ((stage & pipeline) == 0); 1331 1332 ASSERT(stage <= ZIO_STAGE_DONE); 1333 1334 /* 1335 * If we are in interrupt context and this pipeline stage 1336 * will grab a config lock that is held across I/O, 1337 * or may wait for an I/O that needs an interrupt thread 1338 * to complete, issue async to avoid deadlock. 1339 * 1340 * For VDEV_IO_START, we cut in line so that the io will 1341 * be sent to disk promptly. 1342 */ 1343 if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && 1344 zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { 1345 boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? 1346 zio_requeue_io_start_cut_in_line : B_FALSE; 1347 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); 1348 return; 1349 } 1350 1351 zio->io_stage = stage; 1352 rv = zio_pipeline[highbit(stage) - 1](zio); 1353 1354 if (rv == ZIO_PIPELINE_STOP) 1355 return; 1356 1357 ASSERT(rv == ZIO_PIPELINE_CONTINUE); 1358 } 1359} 1360 1361/* 1362 * ========================================================================== 1363 * Initiate I/O, either sync or async 1364 * ========================================================================== 1365 */ 1366int 1367zio_wait(zio_t *zio) 1368{ 1369 int error; 1370 1371 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 1372 ASSERT(zio->io_executor == NULL); 1373 1374 zio->io_waiter = curthread; 1375 1376 zio_execute(zio); 1377 1378 mutex_enter(&zio->io_lock); 1379 while (zio->io_executor != NULL) 1380 cv_wait(&zio->io_cv, &zio->io_lock); 1381 mutex_exit(&zio->io_lock); 1382 1383 error = zio->io_error; 1384 zio_destroy(zio); 1385 1386 return (error); 1387} 1388 1389void 1390zio_nowait(zio_t *zio) 1391{ 1392 ASSERT(zio->io_executor == NULL); 1393 1394 if (zio->io_child_type == ZIO_CHILD_LOGICAL && 1395 zio_unique_parent(zio) == NULL) { 1396 /* 1397 * This is a logical async I/O with no parent to wait for it. 1398 * We add it to the spa_async_root_zio "Godfather" I/O which 1399 * will ensure they complete prior to unloading the pool. 1400 */ 1401 spa_t *spa = zio->io_spa; 1402 1403 zio_add_child(spa->spa_async_zio_root, zio); 1404 } 1405 1406 zio_execute(zio); 1407} 1408 1409/* 1410 * ========================================================================== 1411 * Reexecute or suspend/resume failed I/O 1412 * ========================================================================== 1413 */ 1414 1415static void 1416zio_reexecute(zio_t *pio) 1417{ 1418 zio_t *cio, *cio_next; 1419 1420 ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); 1421 ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); 1422 ASSERT(pio->io_gang_leader == NULL); 1423 ASSERT(pio->io_gang_tree == NULL); 1424 1425 pio->io_flags = pio->io_orig_flags; 1426 pio->io_stage = pio->io_orig_stage; 1427 pio->io_pipeline = pio->io_orig_pipeline; 1428 pio->io_reexecute = 0; 1429 pio->io_flags |= ZIO_FLAG_REEXECUTED; 1430 pio->io_error = 0; 1431 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1432 pio->io_state[w] = 0; 1433 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 1434 pio->io_child_error[c] = 0; 1435 1436 if (IO_IS_ALLOCATING(pio)) 1437 BP_ZERO(pio->io_bp); 1438 1439 /* 1440 * As we reexecute pio's children, new children could be created. 1441 * New children go to the head of pio's io_child_list, however, 1442 * so we will (correctly) not reexecute them. The key is that 1443 * the remainder of pio's io_child_list, from 'cio_next' onward, 1444 * cannot be affected by any side effects of reexecuting 'cio'. 1445 */ 1446 for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { 1447 cio_next = zio_walk_children(pio); 1448 mutex_enter(&pio->io_lock); 1449 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1450 pio->io_children[cio->io_child_type][w]++; 1451 mutex_exit(&pio->io_lock); 1452 zio_reexecute(cio); 1453 } 1454 1455 /* 1456 * Now that all children have been reexecuted, execute the parent. 1457 * We don't reexecute "The Godfather" I/O here as it's the 1458 * responsibility of the caller to wait on him. 1459 */ 1460 if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) 1461 zio_execute(pio); 1462} 1463 1464void 1465zio_suspend(spa_t *spa, zio_t *zio) 1466{ 1467 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) 1468 fm_panic("Pool '%s' has encountered an uncorrectable I/O " 1469 "failure and the failure mode property for this pool " 1470 "is set to panic.", spa_name(spa)); 1471 1472 zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); 1473 1474 mutex_enter(&spa->spa_suspend_lock); 1475 1476 if (spa->spa_suspend_zio_root == NULL) 1477 spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 1478 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 1479 ZIO_FLAG_GODFATHER); 1480 1481 spa->spa_suspended = B_TRUE; 1482 1483 if (zio != NULL) { 1484 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 1485 ASSERT(zio != spa->spa_suspend_zio_root); 1486 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1487 ASSERT(zio_unique_parent(zio) == NULL); 1488 ASSERT(zio->io_stage == ZIO_STAGE_DONE); 1489 zio_add_child(spa->spa_suspend_zio_root, zio); 1490 } 1491 1492 mutex_exit(&spa->spa_suspend_lock); 1493} 1494 1495int 1496zio_resume(spa_t *spa) 1497{ 1498 zio_t *pio; 1499 1500 /* 1501 * Reexecute all previously suspended i/o. 1502 */ 1503 mutex_enter(&spa->spa_suspend_lock); 1504 spa->spa_suspended = B_FALSE; 1505 cv_broadcast(&spa->spa_suspend_cv); 1506 pio = spa->spa_suspend_zio_root; 1507 spa->spa_suspend_zio_root = NULL; 1508 mutex_exit(&spa->spa_suspend_lock); 1509 1510 if (pio == NULL) 1511 return (0); 1512 1513 zio_reexecute(pio); 1514 return (zio_wait(pio)); 1515} 1516 1517void 1518zio_resume_wait(spa_t *spa) 1519{ 1520 mutex_enter(&spa->spa_suspend_lock); 1521 while (spa_suspended(spa)) 1522 cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); 1523 mutex_exit(&spa->spa_suspend_lock); 1524} 1525 1526/* 1527 * ========================================================================== 1528 * Gang blocks. 1529 * 1530 * A gang block is a collection of small blocks that looks to the DMU 1531 * like one large block. When zio_dva_allocate() cannot find a block 1532 * of the requested size, due to either severe fragmentation or the pool 1533 * being nearly full, it calls zio_write_gang_block() to construct the 1534 * block from smaller fragments. 1535 * 1536 * A gang block consists of a gang header (zio_gbh_phys_t) and up to 1537 * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like 1538 * an indirect block: it's an array of block pointers. It consumes 1539 * only one sector and hence is allocatable regardless of fragmentation. 1540 * The gang header's bps point to its gang members, which hold the data. 1541 * 1542 * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> 1543 * as the verifier to ensure uniqueness of the SHA256 checksum. 1544 * Critically, the gang block bp's blk_cksum is the checksum of the data, 1545 * not the gang header. This ensures that data block signatures (needed for 1546 * deduplication) are independent of how the block is physically stored. 1547 * 1548 * Gang blocks can be nested: a gang member may itself be a gang block. 1549 * Thus every gang block is a tree in which root and all interior nodes are 1550 * gang headers, and the leaves are normal blocks that contain user data. 1551 * The root of the gang tree is called the gang leader. 1552 * 1553 * To perform any operation (read, rewrite, free, claim) on a gang block, 1554 * zio_gang_assemble() first assembles the gang tree (minus data leaves) 1555 * in the io_gang_tree field of the original logical i/o by recursively 1556 * reading the gang leader and all gang headers below it. This yields 1557 * an in-core tree containing the contents of every gang header and the 1558 * bps for every constituent of the gang block. 1559 * 1560 * With the gang tree now assembled, zio_gang_issue() just walks the gang tree 1561 * and invokes a callback on each bp. To free a gang block, zio_gang_issue() 1562 * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. 1563 * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). 1564 * zio_read_gang() is a wrapper around zio_read() that omits reading gang 1565 * headers, since we already have those in io_gang_tree. zio_rewrite_gang() 1566 * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() 1567 * of the gang header plus zio_checksum_compute() of the data to update the 1568 * gang header's blk_cksum as described above. 1569 * 1570 * The two-phase assemble/issue model solves the problem of partial failure -- 1571 * what if you'd freed part of a gang block but then couldn't read the 1572 * gang header for another part? Assembling the entire gang tree first 1573 * ensures that all the necessary gang header I/O has succeeded before 1574 * starting the actual work of free, claim, or write. Once the gang tree 1575 * is assembled, free and claim are in-memory operations that cannot fail. 1576 * 1577 * In the event that a gang write fails, zio_dva_unallocate() walks the 1578 * gang tree to immediately free (i.e. insert back into the space map) 1579 * everything we've allocated. This ensures that we don't get ENOSPC 1580 * errors during repeated suspend/resume cycles due to a flaky device. 1581 * 1582 * Gang rewrites only happen during sync-to-convergence. If we can't assemble 1583 * the gang tree, we won't modify the block, so we can safely defer the free 1584 * (knowing that the block is still intact). If we *can* assemble the gang 1585 * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free 1586 * each constituent bp and we can allocate a new block on the next sync pass. 1587 * 1588 * In all cases, the gang tree allows complete recovery from partial failure. 1589 * ========================================================================== 1590 */ 1591 1592static zio_t * 1593zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1594{ 1595 if (gn != NULL) 1596 return (pio); 1597 1598 return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), 1599 NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1600 &pio->io_bookmark)); 1601} 1602 1603zio_t * 1604zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1605{ 1606 zio_t *zio; 1607 1608 if (gn != NULL) { 1609 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1610 gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, 1611 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1612 /* 1613 * As we rewrite each gang header, the pipeline will compute 1614 * a new gang block header checksum for it; but no one will 1615 * compute a new data checksum, so we do that here. The one 1616 * exception is the gang leader: the pipeline already computed 1617 * its data checksum because that stage precedes gang assembly. 1618 * (Presently, nothing actually uses interior data checksums; 1619 * this is just good hygiene.) 1620 */ 1621 if (gn != pio->io_gang_leader->io_gang_tree) { 1622 zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), 1623 data, BP_GET_PSIZE(bp)); 1624 } 1625 /* 1626 * If we are here to damage data for testing purposes, 1627 * leave the GBH alone so that we can detect the damage. 1628 */ 1629 if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) 1630 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 1631 } else { 1632 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1633 data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, 1634 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1635 } 1636 1637 return (zio); 1638} 1639 1640/* ARGSUSED */ 1641zio_t * 1642zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1643{ 1644 return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, 1645 BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp), 1646 ZIO_GANG_CHILD_FLAGS(pio))); 1647} 1648 1649/* ARGSUSED */ 1650zio_t * 1651zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1652{ 1653 return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, 1654 NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); 1655} 1656 1657static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { 1658 NULL, 1659 zio_read_gang, 1660 zio_rewrite_gang, 1661 zio_free_gang, 1662 zio_claim_gang, 1663 NULL 1664}; 1665 1666static void zio_gang_tree_assemble_done(zio_t *zio); 1667 1668static zio_gang_node_t * 1669zio_gang_node_alloc(zio_gang_node_t **gnpp) 1670{ 1671 zio_gang_node_t *gn; 1672 1673 ASSERT(*gnpp == NULL); 1674 1675 gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); 1676 gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); 1677 *gnpp = gn; 1678 1679 return (gn); 1680} 1681 1682static void 1683zio_gang_node_free(zio_gang_node_t **gnpp) 1684{ 1685 zio_gang_node_t *gn = *gnpp; 1686 1687 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1688 ASSERT(gn->gn_child[g] == NULL); 1689 1690 zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); 1691 kmem_free(gn, sizeof (*gn)); 1692 *gnpp = NULL; 1693} 1694 1695static void 1696zio_gang_tree_free(zio_gang_node_t **gnpp) 1697{ 1698 zio_gang_node_t *gn = *gnpp; 1699 1700 if (gn == NULL) 1701 return; 1702 1703 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1704 zio_gang_tree_free(&gn->gn_child[g]); 1705 1706 zio_gang_node_free(gnpp); 1707} 1708 1709static void 1710zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) 1711{ 1712 zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); 1713 1714 ASSERT(gio->io_gang_leader == gio); 1715 ASSERT(BP_IS_GANG(bp)); 1716 1717 zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, 1718 SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, 1719 gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); 1720} 1721 1722static void 1723zio_gang_tree_assemble_done(zio_t *zio) 1724{ 1725 zio_t *gio = zio->io_gang_leader; 1726 zio_gang_node_t *gn = zio->io_private; 1727 blkptr_t *bp = zio->io_bp; 1728 1729 ASSERT(gio == zio_unique_parent(zio)); 1730 ASSERT(zio->io_child_count == 0); 1731 1732 if (zio->io_error) 1733 return; 1734 1735 if (BP_SHOULD_BYTESWAP(bp)) 1736 byteswap_uint64_array(zio->io_data, zio->io_size); 1737 1738 ASSERT(zio->io_data == gn->gn_gbh); 1739 ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 1740 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1741 1742 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1743 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1744 if (!BP_IS_GANG(gbp)) 1745 continue; 1746 zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); 1747 } 1748} 1749 1750static void 1751zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) 1752{ 1753 zio_t *gio = pio->io_gang_leader; 1754 zio_t *zio; 1755 1756 ASSERT(BP_IS_GANG(bp) == !!gn); 1757 ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); 1758 ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); 1759 1760 /* 1761 * If you're a gang header, your data is in gn->gn_gbh. 1762 * If you're a gang member, your data is in 'data' and gn == NULL. 1763 */ 1764 zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); 1765 1766 if (gn != NULL) { 1767 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1768 1769 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1770 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1771 if (BP_IS_HOLE(gbp)) 1772 continue; 1773 zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); 1774 data = (char *)data + BP_GET_PSIZE(gbp); 1775 } 1776 } 1777 1778 if (gn == gio->io_gang_tree && gio->io_data != NULL) 1779 ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); 1780 1781 if (zio != pio) 1782 zio_nowait(zio); 1783} 1784 1785static int 1786zio_gang_assemble(zio_t *zio) 1787{ 1788 blkptr_t *bp = zio->io_bp; 1789 1790 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); 1791 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1792 1793 zio->io_gang_leader = zio; 1794 1795 zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); 1796 1797 return (ZIO_PIPELINE_CONTINUE); 1798} 1799 1800static int 1801zio_gang_issue(zio_t *zio) 1802{ 1803 blkptr_t *bp = zio->io_bp; 1804 1805 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) 1806 return (ZIO_PIPELINE_STOP); 1807 1808 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); 1809 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1810 1811 if (zio->io_child_error[ZIO_CHILD_GANG] == 0) 1812 zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); 1813 else 1814 zio_gang_tree_free(&zio->io_gang_tree); 1815 1816 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1817 1818 return (ZIO_PIPELINE_CONTINUE); 1819} 1820 1821static void 1822zio_write_gang_member_ready(zio_t *zio) 1823{ 1824 zio_t *pio = zio_unique_parent(zio); 1825 zio_t *gio = zio->io_gang_leader; 1826 dva_t *cdva = zio->io_bp->blk_dva; 1827 dva_t *pdva = pio->io_bp->blk_dva; 1828 uint64_t asize; 1829 1830 if (BP_IS_HOLE(zio->io_bp)) 1831 return; 1832 1833 ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); 1834 1835 ASSERT(zio->io_child_type == ZIO_CHILD_GANG); 1836 ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); 1837 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); 1838 ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); 1839 ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 1840 1841 mutex_enter(&pio->io_lock); 1842 for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { 1843 ASSERT(DVA_GET_GANG(&pdva[d])); 1844 asize = DVA_GET_ASIZE(&pdva[d]); 1845 asize += DVA_GET_ASIZE(&cdva[d]); 1846 DVA_SET_ASIZE(&pdva[d], asize); 1847 } 1848 mutex_exit(&pio->io_lock); 1849} 1850 1851static int 1852zio_write_gang_block(zio_t *pio) 1853{ 1854 spa_t *spa = pio->io_spa; 1855 blkptr_t *bp = pio->io_bp; 1856 zio_t *gio = pio->io_gang_leader; 1857 zio_t *zio; 1858 zio_gang_node_t *gn, **gnpp; 1859 zio_gbh_phys_t *gbh; 1860 uint64_t txg = pio->io_txg; 1861 uint64_t resid = pio->io_size; 1862 uint64_t lsize; 1863 int copies = gio->io_prop.zp_copies; 1864 int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); 1865 zio_prop_t zp; 1866 int error; 1867 1868 error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE, 1869 bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, 1870 METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); 1871 if (error) { 1872 pio->io_error = error; 1873 return (ZIO_PIPELINE_CONTINUE); 1874 } 1875 1876 if (pio == gio) { 1877 gnpp = &gio->io_gang_tree; 1878 } else { 1879 gnpp = pio->io_private; 1880 ASSERT(pio->io_ready == zio_write_gang_member_ready); 1881 } 1882 1883 gn = zio_gang_node_alloc(gnpp); 1884 gbh = gn->gn_gbh; 1885 bzero(gbh, SPA_GANGBLOCKSIZE); 1886 1887 /* 1888 * Create the gang header. 1889 */ 1890 zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, 1891 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1892 1893 /* 1894 * Create and nowait the gang children. 1895 */ 1896 for (int g = 0; resid != 0; resid -= lsize, g++) { 1897 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), 1898 SPA_MINBLOCKSIZE); 1899 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); 1900 1901 zp.zp_checksum = gio->io_prop.zp_checksum; 1902 zp.zp_compress = ZIO_COMPRESS_OFF; 1903 zp.zp_type = DMU_OT_NONE; 1904 zp.zp_level = 0; 1905 zp.zp_copies = gio->io_prop.zp_copies; 1906 zp.zp_dedup = B_FALSE; 1907 zp.zp_dedup_verify = B_FALSE; 1908 zp.zp_nopwrite = B_FALSE; 1909 1910 zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], 1911 (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, 1912 zio_write_gang_member_ready, NULL, &gn->gn_child[g], 1913 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1914 &pio->io_bookmark)); 1915 } 1916 1917 /* 1918 * Set pio's pipeline to just wait for zio to finish. 1919 */ 1920 pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1921 1922 zio_nowait(zio); 1923 1924 return (ZIO_PIPELINE_CONTINUE); 1925} 1926 1927/* 1928 * The zio_nop_write stage in the pipeline determines if allocating 1929 * a new bp is necessary. By leveraging a cryptographically secure checksum, 1930 * such as SHA256, we can compare the checksums of the new data and the old 1931 * to determine if allocating a new block is required. The nopwrite 1932 * feature can handle writes in either syncing or open context (i.e. zil 1933 * writes) and as a result is mutually exclusive with dedup. 1934 */ 1935static int 1936zio_nop_write(zio_t *zio) 1937{ 1938 blkptr_t *bp = zio->io_bp; 1939 blkptr_t *bp_orig = &zio->io_bp_orig; 1940 zio_prop_t *zp = &zio->io_prop; 1941 1942 ASSERT(BP_GET_LEVEL(bp) == 0); 1943 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1944 ASSERT(zp->zp_nopwrite); 1945 ASSERT(!zp->zp_dedup); 1946 ASSERT(zio->io_bp_override == NULL); 1947 ASSERT(IO_IS_ALLOCATING(zio)); 1948 1949 /* 1950 * Check to see if the original bp and the new bp have matching 1951 * characteristics (i.e. same checksum, compression algorithms, etc). 1952 * If they don't then just continue with the pipeline which will 1953 * allocate a new bp. 1954 */ 1955 if (BP_IS_HOLE(bp_orig) || 1956 !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup || 1957 BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || 1958 BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || 1959 BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || 1960 zp->zp_copies != BP_GET_NDVAS(bp_orig)) 1961 return (ZIO_PIPELINE_CONTINUE); 1962 1963 /* 1964 * If the checksums match then reset the pipeline so that we 1965 * avoid allocating a new bp and issuing any I/O. 1966 */ 1967 if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { 1968 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup); 1969 ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); 1970 ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); 1971 ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); 1972 ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, 1973 sizeof (uint64_t)) == 0); 1974 1975 *bp = *bp_orig; 1976 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1977 zio->io_flags |= ZIO_FLAG_NOPWRITE; 1978 } 1979 1980 return (ZIO_PIPELINE_CONTINUE); 1981} 1982 1983/* 1984 * ========================================================================== 1985 * Dedup 1986 * ========================================================================== 1987 */ 1988static void 1989zio_ddt_child_read_done(zio_t *zio) 1990{ 1991 blkptr_t *bp = zio->io_bp; 1992 ddt_entry_t *dde = zio->io_private; 1993 ddt_phys_t *ddp; 1994 zio_t *pio = zio_unique_parent(zio); 1995 1996 mutex_enter(&pio->io_lock); 1997 ddp = ddt_phys_select(dde, bp); 1998 if (zio->io_error == 0) 1999 ddt_phys_clear(ddp); /* this ddp doesn't need repair */ 2000 if (zio->io_error == 0 && dde->dde_repair_data == NULL) 2001 dde->dde_repair_data = zio->io_data; 2002 else 2003 zio_buf_free(zio->io_data, zio->io_size); 2004 mutex_exit(&pio->io_lock); 2005} 2006 2007static int 2008zio_ddt_read_start(zio_t *zio) 2009{ 2010 blkptr_t *bp = zio->io_bp; 2011 2012 ASSERT(BP_GET_DEDUP(bp)); 2013 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2014 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2015 2016 if (zio->io_child_error[ZIO_CHILD_DDT]) { 2017 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2018 ddt_entry_t *dde = ddt_repair_start(ddt, bp); 2019 ddt_phys_t *ddp = dde->dde_phys; 2020 ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); 2021 blkptr_t blk; 2022 2023 ASSERT(zio->io_vsd == NULL); 2024 zio->io_vsd = dde; 2025 2026 if (ddp_self == NULL) 2027 return (ZIO_PIPELINE_CONTINUE); 2028 2029 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 2030 if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) 2031 continue; 2032 ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, 2033 &blk); 2034 zio_nowait(zio_read(zio, zio->io_spa, &blk, 2035 zio_buf_alloc(zio->io_size), zio->io_size, 2036 zio_ddt_child_read_done, dde, zio->io_priority, 2037 ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, 2038 &zio->io_bookmark)); 2039 } 2040 return (ZIO_PIPELINE_CONTINUE); 2041 } 2042 2043 zio_nowait(zio_read(zio, zio->io_spa, bp, 2044 zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, 2045 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); 2046 2047 return (ZIO_PIPELINE_CONTINUE); 2048} 2049 2050static int 2051zio_ddt_read_done(zio_t *zio) 2052{ 2053 blkptr_t *bp = zio->io_bp; 2054 2055 if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)) 2056 return (ZIO_PIPELINE_STOP); 2057 2058 ASSERT(BP_GET_DEDUP(bp)); 2059 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2060 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2061 2062 if (zio->io_child_error[ZIO_CHILD_DDT]) { 2063 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2064 ddt_entry_t *dde = zio->io_vsd; 2065 if (ddt == NULL) { 2066 ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); 2067 return (ZIO_PIPELINE_CONTINUE); 2068 } 2069 if (dde == NULL) { 2070 zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; 2071 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 2072 return (ZIO_PIPELINE_STOP); 2073 } 2074 if (dde->dde_repair_data != NULL) { 2075 bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); 2076 zio->io_child_error[ZIO_CHILD_DDT] = 0; 2077 } 2078 ddt_repair_done(ddt, dde); 2079 zio->io_vsd = NULL; 2080 } 2081 2082 ASSERT(zio->io_vsd == NULL); 2083 2084 return (ZIO_PIPELINE_CONTINUE); 2085} 2086 2087static boolean_t 2088zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) 2089{ 2090 spa_t *spa = zio->io_spa; 2091 2092 /* 2093 * Note: we compare the original data, not the transformed data, 2094 * because when zio->io_bp is an override bp, we will not have 2095 * pushed the I/O transforms. That's an important optimization 2096 * because otherwise we'd compress/encrypt all dmu_sync() data twice. 2097 */ 2098 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2099 zio_t *lio = dde->dde_lead_zio[p]; 2100 2101 if (lio != NULL) { 2102 return (lio->io_orig_size != zio->io_orig_size || 2103 bcmp(zio->io_orig_data, lio->io_orig_data, 2104 zio->io_orig_size) != 0); 2105 } 2106 } 2107 2108 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2109 ddt_phys_t *ddp = &dde->dde_phys[p]; 2110 2111 if (ddp->ddp_phys_birth != 0) { 2112 arc_buf_t *abuf = NULL; 2113 uint32_t aflags = ARC_WAIT; 2114 blkptr_t blk = *zio->io_bp; 2115 int error; 2116 2117 ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); 2118 2119 ddt_exit(ddt); 2120 2121 error = arc_read(NULL, spa, &blk, 2122 arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, 2123 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2124 &aflags, &zio->io_bookmark); 2125 2126 if (error == 0) { 2127 if (arc_buf_size(abuf) != zio->io_orig_size || 2128 bcmp(abuf->b_data, zio->io_orig_data, 2129 zio->io_orig_size) != 0) 2130 error = SET_ERROR(EEXIST); 2131 VERIFY(arc_buf_remove_ref(abuf, &abuf)); 2132 } 2133 2134 ddt_enter(ddt); 2135 return (error != 0); 2136 } 2137 } 2138 2139 return (B_FALSE); 2140} 2141 2142static void 2143zio_ddt_child_write_ready(zio_t *zio) 2144{ 2145 int p = zio->io_prop.zp_copies; 2146 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2147 ddt_entry_t *dde = zio->io_private; 2148 ddt_phys_t *ddp = &dde->dde_phys[p]; 2149 zio_t *pio; 2150 2151 if (zio->io_error) 2152 return; 2153 2154 ddt_enter(ddt); 2155 2156 ASSERT(dde->dde_lead_zio[p] == zio); 2157 2158 ddt_phys_fill(ddp, zio->io_bp); 2159 2160 while ((pio = zio_walk_parents(zio)) != NULL) 2161 ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); 2162 2163 ddt_exit(ddt); 2164} 2165 2166static void 2167zio_ddt_child_write_done(zio_t *zio) 2168{ 2169 int p = zio->io_prop.zp_copies; 2170 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2171 ddt_entry_t *dde = zio->io_private; 2172 ddt_phys_t *ddp = &dde->dde_phys[p]; 2173 2174 ddt_enter(ddt); 2175 2176 ASSERT(ddp->ddp_refcnt == 0); 2177 ASSERT(dde->dde_lead_zio[p] == zio); 2178 dde->dde_lead_zio[p] = NULL; 2179 2180 if (zio->io_error == 0) { 2181 while (zio_walk_parents(zio) != NULL) 2182 ddt_phys_addref(ddp); 2183 } else { 2184 ddt_phys_clear(ddp); 2185 } 2186 2187 ddt_exit(ddt); 2188} 2189 2190static void 2191zio_ddt_ditto_write_done(zio_t *zio) 2192{ 2193 int p = DDT_PHYS_DITTO; 2194 zio_prop_t *zp = &zio->io_prop; 2195 blkptr_t *bp = zio->io_bp; 2196 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2197 ddt_entry_t *dde = zio->io_private; 2198 ddt_phys_t *ddp = &dde->dde_phys[p]; 2199 ddt_key_t *ddk = &dde->dde_key; 2200 2201 ddt_enter(ddt); 2202 2203 ASSERT(ddp->ddp_refcnt == 0); 2204 ASSERT(dde->dde_lead_zio[p] == zio); 2205 dde->dde_lead_zio[p] = NULL; 2206 2207 if (zio->io_error == 0) { 2208 ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); 2209 ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); 2210 ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); 2211 if (ddp->ddp_phys_birth != 0) 2212 ddt_phys_free(ddt, ddk, ddp, zio->io_txg); 2213 ddt_phys_fill(ddp, bp); 2214 } 2215 2216 ddt_exit(ddt); 2217} 2218 2219static int 2220zio_ddt_write(zio_t *zio) 2221{ 2222 spa_t *spa = zio->io_spa; 2223 blkptr_t *bp = zio->io_bp; 2224 uint64_t txg = zio->io_txg; 2225 zio_prop_t *zp = &zio->io_prop; 2226 int p = zp->zp_copies; 2227 int ditto_copies; 2228 zio_t *cio = NULL; 2229 zio_t *dio = NULL; 2230 ddt_t *ddt = ddt_select(spa, bp); 2231 ddt_entry_t *dde; 2232 ddt_phys_t *ddp; 2233 2234 ASSERT(BP_GET_DEDUP(bp)); 2235 ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); 2236 ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); 2237 2238 ddt_enter(ddt); 2239 dde = ddt_lookup(ddt, bp, B_TRUE); 2240 ddp = &dde->dde_phys[p]; 2241 2242 if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { 2243 /* 2244 * If we're using a weak checksum, upgrade to a strong checksum 2245 * and try again. If we're already using a strong checksum, 2246 * we can't resolve it, so just convert to an ordinary write. 2247 * (And automatically e-mail a paper to Nature?) 2248 */ 2249 if (!zio_checksum_table[zp->zp_checksum].ci_dedup) { 2250 zp->zp_checksum = spa_dedup_checksum(spa); 2251 zio_pop_transforms(zio); 2252 zio->io_stage = ZIO_STAGE_OPEN; 2253 BP_ZERO(bp); 2254 } else { 2255 zp->zp_dedup = B_FALSE; 2256 } 2257 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2258 ddt_exit(ddt); 2259 return (ZIO_PIPELINE_CONTINUE); 2260 } 2261 2262 ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); 2263 ASSERT(ditto_copies < SPA_DVAS_PER_BP); 2264 2265 if (ditto_copies > ddt_ditto_copies_present(dde) && 2266 dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { 2267 zio_prop_t czp = *zp; 2268 2269 czp.zp_copies = ditto_copies; 2270 2271 /* 2272 * If we arrived here with an override bp, we won't have run 2273 * the transform stack, so we won't have the data we need to 2274 * generate a child i/o. So, toss the override bp and restart. 2275 * This is safe, because using the override bp is just an 2276 * optimization; and it's rare, so the cost doesn't matter. 2277 */ 2278 if (zio->io_bp_override) { 2279 zio_pop_transforms(zio); 2280 zio->io_stage = ZIO_STAGE_OPEN; 2281 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2282 zio->io_bp_override = NULL; 2283 BP_ZERO(bp); 2284 ddt_exit(ddt); 2285 return (ZIO_PIPELINE_CONTINUE); 2286 } 2287 2288 dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2289 zio->io_orig_size, &czp, NULL, 2290 zio_ddt_ditto_write_done, dde, zio->io_priority, 2291 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2292 2293 zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); 2294 dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; 2295 } 2296 2297 if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { 2298 if (ddp->ddp_phys_birth != 0) 2299 ddt_bp_fill(ddp, bp, txg); 2300 if (dde->dde_lead_zio[p] != NULL) 2301 zio_add_child(zio, dde->dde_lead_zio[p]); 2302 else 2303 ddt_phys_addref(ddp); 2304 } else if (zio->io_bp_override) { 2305 ASSERT(bp->blk_birth == txg); 2306 ASSERT(BP_EQUAL(bp, zio->io_bp_override)); 2307 ddt_phys_fill(ddp, bp); 2308 ddt_phys_addref(ddp); 2309 } else { 2310 cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2311 zio->io_orig_size, zp, zio_ddt_child_write_ready, 2312 zio_ddt_child_write_done, dde, zio->io_priority, 2313 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2314 2315 zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); 2316 dde->dde_lead_zio[p] = cio; 2317 } 2318 2319 ddt_exit(ddt); 2320 2321 if (cio) 2322 zio_nowait(cio); 2323 if (dio) 2324 zio_nowait(dio); 2325 2326 return (ZIO_PIPELINE_CONTINUE); 2327} 2328 2329ddt_entry_t *freedde; /* for debugging */ 2330 2331static int 2332zio_ddt_free(zio_t *zio) 2333{ 2334 spa_t *spa = zio->io_spa; 2335 blkptr_t *bp = zio->io_bp; 2336 ddt_t *ddt = ddt_select(spa, bp); 2337 ddt_entry_t *dde; 2338 ddt_phys_t *ddp; 2339 2340 ASSERT(BP_GET_DEDUP(bp)); 2341 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2342 2343 ddt_enter(ddt); 2344 freedde = dde = ddt_lookup(ddt, bp, B_TRUE); 2345 ddp = ddt_phys_select(dde, bp); 2346 ddt_phys_decref(ddp); 2347 ddt_exit(ddt); 2348 2349 return (ZIO_PIPELINE_CONTINUE); 2350} 2351 2352/* 2353 * ========================================================================== 2354 * Allocate and free blocks 2355 * ========================================================================== 2356 */ 2357static int 2358zio_dva_allocate(zio_t *zio) 2359{ 2360 spa_t *spa = zio->io_spa; 2361 metaslab_class_t *mc = spa_normal_class(spa); 2362 blkptr_t *bp = zio->io_bp; 2363 int error; 2364 int flags = 0; 2365 2366 if (zio->io_gang_leader == NULL) { 2367 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2368 zio->io_gang_leader = zio; 2369 } 2370 2371 ASSERT(BP_IS_HOLE(bp)); 2372 ASSERT0(BP_GET_NDVAS(bp)); 2373 ASSERT3U(zio->io_prop.zp_copies, >, 0); 2374 ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); 2375 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 2376 2377 /* 2378 * The dump device does not support gang blocks so allocation on 2379 * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid 2380 * the "fast" gang feature. 2381 */ 2382 flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0; 2383 flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ? 2384 METASLAB_GANG_CHILD : 0; 2385 error = metaslab_alloc(spa, mc, zio->io_size, bp, 2386 zio->io_prop.zp_copies, zio->io_txg, NULL, flags); 2387 2388 if (error) { 2389 spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " 2390 "size %llu, error %d", spa_name(spa), zio, zio->io_size, 2391 error); 2392 if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) 2393 return (zio_write_gang_block(zio)); 2394 zio->io_error = error; 2395 } 2396 2397 return (ZIO_PIPELINE_CONTINUE); 2398} 2399 2400static int 2401zio_dva_free(zio_t *zio) 2402{ 2403 metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); 2404 2405 return (ZIO_PIPELINE_CONTINUE); 2406} 2407 2408static int 2409zio_dva_claim(zio_t *zio) 2410{ 2411 int error; 2412 2413 error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 2414 if (error) 2415 zio->io_error = error; 2416 2417 return (ZIO_PIPELINE_CONTINUE); 2418} 2419 2420/* 2421 * Undo an allocation. This is used by zio_done() when an I/O fails 2422 * and we want to give back the block we just allocated. 2423 * This handles both normal blocks and gang blocks. 2424 */ 2425static void 2426zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) 2427{ 2428 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 2429 ASSERT(zio->io_bp_override == NULL); 2430 2431 if (!BP_IS_HOLE(bp)) 2432 metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); 2433 2434 if (gn != NULL) { 2435 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 2436 zio_dva_unallocate(zio, gn->gn_child[g], 2437 &gn->gn_gbh->zg_blkptr[g]); 2438 } 2439 } 2440} 2441 2442/* 2443 * Try to allocate an intent log block. Return 0 on success, errno on failure. 2444 */ 2445int 2446zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, 2447 uint64_t size, boolean_t use_slog) 2448{ 2449 int error = 1; 2450 2451 ASSERT(txg > spa_syncing_txg(spa)); 2452 2453 /* 2454 * ZIL blocks are always contiguous (i.e. not gang blocks) so we 2455 * set the METASLAB_GANG_AVOID flag so that they don't "fast gang" 2456 * when allocating them. 2457 */ 2458 if (use_slog) { 2459 error = metaslab_alloc(spa, spa_log_class(spa), size, 2460 new_bp, 1, txg, old_bp, 2461 METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID); 2462 } 2463 2464 if (error) { 2465 error = metaslab_alloc(spa, spa_normal_class(spa), size, 2466 new_bp, 1, txg, old_bp, 2467 METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID); 2468 } 2469 2470 if (error == 0) { 2471 BP_SET_LSIZE(new_bp, size); 2472 BP_SET_PSIZE(new_bp, size); 2473 BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 2474 BP_SET_CHECKSUM(new_bp, 2475 spa_version(spa) >= SPA_VERSION_SLIM_ZIL 2476 ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); 2477 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 2478 BP_SET_LEVEL(new_bp, 0); 2479 BP_SET_DEDUP(new_bp, 0); 2480 BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 2481 } 2482 2483 return (error); 2484} 2485 2486/* 2487 * Free an intent log block. 2488 */ 2489void 2490zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp) 2491{ 2492 ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG); 2493 ASSERT(!BP_IS_GANG(bp)); 2494 2495 zio_free(spa, txg, bp); 2496} 2497 2498/* 2499 * ========================================================================== 2500 * Read, write and delete to physical devices 2501 * ========================================================================== 2502 */ 2503static int 2504zio_vdev_io_start(zio_t *zio) 2505{ 2506 vdev_t *vd = zio->io_vd; 2507 uint64_t align; 2508 spa_t *spa = zio->io_spa; 2509 2510 ASSERT(zio->io_error == 0); 2511 ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); 2512 2513 if (vd == NULL) { 2514 if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2515 spa_config_enter(spa, SCL_ZIO, zio, RW_READER); 2516 2517 /* 2518 * The mirror_ops handle multiple DVAs in a single BP. 2519 */ 2520 return (vdev_mirror_ops.vdev_op_io_start(zio)); 2521 } 2522 2523 if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE) { 2524 trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg); 2525 return (ZIO_PIPELINE_CONTINUE); 2526 } 2527 2528 /* 2529 * We keep track of time-sensitive I/Os so that the scan thread 2530 * can quickly react to certain workloads. In particular, we care 2531 * about non-scrubbing, top-level reads and writes with the following 2532 * characteristics: 2533 * - synchronous writes of user data to non-slog devices 2534 * - any reads of user data 2535 * When these conditions are met, adjust the timestamp of spa_last_io 2536 * which allows the scan thread to adjust its workload accordingly. 2537 */ 2538 if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && 2539 vd == vd->vdev_top && !vd->vdev_islog && 2540 zio->io_bookmark.zb_objset != DMU_META_OBJSET && 2541 zio->io_txg != spa_syncing_txg(spa)) { 2542 uint64_t old = spa->spa_last_io; 2543 uint64_t new = ddi_get_lbolt64(); 2544 if (old != new) 2545 (void) atomic_cas_64(&spa->spa_last_io, old, new); 2546 } 2547 2548 align = 1ULL << vd->vdev_top->vdev_ashift; 2549 2550 if (P2PHASE(zio->io_size, align) != 0) { 2551 uint64_t asize = P2ROUNDUP(zio->io_size, align); 2552 char *abuf = NULL; 2553 if (zio->io_type == ZIO_TYPE_READ || 2554 zio->io_type == ZIO_TYPE_WRITE) 2555 abuf = zio_buf_alloc(asize); 2556 ASSERT(vd == vd->vdev_top); 2557 if (zio->io_type == ZIO_TYPE_WRITE) { 2558 bcopy(zio->io_data, abuf, zio->io_size); 2559 bzero(abuf + zio->io_size, asize - zio->io_size); 2560 } 2561 zio_push_transform(zio, abuf, asize, abuf ? asize : 0, 2562 zio_subblock); 2563 } 2564 2565 ASSERT(P2PHASE(zio->io_offset, align) == 0); 2566 ASSERT(P2PHASE(zio->io_size, align) == 0); 2567 VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa)); 2568 2569 /* 2570 * If this is a repair I/O, and there's no self-healing involved -- 2571 * that is, we're just resilvering what we expect to resilver -- 2572 * then don't do the I/O unless zio's txg is actually in vd's DTL. 2573 * This prevents spurious resilvering with nested replication. 2574 * For example, given a mirror of mirrors, (A+B)+(C+D), if only 2575 * A is out of date, we'll read from C+D, then use the data to 2576 * resilver A+B -- but we don't actually want to resilver B, just A. 2577 * The top-level mirror has no way to know this, so instead we just 2578 * discard unnecessary repairs as we work our way down the vdev tree. 2579 * The same logic applies to any form of nested replication: 2580 * ditto + mirror, RAID-Z + replacing, etc. This covers them all. 2581 */ 2582 if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && 2583 !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && 2584 zio->io_txg != 0 && /* not a delegated i/o */ 2585 !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { 2586 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 2587 zio_vdev_io_bypass(zio); 2588 return (ZIO_PIPELINE_CONTINUE); 2589 } 2590 2591 if (vd->vdev_ops->vdev_op_leaf && 2592 (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { 2593 2594 if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) 2595 return (ZIO_PIPELINE_CONTINUE); 2596 2597 if ((zio = vdev_queue_io(zio)) == NULL) 2598 return (ZIO_PIPELINE_STOP); 2599 2600 if (!vdev_accessible(vd, zio)) { 2601 zio->io_error = SET_ERROR(ENXIO); 2602 zio_interrupt(zio); 2603 return (ZIO_PIPELINE_STOP); 2604 } 2605 } 2606 2607 /* 2608 * Note that we ignore repair writes for TRIM because they can conflict 2609 * with normal writes. This isn't an issue because, by definition, we 2610 * only repair blocks that aren't freed. 2611 */ 2612 if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_WRITE && 2613 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 2614 if (!trim_map_write_start(zio)) 2615 return (ZIO_PIPELINE_STOP); 2616 } 2617 2618 return (vd->vdev_ops->vdev_op_io_start(zio)); 2619} 2620 2621static int 2622zio_vdev_io_done(zio_t *zio) 2623{ 2624 vdev_t *vd = zio->io_vd; 2625 vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; 2626 boolean_t unexpected_error = B_FALSE; 2627 2628 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2629 return (ZIO_PIPELINE_STOP); 2630 2631 ASSERT(zio->io_type == ZIO_TYPE_READ || 2632 zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE); 2633 2634 if (vd != NULL && vd->vdev_ops->vdev_op_leaf && 2635 (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { 2636 2637 if (zio->io_type == ZIO_TYPE_WRITE && 2638 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) 2639 trim_map_write_done(zio); 2640 2641 vdev_queue_io_done(zio); 2642 2643 if (zio->io_type == ZIO_TYPE_WRITE) 2644 vdev_cache_write(zio); 2645 2646 if (zio_injection_enabled && zio->io_error == 0) 2647 zio->io_error = zio_handle_device_injection(vd, 2648 zio, EIO); 2649 2650 if (zio_injection_enabled && zio->io_error == 0) 2651 zio->io_error = zio_handle_label_injection(zio, EIO); 2652 2653 if (zio->io_error) { 2654 if (!vdev_accessible(vd, zio)) { 2655 zio->io_error = SET_ERROR(ENXIO); 2656 } else { 2657 unexpected_error = B_TRUE; 2658 } 2659 } 2660 } 2661 2662 ops->vdev_op_io_done(zio); 2663 2664 if (unexpected_error) 2665 VERIFY(vdev_probe(vd, zio) == NULL); 2666 2667 return (ZIO_PIPELINE_CONTINUE); 2668} 2669 2670/* 2671 * For non-raidz ZIOs, we can just copy aside the bad data read from the 2672 * disk, and use that to finish the checksum ereport later. 2673 */ 2674static void 2675zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, 2676 const void *good_buf) 2677{ 2678 /* no processing needed */ 2679 zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); 2680} 2681 2682/*ARGSUSED*/ 2683void 2684zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) 2685{ 2686 void *buf = zio_buf_alloc(zio->io_size); 2687 2688 bcopy(zio->io_data, buf, zio->io_size); 2689 2690 zcr->zcr_cbinfo = zio->io_size; 2691 zcr->zcr_cbdata = buf; 2692 zcr->zcr_finish = zio_vsd_default_cksum_finish; 2693 zcr->zcr_free = zio_buf_free; 2694} 2695 2696static int 2697zio_vdev_io_assess(zio_t *zio) 2698{ 2699 vdev_t *vd = zio->io_vd; 2700 2701 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2702 return (ZIO_PIPELINE_STOP); 2703 2704 if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2705 spa_config_exit(zio->io_spa, SCL_ZIO, zio); 2706 2707 if (zio->io_vsd != NULL) { 2708 zio->io_vsd_ops->vsd_free(zio); 2709 zio->io_vsd = NULL; 2710 } 2711 2712 if (zio_injection_enabled && zio->io_error == 0) 2713 zio->io_error = zio_handle_fault_injection(zio, EIO); 2714 2715 if (zio->io_type == ZIO_TYPE_IOCTL && zio->io_cmd == DKIOCTRIM) 2716 switch (zio->io_error) { 2717 case 0: 2718 ZIO_TRIM_STAT_INCR(bytes, zio->io_size); 2719 ZIO_TRIM_STAT_BUMP(success); 2720 break; 2721 case EOPNOTSUPP: 2722 ZIO_TRIM_STAT_BUMP(unsupported); 2723 break; 2724 default: 2725 ZIO_TRIM_STAT_BUMP(failed); 2726 break; 2727 } 2728 2729 /* 2730 * If the I/O failed, determine whether we should attempt to retry it. 2731 * 2732 * On retry, we cut in line in the issue queue, since we don't want 2733 * compression/checksumming/etc. work to prevent our (cheap) IO reissue. 2734 */ 2735 if (zio->io_error && vd == NULL && 2736 !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { 2737 ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ 2738 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ 2739 zio->io_error = 0; 2740 zio->io_flags |= ZIO_FLAG_IO_RETRY | 2741 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; 2742 zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; 2743 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, 2744 zio_requeue_io_start_cut_in_line); 2745 return (ZIO_PIPELINE_STOP); 2746 } 2747 2748 /* 2749 * If we got an error on a leaf device, convert it to ENXIO 2750 * if the device is not accessible at all. 2751 */ 2752 if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && 2753 !vdev_accessible(vd, zio)) 2754 zio->io_error = SET_ERROR(ENXIO); 2755 2756 /* 2757 * If we can't write to an interior vdev (mirror or RAID-Z), 2758 * set vdev_cant_write so that we stop trying to allocate from it. 2759 */ 2760 if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && 2761 vd != NULL && !vd->vdev_ops->vdev_op_leaf) { 2762 vd->vdev_cant_write = B_TRUE; 2763 } 2764 2765 if (zio->io_error) 2766 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2767 2768 return (ZIO_PIPELINE_CONTINUE); 2769} 2770 2771void 2772zio_vdev_io_reissue(zio_t *zio) 2773{ 2774 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2775 ASSERT(zio->io_error == 0); 2776 2777 zio->io_stage >>= 1; 2778} 2779 2780void 2781zio_vdev_io_redone(zio_t *zio) 2782{ 2783 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 2784 2785 zio->io_stage >>= 1; 2786} 2787 2788void 2789zio_vdev_io_bypass(zio_t *zio) 2790{ 2791 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2792 ASSERT(zio->io_error == 0); 2793 2794 zio->io_flags |= ZIO_FLAG_IO_BYPASS; 2795 zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; 2796} 2797 2798/* 2799 * ========================================================================== 2800 * Generate and verify checksums 2801 * ========================================================================== 2802 */ 2803static int 2804zio_checksum_generate(zio_t *zio) 2805{ 2806 blkptr_t *bp = zio->io_bp; 2807 enum zio_checksum checksum; 2808 2809 if (bp == NULL) { 2810 /* 2811 * This is zio_write_phys(). 2812 * We're either generating a label checksum, or none at all. 2813 */ 2814 checksum = zio->io_prop.zp_checksum; 2815 2816 if (checksum == ZIO_CHECKSUM_OFF) 2817 return (ZIO_PIPELINE_CONTINUE); 2818 2819 ASSERT(checksum == ZIO_CHECKSUM_LABEL); 2820 } else { 2821 if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { 2822 ASSERT(!IO_IS_ALLOCATING(zio)); 2823 checksum = ZIO_CHECKSUM_GANG_HEADER; 2824 } else { 2825 checksum = BP_GET_CHECKSUM(bp); 2826 } 2827 } 2828 2829 zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); 2830 2831 return (ZIO_PIPELINE_CONTINUE); 2832} 2833 2834static int 2835zio_checksum_verify(zio_t *zio) 2836{ 2837 zio_bad_cksum_t info; 2838 blkptr_t *bp = zio->io_bp; 2839 int error; 2840 2841 ASSERT(zio->io_vd != NULL); 2842 2843 if (bp == NULL) { 2844 /* 2845 * This is zio_read_phys(). 2846 * We're either verifying a label checksum, or nothing at all. 2847 */ 2848 if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) 2849 return (ZIO_PIPELINE_CONTINUE); 2850 2851 ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); 2852 } 2853 2854 if ((error = zio_checksum_error(zio, &info)) != 0) { 2855 zio->io_error = error; 2856 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2857 zfs_ereport_start_checksum(zio->io_spa, 2858 zio->io_vd, zio, zio->io_offset, 2859 zio->io_size, NULL, &info); 2860 } 2861 } 2862 2863 return (ZIO_PIPELINE_CONTINUE); 2864} 2865 2866/* 2867 * Called by RAID-Z to ensure we don't compute the checksum twice. 2868 */ 2869void 2870zio_checksum_verified(zio_t *zio) 2871{ 2872 zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 2873} 2874 2875/* 2876 * ========================================================================== 2877 * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. 2878 * An error of 0 indictes success. ENXIO indicates whole-device failure, 2879 * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO 2880 * indicate errors that are specific to one I/O, and most likely permanent. 2881 * Any other error is presumed to be worse because we weren't expecting it. 2882 * ========================================================================== 2883 */ 2884int 2885zio_worst_error(int e1, int e2) 2886{ 2887 static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; 2888 int r1, r2; 2889 2890 for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) 2891 if (e1 == zio_error_rank[r1]) 2892 break; 2893 2894 for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) 2895 if (e2 == zio_error_rank[r2]) 2896 break; 2897 2898 return (r1 > r2 ? e1 : e2); 2899} 2900 2901/* 2902 * ========================================================================== 2903 * I/O completion 2904 * ========================================================================== 2905 */ 2906static int 2907zio_ready(zio_t *zio) 2908{ 2909 blkptr_t *bp = zio->io_bp; 2910 zio_t *pio, *pio_next; 2911 2912 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 2913 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY)) 2914 return (ZIO_PIPELINE_STOP); 2915 2916 if (zio->io_ready) { 2917 ASSERT(IO_IS_ALLOCATING(zio)); 2918 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || 2919 (zio->io_flags & ZIO_FLAG_NOPWRITE)); 2920 ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); 2921 2922 zio->io_ready(zio); 2923 } 2924 2925 if (bp != NULL && bp != &zio->io_bp_copy) 2926 zio->io_bp_copy = *bp; 2927 2928 if (zio->io_error) 2929 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2930 2931 mutex_enter(&zio->io_lock); 2932 zio->io_state[ZIO_WAIT_READY] = 1; 2933 pio = zio_walk_parents(zio); 2934 mutex_exit(&zio->io_lock); 2935 2936 /* 2937 * As we notify zio's parents, new parents could be added. 2938 * New parents go to the head of zio's io_parent_list, however, 2939 * so we will (correctly) not notify them. The remainder of zio's 2940 * io_parent_list, from 'pio_next' onward, cannot change because 2941 * all parents must wait for us to be done before they can be done. 2942 */ 2943 for (; pio != NULL; pio = pio_next) { 2944 pio_next = zio_walk_parents(zio); 2945 zio_notify_parent(pio, zio, ZIO_WAIT_READY); 2946 } 2947 2948 if (zio->io_flags & ZIO_FLAG_NODATA) { 2949 if (BP_IS_GANG(bp)) { 2950 zio->io_flags &= ~ZIO_FLAG_NODATA; 2951 } else { 2952 ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE); 2953 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 2954 } 2955 } 2956 2957 if (zio_injection_enabled && 2958 zio->io_spa->spa_syncing_txg == zio->io_txg) 2959 zio_handle_ignored_writes(zio); 2960 2961 return (ZIO_PIPELINE_CONTINUE); 2962} 2963 2964static int 2965zio_done(zio_t *zio) 2966{ 2967 spa_t *spa = zio->io_spa; 2968 zio_t *lio = zio->io_logical; 2969 blkptr_t *bp = zio->io_bp; 2970 vdev_t *vd = zio->io_vd; 2971 uint64_t psize = zio->io_size; 2972 zio_t *pio, *pio_next; 2973 2974 /* 2975 * If our children haven't all completed, 2976 * wait for them and then repeat this pipeline stage. 2977 */ 2978 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || 2979 zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || 2980 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) || 2981 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) 2982 return (ZIO_PIPELINE_STOP); 2983 2984 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 2985 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 2986 ASSERT(zio->io_children[c][w] == 0); 2987 2988 if (bp != NULL) { 2989 ASSERT(bp->blk_pad[0] == 0); 2990 ASSERT(bp->blk_pad[1] == 0); 2991 ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || 2992 (bp == zio_unique_parent(zio)->io_bp)); 2993 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 2994 zio->io_bp_override == NULL && 2995 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 2996 ASSERT(!BP_SHOULD_BYTESWAP(bp)); 2997 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); 2998 ASSERT(BP_COUNT_GANG(bp) == 0 || 2999 (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 3000 } 3001 if (zio->io_flags & ZIO_FLAG_NOPWRITE) 3002 VERIFY(BP_EQUAL(bp, &zio->io_bp_orig)); 3003 } 3004 3005 /* 3006 * If there were child vdev/gang/ddt errors, they apply to us now. 3007 */ 3008 zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); 3009 zio_inherit_child_errors(zio, ZIO_CHILD_GANG); 3010 zio_inherit_child_errors(zio, ZIO_CHILD_DDT); 3011 3012 /* 3013 * If the I/O on the transformed data was successful, generate any 3014 * checksum reports now while we still have the transformed data. 3015 */ 3016 if (zio->io_error == 0) { 3017 while (zio->io_cksum_report != NULL) { 3018 zio_cksum_report_t *zcr = zio->io_cksum_report; 3019 uint64_t align = zcr->zcr_align; 3020 uint64_t asize = P2ROUNDUP(psize, align); 3021 char *abuf = zio->io_data; 3022 3023 if (asize != psize) { 3024 abuf = zio_buf_alloc(asize); 3025 bcopy(zio->io_data, abuf, psize); 3026 bzero(abuf + psize, asize - psize); 3027 } 3028 3029 zio->io_cksum_report = zcr->zcr_next; 3030 zcr->zcr_next = NULL; 3031 zcr->zcr_finish(zcr, abuf); 3032 zfs_ereport_free_checksum(zcr); 3033 3034 if (asize != psize) 3035 zio_buf_free(abuf, asize); 3036 } 3037 } 3038 3039 zio_pop_transforms(zio); /* note: may set zio->io_error */ 3040 3041 vdev_stat_update(zio, psize); 3042 3043 if (zio->io_error) { 3044 /* 3045 * If this I/O is attached to a particular vdev, 3046 * generate an error message describing the I/O failure 3047 * at the block level. We ignore these errors if the 3048 * device is currently unavailable. 3049 */ 3050 if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 3051 zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 3052 3053 if ((zio->io_error == EIO || !(zio->io_flags & 3054 (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && 3055 zio == lio) { 3056 /* 3057 * For logical I/O requests, tell the SPA to log the 3058 * error and generate a logical data ereport. 3059 */ 3060 spa_log_error(spa, zio); 3061 zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 3062 0, 0); 3063 } 3064 } 3065 3066 if (zio->io_error && zio == lio) { 3067 /* 3068 * Determine whether zio should be reexecuted. This will 3069 * propagate all the way to the root via zio_notify_parent(). 3070 */ 3071 ASSERT(vd == NULL && bp != NULL); 3072 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3073 3074 if (IO_IS_ALLOCATING(zio) && 3075 !(zio->io_flags & ZIO_FLAG_CANFAIL)) { 3076 if (zio->io_error != ENOSPC) 3077 zio->io_reexecute |= ZIO_REEXECUTE_NOW; 3078 else 3079 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3080 } 3081 3082 if ((zio->io_type == ZIO_TYPE_READ || 3083 zio->io_type == ZIO_TYPE_FREE) && 3084 !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && 3085 zio->io_error == ENXIO && 3086 spa_load_state(spa) == SPA_LOAD_NONE && 3087 spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 3088 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3089 3090 if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) 3091 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3092 3093 /* 3094 * Here is a possibly good place to attempt to do 3095 * either combinatorial reconstruction or error correction 3096 * based on checksums. It also might be a good place 3097 * to send out preliminary ereports before we suspend 3098 * processing. 3099 */ 3100 } 3101 3102 /* 3103 * If there were logical child errors, they apply to us now. 3104 * We defer this until now to avoid conflating logical child 3105 * errors with errors that happened to the zio itself when 3106 * updating vdev stats and reporting FMA events above. 3107 */ 3108 zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); 3109 3110 if ((zio->io_error || zio->io_reexecute) && 3111 IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && 3112 !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) 3113 zio_dva_unallocate(zio, zio->io_gang_tree, bp); 3114 3115 zio_gang_tree_free(&zio->io_gang_tree); 3116 3117 /* 3118 * Godfather I/Os should never suspend. 3119 */ 3120 if ((zio->io_flags & ZIO_FLAG_GODFATHER) && 3121 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) 3122 zio->io_reexecute = 0; 3123 3124 if (zio->io_reexecute) { 3125 /* 3126 * This is a logical I/O that wants to reexecute. 3127 * 3128 * Reexecute is top-down. When an i/o fails, if it's not 3129 * the root, it simply notifies its parent and sticks around. 3130 * The parent, seeing that it still has children in zio_done(), 3131 * does the same. This percolates all the way up to the root. 3132 * The root i/o will reexecute or suspend the entire tree. 3133 * 3134 * This approach ensures that zio_reexecute() honors 3135 * all the original i/o dependency relationships, e.g. 3136 * parents not executing until children are ready. 3137 */ 3138 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3139 3140 zio->io_gang_leader = NULL; 3141 3142 mutex_enter(&zio->io_lock); 3143 zio->io_state[ZIO_WAIT_DONE] = 1; 3144 mutex_exit(&zio->io_lock); 3145 3146 /* 3147 * "The Godfather" I/O monitors its children but is 3148 * not a true parent to them. It will track them through 3149 * the pipeline but severs its ties whenever they get into 3150 * trouble (e.g. suspended). This allows "The Godfather" 3151 * I/O to return status without blocking. 3152 */ 3153 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 3154 zio_link_t *zl = zio->io_walk_link; 3155 pio_next = zio_walk_parents(zio); 3156 3157 if ((pio->io_flags & ZIO_FLAG_GODFATHER) && 3158 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { 3159 zio_remove_child(pio, zio, zl); 3160 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3161 } 3162 } 3163 3164 if ((pio = zio_unique_parent(zio)) != NULL) { 3165 /* 3166 * We're not a root i/o, so there's nothing to do 3167 * but notify our parent. Don't propagate errors 3168 * upward since we haven't permanently failed yet. 3169 */ 3170 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 3171 zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; 3172 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3173 } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { 3174 /* 3175 * We'd fail again if we reexecuted now, so suspend 3176 * until conditions improve (e.g. device comes online). 3177 */ 3178 zio_suspend(spa, zio); 3179 } else { 3180 /* 3181 * Reexecution is potentially a huge amount of work. 3182 * Hand it off to the otherwise-unused claim taskq. 3183 */ 3184#if defined(illumos) || !defined(_KERNEL) 3185 ASSERT(zio->io_tqent.tqent_next == NULL); 3186#else 3187 ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); 3188#endif 3189 (void) taskq_dispatch_ent( 3190 spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE], 3191 (task_func_t *)zio_reexecute, zio, 0, 3192 &zio->io_tqent); 3193 } 3194 return (ZIO_PIPELINE_STOP); 3195 } 3196 3197 ASSERT(zio->io_child_count == 0); 3198 ASSERT(zio->io_reexecute == 0); 3199 ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); 3200 3201 /* 3202 * Report any checksum errors, since the I/O is complete. 3203 */ 3204 while (zio->io_cksum_report != NULL) { 3205 zio_cksum_report_t *zcr = zio->io_cksum_report; 3206 zio->io_cksum_report = zcr->zcr_next; 3207 zcr->zcr_next = NULL; 3208 zcr->zcr_finish(zcr, NULL); 3209 zfs_ereport_free_checksum(zcr); 3210 } 3211 3212 /* 3213 * It is the responsibility of the done callback to ensure that this 3214 * particular zio is no longer discoverable for adoption, and as 3215 * such, cannot acquire any new parents. 3216 */ 3217 if (zio->io_done) 3218 zio->io_done(zio); 3219 3220 mutex_enter(&zio->io_lock); 3221 zio->io_state[ZIO_WAIT_DONE] = 1; 3222 mutex_exit(&zio->io_lock); 3223 3224 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 3225 zio_link_t *zl = zio->io_walk_link; 3226 pio_next = zio_walk_parents(zio); 3227 zio_remove_child(pio, zio, zl); 3228 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3229 } 3230 3231 if (zio->io_waiter != NULL) { 3232 mutex_enter(&zio->io_lock); 3233 zio->io_executor = NULL; 3234 cv_broadcast(&zio->io_cv); 3235 mutex_exit(&zio->io_lock); 3236 } else { 3237 zio_destroy(zio); 3238 } 3239 3240 return (ZIO_PIPELINE_STOP); 3241} 3242 3243/* 3244 * ========================================================================== 3245 * I/O pipeline definition 3246 * ========================================================================== 3247 */ 3248static zio_pipe_stage_t *zio_pipeline[] = { 3249 NULL, 3250 zio_read_bp_init, 3251 zio_free_bp_init, 3252 zio_issue_async, 3253 zio_write_bp_init, 3254 zio_checksum_generate, 3255 zio_nop_write, 3256 zio_ddt_read_start, 3257 zio_ddt_read_done, 3258 zio_ddt_write, 3259 zio_ddt_free, 3260 zio_gang_assemble, 3261 zio_gang_issue, 3262 zio_dva_allocate, 3263 zio_dva_free, 3264 zio_dva_claim, 3265 zio_ready, 3266 zio_vdev_io_start, 3267 zio_vdev_io_done, 3268 zio_vdev_io_assess, 3269 zio_checksum_verify, 3270 zio_done 3271}; 3272 3273/* dnp is the dnode for zb1->zb_object */ 3274boolean_t 3275zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1, 3276 const zbookmark_t *zb2) 3277{ 3278 uint64_t zb1nextL0, zb2thisobj; 3279 3280 ASSERT(zb1->zb_objset == zb2->zb_objset); 3281 ASSERT(zb2->zb_level == 0); 3282 3283 /* 3284 * A bookmark in the deadlist is considered to be after 3285 * everything else. 3286 */ 3287 if (zb2->zb_object == DMU_DEADLIST_OBJECT) 3288 return (B_TRUE); 3289 3290 /* The objset_phys_t isn't before anything. */ 3291 if (dnp == NULL) 3292 return (B_FALSE); 3293 3294 zb1nextL0 = (zb1->zb_blkid + 1) << 3295 ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); 3296 3297 zb2thisobj = zb2->zb_object ? zb2->zb_object : 3298 zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); 3299 3300 if (zb1->zb_object == DMU_META_DNODE_OBJECT) { 3301 uint64_t nextobj = zb1nextL0 * 3302 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; 3303 return (nextobj <= zb2thisobj); 3304 } 3305 3306 if (zb1->zb_object < zb2thisobj) 3307 return (B_TRUE); 3308 if (zb1->zb_object > zb2thisobj) 3309 return (B_FALSE); 3310 if (zb2->zb_object == DMU_META_DNODE_OBJECT) 3311 return (B_FALSE); 3312 return (zb1nextL0 <= zb2->zb_blkid); 3313} 3314