zio.c revision 285001
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2015 by Delphix. All rights reserved. 24 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. 25 */ 26 27#include <sys/sysmacros.h> 28#include <sys/zfs_context.h> 29#include <sys/fm/fs/zfs.h> 30#include <sys/spa.h> 31#include <sys/txg.h> 32#include <sys/spa_impl.h> 33#include <sys/vdev_impl.h> 34#include <sys/zio_impl.h> 35#include <sys/zio_compress.h> 36#include <sys/zio_checksum.h> 37#include <sys/dmu_objset.h> 38#include <sys/arc.h> 39#include <sys/ddt.h> 40#include <sys/trim_map.h> 41#include <sys/blkptr.h> 42#include <sys/zfeature.h> 43 44SYSCTL_DECL(_vfs_zfs); 45SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); 46#if defined(__amd64__) 47static int zio_use_uma = 1; 48#else 49static int zio_use_uma = 0; 50#endif 51TUNABLE_INT("vfs.zfs.zio.use_uma", &zio_use_uma); 52SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0, 53 "Use uma(9) for ZIO allocations"); 54static int zio_exclude_metadata = 0; 55TUNABLE_INT("vfs.zfs.zio.exclude_metadata", &zio_exclude_metadata); 56SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0, 57 "Exclude metadata buffers from dumps as well"); 58 59zio_trim_stats_t zio_trim_stats = { 60 { "bytes", KSTAT_DATA_UINT64, 61 "Number of bytes successfully TRIMmed" }, 62 { "success", KSTAT_DATA_UINT64, 63 "Number of successful TRIM requests" }, 64 { "unsupported", KSTAT_DATA_UINT64, 65 "Number of TRIM requests that failed because TRIM is not supported" }, 66 { "failed", KSTAT_DATA_UINT64, 67 "Number of TRIM requests that failed for reasons other than not supported" }, 68}; 69 70static kstat_t *zio_trim_ksp; 71 72/* 73 * ========================================================================== 74 * I/O type descriptions 75 * ========================================================================== 76 */ 77const char *zio_type_name[ZIO_TYPES] = { 78 "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", 79 "zio_ioctl" 80}; 81 82/* 83 * ========================================================================== 84 * I/O kmem caches 85 * ========================================================================== 86 */ 87kmem_cache_t *zio_cache; 88kmem_cache_t *zio_link_cache; 89kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 90kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 91 92#ifdef _KERNEL 93extern vmem_t *zio_alloc_arena; 94#endif 95 96/* 97 * The following actions directly effect the spa's sync-to-convergence logic. 98 * The values below define the sync pass when we start performing the action. 99 * Care should be taken when changing these values as they directly impact 100 * spa_sync() performance. Tuning these values may introduce subtle performance 101 * pathologies and should only be done in the context of performance analysis. 102 * These tunables will eventually be removed and replaced with #defines once 103 * enough analysis has been done to determine optimal values. 104 * 105 * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that 106 * regular blocks are not deferred. 107 */ 108int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */ 109TUNABLE_INT("vfs.zfs.sync_pass_deferred_free", &zfs_sync_pass_deferred_free); 110SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_deferred_free, CTLFLAG_RDTUN, 111 &zfs_sync_pass_deferred_free, 0, "defer frees starting in this pass"); 112int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */ 113TUNABLE_INT("vfs.zfs.sync_pass_dont_compress", &zfs_sync_pass_dont_compress); 114SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_dont_compress, CTLFLAG_RDTUN, 115 &zfs_sync_pass_dont_compress, 0, "don't compress starting in this pass"); 116int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ 117TUNABLE_INT("vfs.zfs.sync_pass_rewrite", &zfs_sync_pass_rewrite); 118SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_rewrite, CTLFLAG_RDTUN, 119 &zfs_sync_pass_rewrite, 0, "rewrite new bps starting in this pass"); 120 121/* 122 * An allocating zio is one that either currently has the DVA allocate 123 * stage set or will have it later in its lifetime. 124 */ 125#define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) 126 127boolean_t zio_requeue_io_start_cut_in_line = B_TRUE; 128 129#ifdef ZFS_DEBUG 130int zio_buf_debug_limit = 16384; 131#else 132int zio_buf_debug_limit = 0; 133#endif 134 135void 136zio_init(void) 137{ 138 size_t c; 139 zio_cache = kmem_cache_create("zio_cache", 140 sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 141 zio_link_cache = kmem_cache_create("zio_link_cache", 142 sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 143 if (!zio_use_uma) 144 goto out; 145 146 /* 147 * For small buffers, we want a cache for each multiple of 148 * SPA_MINBLOCKSIZE. For larger buffers, we want a cache 149 * for each quarter-power of 2. 150 */ 151 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 152 size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 153 size_t p2 = size; 154 size_t align = 0; 155 size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0; 156 157 while (!ISP2(p2)) 158 p2 &= p2 - 1; 159 160#ifdef illumos 161#ifndef _KERNEL 162 /* 163 * If we are using watchpoints, put each buffer on its own page, 164 * to eliminate the performance overhead of trapping to the 165 * kernel when modifying a non-watched buffer that shares the 166 * page with a watched buffer. 167 */ 168 if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) 169 continue; 170#endif 171#endif /* illumos */ 172 if (size <= 4 * SPA_MINBLOCKSIZE) { 173 align = SPA_MINBLOCKSIZE; 174 } else if (IS_P2ALIGNED(size, p2 >> 2)) { 175 align = MIN(p2 >> 2, PAGESIZE); 176 } 177 178 if (align != 0) { 179 char name[36]; 180 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 181 zio_buf_cache[c] = kmem_cache_create(name, size, 182 align, NULL, NULL, NULL, NULL, NULL, cflags); 183 184 /* 185 * Since zio_data bufs do not appear in crash dumps, we 186 * pass KMC_NOTOUCH so that no allocator metadata is 187 * stored with the buffers. 188 */ 189 (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 190 zio_data_buf_cache[c] = kmem_cache_create(name, size, 191 align, NULL, NULL, NULL, NULL, NULL, 192 cflags | KMC_NOTOUCH | KMC_NODEBUG); 193 } 194 } 195 196 while (--c != 0) { 197 ASSERT(zio_buf_cache[c] != NULL); 198 if (zio_buf_cache[c - 1] == NULL) 199 zio_buf_cache[c - 1] = zio_buf_cache[c]; 200 201 ASSERT(zio_data_buf_cache[c] != NULL); 202 if (zio_data_buf_cache[c - 1] == NULL) 203 zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 204 } 205out: 206 207 zio_inject_init(); 208 209 zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc", 210 KSTAT_TYPE_NAMED, 211 sizeof(zio_trim_stats) / sizeof(kstat_named_t), 212 KSTAT_FLAG_VIRTUAL); 213 214 if (zio_trim_ksp != NULL) { 215 zio_trim_ksp->ks_data = &zio_trim_stats; 216 kstat_install(zio_trim_ksp); 217 } 218} 219 220void 221zio_fini(void) 222{ 223 size_t c; 224 kmem_cache_t *last_cache = NULL; 225 kmem_cache_t *last_data_cache = NULL; 226 227 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 228 if (zio_buf_cache[c] != last_cache) { 229 last_cache = zio_buf_cache[c]; 230 kmem_cache_destroy(zio_buf_cache[c]); 231 } 232 zio_buf_cache[c] = NULL; 233 234 if (zio_data_buf_cache[c] != last_data_cache) { 235 last_data_cache = zio_data_buf_cache[c]; 236 kmem_cache_destroy(zio_data_buf_cache[c]); 237 } 238 zio_data_buf_cache[c] = NULL; 239 } 240 241 kmem_cache_destroy(zio_link_cache); 242 kmem_cache_destroy(zio_cache); 243 244 zio_inject_fini(); 245 246 if (zio_trim_ksp != NULL) { 247 kstat_delete(zio_trim_ksp); 248 zio_trim_ksp = NULL; 249 } 250} 251 252/* 253 * ========================================================================== 254 * Allocate and free I/O buffers 255 * ========================================================================== 256 */ 257 258/* 259 * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 260 * crashdump if the kernel panics, so use it judiciously. Obviously, it's 261 * useful to inspect ZFS metadata, but if possible, we should avoid keeping 262 * excess / transient data in-core during a crashdump. 263 */ 264void * 265zio_buf_alloc(size_t size) 266{ 267 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 268 int flags = zio_exclude_metadata ? KM_NODEBUG : 0; 269 270 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 271 272 if (zio_use_uma) 273 return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 274 else 275 return (kmem_alloc(size, KM_SLEEP|flags)); 276} 277 278/* 279 * Use zio_data_buf_alloc to allocate data. The data will not appear in a 280 * crashdump if the kernel panics. This exists so that we will limit the amount 281 * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 282 * of kernel heap dumped to disk when the kernel panics) 283 */ 284void * 285zio_data_buf_alloc(size_t size) 286{ 287 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 288 289 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 290 291 if (zio_use_uma) 292 return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 293 else 294 return (kmem_alloc(size, KM_SLEEP | KM_NODEBUG)); 295} 296 297void 298zio_buf_free(void *buf, size_t size) 299{ 300 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 301 302 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 303 304 if (zio_use_uma) 305 kmem_cache_free(zio_buf_cache[c], buf); 306 else 307 kmem_free(buf, size); 308} 309 310void 311zio_data_buf_free(void *buf, size_t size) 312{ 313 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 314 315 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 316 317 if (zio_use_uma) 318 kmem_cache_free(zio_data_buf_cache[c], buf); 319 else 320 kmem_free(buf, size); 321} 322 323/* 324 * ========================================================================== 325 * Push and pop I/O transform buffers 326 * ========================================================================== 327 */ 328static void 329zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, 330 zio_transform_func_t *transform) 331{ 332 zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 333 334 zt->zt_orig_data = zio->io_data; 335 zt->zt_orig_size = zio->io_size; 336 zt->zt_bufsize = bufsize; 337 zt->zt_transform = transform; 338 339 zt->zt_next = zio->io_transform_stack; 340 zio->io_transform_stack = zt; 341 342 zio->io_data = data; 343 zio->io_size = size; 344} 345 346static void 347zio_pop_transforms(zio_t *zio) 348{ 349 zio_transform_t *zt; 350 351 while ((zt = zio->io_transform_stack) != NULL) { 352 if (zt->zt_transform != NULL) 353 zt->zt_transform(zio, 354 zt->zt_orig_data, zt->zt_orig_size); 355 356 if (zt->zt_bufsize != 0) 357 zio_buf_free(zio->io_data, zt->zt_bufsize); 358 359 zio->io_data = zt->zt_orig_data; 360 zio->io_size = zt->zt_orig_size; 361 zio->io_transform_stack = zt->zt_next; 362 363 kmem_free(zt, sizeof (zio_transform_t)); 364 } 365} 366 367/* 368 * ========================================================================== 369 * I/O transform callbacks for subblocks and decompression 370 * ========================================================================== 371 */ 372static void 373zio_subblock(zio_t *zio, void *data, uint64_t size) 374{ 375 ASSERT(zio->io_size > size); 376 377 if (zio->io_type == ZIO_TYPE_READ) 378 bcopy(zio->io_data, data, size); 379} 380 381static void 382zio_decompress(zio_t *zio, void *data, uint64_t size) 383{ 384 if (zio->io_error == 0 && 385 zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), 386 zio->io_data, data, zio->io_size, size) != 0) 387 zio->io_error = SET_ERROR(EIO); 388} 389 390/* 391 * ========================================================================== 392 * I/O parent/child relationships and pipeline interlocks 393 * ========================================================================== 394 */ 395/* 396 * NOTE - Callers to zio_walk_parents() and zio_walk_children must 397 * continue calling these functions until they return NULL. 398 * Otherwise, the next caller will pick up the list walk in 399 * some indeterminate state. (Otherwise every caller would 400 * have to pass in a cookie to keep the state represented by 401 * io_walk_link, which gets annoying.) 402 */ 403zio_t * 404zio_walk_parents(zio_t *cio) 405{ 406 zio_link_t *zl = cio->io_walk_link; 407 list_t *pl = &cio->io_parent_list; 408 409 zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); 410 cio->io_walk_link = zl; 411 412 if (zl == NULL) 413 return (NULL); 414 415 ASSERT(zl->zl_child == cio); 416 return (zl->zl_parent); 417} 418 419zio_t * 420zio_walk_children(zio_t *pio) 421{ 422 zio_link_t *zl = pio->io_walk_link; 423 list_t *cl = &pio->io_child_list; 424 425 zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); 426 pio->io_walk_link = zl; 427 428 if (zl == NULL) 429 return (NULL); 430 431 ASSERT(zl->zl_parent == pio); 432 return (zl->zl_child); 433} 434 435zio_t * 436zio_unique_parent(zio_t *cio) 437{ 438 zio_t *pio = zio_walk_parents(cio); 439 440 VERIFY(zio_walk_parents(cio) == NULL); 441 return (pio); 442} 443 444void 445zio_add_child(zio_t *pio, zio_t *cio) 446{ 447 zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); 448 449 /* 450 * Logical I/Os can have logical, gang, or vdev children. 451 * Gang I/Os can have gang or vdev children. 452 * Vdev I/Os can only have vdev children. 453 * The following ASSERT captures all of these constraints. 454 */ 455 ASSERT(cio->io_child_type <= pio->io_child_type); 456 457 zl->zl_parent = pio; 458 zl->zl_child = cio; 459 460 mutex_enter(&cio->io_lock); 461 mutex_enter(&pio->io_lock); 462 463 ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); 464 465 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 466 pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; 467 468 list_insert_head(&pio->io_child_list, zl); 469 list_insert_head(&cio->io_parent_list, zl); 470 471 pio->io_child_count++; 472 cio->io_parent_count++; 473 474 mutex_exit(&pio->io_lock); 475 mutex_exit(&cio->io_lock); 476} 477 478static void 479zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) 480{ 481 ASSERT(zl->zl_parent == pio); 482 ASSERT(zl->zl_child == cio); 483 484 mutex_enter(&cio->io_lock); 485 mutex_enter(&pio->io_lock); 486 487 list_remove(&pio->io_child_list, zl); 488 list_remove(&cio->io_parent_list, zl); 489 490 pio->io_child_count--; 491 cio->io_parent_count--; 492 493 mutex_exit(&pio->io_lock); 494 mutex_exit(&cio->io_lock); 495 496 kmem_cache_free(zio_link_cache, zl); 497} 498 499static boolean_t 500zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) 501{ 502 uint64_t *countp = &zio->io_children[child][wait]; 503 boolean_t waiting = B_FALSE; 504 505 mutex_enter(&zio->io_lock); 506 ASSERT(zio->io_stall == NULL); 507 if (*countp != 0) { 508 zio->io_stage >>= 1; 509 zio->io_stall = countp; 510 waiting = B_TRUE; 511 } 512 mutex_exit(&zio->io_lock); 513 514 return (waiting); 515} 516 517static void 518zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) 519{ 520 uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; 521 int *errorp = &pio->io_child_error[zio->io_child_type]; 522 523 mutex_enter(&pio->io_lock); 524 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 525 *errorp = zio_worst_error(*errorp, zio->io_error); 526 pio->io_reexecute |= zio->io_reexecute; 527 ASSERT3U(*countp, >, 0); 528 529 (*countp)--; 530 531 if (*countp == 0 && pio->io_stall == countp) { 532 pio->io_stall = NULL; 533 mutex_exit(&pio->io_lock); 534 zio_execute(pio); 535 } else { 536 mutex_exit(&pio->io_lock); 537 } 538} 539 540static void 541zio_inherit_child_errors(zio_t *zio, enum zio_child c) 542{ 543 if (zio->io_child_error[c] != 0 && zio->io_error == 0) 544 zio->io_error = zio->io_child_error[c]; 545} 546 547/* 548 * ========================================================================== 549 * Create the various types of I/O (read, write, free, etc) 550 * ========================================================================== 551 */ 552static zio_t * 553zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 554 void *data, uint64_t size, zio_done_func_t *done, void *private, 555 zio_type_t type, zio_priority_t priority, enum zio_flag flags, 556 vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb, 557 enum zio_stage stage, enum zio_stage pipeline) 558{ 559 zio_t *zio; 560 561 ASSERT3U(type == ZIO_TYPE_FREE || size, <=, SPA_MAXBLOCKSIZE); 562 ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 563 ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 564 565 ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); 566 ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); 567 ASSERT(vd || stage == ZIO_STAGE_OPEN); 568 569 zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 570 bzero(zio, sizeof (zio_t)); 571 572 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 573 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 574 575 list_create(&zio->io_parent_list, sizeof (zio_link_t), 576 offsetof(zio_link_t, zl_parent_node)); 577 list_create(&zio->io_child_list, sizeof (zio_link_t), 578 offsetof(zio_link_t, zl_child_node)); 579 580 if (vd != NULL) 581 zio->io_child_type = ZIO_CHILD_VDEV; 582 else if (flags & ZIO_FLAG_GANG_CHILD) 583 zio->io_child_type = ZIO_CHILD_GANG; 584 else if (flags & ZIO_FLAG_DDT_CHILD) 585 zio->io_child_type = ZIO_CHILD_DDT; 586 else 587 zio->io_child_type = ZIO_CHILD_LOGICAL; 588 589 if (bp != NULL) { 590 zio->io_bp = (blkptr_t *)bp; 591 zio->io_bp_copy = *bp; 592 zio->io_bp_orig = *bp; 593 if (type != ZIO_TYPE_WRITE || 594 zio->io_child_type == ZIO_CHILD_DDT) 595 zio->io_bp = &zio->io_bp_copy; /* so caller can free */ 596 if (zio->io_child_type == ZIO_CHILD_LOGICAL) 597 zio->io_logical = zio; 598 if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) 599 pipeline |= ZIO_GANG_STAGES; 600 } 601 602 zio->io_spa = spa; 603 zio->io_txg = txg; 604 zio->io_done = done; 605 zio->io_private = private; 606 zio->io_type = type; 607 zio->io_priority = priority; 608 zio->io_vd = vd; 609 zio->io_offset = offset; 610 zio->io_orig_data = zio->io_data = data; 611 zio->io_orig_size = zio->io_size = size; 612 zio->io_orig_flags = zio->io_flags = flags; 613 zio->io_orig_stage = zio->io_stage = stage; 614 zio->io_orig_pipeline = zio->io_pipeline = pipeline; 615 616 zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); 617 zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); 618 619 if (zb != NULL) 620 zio->io_bookmark = *zb; 621 622 if (pio != NULL) { 623 if (zio->io_logical == NULL) 624 zio->io_logical = pio->io_logical; 625 if (zio->io_child_type == ZIO_CHILD_GANG) 626 zio->io_gang_leader = pio->io_gang_leader; 627 zio_add_child(pio, zio); 628 } 629 630 return (zio); 631} 632 633static void 634zio_destroy(zio_t *zio) 635{ 636 list_destroy(&zio->io_parent_list); 637 list_destroy(&zio->io_child_list); 638 mutex_destroy(&zio->io_lock); 639 cv_destroy(&zio->io_cv); 640 kmem_cache_free(zio_cache, zio); 641} 642 643zio_t * 644zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, 645 void *private, enum zio_flag flags) 646{ 647 zio_t *zio; 648 649 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 650 ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 651 ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); 652 653 return (zio); 654} 655 656zio_t * 657zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) 658{ 659 return (zio_null(NULL, spa, NULL, done, private, flags)); 660} 661 662void 663zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp) 664{ 665 if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) { 666 zfs_panic_recover("blkptr at %p has invalid TYPE %llu", 667 bp, (longlong_t)BP_GET_TYPE(bp)); 668 } 669 if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS || 670 BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) { 671 zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu", 672 bp, (longlong_t)BP_GET_CHECKSUM(bp)); 673 } 674 if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS || 675 BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) { 676 zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu", 677 bp, (longlong_t)BP_GET_COMPRESS(bp)); 678 } 679 if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) { 680 zfs_panic_recover("blkptr at %p has invalid LSIZE %llu", 681 bp, (longlong_t)BP_GET_LSIZE(bp)); 682 } 683 if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) { 684 zfs_panic_recover("blkptr at %p has invalid PSIZE %llu", 685 bp, (longlong_t)BP_GET_PSIZE(bp)); 686 } 687 688 if (BP_IS_EMBEDDED(bp)) { 689 if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) { 690 zfs_panic_recover("blkptr at %p has invalid ETYPE %llu", 691 bp, (longlong_t)BPE_GET_ETYPE(bp)); 692 } 693 } 694 695 /* 696 * Pool-specific checks. 697 * 698 * Note: it would be nice to verify that the blk_birth and 699 * BP_PHYSICAL_BIRTH() are not too large. However, spa_freeze() 700 * allows the birth time of log blocks (and dmu_sync()-ed blocks 701 * that are in the log) to be arbitrarily large. 702 */ 703 for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 704 uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]); 705 if (vdevid >= spa->spa_root_vdev->vdev_children) { 706 zfs_panic_recover("blkptr at %p DVA %u has invalid " 707 "VDEV %llu", 708 bp, i, (longlong_t)vdevid); 709 continue; 710 } 711 vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; 712 if (vd == NULL) { 713 zfs_panic_recover("blkptr at %p DVA %u has invalid " 714 "VDEV %llu", 715 bp, i, (longlong_t)vdevid); 716 continue; 717 } 718 if (vd->vdev_ops == &vdev_hole_ops) { 719 zfs_panic_recover("blkptr at %p DVA %u has hole " 720 "VDEV %llu", 721 bp, i, (longlong_t)vdevid); 722 continue; 723 } 724 if (vd->vdev_ops == &vdev_missing_ops) { 725 /* 726 * "missing" vdevs are valid during import, but we 727 * don't have their detailed info (e.g. asize), so 728 * we can't perform any more checks on them. 729 */ 730 continue; 731 } 732 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); 733 uint64_t asize = DVA_GET_ASIZE(&bp->blk_dva[i]); 734 if (BP_IS_GANG(bp)) 735 asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 736 if (offset + asize > vd->vdev_asize) { 737 zfs_panic_recover("blkptr at %p DVA %u has invalid " 738 "OFFSET %llu", 739 bp, i, (longlong_t)offset); 740 } 741 } 742} 743 744zio_t * 745zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, 746 void *data, uint64_t size, zio_done_func_t *done, void *private, 747 zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) 748{ 749 zio_t *zio; 750 751 zfs_blkptr_verify(spa, bp); 752 753 zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, 754 data, size, done, private, 755 ZIO_TYPE_READ, priority, flags, NULL, 0, zb, 756 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 757 ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); 758 759 return (zio); 760} 761 762zio_t * 763zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 764 void *data, uint64_t size, const zio_prop_t *zp, 765 zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done, 766 void *private, 767 zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) 768{ 769 zio_t *zio; 770 771 ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && 772 zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && 773 zp->zp_compress >= ZIO_COMPRESS_OFF && 774 zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && 775 DMU_OT_IS_VALID(zp->zp_type) && 776 zp->zp_level < 32 && 777 zp->zp_copies > 0 && 778 zp->zp_copies <= spa_max_replication(spa)); 779 780 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 781 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 782 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 783 ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); 784 785 zio->io_ready = ready; 786 zio->io_physdone = physdone; 787 zio->io_prop = *zp; 788 789 /* 790 * Data can be NULL if we are going to call zio_write_override() to 791 * provide the already-allocated BP. But we may need the data to 792 * verify a dedup hit (if requested). In this case, don't try to 793 * dedup (just take the already-allocated BP verbatim). 794 */ 795 if (data == NULL && zio->io_prop.zp_dedup_verify) { 796 zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE; 797 } 798 799 return (zio); 800} 801 802zio_t * 803zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, 804 uint64_t size, zio_done_func_t *done, void *private, 805 zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb) 806{ 807 zio_t *zio; 808 809 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 810 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 811 ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 812 813 return (zio); 814} 815 816void 817zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) 818{ 819 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 820 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 821 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 822 ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); 823 824 /* 825 * We must reset the io_prop to match the values that existed 826 * when the bp was first written by dmu_sync() keeping in mind 827 * that nopwrite and dedup are mutually exclusive. 828 */ 829 zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; 830 zio->io_prop.zp_nopwrite = nopwrite; 831 zio->io_prop.zp_copies = copies; 832 zio->io_bp_override = bp; 833} 834 835void 836zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) 837{ 838 839 /* 840 * The check for EMBEDDED is a performance optimization. We 841 * process the free here (by ignoring it) rather than 842 * putting it on the list and then processing it in zio_free_sync(). 843 */ 844 if (BP_IS_EMBEDDED(bp)) 845 return; 846 metaslab_check_free(spa, bp); 847 848 /* 849 * Frees that are for the currently-syncing txg, are not going to be 850 * deferred, and which will not need to do a read (i.e. not GANG or 851 * DEDUP), can be processed immediately. Otherwise, put them on the 852 * in-memory list for later processing. 853 */ 854 if (zfs_trim_enabled || BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || 855 txg != spa->spa_syncing_txg || 856 spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) { 857 bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); 858 } else { 859 VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 860 BP_GET_PSIZE(bp), 0))); 861 } 862} 863 864zio_t * 865zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 866 uint64_t size, enum zio_flag flags) 867{ 868 zio_t *zio; 869 enum zio_stage stage = ZIO_FREE_PIPELINE; 870 871 ASSERT(!BP_IS_HOLE(bp)); 872 ASSERT(spa_syncing_txg(spa) == txg); 873 ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); 874 875 if (BP_IS_EMBEDDED(bp)) 876 return (zio_null(pio, spa, NULL, NULL, NULL, 0)); 877 878 metaslab_check_free(spa, bp); 879 arc_freed(spa, bp); 880 881 if (zfs_trim_enabled) 882 stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START | 883 ZIO_STAGE_VDEV_IO_ASSESS; 884 /* 885 * GANG and DEDUP blocks can induce a read (for the gang block header, 886 * or the DDT), so issue them asynchronously so that this thread is 887 * not tied up. 888 */ 889 else if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) 890 stage |= ZIO_STAGE_ISSUE_ASYNC; 891 892 flags |= ZIO_FLAG_DONT_QUEUE; 893 894 zio = zio_create(pio, spa, txg, bp, NULL, size, 895 NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags, 896 NULL, 0, NULL, ZIO_STAGE_OPEN, stage); 897 898 return (zio); 899} 900 901zio_t * 902zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 903 zio_done_func_t *done, void *private, enum zio_flag flags) 904{ 905 zio_t *zio; 906 907 dprintf_bp(bp, "claiming in txg %llu", txg); 908 909 if (BP_IS_EMBEDDED(bp)) 910 return (zio_null(pio, spa, NULL, NULL, NULL, 0)); 911 912 /* 913 * A claim is an allocation of a specific block. Claims are needed 914 * to support immediate writes in the intent log. The issue is that 915 * immediate writes contain committed data, but in a txg that was 916 * *not* committed. Upon opening the pool after an unclean shutdown, 917 * the intent log claims all blocks that contain immediate write data 918 * so that the SPA knows they're in use. 919 * 920 * All claims *must* be resolved in the first txg -- before the SPA 921 * starts allocating blocks -- so that nothing is allocated twice. 922 * If txg == 0 we just verify that the block is claimable. 923 */ 924 ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 925 ASSERT(txg == spa_first_txg(spa) || txg == 0); 926 ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ 927 928 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 929 done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, 930 NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 931 932 return (zio); 933} 934 935zio_t * 936zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset, 937 uint64_t size, zio_done_func_t *done, void *private, 938 zio_priority_t priority, enum zio_flag flags) 939{ 940 zio_t *zio; 941 int c; 942 943 if (vd->vdev_children == 0) { 944 zio = zio_create(pio, spa, 0, NULL, NULL, size, done, private, 945 ZIO_TYPE_IOCTL, priority, flags, vd, offset, NULL, 946 ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 947 948 zio->io_cmd = cmd; 949 } else { 950 zio = zio_null(pio, spa, NULL, NULL, NULL, flags); 951 952 for (c = 0; c < vd->vdev_children; c++) 953 zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 954 offset, size, done, private, priority, flags)); 955 } 956 957 return (zio); 958} 959 960zio_t * 961zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 962 void *data, int checksum, zio_done_func_t *done, void *private, 963 zio_priority_t priority, enum zio_flag flags, boolean_t labels) 964{ 965 zio_t *zio; 966 967 ASSERT(vd->vdev_children == 0); 968 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 969 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 970 ASSERT3U(offset + size, <=, vd->vdev_psize); 971 972 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 973 ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, 974 NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 975 976 zio->io_prop.zp_checksum = checksum; 977 978 return (zio); 979} 980 981zio_t * 982zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 983 void *data, int checksum, zio_done_func_t *done, void *private, 984 zio_priority_t priority, enum zio_flag flags, boolean_t labels) 985{ 986 zio_t *zio; 987 988 ASSERT(vd->vdev_children == 0); 989 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 990 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 991 ASSERT3U(offset + size, <=, vd->vdev_psize); 992 993 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 994 ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, 995 NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 996 997 zio->io_prop.zp_checksum = checksum; 998 999 if (zio_checksum_table[checksum].ci_eck) { 1000 /* 1001 * zec checksums are necessarily destructive -- they modify 1002 * the end of the write buffer to hold the verifier/checksum. 1003 * Therefore, we must make a local copy in case the data is 1004 * being written to multiple places in parallel. 1005 */ 1006 void *wbuf = zio_buf_alloc(size); 1007 bcopy(data, wbuf, size); 1008 zio_push_transform(zio, wbuf, size, size, NULL); 1009 } 1010 1011 return (zio); 1012} 1013 1014/* 1015 * Create a child I/O to do some work for us. 1016 */ 1017zio_t * 1018zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 1019 void *data, uint64_t size, int type, zio_priority_t priority, 1020 enum zio_flag flags, zio_done_func_t *done, void *private) 1021{ 1022 enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; 1023 zio_t *zio; 1024 1025 ASSERT(vd->vdev_parent == 1026 (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); 1027 1028 if (type == ZIO_TYPE_READ && bp != NULL) { 1029 /* 1030 * If we have the bp, then the child should perform the 1031 * checksum and the parent need not. This pushes error 1032 * detection as close to the leaves as possible and 1033 * eliminates redundant checksums in the interior nodes. 1034 */ 1035 pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; 1036 pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 1037 } 1038 1039 /* Not all IO types require vdev io done stage e.g. free */ 1040 if (!(pio->io_pipeline & ZIO_STAGE_VDEV_IO_DONE)) 1041 pipeline &= ~ZIO_STAGE_VDEV_IO_DONE; 1042 1043 if (vd->vdev_children == 0) 1044 offset += VDEV_LABEL_START_SIZE; 1045 1046 flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE; 1047 1048 /* 1049 * If we've decided to do a repair, the write is not speculative -- 1050 * even if the original read was. 1051 */ 1052 if (flags & ZIO_FLAG_IO_REPAIR) 1053 flags &= ~ZIO_FLAG_SPECULATIVE; 1054 1055 zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, 1056 done, private, type, priority, flags, vd, offset, &pio->io_bookmark, 1057 ZIO_STAGE_VDEV_IO_START >> 1, pipeline); 1058 1059 zio->io_physdone = pio->io_physdone; 1060 if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) 1061 zio->io_logical->io_phys_children++; 1062 1063 return (zio); 1064} 1065 1066zio_t * 1067zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, 1068 int type, zio_priority_t priority, enum zio_flag flags, 1069 zio_done_func_t *done, void *private) 1070{ 1071 zio_t *zio; 1072 1073 ASSERT(vd->vdev_ops->vdev_op_leaf); 1074 1075 zio = zio_create(NULL, vd->vdev_spa, 0, NULL, 1076 data, size, done, private, type, priority, 1077 flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED, 1078 vd, offset, NULL, 1079 ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); 1080 1081 return (zio); 1082} 1083 1084void 1085zio_flush(zio_t *zio, vdev_t *vd) 1086{ 1087 zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0, 1088 NULL, NULL, ZIO_PRIORITY_NOW, 1089 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); 1090} 1091 1092zio_t * 1093zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size) 1094{ 1095 1096 ASSERT(vd->vdev_ops->vdev_op_leaf); 1097 1098 return (zio_create(zio, spa, 0, NULL, NULL, size, NULL, NULL, 1099 ZIO_TYPE_FREE, ZIO_PRIORITY_TRIM, ZIO_FLAG_DONT_AGGREGATE | 1100 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, 1101 vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PHYS_PIPELINE)); 1102} 1103 1104void 1105zio_shrink(zio_t *zio, uint64_t size) 1106{ 1107 ASSERT(zio->io_executor == NULL); 1108 ASSERT(zio->io_orig_size == zio->io_size); 1109 ASSERT(size <= zio->io_size); 1110 1111 /* 1112 * We don't shrink for raidz because of problems with the 1113 * reconstruction when reading back less than the block size. 1114 * Note, BP_IS_RAIDZ() assumes no compression. 1115 */ 1116 ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); 1117 if (!BP_IS_RAIDZ(zio->io_bp)) 1118 zio->io_orig_size = zio->io_size = size; 1119} 1120 1121/* 1122 * ========================================================================== 1123 * Prepare to read and write logical blocks 1124 * ========================================================================== 1125 */ 1126 1127static int 1128zio_read_bp_init(zio_t *zio) 1129{ 1130 blkptr_t *bp = zio->io_bp; 1131 1132 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && 1133 zio->io_child_type == ZIO_CHILD_LOGICAL && 1134 !(zio->io_flags & ZIO_FLAG_RAW)) { 1135 uint64_t psize = 1136 BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); 1137 void *cbuf = zio_buf_alloc(psize); 1138 1139 zio_push_transform(zio, cbuf, psize, psize, zio_decompress); 1140 } 1141 1142 if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { 1143 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1144 decode_embedded_bp_compressed(bp, zio->io_data); 1145 } else { 1146 ASSERT(!BP_IS_EMBEDDED(bp)); 1147 } 1148 1149 if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) 1150 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1151 1152 if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) 1153 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1154 1155 if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) 1156 zio->io_pipeline = ZIO_DDT_READ_PIPELINE; 1157 1158 return (ZIO_PIPELINE_CONTINUE); 1159} 1160 1161static int 1162zio_write_bp_init(zio_t *zio) 1163{ 1164 spa_t *spa = zio->io_spa; 1165 zio_prop_t *zp = &zio->io_prop; 1166 enum zio_compress compress = zp->zp_compress; 1167 blkptr_t *bp = zio->io_bp; 1168 uint64_t lsize = zio->io_size; 1169 uint64_t psize = lsize; 1170 int pass = 1; 1171 1172 /* 1173 * If our children haven't all reached the ready stage, 1174 * wait for them and then repeat this pipeline stage. 1175 */ 1176 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 1177 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) 1178 return (ZIO_PIPELINE_STOP); 1179 1180 if (!IO_IS_ALLOCATING(zio)) 1181 return (ZIO_PIPELINE_CONTINUE); 1182 1183 ASSERT(zio->io_child_type != ZIO_CHILD_DDT); 1184 1185 if (zio->io_bp_override) { 1186 ASSERT(bp->blk_birth != zio->io_txg); 1187 ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); 1188 1189 *bp = *zio->io_bp_override; 1190 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1191 1192 if (BP_IS_EMBEDDED(bp)) 1193 return (ZIO_PIPELINE_CONTINUE); 1194 1195 /* 1196 * If we've been overridden and nopwrite is set then 1197 * set the flag accordingly to indicate that a nopwrite 1198 * has already occurred. 1199 */ 1200 if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { 1201 ASSERT(!zp->zp_dedup); 1202 zio->io_flags |= ZIO_FLAG_NOPWRITE; 1203 return (ZIO_PIPELINE_CONTINUE); 1204 } 1205 1206 ASSERT(!zp->zp_nopwrite); 1207 1208 if (BP_IS_HOLE(bp) || !zp->zp_dedup) 1209 return (ZIO_PIPELINE_CONTINUE); 1210 1211 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup || 1212 zp->zp_dedup_verify); 1213 1214 if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { 1215 BP_SET_DEDUP(bp, 1); 1216 zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; 1217 return (ZIO_PIPELINE_CONTINUE); 1218 } 1219 zio->io_bp_override = NULL; 1220 BP_ZERO(bp); 1221 } 1222 1223 if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) { 1224 /* 1225 * We're rewriting an existing block, which means we're 1226 * working on behalf of spa_sync(). For spa_sync() to 1227 * converge, it must eventually be the case that we don't 1228 * have to allocate new blocks. But compression changes 1229 * the blocksize, which forces a reallocate, and makes 1230 * convergence take longer. Therefore, after the first 1231 * few passes, stop compressing to ensure convergence. 1232 */ 1233 pass = spa_sync_pass(spa); 1234 1235 ASSERT(zio->io_txg == spa_syncing_txg(spa)); 1236 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1237 ASSERT(!BP_GET_DEDUP(bp)); 1238 1239 if (pass >= zfs_sync_pass_dont_compress) 1240 compress = ZIO_COMPRESS_OFF; 1241 1242 /* Make sure someone doesn't change their mind on overwrites */ 1243 ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp), 1244 spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 1245 } 1246 1247 if (compress != ZIO_COMPRESS_OFF) { 1248 void *cbuf = zio_buf_alloc(lsize); 1249 psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); 1250 if (psize == 0 || psize == lsize) { 1251 compress = ZIO_COMPRESS_OFF; 1252 zio_buf_free(cbuf, lsize); 1253 } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE && 1254 zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && 1255 spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { 1256 encode_embedded_bp_compressed(bp, 1257 cbuf, compress, lsize, psize); 1258 BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA); 1259 BP_SET_TYPE(bp, zio->io_prop.zp_type); 1260 BP_SET_LEVEL(bp, zio->io_prop.zp_level); 1261 zio_buf_free(cbuf, lsize); 1262 bp->blk_birth = zio->io_txg; 1263 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1264 ASSERT(spa_feature_is_active(spa, 1265 SPA_FEATURE_EMBEDDED_DATA)); 1266 return (ZIO_PIPELINE_CONTINUE); 1267 } else { 1268 /* 1269 * Round up compressed size up to the ashift 1270 * of the smallest-ashift device, and zero the tail. 1271 * This ensures that the compressed size of the BP 1272 * (and thus compressratio property) are correct, 1273 * in that we charge for the padding used to fill out 1274 * the last sector. 1275 */ 1276 ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); 1277 size_t rounded = (size_t)P2ROUNDUP(psize, 1278 1ULL << spa->spa_min_ashift); 1279 if (rounded >= lsize) { 1280 compress = ZIO_COMPRESS_OFF; 1281 zio_buf_free(cbuf, lsize); 1282 psize = lsize; 1283 } else { 1284 bzero((char *)cbuf + psize, rounded - psize); 1285 psize = rounded; 1286 zio_push_transform(zio, cbuf, 1287 psize, lsize, NULL); 1288 } 1289 } 1290 } 1291 1292 /* 1293 * The final pass of spa_sync() must be all rewrites, but the first 1294 * few passes offer a trade-off: allocating blocks defers convergence, 1295 * but newly allocated blocks are sequential, so they can be written 1296 * to disk faster. Therefore, we allow the first few passes of 1297 * spa_sync() to allocate new blocks, but force rewrites after that. 1298 * There should only be a handful of blocks after pass 1 in any case. 1299 */ 1300 if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg && 1301 BP_GET_PSIZE(bp) == psize && 1302 pass >= zfs_sync_pass_rewrite) { 1303 ASSERT(psize != 0); 1304 enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; 1305 zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; 1306 zio->io_flags |= ZIO_FLAG_IO_REWRITE; 1307 } else { 1308 BP_ZERO(bp); 1309 zio->io_pipeline = ZIO_WRITE_PIPELINE; 1310 } 1311 1312 if (psize == 0) { 1313 if (zio->io_bp_orig.blk_birth != 0 && 1314 spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { 1315 BP_SET_LSIZE(bp, lsize); 1316 BP_SET_TYPE(bp, zp->zp_type); 1317 BP_SET_LEVEL(bp, zp->zp_level); 1318 BP_SET_BIRTH(bp, zio->io_txg, 0); 1319 } 1320 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1321 } else { 1322 ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); 1323 BP_SET_LSIZE(bp, lsize); 1324 BP_SET_TYPE(bp, zp->zp_type); 1325 BP_SET_LEVEL(bp, zp->zp_level); 1326 BP_SET_PSIZE(bp, psize); 1327 BP_SET_COMPRESS(bp, compress); 1328 BP_SET_CHECKSUM(bp, zp->zp_checksum); 1329 BP_SET_DEDUP(bp, zp->zp_dedup); 1330 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1331 if (zp->zp_dedup) { 1332 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1333 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1334 zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; 1335 } 1336 if (zp->zp_nopwrite) { 1337 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1338 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1339 zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; 1340 } 1341 } 1342 1343 return (ZIO_PIPELINE_CONTINUE); 1344} 1345 1346static int 1347zio_free_bp_init(zio_t *zio) 1348{ 1349 blkptr_t *bp = zio->io_bp; 1350 1351 if (zio->io_child_type == ZIO_CHILD_LOGICAL) { 1352 if (BP_GET_DEDUP(bp)) 1353 zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; 1354 } 1355 1356 return (ZIO_PIPELINE_CONTINUE); 1357} 1358 1359/* 1360 * ========================================================================== 1361 * Execute the I/O pipeline 1362 * ========================================================================== 1363 */ 1364 1365static void 1366zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) 1367{ 1368 spa_t *spa = zio->io_spa; 1369 zio_type_t t = zio->io_type; 1370 int flags = (cutinline ? TQ_FRONT : 0); 1371 1372 ASSERT(q == ZIO_TASKQ_ISSUE || q == ZIO_TASKQ_INTERRUPT); 1373 1374 /* 1375 * If we're a config writer or a probe, the normal issue and 1376 * interrupt threads may all be blocked waiting for the config lock. 1377 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. 1378 */ 1379 if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) 1380 t = ZIO_TYPE_NULL; 1381 1382 /* 1383 * A similar issue exists for the L2ARC write thread until L2ARC 2.0. 1384 */ 1385 if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) 1386 t = ZIO_TYPE_NULL; 1387 1388 /* 1389 * If this is a high priority I/O, then use the high priority taskq if 1390 * available. 1391 */ 1392 if (zio->io_priority == ZIO_PRIORITY_NOW && 1393 spa->spa_zio_taskq[t][q + 1].stqs_count != 0) 1394 q++; 1395 1396 ASSERT3U(q, <, ZIO_TASKQ_TYPES); 1397 1398 /* 1399 * NB: We are assuming that the zio can only be dispatched 1400 * to a single taskq at a time. It would be a grievous error 1401 * to dispatch the zio to another taskq at the same time. 1402 */ 1403#if defined(illumos) || !defined(_KERNEL) 1404 ASSERT(zio->io_tqent.tqent_next == NULL); 1405#else 1406 ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); 1407#endif 1408 spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio, 1409 flags, &zio->io_tqent); 1410} 1411 1412static boolean_t 1413zio_taskq_member(zio_t *zio, zio_taskq_type_t q) 1414{ 1415 kthread_t *executor = zio->io_executor; 1416 spa_t *spa = zio->io_spa; 1417 1418 for (zio_type_t t = 0; t < ZIO_TYPES; t++) { 1419 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1420 uint_t i; 1421 for (i = 0; i < tqs->stqs_count; i++) { 1422 if (taskq_member(tqs->stqs_taskq[i], executor)) 1423 return (B_TRUE); 1424 } 1425 } 1426 1427 return (B_FALSE); 1428} 1429 1430static int 1431zio_issue_async(zio_t *zio) 1432{ 1433 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 1434 1435 return (ZIO_PIPELINE_STOP); 1436} 1437 1438void 1439zio_interrupt(zio_t *zio) 1440{ 1441 zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); 1442} 1443 1444/* 1445 * Execute the I/O pipeline until one of the following occurs: 1446 * 1447 * (1) the I/O completes 1448 * (2) the pipeline stalls waiting for dependent child I/Os 1449 * (3) the I/O issues, so we're waiting for an I/O completion interrupt 1450 * (4) the I/O is delegated by vdev-level caching or aggregation 1451 * (5) the I/O is deferred due to vdev-level queueing 1452 * (6) the I/O is handed off to another thread. 1453 * 1454 * In all cases, the pipeline stops whenever there's no CPU work; it never 1455 * burns a thread in cv_wait(). 1456 * 1457 * There's no locking on io_stage because there's no legitimate way 1458 * for multiple threads to be attempting to process the same I/O. 1459 */ 1460static zio_pipe_stage_t *zio_pipeline[]; 1461 1462void 1463zio_execute(zio_t *zio) 1464{ 1465 zio->io_executor = curthread; 1466 1467 while (zio->io_stage < ZIO_STAGE_DONE) { 1468 enum zio_stage pipeline = zio->io_pipeline; 1469 enum zio_stage stage = zio->io_stage; 1470 int rv; 1471 1472 ASSERT(!MUTEX_HELD(&zio->io_lock)); 1473 ASSERT(ISP2(stage)); 1474 ASSERT(zio->io_stall == NULL); 1475 1476 do { 1477 stage <<= 1; 1478 } while ((stage & pipeline) == 0); 1479 1480 ASSERT(stage <= ZIO_STAGE_DONE); 1481 1482 /* 1483 * If we are in interrupt context and this pipeline stage 1484 * will grab a config lock that is held across I/O, 1485 * or may wait for an I/O that needs an interrupt thread 1486 * to complete, issue async to avoid deadlock. 1487 * 1488 * For VDEV_IO_START, we cut in line so that the io will 1489 * be sent to disk promptly. 1490 */ 1491 if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && 1492 zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { 1493 boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? 1494 zio_requeue_io_start_cut_in_line : B_FALSE; 1495 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); 1496 return; 1497 } 1498 1499 zio->io_stage = stage; 1500 rv = zio_pipeline[highbit64(stage) - 1](zio); 1501 1502 if (rv == ZIO_PIPELINE_STOP) 1503 return; 1504 1505 ASSERT(rv == ZIO_PIPELINE_CONTINUE); 1506 } 1507} 1508 1509/* 1510 * ========================================================================== 1511 * Initiate I/O, either sync or async 1512 * ========================================================================== 1513 */ 1514int 1515zio_wait(zio_t *zio) 1516{ 1517 int error; 1518 1519 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 1520 ASSERT(zio->io_executor == NULL); 1521 1522 zio->io_waiter = curthread; 1523 1524 zio_execute(zio); 1525 1526 mutex_enter(&zio->io_lock); 1527 while (zio->io_executor != NULL) 1528 cv_wait(&zio->io_cv, &zio->io_lock); 1529 mutex_exit(&zio->io_lock); 1530 1531 error = zio->io_error; 1532 zio_destroy(zio); 1533 1534 return (error); 1535} 1536 1537void 1538zio_nowait(zio_t *zio) 1539{ 1540 ASSERT(zio->io_executor == NULL); 1541 1542 if (zio->io_child_type == ZIO_CHILD_LOGICAL && 1543 zio_unique_parent(zio) == NULL) { 1544 /* 1545 * This is a logical async I/O with no parent to wait for it. 1546 * We add it to the spa_async_root_zio "Godfather" I/O which 1547 * will ensure they complete prior to unloading the pool. 1548 */ 1549 spa_t *spa = zio->io_spa; 1550 1551 zio_add_child(spa->spa_async_zio_root[CPU_SEQID], zio); 1552 } 1553 1554 zio_execute(zio); 1555} 1556 1557/* 1558 * ========================================================================== 1559 * Reexecute or suspend/resume failed I/O 1560 * ========================================================================== 1561 */ 1562 1563static void 1564zio_reexecute(zio_t *pio) 1565{ 1566 zio_t *cio, *cio_next; 1567 1568 ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); 1569 ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); 1570 ASSERT(pio->io_gang_leader == NULL); 1571 ASSERT(pio->io_gang_tree == NULL); 1572 1573 pio->io_flags = pio->io_orig_flags; 1574 pio->io_stage = pio->io_orig_stage; 1575 pio->io_pipeline = pio->io_orig_pipeline; 1576 pio->io_reexecute = 0; 1577 pio->io_flags |= ZIO_FLAG_REEXECUTED; 1578 pio->io_error = 0; 1579 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1580 pio->io_state[w] = 0; 1581 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 1582 pio->io_child_error[c] = 0; 1583 1584 if (IO_IS_ALLOCATING(pio)) 1585 BP_ZERO(pio->io_bp); 1586 1587 /* 1588 * As we reexecute pio's children, new children could be created. 1589 * New children go to the head of pio's io_child_list, however, 1590 * so we will (correctly) not reexecute them. The key is that 1591 * the remainder of pio's io_child_list, from 'cio_next' onward, 1592 * cannot be affected by any side effects of reexecuting 'cio'. 1593 */ 1594 for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { 1595 cio_next = zio_walk_children(pio); 1596 mutex_enter(&pio->io_lock); 1597 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1598 pio->io_children[cio->io_child_type][w]++; 1599 mutex_exit(&pio->io_lock); 1600 zio_reexecute(cio); 1601 } 1602 1603 /* 1604 * Now that all children have been reexecuted, execute the parent. 1605 * We don't reexecute "The Godfather" I/O here as it's the 1606 * responsibility of the caller to wait on him. 1607 */ 1608 if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) 1609 zio_execute(pio); 1610} 1611 1612void 1613zio_suspend(spa_t *spa, zio_t *zio) 1614{ 1615 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) 1616 fm_panic("Pool '%s' has encountered an uncorrectable I/O " 1617 "failure and the failure mode property for this pool " 1618 "is set to panic.", spa_name(spa)); 1619 1620 zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); 1621 1622 mutex_enter(&spa->spa_suspend_lock); 1623 1624 if (spa->spa_suspend_zio_root == NULL) 1625 spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 1626 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 1627 ZIO_FLAG_GODFATHER); 1628 1629 spa->spa_suspended = B_TRUE; 1630 1631 if (zio != NULL) { 1632 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 1633 ASSERT(zio != spa->spa_suspend_zio_root); 1634 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1635 ASSERT(zio_unique_parent(zio) == NULL); 1636 ASSERT(zio->io_stage == ZIO_STAGE_DONE); 1637 zio_add_child(spa->spa_suspend_zio_root, zio); 1638 } 1639 1640 mutex_exit(&spa->spa_suspend_lock); 1641} 1642 1643int 1644zio_resume(spa_t *spa) 1645{ 1646 zio_t *pio; 1647 1648 /* 1649 * Reexecute all previously suspended i/o. 1650 */ 1651 mutex_enter(&spa->spa_suspend_lock); 1652 spa->spa_suspended = B_FALSE; 1653 cv_broadcast(&spa->spa_suspend_cv); 1654 pio = spa->spa_suspend_zio_root; 1655 spa->spa_suspend_zio_root = NULL; 1656 mutex_exit(&spa->spa_suspend_lock); 1657 1658 if (pio == NULL) 1659 return (0); 1660 1661 zio_reexecute(pio); 1662 return (zio_wait(pio)); 1663} 1664 1665void 1666zio_resume_wait(spa_t *spa) 1667{ 1668 mutex_enter(&spa->spa_suspend_lock); 1669 while (spa_suspended(spa)) 1670 cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); 1671 mutex_exit(&spa->spa_suspend_lock); 1672} 1673 1674/* 1675 * ========================================================================== 1676 * Gang blocks. 1677 * 1678 * A gang block is a collection of small blocks that looks to the DMU 1679 * like one large block. When zio_dva_allocate() cannot find a block 1680 * of the requested size, due to either severe fragmentation or the pool 1681 * being nearly full, it calls zio_write_gang_block() to construct the 1682 * block from smaller fragments. 1683 * 1684 * A gang block consists of a gang header (zio_gbh_phys_t) and up to 1685 * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like 1686 * an indirect block: it's an array of block pointers. It consumes 1687 * only one sector and hence is allocatable regardless of fragmentation. 1688 * The gang header's bps point to its gang members, which hold the data. 1689 * 1690 * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> 1691 * as the verifier to ensure uniqueness of the SHA256 checksum. 1692 * Critically, the gang block bp's blk_cksum is the checksum of the data, 1693 * not the gang header. This ensures that data block signatures (needed for 1694 * deduplication) are independent of how the block is physically stored. 1695 * 1696 * Gang blocks can be nested: a gang member may itself be a gang block. 1697 * Thus every gang block is a tree in which root and all interior nodes are 1698 * gang headers, and the leaves are normal blocks that contain user data. 1699 * The root of the gang tree is called the gang leader. 1700 * 1701 * To perform any operation (read, rewrite, free, claim) on a gang block, 1702 * zio_gang_assemble() first assembles the gang tree (minus data leaves) 1703 * in the io_gang_tree field of the original logical i/o by recursively 1704 * reading the gang leader and all gang headers below it. This yields 1705 * an in-core tree containing the contents of every gang header and the 1706 * bps for every constituent of the gang block. 1707 * 1708 * With the gang tree now assembled, zio_gang_issue() just walks the gang tree 1709 * and invokes a callback on each bp. To free a gang block, zio_gang_issue() 1710 * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. 1711 * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). 1712 * zio_read_gang() is a wrapper around zio_read() that omits reading gang 1713 * headers, since we already have those in io_gang_tree. zio_rewrite_gang() 1714 * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() 1715 * of the gang header plus zio_checksum_compute() of the data to update the 1716 * gang header's blk_cksum as described above. 1717 * 1718 * The two-phase assemble/issue model solves the problem of partial failure -- 1719 * what if you'd freed part of a gang block but then couldn't read the 1720 * gang header for another part? Assembling the entire gang tree first 1721 * ensures that all the necessary gang header I/O has succeeded before 1722 * starting the actual work of free, claim, or write. Once the gang tree 1723 * is assembled, free and claim are in-memory operations that cannot fail. 1724 * 1725 * In the event that a gang write fails, zio_dva_unallocate() walks the 1726 * gang tree to immediately free (i.e. insert back into the space map) 1727 * everything we've allocated. This ensures that we don't get ENOSPC 1728 * errors during repeated suspend/resume cycles due to a flaky device. 1729 * 1730 * Gang rewrites only happen during sync-to-convergence. If we can't assemble 1731 * the gang tree, we won't modify the block, so we can safely defer the free 1732 * (knowing that the block is still intact). If we *can* assemble the gang 1733 * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free 1734 * each constituent bp and we can allocate a new block on the next sync pass. 1735 * 1736 * In all cases, the gang tree allows complete recovery from partial failure. 1737 * ========================================================================== 1738 */ 1739 1740static zio_t * 1741zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1742{ 1743 if (gn != NULL) 1744 return (pio); 1745 1746 return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), 1747 NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1748 &pio->io_bookmark)); 1749} 1750 1751zio_t * 1752zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1753{ 1754 zio_t *zio; 1755 1756 if (gn != NULL) { 1757 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1758 gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, 1759 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1760 /* 1761 * As we rewrite each gang header, the pipeline will compute 1762 * a new gang block header checksum for it; but no one will 1763 * compute a new data checksum, so we do that here. The one 1764 * exception is the gang leader: the pipeline already computed 1765 * its data checksum because that stage precedes gang assembly. 1766 * (Presently, nothing actually uses interior data checksums; 1767 * this is just good hygiene.) 1768 */ 1769 if (gn != pio->io_gang_leader->io_gang_tree) { 1770 zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), 1771 data, BP_GET_PSIZE(bp)); 1772 } 1773 /* 1774 * If we are here to damage data for testing purposes, 1775 * leave the GBH alone so that we can detect the damage. 1776 */ 1777 if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) 1778 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 1779 } else { 1780 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1781 data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, 1782 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1783 } 1784 1785 return (zio); 1786} 1787 1788/* ARGSUSED */ 1789zio_t * 1790zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1791{ 1792 return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, 1793 BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp), 1794 ZIO_GANG_CHILD_FLAGS(pio))); 1795} 1796 1797/* ARGSUSED */ 1798zio_t * 1799zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1800{ 1801 return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, 1802 NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); 1803} 1804 1805static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { 1806 NULL, 1807 zio_read_gang, 1808 zio_rewrite_gang, 1809 zio_free_gang, 1810 zio_claim_gang, 1811 NULL 1812}; 1813 1814static void zio_gang_tree_assemble_done(zio_t *zio); 1815 1816static zio_gang_node_t * 1817zio_gang_node_alloc(zio_gang_node_t **gnpp) 1818{ 1819 zio_gang_node_t *gn; 1820 1821 ASSERT(*gnpp == NULL); 1822 1823 gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); 1824 gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); 1825 *gnpp = gn; 1826 1827 return (gn); 1828} 1829 1830static void 1831zio_gang_node_free(zio_gang_node_t **gnpp) 1832{ 1833 zio_gang_node_t *gn = *gnpp; 1834 1835 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1836 ASSERT(gn->gn_child[g] == NULL); 1837 1838 zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); 1839 kmem_free(gn, sizeof (*gn)); 1840 *gnpp = NULL; 1841} 1842 1843static void 1844zio_gang_tree_free(zio_gang_node_t **gnpp) 1845{ 1846 zio_gang_node_t *gn = *gnpp; 1847 1848 if (gn == NULL) 1849 return; 1850 1851 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1852 zio_gang_tree_free(&gn->gn_child[g]); 1853 1854 zio_gang_node_free(gnpp); 1855} 1856 1857static void 1858zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) 1859{ 1860 zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); 1861 1862 ASSERT(gio->io_gang_leader == gio); 1863 ASSERT(BP_IS_GANG(bp)); 1864 1865 zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, 1866 SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, 1867 gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); 1868} 1869 1870static void 1871zio_gang_tree_assemble_done(zio_t *zio) 1872{ 1873 zio_t *gio = zio->io_gang_leader; 1874 zio_gang_node_t *gn = zio->io_private; 1875 blkptr_t *bp = zio->io_bp; 1876 1877 ASSERT(gio == zio_unique_parent(zio)); 1878 ASSERT(zio->io_child_count == 0); 1879 1880 if (zio->io_error) 1881 return; 1882 1883 if (BP_SHOULD_BYTESWAP(bp)) 1884 byteswap_uint64_array(zio->io_data, zio->io_size); 1885 1886 ASSERT(zio->io_data == gn->gn_gbh); 1887 ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 1888 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1889 1890 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1891 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1892 if (!BP_IS_GANG(gbp)) 1893 continue; 1894 zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); 1895 } 1896} 1897 1898static void 1899zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) 1900{ 1901 zio_t *gio = pio->io_gang_leader; 1902 zio_t *zio; 1903 1904 ASSERT(BP_IS_GANG(bp) == !!gn); 1905 ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); 1906 ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); 1907 1908 /* 1909 * If you're a gang header, your data is in gn->gn_gbh. 1910 * If you're a gang member, your data is in 'data' and gn == NULL. 1911 */ 1912 zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); 1913 1914 if (gn != NULL) { 1915 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1916 1917 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1918 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1919 if (BP_IS_HOLE(gbp)) 1920 continue; 1921 zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); 1922 data = (char *)data + BP_GET_PSIZE(gbp); 1923 } 1924 } 1925 1926 if (gn == gio->io_gang_tree && gio->io_data != NULL) 1927 ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); 1928 1929 if (zio != pio) 1930 zio_nowait(zio); 1931} 1932 1933static int 1934zio_gang_assemble(zio_t *zio) 1935{ 1936 blkptr_t *bp = zio->io_bp; 1937 1938 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); 1939 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1940 1941 zio->io_gang_leader = zio; 1942 1943 zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); 1944 1945 return (ZIO_PIPELINE_CONTINUE); 1946} 1947 1948static int 1949zio_gang_issue(zio_t *zio) 1950{ 1951 blkptr_t *bp = zio->io_bp; 1952 1953 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) 1954 return (ZIO_PIPELINE_STOP); 1955 1956 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); 1957 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1958 1959 if (zio->io_child_error[ZIO_CHILD_GANG] == 0) 1960 zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); 1961 else 1962 zio_gang_tree_free(&zio->io_gang_tree); 1963 1964 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1965 1966 return (ZIO_PIPELINE_CONTINUE); 1967} 1968 1969static void 1970zio_write_gang_member_ready(zio_t *zio) 1971{ 1972 zio_t *pio = zio_unique_parent(zio); 1973 zio_t *gio = zio->io_gang_leader; 1974 dva_t *cdva = zio->io_bp->blk_dva; 1975 dva_t *pdva = pio->io_bp->blk_dva; 1976 uint64_t asize; 1977 1978 if (BP_IS_HOLE(zio->io_bp)) 1979 return; 1980 1981 ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); 1982 1983 ASSERT(zio->io_child_type == ZIO_CHILD_GANG); 1984 ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); 1985 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); 1986 ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); 1987 ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 1988 1989 mutex_enter(&pio->io_lock); 1990 for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { 1991 ASSERT(DVA_GET_GANG(&pdva[d])); 1992 asize = DVA_GET_ASIZE(&pdva[d]); 1993 asize += DVA_GET_ASIZE(&cdva[d]); 1994 DVA_SET_ASIZE(&pdva[d], asize); 1995 } 1996 mutex_exit(&pio->io_lock); 1997} 1998 1999static int 2000zio_write_gang_block(zio_t *pio) 2001{ 2002 spa_t *spa = pio->io_spa; 2003 blkptr_t *bp = pio->io_bp; 2004 zio_t *gio = pio->io_gang_leader; 2005 zio_t *zio; 2006 zio_gang_node_t *gn, **gnpp; 2007 zio_gbh_phys_t *gbh; 2008 uint64_t txg = pio->io_txg; 2009 uint64_t resid = pio->io_size; 2010 uint64_t lsize; 2011 int copies = gio->io_prop.zp_copies; 2012 int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); 2013 zio_prop_t zp; 2014 int error; 2015 2016 error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE, 2017 bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, 2018 METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); 2019 if (error) { 2020 pio->io_error = error; 2021 return (ZIO_PIPELINE_CONTINUE); 2022 } 2023 2024 if (pio == gio) { 2025 gnpp = &gio->io_gang_tree; 2026 } else { 2027 gnpp = pio->io_private; 2028 ASSERT(pio->io_ready == zio_write_gang_member_ready); 2029 } 2030 2031 gn = zio_gang_node_alloc(gnpp); 2032 gbh = gn->gn_gbh; 2033 bzero(gbh, SPA_GANGBLOCKSIZE); 2034 2035 /* 2036 * Create the gang header. 2037 */ 2038 zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, 2039 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 2040 2041 /* 2042 * Create and nowait the gang children. 2043 */ 2044 for (int g = 0; resid != 0; resid -= lsize, g++) { 2045 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), 2046 SPA_MINBLOCKSIZE); 2047 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); 2048 2049 zp.zp_checksum = gio->io_prop.zp_checksum; 2050 zp.zp_compress = ZIO_COMPRESS_OFF; 2051 zp.zp_type = DMU_OT_NONE; 2052 zp.zp_level = 0; 2053 zp.zp_copies = gio->io_prop.zp_copies; 2054 zp.zp_dedup = B_FALSE; 2055 zp.zp_dedup_verify = B_FALSE; 2056 zp.zp_nopwrite = B_FALSE; 2057 2058 zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], 2059 (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, 2060 zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g], 2061 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 2062 &pio->io_bookmark)); 2063 } 2064 2065 /* 2066 * Set pio's pipeline to just wait for zio to finish. 2067 */ 2068 pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2069 2070 zio_nowait(zio); 2071 2072 return (ZIO_PIPELINE_CONTINUE); 2073} 2074 2075/* 2076 * The zio_nop_write stage in the pipeline determines if allocating 2077 * a new bp is necessary. By leveraging a cryptographically secure checksum, 2078 * such as SHA256, we can compare the checksums of the new data and the old 2079 * to determine if allocating a new block is required. The nopwrite 2080 * feature can handle writes in either syncing or open context (i.e. zil 2081 * writes) and as a result is mutually exclusive with dedup. 2082 */ 2083static int 2084zio_nop_write(zio_t *zio) 2085{ 2086 blkptr_t *bp = zio->io_bp; 2087 blkptr_t *bp_orig = &zio->io_bp_orig; 2088 zio_prop_t *zp = &zio->io_prop; 2089 2090 ASSERT(BP_GET_LEVEL(bp) == 0); 2091 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 2092 ASSERT(zp->zp_nopwrite); 2093 ASSERT(!zp->zp_dedup); 2094 ASSERT(zio->io_bp_override == NULL); 2095 ASSERT(IO_IS_ALLOCATING(zio)); 2096 2097 /* 2098 * Check to see if the original bp and the new bp have matching 2099 * characteristics (i.e. same checksum, compression algorithms, etc). 2100 * If they don't then just continue with the pipeline which will 2101 * allocate a new bp. 2102 */ 2103 if (BP_IS_HOLE(bp_orig) || 2104 !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup || 2105 BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || 2106 BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || 2107 BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || 2108 zp->zp_copies != BP_GET_NDVAS(bp_orig)) 2109 return (ZIO_PIPELINE_CONTINUE); 2110 2111 /* 2112 * If the checksums match then reset the pipeline so that we 2113 * avoid allocating a new bp and issuing any I/O. 2114 */ 2115 if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { 2116 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup); 2117 ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); 2118 ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); 2119 ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); 2120 ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, 2121 sizeof (uint64_t)) == 0); 2122 2123 *bp = *bp_orig; 2124 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2125 zio->io_flags |= ZIO_FLAG_NOPWRITE; 2126 } 2127 2128 return (ZIO_PIPELINE_CONTINUE); 2129} 2130 2131/* 2132 * ========================================================================== 2133 * Dedup 2134 * ========================================================================== 2135 */ 2136static void 2137zio_ddt_child_read_done(zio_t *zio) 2138{ 2139 blkptr_t *bp = zio->io_bp; 2140 ddt_entry_t *dde = zio->io_private; 2141 ddt_phys_t *ddp; 2142 zio_t *pio = zio_unique_parent(zio); 2143 2144 mutex_enter(&pio->io_lock); 2145 ddp = ddt_phys_select(dde, bp); 2146 if (zio->io_error == 0) 2147 ddt_phys_clear(ddp); /* this ddp doesn't need repair */ 2148 if (zio->io_error == 0 && dde->dde_repair_data == NULL) 2149 dde->dde_repair_data = zio->io_data; 2150 else 2151 zio_buf_free(zio->io_data, zio->io_size); 2152 mutex_exit(&pio->io_lock); 2153} 2154 2155static int 2156zio_ddt_read_start(zio_t *zio) 2157{ 2158 blkptr_t *bp = zio->io_bp; 2159 2160 ASSERT(BP_GET_DEDUP(bp)); 2161 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2162 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2163 2164 if (zio->io_child_error[ZIO_CHILD_DDT]) { 2165 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2166 ddt_entry_t *dde = ddt_repair_start(ddt, bp); 2167 ddt_phys_t *ddp = dde->dde_phys; 2168 ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); 2169 blkptr_t blk; 2170 2171 ASSERT(zio->io_vsd == NULL); 2172 zio->io_vsd = dde; 2173 2174 if (ddp_self == NULL) 2175 return (ZIO_PIPELINE_CONTINUE); 2176 2177 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 2178 if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) 2179 continue; 2180 ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, 2181 &blk); 2182 zio_nowait(zio_read(zio, zio->io_spa, &blk, 2183 zio_buf_alloc(zio->io_size), zio->io_size, 2184 zio_ddt_child_read_done, dde, zio->io_priority, 2185 ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, 2186 &zio->io_bookmark)); 2187 } 2188 return (ZIO_PIPELINE_CONTINUE); 2189 } 2190 2191 zio_nowait(zio_read(zio, zio->io_spa, bp, 2192 zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, 2193 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); 2194 2195 return (ZIO_PIPELINE_CONTINUE); 2196} 2197 2198static int 2199zio_ddt_read_done(zio_t *zio) 2200{ 2201 blkptr_t *bp = zio->io_bp; 2202 2203 if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)) 2204 return (ZIO_PIPELINE_STOP); 2205 2206 ASSERT(BP_GET_DEDUP(bp)); 2207 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2208 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2209 2210 if (zio->io_child_error[ZIO_CHILD_DDT]) { 2211 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2212 ddt_entry_t *dde = zio->io_vsd; 2213 if (ddt == NULL) { 2214 ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); 2215 return (ZIO_PIPELINE_CONTINUE); 2216 } 2217 if (dde == NULL) { 2218 zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; 2219 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 2220 return (ZIO_PIPELINE_STOP); 2221 } 2222 if (dde->dde_repair_data != NULL) { 2223 bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); 2224 zio->io_child_error[ZIO_CHILD_DDT] = 0; 2225 } 2226 ddt_repair_done(ddt, dde); 2227 zio->io_vsd = NULL; 2228 } 2229 2230 ASSERT(zio->io_vsd == NULL); 2231 2232 return (ZIO_PIPELINE_CONTINUE); 2233} 2234 2235static boolean_t 2236zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) 2237{ 2238 spa_t *spa = zio->io_spa; 2239 2240 /* 2241 * Note: we compare the original data, not the transformed data, 2242 * because when zio->io_bp is an override bp, we will not have 2243 * pushed the I/O transforms. That's an important optimization 2244 * because otherwise we'd compress/encrypt all dmu_sync() data twice. 2245 */ 2246 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2247 zio_t *lio = dde->dde_lead_zio[p]; 2248 2249 if (lio != NULL) { 2250 return (lio->io_orig_size != zio->io_orig_size || 2251 bcmp(zio->io_orig_data, lio->io_orig_data, 2252 zio->io_orig_size) != 0); 2253 } 2254 } 2255 2256 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2257 ddt_phys_t *ddp = &dde->dde_phys[p]; 2258 2259 if (ddp->ddp_phys_birth != 0) { 2260 arc_buf_t *abuf = NULL; 2261 arc_flags_t aflags = ARC_FLAG_WAIT; 2262 blkptr_t blk = *zio->io_bp; 2263 int error; 2264 2265 ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); 2266 2267 ddt_exit(ddt); 2268 2269 error = arc_read(NULL, spa, &blk, 2270 arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, 2271 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2272 &aflags, &zio->io_bookmark); 2273 2274 if (error == 0) { 2275 if (arc_buf_size(abuf) != zio->io_orig_size || 2276 bcmp(abuf->b_data, zio->io_orig_data, 2277 zio->io_orig_size) != 0) 2278 error = SET_ERROR(EEXIST); 2279 VERIFY(arc_buf_remove_ref(abuf, &abuf)); 2280 } 2281 2282 ddt_enter(ddt); 2283 return (error != 0); 2284 } 2285 } 2286 2287 return (B_FALSE); 2288} 2289 2290static void 2291zio_ddt_child_write_ready(zio_t *zio) 2292{ 2293 int p = zio->io_prop.zp_copies; 2294 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2295 ddt_entry_t *dde = zio->io_private; 2296 ddt_phys_t *ddp = &dde->dde_phys[p]; 2297 zio_t *pio; 2298 2299 if (zio->io_error) 2300 return; 2301 2302 ddt_enter(ddt); 2303 2304 ASSERT(dde->dde_lead_zio[p] == zio); 2305 2306 ddt_phys_fill(ddp, zio->io_bp); 2307 2308 while ((pio = zio_walk_parents(zio)) != NULL) 2309 ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); 2310 2311 ddt_exit(ddt); 2312} 2313 2314static void 2315zio_ddt_child_write_done(zio_t *zio) 2316{ 2317 int p = zio->io_prop.zp_copies; 2318 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2319 ddt_entry_t *dde = zio->io_private; 2320 ddt_phys_t *ddp = &dde->dde_phys[p]; 2321 2322 ddt_enter(ddt); 2323 2324 ASSERT(ddp->ddp_refcnt == 0); 2325 ASSERT(dde->dde_lead_zio[p] == zio); 2326 dde->dde_lead_zio[p] = NULL; 2327 2328 if (zio->io_error == 0) { 2329 while (zio_walk_parents(zio) != NULL) 2330 ddt_phys_addref(ddp); 2331 } else { 2332 ddt_phys_clear(ddp); 2333 } 2334 2335 ddt_exit(ddt); 2336} 2337 2338static void 2339zio_ddt_ditto_write_done(zio_t *zio) 2340{ 2341 int p = DDT_PHYS_DITTO; 2342 zio_prop_t *zp = &zio->io_prop; 2343 blkptr_t *bp = zio->io_bp; 2344 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2345 ddt_entry_t *dde = zio->io_private; 2346 ddt_phys_t *ddp = &dde->dde_phys[p]; 2347 ddt_key_t *ddk = &dde->dde_key; 2348 2349 ddt_enter(ddt); 2350 2351 ASSERT(ddp->ddp_refcnt == 0); 2352 ASSERT(dde->dde_lead_zio[p] == zio); 2353 dde->dde_lead_zio[p] = NULL; 2354 2355 if (zio->io_error == 0) { 2356 ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); 2357 ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); 2358 ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); 2359 if (ddp->ddp_phys_birth != 0) 2360 ddt_phys_free(ddt, ddk, ddp, zio->io_txg); 2361 ddt_phys_fill(ddp, bp); 2362 } 2363 2364 ddt_exit(ddt); 2365} 2366 2367static int 2368zio_ddt_write(zio_t *zio) 2369{ 2370 spa_t *spa = zio->io_spa; 2371 blkptr_t *bp = zio->io_bp; 2372 uint64_t txg = zio->io_txg; 2373 zio_prop_t *zp = &zio->io_prop; 2374 int p = zp->zp_copies; 2375 int ditto_copies; 2376 zio_t *cio = NULL; 2377 zio_t *dio = NULL; 2378 ddt_t *ddt = ddt_select(spa, bp); 2379 ddt_entry_t *dde; 2380 ddt_phys_t *ddp; 2381 2382 ASSERT(BP_GET_DEDUP(bp)); 2383 ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); 2384 ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); 2385 2386 ddt_enter(ddt); 2387 dde = ddt_lookup(ddt, bp, B_TRUE); 2388 ddp = &dde->dde_phys[p]; 2389 2390 if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { 2391 /* 2392 * If we're using a weak checksum, upgrade to a strong checksum 2393 * and try again. If we're already using a strong checksum, 2394 * we can't resolve it, so just convert to an ordinary write. 2395 * (And automatically e-mail a paper to Nature?) 2396 */ 2397 if (!zio_checksum_table[zp->zp_checksum].ci_dedup) { 2398 zp->zp_checksum = spa_dedup_checksum(spa); 2399 zio_pop_transforms(zio); 2400 zio->io_stage = ZIO_STAGE_OPEN; 2401 BP_ZERO(bp); 2402 } else { 2403 zp->zp_dedup = B_FALSE; 2404 } 2405 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2406 ddt_exit(ddt); 2407 return (ZIO_PIPELINE_CONTINUE); 2408 } 2409 2410 ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); 2411 ASSERT(ditto_copies < SPA_DVAS_PER_BP); 2412 2413 if (ditto_copies > ddt_ditto_copies_present(dde) && 2414 dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { 2415 zio_prop_t czp = *zp; 2416 2417 czp.zp_copies = ditto_copies; 2418 2419 /* 2420 * If we arrived here with an override bp, we won't have run 2421 * the transform stack, so we won't have the data we need to 2422 * generate a child i/o. So, toss the override bp and restart. 2423 * This is safe, because using the override bp is just an 2424 * optimization; and it's rare, so the cost doesn't matter. 2425 */ 2426 if (zio->io_bp_override) { 2427 zio_pop_transforms(zio); 2428 zio->io_stage = ZIO_STAGE_OPEN; 2429 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2430 zio->io_bp_override = NULL; 2431 BP_ZERO(bp); 2432 ddt_exit(ddt); 2433 return (ZIO_PIPELINE_CONTINUE); 2434 } 2435 2436 dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2437 zio->io_orig_size, &czp, NULL, NULL, 2438 zio_ddt_ditto_write_done, dde, zio->io_priority, 2439 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2440 2441 zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); 2442 dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; 2443 } 2444 2445 if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { 2446 if (ddp->ddp_phys_birth != 0) 2447 ddt_bp_fill(ddp, bp, txg); 2448 if (dde->dde_lead_zio[p] != NULL) 2449 zio_add_child(zio, dde->dde_lead_zio[p]); 2450 else 2451 ddt_phys_addref(ddp); 2452 } else if (zio->io_bp_override) { 2453 ASSERT(bp->blk_birth == txg); 2454 ASSERT(BP_EQUAL(bp, zio->io_bp_override)); 2455 ddt_phys_fill(ddp, bp); 2456 ddt_phys_addref(ddp); 2457 } else { 2458 cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2459 zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL, 2460 zio_ddt_child_write_done, dde, zio->io_priority, 2461 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2462 2463 zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); 2464 dde->dde_lead_zio[p] = cio; 2465 } 2466 2467 ddt_exit(ddt); 2468 2469 if (cio) 2470 zio_nowait(cio); 2471 if (dio) 2472 zio_nowait(dio); 2473 2474 return (ZIO_PIPELINE_CONTINUE); 2475} 2476 2477ddt_entry_t *freedde; /* for debugging */ 2478 2479static int 2480zio_ddt_free(zio_t *zio) 2481{ 2482 spa_t *spa = zio->io_spa; 2483 blkptr_t *bp = zio->io_bp; 2484 ddt_t *ddt = ddt_select(spa, bp); 2485 ddt_entry_t *dde; 2486 ddt_phys_t *ddp; 2487 2488 ASSERT(BP_GET_DEDUP(bp)); 2489 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2490 2491 ddt_enter(ddt); 2492 freedde = dde = ddt_lookup(ddt, bp, B_TRUE); 2493 ddp = ddt_phys_select(dde, bp); 2494 ddt_phys_decref(ddp); 2495 ddt_exit(ddt); 2496 2497 return (ZIO_PIPELINE_CONTINUE); 2498} 2499 2500/* 2501 * ========================================================================== 2502 * Allocate and free blocks 2503 * ========================================================================== 2504 */ 2505static int 2506zio_dva_allocate(zio_t *zio) 2507{ 2508 spa_t *spa = zio->io_spa; 2509 metaslab_class_t *mc = spa_normal_class(spa); 2510 blkptr_t *bp = zio->io_bp; 2511 int error; 2512 int flags = 0; 2513 2514 if (zio->io_gang_leader == NULL) { 2515 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2516 zio->io_gang_leader = zio; 2517 } 2518 2519 ASSERT(BP_IS_HOLE(bp)); 2520 ASSERT0(BP_GET_NDVAS(bp)); 2521 ASSERT3U(zio->io_prop.zp_copies, >, 0); 2522 ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); 2523 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 2524 2525 /* 2526 * The dump device does not support gang blocks so allocation on 2527 * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid 2528 * the "fast" gang feature. 2529 */ 2530 flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0; 2531 flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ? 2532 METASLAB_GANG_CHILD : 0; 2533 error = metaslab_alloc(spa, mc, zio->io_size, bp, 2534 zio->io_prop.zp_copies, zio->io_txg, NULL, flags); 2535 2536 if (error) { 2537 spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " 2538 "size %llu, error %d", spa_name(spa), zio, zio->io_size, 2539 error); 2540 if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) 2541 return (zio_write_gang_block(zio)); 2542 zio->io_error = error; 2543 } 2544 2545 return (ZIO_PIPELINE_CONTINUE); 2546} 2547 2548static int 2549zio_dva_free(zio_t *zio) 2550{ 2551 metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); 2552 2553 return (ZIO_PIPELINE_CONTINUE); 2554} 2555 2556static int 2557zio_dva_claim(zio_t *zio) 2558{ 2559 int error; 2560 2561 error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 2562 if (error) 2563 zio->io_error = error; 2564 2565 return (ZIO_PIPELINE_CONTINUE); 2566} 2567 2568/* 2569 * Undo an allocation. This is used by zio_done() when an I/O fails 2570 * and we want to give back the block we just allocated. 2571 * This handles both normal blocks and gang blocks. 2572 */ 2573static void 2574zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) 2575{ 2576 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 2577 ASSERT(zio->io_bp_override == NULL); 2578 2579 if (!BP_IS_HOLE(bp)) 2580 metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); 2581 2582 if (gn != NULL) { 2583 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 2584 zio_dva_unallocate(zio, gn->gn_child[g], 2585 &gn->gn_gbh->zg_blkptr[g]); 2586 } 2587 } 2588} 2589 2590/* 2591 * Try to allocate an intent log block. Return 0 on success, errno on failure. 2592 */ 2593int 2594zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, 2595 uint64_t size, boolean_t use_slog) 2596{ 2597 int error = 1; 2598 2599 ASSERT(txg > spa_syncing_txg(spa)); 2600 2601 /* 2602 * ZIL blocks are always contiguous (i.e. not gang blocks) so we 2603 * set the METASLAB_GANG_AVOID flag so that they don't "fast gang" 2604 * when allocating them. 2605 */ 2606 if (use_slog) { 2607 error = metaslab_alloc(spa, spa_log_class(spa), size, 2608 new_bp, 1, txg, old_bp, 2609 METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID); 2610 } 2611 2612 if (error) { 2613 error = metaslab_alloc(spa, spa_normal_class(spa), size, 2614 new_bp, 1, txg, old_bp, 2615 METASLAB_HINTBP_AVOID); 2616 } 2617 2618 if (error == 0) { 2619 BP_SET_LSIZE(new_bp, size); 2620 BP_SET_PSIZE(new_bp, size); 2621 BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 2622 BP_SET_CHECKSUM(new_bp, 2623 spa_version(spa) >= SPA_VERSION_SLIM_ZIL 2624 ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); 2625 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 2626 BP_SET_LEVEL(new_bp, 0); 2627 BP_SET_DEDUP(new_bp, 0); 2628 BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 2629 } 2630 2631 return (error); 2632} 2633 2634/* 2635 * Free an intent log block. 2636 */ 2637void 2638zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp) 2639{ 2640 ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG); 2641 ASSERT(!BP_IS_GANG(bp)); 2642 2643 zio_free(spa, txg, bp); 2644} 2645 2646/* 2647 * ========================================================================== 2648 * Read, write and delete to physical devices 2649 * ========================================================================== 2650 */ 2651static int 2652zio_vdev_io_start(zio_t *zio) 2653{ 2654 vdev_t *vd = zio->io_vd; 2655 uint64_t align; 2656 spa_t *spa = zio->io_spa; 2657 int ret; 2658 2659 ASSERT(zio->io_error == 0); 2660 ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); 2661 2662 if (vd == NULL) { 2663 if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2664 spa_config_enter(spa, SCL_ZIO, zio, RW_READER); 2665 2666 /* 2667 * The mirror_ops handle multiple DVAs in a single BP. 2668 */ 2669 return (vdev_mirror_ops.vdev_op_io_start(zio)); 2670 } 2671 2672 if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE && 2673 zio->io_priority == ZIO_PRIORITY_NOW) { 2674 trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg); 2675 return (ZIO_PIPELINE_CONTINUE); 2676 } 2677 2678 /* 2679 * We keep track of time-sensitive I/Os so that the scan thread 2680 * can quickly react to certain workloads. In particular, we care 2681 * about non-scrubbing, top-level reads and writes with the following 2682 * characteristics: 2683 * - synchronous writes of user data to non-slog devices 2684 * - any reads of user data 2685 * When these conditions are met, adjust the timestamp of spa_last_io 2686 * which allows the scan thread to adjust its workload accordingly. 2687 */ 2688 if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && 2689 vd == vd->vdev_top && !vd->vdev_islog && 2690 zio->io_bookmark.zb_objset != DMU_META_OBJSET && 2691 zio->io_txg != spa_syncing_txg(spa)) { 2692 uint64_t old = spa->spa_last_io; 2693 uint64_t new = ddi_get_lbolt64(); 2694 if (old != new) 2695 (void) atomic_cas_64(&spa->spa_last_io, old, new); 2696 } 2697 2698 align = 1ULL << vd->vdev_top->vdev_ashift; 2699 2700 if ((!(zio->io_flags & ZIO_FLAG_PHYSICAL) || 2701 (vd->vdev_top->vdev_physical_ashift > SPA_MINBLOCKSHIFT)) && 2702 P2PHASE(zio->io_size, align) != 0) { 2703 /* Transform logical writes to be a full physical block size. */ 2704 uint64_t asize = P2ROUNDUP(zio->io_size, align); 2705 char *abuf = NULL; 2706 if (zio->io_type == ZIO_TYPE_READ || 2707 zio->io_type == ZIO_TYPE_WRITE) 2708 abuf = zio_buf_alloc(asize); 2709 ASSERT(vd == vd->vdev_top); 2710 if (zio->io_type == ZIO_TYPE_WRITE) { 2711 bcopy(zio->io_data, abuf, zio->io_size); 2712 bzero(abuf + zio->io_size, asize - zio->io_size); 2713 } 2714 zio_push_transform(zio, abuf, asize, abuf ? asize : 0, 2715 zio_subblock); 2716 } 2717 2718 /* 2719 * If this is not a physical io, make sure that it is properly aligned 2720 * before proceeding. 2721 */ 2722 if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) { 2723 ASSERT0(P2PHASE(zio->io_offset, align)); 2724 ASSERT0(P2PHASE(zio->io_size, align)); 2725 } else { 2726 /* 2727 * For physical writes, we allow 512b aligned writes and assume 2728 * the device will perform a read-modify-write as necessary. 2729 */ 2730 ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE)); 2731 ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE)); 2732 } 2733 2734 VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa)); 2735 2736 /* 2737 * If this is a repair I/O, and there's no self-healing involved -- 2738 * that is, we're just resilvering what we expect to resilver -- 2739 * then don't do the I/O unless zio's txg is actually in vd's DTL. 2740 * This prevents spurious resilvering with nested replication. 2741 * For example, given a mirror of mirrors, (A+B)+(C+D), if only 2742 * A is out of date, we'll read from C+D, then use the data to 2743 * resilver A+B -- but we don't actually want to resilver B, just A. 2744 * The top-level mirror has no way to know this, so instead we just 2745 * discard unnecessary repairs as we work our way down the vdev tree. 2746 * The same logic applies to any form of nested replication: 2747 * ditto + mirror, RAID-Z + replacing, etc. This covers them all. 2748 */ 2749 if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && 2750 !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && 2751 zio->io_txg != 0 && /* not a delegated i/o */ 2752 !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { 2753 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 2754 zio_vdev_io_bypass(zio); 2755 return (ZIO_PIPELINE_CONTINUE); 2756 } 2757 2758 if (vd->vdev_ops->vdev_op_leaf) { 2759 switch (zio->io_type) { 2760 case ZIO_TYPE_READ: 2761 if (vdev_cache_read(zio)) 2762 return (ZIO_PIPELINE_CONTINUE); 2763 /* FALLTHROUGH */ 2764 case ZIO_TYPE_WRITE: 2765 case ZIO_TYPE_FREE: 2766 if ((zio = vdev_queue_io(zio)) == NULL) 2767 return (ZIO_PIPELINE_STOP); 2768 2769 if (!vdev_accessible(vd, zio)) { 2770 zio->io_error = SET_ERROR(ENXIO); 2771 zio_interrupt(zio); 2772 return (ZIO_PIPELINE_STOP); 2773 } 2774 break; 2775 } 2776 /* 2777 * Note that we ignore repair writes for TRIM because they can 2778 * conflict with normal writes. This isn't an issue because, by 2779 * definition, we only repair blocks that aren't freed. 2780 */ 2781 if (zio->io_type == ZIO_TYPE_WRITE && 2782 !(zio->io_flags & ZIO_FLAG_IO_REPAIR) && 2783 !trim_map_write_start(zio)) 2784 return (ZIO_PIPELINE_STOP); 2785 } 2786 2787 ret = vd->vdev_ops->vdev_op_io_start(zio); 2788 ASSERT(ret == ZIO_PIPELINE_STOP); 2789 2790 return (ret); 2791} 2792 2793static int 2794zio_vdev_io_done(zio_t *zio) 2795{ 2796 vdev_t *vd = zio->io_vd; 2797 vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; 2798 boolean_t unexpected_error = B_FALSE; 2799 2800 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2801 return (ZIO_PIPELINE_STOP); 2802 2803 ASSERT(zio->io_type == ZIO_TYPE_READ || 2804 zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE); 2805 2806 if (vd != NULL && vd->vdev_ops->vdev_op_leaf && 2807 (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || 2808 zio->io_type == ZIO_TYPE_FREE)) { 2809 2810 if (zio->io_type == ZIO_TYPE_WRITE && 2811 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) 2812 trim_map_write_done(zio); 2813 2814 vdev_queue_io_done(zio); 2815 2816 if (zio->io_type == ZIO_TYPE_WRITE) 2817 vdev_cache_write(zio); 2818 2819 if (zio_injection_enabled && zio->io_error == 0) 2820 zio->io_error = zio_handle_device_injection(vd, 2821 zio, EIO); 2822 2823 if (zio_injection_enabled && zio->io_error == 0) 2824 zio->io_error = zio_handle_label_injection(zio, EIO); 2825 2826 if (zio->io_error) { 2827 if (zio->io_error == ENOTSUP && 2828 zio->io_type == ZIO_TYPE_FREE) { 2829 /* Not all devices support TRIM. */ 2830 } else if (!vdev_accessible(vd, zio)) { 2831 zio->io_error = SET_ERROR(ENXIO); 2832 } else { 2833 unexpected_error = B_TRUE; 2834 } 2835 } 2836 } 2837 2838 ops->vdev_op_io_done(zio); 2839 2840 if (unexpected_error) 2841 VERIFY(vdev_probe(vd, zio) == NULL); 2842 2843 return (ZIO_PIPELINE_CONTINUE); 2844} 2845 2846/* 2847 * For non-raidz ZIOs, we can just copy aside the bad data read from the 2848 * disk, and use that to finish the checksum ereport later. 2849 */ 2850static void 2851zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, 2852 const void *good_buf) 2853{ 2854 /* no processing needed */ 2855 zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); 2856} 2857 2858/*ARGSUSED*/ 2859void 2860zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) 2861{ 2862 void *buf = zio_buf_alloc(zio->io_size); 2863 2864 bcopy(zio->io_data, buf, zio->io_size); 2865 2866 zcr->zcr_cbinfo = zio->io_size; 2867 zcr->zcr_cbdata = buf; 2868 zcr->zcr_finish = zio_vsd_default_cksum_finish; 2869 zcr->zcr_free = zio_buf_free; 2870} 2871 2872static int 2873zio_vdev_io_assess(zio_t *zio) 2874{ 2875 vdev_t *vd = zio->io_vd; 2876 2877 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2878 return (ZIO_PIPELINE_STOP); 2879 2880 if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2881 spa_config_exit(zio->io_spa, SCL_ZIO, zio); 2882 2883 if (zio->io_vsd != NULL) { 2884 zio->io_vsd_ops->vsd_free(zio); 2885 zio->io_vsd = NULL; 2886 } 2887 2888 if (zio_injection_enabled && zio->io_error == 0) 2889 zio->io_error = zio_handle_fault_injection(zio, EIO); 2890 2891 if (zio->io_type == ZIO_TYPE_FREE && 2892 zio->io_priority != ZIO_PRIORITY_NOW) { 2893 switch (zio->io_error) { 2894 case 0: 2895 ZIO_TRIM_STAT_INCR(bytes, zio->io_size); 2896 ZIO_TRIM_STAT_BUMP(success); 2897 break; 2898 case EOPNOTSUPP: 2899 ZIO_TRIM_STAT_BUMP(unsupported); 2900 break; 2901 default: 2902 ZIO_TRIM_STAT_BUMP(failed); 2903 break; 2904 } 2905 } 2906 2907 /* 2908 * If the I/O failed, determine whether we should attempt to retry it. 2909 * 2910 * On retry, we cut in line in the issue queue, since we don't want 2911 * compression/checksumming/etc. work to prevent our (cheap) IO reissue. 2912 */ 2913 if (zio->io_error && vd == NULL && 2914 !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { 2915 ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ 2916 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ 2917 zio->io_error = 0; 2918 zio->io_flags |= ZIO_FLAG_IO_RETRY | 2919 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; 2920 zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; 2921 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, 2922 zio_requeue_io_start_cut_in_line); 2923 return (ZIO_PIPELINE_STOP); 2924 } 2925 2926 /* 2927 * If we got an error on a leaf device, convert it to ENXIO 2928 * if the device is not accessible at all. 2929 */ 2930 if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && 2931 !vdev_accessible(vd, zio)) 2932 zio->io_error = SET_ERROR(ENXIO); 2933 2934 /* 2935 * If we can't write to an interior vdev (mirror or RAID-Z), 2936 * set vdev_cant_write so that we stop trying to allocate from it. 2937 */ 2938 if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && 2939 vd != NULL && !vd->vdev_ops->vdev_op_leaf) { 2940 vd->vdev_cant_write = B_TRUE; 2941 } 2942 2943 if (zio->io_error) 2944 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2945 2946 if (vd != NULL && vd->vdev_ops->vdev_op_leaf && 2947 zio->io_physdone != NULL) { 2948 ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED)); 2949 ASSERT(zio->io_child_type == ZIO_CHILD_VDEV); 2950 zio->io_physdone(zio->io_logical); 2951 } 2952 2953 return (ZIO_PIPELINE_CONTINUE); 2954} 2955 2956void 2957zio_vdev_io_reissue(zio_t *zio) 2958{ 2959 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2960 ASSERT(zio->io_error == 0); 2961 2962 zio->io_stage >>= 1; 2963} 2964 2965void 2966zio_vdev_io_redone(zio_t *zio) 2967{ 2968 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 2969 2970 zio->io_stage >>= 1; 2971} 2972 2973void 2974zio_vdev_io_bypass(zio_t *zio) 2975{ 2976 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2977 ASSERT(zio->io_error == 0); 2978 2979 zio->io_flags |= ZIO_FLAG_IO_BYPASS; 2980 zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; 2981} 2982 2983/* 2984 * ========================================================================== 2985 * Generate and verify checksums 2986 * ========================================================================== 2987 */ 2988static int 2989zio_checksum_generate(zio_t *zio) 2990{ 2991 blkptr_t *bp = zio->io_bp; 2992 enum zio_checksum checksum; 2993 2994 if (bp == NULL) { 2995 /* 2996 * This is zio_write_phys(). 2997 * We're either generating a label checksum, or none at all. 2998 */ 2999 checksum = zio->io_prop.zp_checksum; 3000 3001 if (checksum == ZIO_CHECKSUM_OFF) 3002 return (ZIO_PIPELINE_CONTINUE); 3003 3004 ASSERT(checksum == ZIO_CHECKSUM_LABEL); 3005 } else { 3006 if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { 3007 ASSERT(!IO_IS_ALLOCATING(zio)); 3008 checksum = ZIO_CHECKSUM_GANG_HEADER; 3009 } else { 3010 checksum = BP_GET_CHECKSUM(bp); 3011 } 3012 } 3013 3014 zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); 3015 3016 return (ZIO_PIPELINE_CONTINUE); 3017} 3018 3019static int 3020zio_checksum_verify(zio_t *zio) 3021{ 3022 zio_bad_cksum_t info; 3023 blkptr_t *bp = zio->io_bp; 3024 int error; 3025 3026 ASSERT(zio->io_vd != NULL); 3027 3028 if (bp == NULL) { 3029 /* 3030 * This is zio_read_phys(). 3031 * We're either verifying a label checksum, or nothing at all. 3032 */ 3033 if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) 3034 return (ZIO_PIPELINE_CONTINUE); 3035 3036 ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); 3037 } 3038 3039 if ((error = zio_checksum_error(zio, &info)) != 0) { 3040 zio->io_error = error; 3041 if (error == ECKSUM && 3042 !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 3043 zfs_ereport_start_checksum(zio->io_spa, 3044 zio->io_vd, zio, zio->io_offset, 3045 zio->io_size, NULL, &info); 3046 } 3047 } 3048 3049 return (ZIO_PIPELINE_CONTINUE); 3050} 3051 3052/* 3053 * Called by RAID-Z to ensure we don't compute the checksum twice. 3054 */ 3055void 3056zio_checksum_verified(zio_t *zio) 3057{ 3058 zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 3059} 3060 3061/* 3062 * ========================================================================== 3063 * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. 3064 * An error of 0 indicates success. ENXIO indicates whole-device failure, 3065 * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO 3066 * indicate errors that are specific to one I/O, and most likely permanent. 3067 * Any other error is presumed to be worse because we weren't expecting it. 3068 * ========================================================================== 3069 */ 3070int 3071zio_worst_error(int e1, int e2) 3072{ 3073 static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; 3074 int r1, r2; 3075 3076 for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) 3077 if (e1 == zio_error_rank[r1]) 3078 break; 3079 3080 for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) 3081 if (e2 == zio_error_rank[r2]) 3082 break; 3083 3084 return (r1 > r2 ? e1 : e2); 3085} 3086 3087/* 3088 * ========================================================================== 3089 * I/O completion 3090 * ========================================================================== 3091 */ 3092static int 3093zio_ready(zio_t *zio) 3094{ 3095 blkptr_t *bp = zio->io_bp; 3096 zio_t *pio, *pio_next; 3097 3098 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 3099 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY)) 3100 return (ZIO_PIPELINE_STOP); 3101 3102 if (zio->io_ready) { 3103 ASSERT(IO_IS_ALLOCATING(zio)); 3104 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || 3105 (zio->io_flags & ZIO_FLAG_NOPWRITE)); 3106 ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); 3107 3108 zio->io_ready(zio); 3109 } 3110 3111 if (bp != NULL && bp != &zio->io_bp_copy) 3112 zio->io_bp_copy = *bp; 3113 3114 if (zio->io_error) 3115 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 3116 3117 mutex_enter(&zio->io_lock); 3118 zio->io_state[ZIO_WAIT_READY] = 1; 3119 pio = zio_walk_parents(zio); 3120 mutex_exit(&zio->io_lock); 3121 3122 /* 3123 * As we notify zio's parents, new parents could be added. 3124 * New parents go to the head of zio's io_parent_list, however, 3125 * so we will (correctly) not notify them. The remainder of zio's 3126 * io_parent_list, from 'pio_next' onward, cannot change because 3127 * all parents must wait for us to be done before they can be done. 3128 */ 3129 for (; pio != NULL; pio = pio_next) { 3130 pio_next = zio_walk_parents(zio); 3131 zio_notify_parent(pio, zio, ZIO_WAIT_READY); 3132 } 3133 3134 if (zio->io_flags & ZIO_FLAG_NODATA) { 3135 if (BP_IS_GANG(bp)) { 3136 zio->io_flags &= ~ZIO_FLAG_NODATA; 3137 } else { 3138 ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE); 3139 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 3140 } 3141 } 3142 3143 if (zio_injection_enabled && 3144 zio->io_spa->spa_syncing_txg == zio->io_txg) 3145 zio_handle_ignored_writes(zio); 3146 3147 return (ZIO_PIPELINE_CONTINUE); 3148} 3149 3150static int 3151zio_done(zio_t *zio) 3152{ 3153 spa_t *spa = zio->io_spa; 3154 zio_t *lio = zio->io_logical; 3155 blkptr_t *bp = zio->io_bp; 3156 vdev_t *vd = zio->io_vd; 3157 uint64_t psize = zio->io_size; 3158 zio_t *pio, *pio_next; 3159 3160 /* 3161 * If our children haven't all completed, 3162 * wait for them and then repeat this pipeline stage. 3163 */ 3164 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || 3165 zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || 3166 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) || 3167 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) 3168 return (ZIO_PIPELINE_STOP); 3169 3170 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 3171 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 3172 ASSERT(zio->io_children[c][w] == 0); 3173 3174 if (bp != NULL && !BP_IS_EMBEDDED(bp)) { 3175 ASSERT(bp->blk_pad[0] == 0); 3176 ASSERT(bp->blk_pad[1] == 0); 3177 ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || 3178 (bp == zio_unique_parent(zio)->io_bp)); 3179 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 3180 zio->io_bp_override == NULL && 3181 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 3182 ASSERT(!BP_SHOULD_BYTESWAP(bp)); 3183 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); 3184 ASSERT(BP_COUNT_GANG(bp) == 0 || 3185 (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 3186 } 3187 if (zio->io_flags & ZIO_FLAG_NOPWRITE) 3188 VERIFY(BP_EQUAL(bp, &zio->io_bp_orig)); 3189 } 3190 3191 /* 3192 * If there were child vdev/gang/ddt errors, they apply to us now. 3193 */ 3194 zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); 3195 zio_inherit_child_errors(zio, ZIO_CHILD_GANG); 3196 zio_inherit_child_errors(zio, ZIO_CHILD_DDT); 3197 3198 /* 3199 * If the I/O on the transformed data was successful, generate any 3200 * checksum reports now while we still have the transformed data. 3201 */ 3202 if (zio->io_error == 0) { 3203 while (zio->io_cksum_report != NULL) { 3204 zio_cksum_report_t *zcr = zio->io_cksum_report; 3205 uint64_t align = zcr->zcr_align; 3206 uint64_t asize = P2ROUNDUP(psize, align); 3207 char *abuf = zio->io_data; 3208 3209 if (asize != psize) { 3210 abuf = zio_buf_alloc(asize); 3211 bcopy(zio->io_data, abuf, psize); 3212 bzero(abuf + psize, asize - psize); 3213 } 3214 3215 zio->io_cksum_report = zcr->zcr_next; 3216 zcr->zcr_next = NULL; 3217 zcr->zcr_finish(zcr, abuf); 3218 zfs_ereport_free_checksum(zcr); 3219 3220 if (asize != psize) 3221 zio_buf_free(abuf, asize); 3222 } 3223 } 3224 3225 zio_pop_transforms(zio); /* note: may set zio->io_error */ 3226 3227 vdev_stat_update(zio, psize); 3228 3229 if (zio->io_error) { 3230 /* 3231 * If this I/O is attached to a particular vdev, 3232 * generate an error message describing the I/O failure 3233 * at the block level. We ignore these errors if the 3234 * device is currently unavailable. 3235 */ 3236 if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 3237 zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 3238 3239 if ((zio->io_error == EIO || !(zio->io_flags & 3240 (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && 3241 zio == lio) { 3242 /* 3243 * For logical I/O requests, tell the SPA to log the 3244 * error and generate a logical data ereport. 3245 */ 3246 spa_log_error(spa, zio); 3247 zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 3248 0, 0); 3249 } 3250 } 3251 3252 if (zio->io_error && zio == lio) { 3253 /* 3254 * Determine whether zio should be reexecuted. This will 3255 * propagate all the way to the root via zio_notify_parent(). 3256 */ 3257 ASSERT(vd == NULL && bp != NULL); 3258 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3259 3260 if (IO_IS_ALLOCATING(zio) && 3261 !(zio->io_flags & ZIO_FLAG_CANFAIL)) { 3262 if (zio->io_error != ENOSPC) 3263 zio->io_reexecute |= ZIO_REEXECUTE_NOW; 3264 else 3265 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3266 } 3267 3268 if ((zio->io_type == ZIO_TYPE_READ || 3269 zio->io_type == ZIO_TYPE_FREE) && 3270 !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && 3271 zio->io_error == ENXIO && 3272 spa_load_state(spa) == SPA_LOAD_NONE && 3273 spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 3274 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3275 3276 if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) 3277 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3278 3279 /* 3280 * Here is a possibly good place to attempt to do 3281 * either combinatorial reconstruction or error correction 3282 * based on checksums. It also might be a good place 3283 * to send out preliminary ereports before we suspend 3284 * processing. 3285 */ 3286 } 3287 3288 /* 3289 * If there were logical child errors, they apply to us now. 3290 * We defer this until now to avoid conflating logical child 3291 * errors with errors that happened to the zio itself when 3292 * updating vdev stats and reporting FMA events above. 3293 */ 3294 zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); 3295 3296 if ((zio->io_error || zio->io_reexecute) && 3297 IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && 3298 !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) 3299 zio_dva_unallocate(zio, zio->io_gang_tree, bp); 3300 3301 zio_gang_tree_free(&zio->io_gang_tree); 3302 3303 /* 3304 * Godfather I/Os should never suspend. 3305 */ 3306 if ((zio->io_flags & ZIO_FLAG_GODFATHER) && 3307 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) 3308 zio->io_reexecute = 0; 3309 3310 if (zio->io_reexecute) { 3311 /* 3312 * This is a logical I/O that wants to reexecute. 3313 * 3314 * Reexecute is top-down. When an i/o fails, if it's not 3315 * the root, it simply notifies its parent and sticks around. 3316 * The parent, seeing that it still has children in zio_done(), 3317 * does the same. This percolates all the way up to the root. 3318 * The root i/o will reexecute or suspend the entire tree. 3319 * 3320 * This approach ensures that zio_reexecute() honors 3321 * all the original i/o dependency relationships, e.g. 3322 * parents not executing until children are ready. 3323 */ 3324 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3325 3326 zio->io_gang_leader = NULL; 3327 3328 mutex_enter(&zio->io_lock); 3329 zio->io_state[ZIO_WAIT_DONE] = 1; 3330 mutex_exit(&zio->io_lock); 3331 3332 /* 3333 * "The Godfather" I/O monitors its children but is 3334 * not a true parent to them. It will track them through 3335 * the pipeline but severs its ties whenever they get into 3336 * trouble (e.g. suspended). This allows "The Godfather" 3337 * I/O to return status without blocking. 3338 */ 3339 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 3340 zio_link_t *zl = zio->io_walk_link; 3341 pio_next = zio_walk_parents(zio); 3342 3343 if ((pio->io_flags & ZIO_FLAG_GODFATHER) && 3344 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { 3345 zio_remove_child(pio, zio, zl); 3346 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3347 } 3348 } 3349 3350 if ((pio = zio_unique_parent(zio)) != NULL) { 3351 /* 3352 * We're not a root i/o, so there's nothing to do 3353 * but notify our parent. Don't propagate errors 3354 * upward since we haven't permanently failed yet. 3355 */ 3356 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 3357 zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; 3358 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3359 } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { 3360 /* 3361 * We'd fail again if we reexecuted now, so suspend 3362 * until conditions improve (e.g. device comes online). 3363 */ 3364 zio_suspend(spa, zio); 3365 } else { 3366 /* 3367 * Reexecution is potentially a huge amount of work. 3368 * Hand it off to the otherwise-unused claim taskq. 3369 */ 3370#if defined(illumos) || !defined(_KERNEL) 3371 ASSERT(zio->io_tqent.tqent_next == NULL); 3372#else 3373 ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); 3374#endif 3375 spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM, 3376 ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio, 3377 0, &zio->io_tqent); 3378 } 3379 return (ZIO_PIPELINE_STOP); 3380 } 3381 3382 ASSERT(zio->io_child_count == 0); 3383 ASSERT(zio->io_reexecute == 0); 3384 ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); 3385 3386 /* 3387 * Report any checksum errors, since the I/O is complete. 3388 */ 3389 while (zio->io_cksum_report != NULL) { 3390 zio_cksum_report_t *zcr = zio->io_cksum_report; 3391 zio->io_cksum_report = zcr->zcr_next; 3392 zcr->zcr_next = NULL; 3393 zcr->zcr_finish(zcr, NULL); 3394 zfs_ereport_free_checksum(zcr); 3395 } 3396 3397 /* 3398 * It is the responsibility of the done callback to ensure that this 3399 * particular zio is no longer discoverable for adoption, and as 3400 * such, cannot acquire any new parents. 3401 */ 3402 if (zio->io_done) 3403 zio->io_done(zio); 3404 3405 mutex_enter(&zio->io_lock); 3406 zio->io_state[ZIO_WAIT_DONE] = 1; 3407 mutex_exit(&zio->io_lock); 3408 3409 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 3410 zio_link_t *zl = zio->io_walk_link; 3411 pio_next = zio_walk_parents(zio); 3412 zio_remove_child(pio, zio, zl); 3413 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3414 } 3415 3416 if (zio->io_waiter != NULL) { 3417 mutex_enter(&zio->io_lock); 3418 zio->io_executor = NULL; 3419 cv_broadcast(&zio->io_cv); 3420 mutex_exit(&zio->io_lock); 3421 } else { 3422 zio_destroy(zio); 3423 } 3424 3425 return (ZIO_PIPELINE_STOP); 3426} 3427 3428/* 3429 * ========================================================================== 3430 * I/O pipeline definition 3431 * ========================================================================== 3432 */ 3433static zio_pipe_stage_t *zio_pipeline[] = { 3434 NULL, 3435 zio_read_bp_init, 3436 zio_free_bp_init, 3437 zio_issue_async, 3438 zio_write_bp_init, 3439 zio_checksum_generate, 3440 zio_nop_write, 3441 zio_ddt_read_start, 3442 zio_ddt_read_done, 3443 zio_ddt_write, 3444 zio_ddt_free, 3445 zio_gang_assemble, 3446 zio_gang_issue, 3447 zio_dva_allocate, 3448 zio_dva_free, 3449 zio_dva_claim, 3450 zio_ready, 3451 zio_vdev_io_start, 3452 zio_vdev_io_done, 3453 zio_vdev_io_assess, 3454 zio_checksum_verify, 3455 zio_done 3456}; 3457 3458/* dnp is the dnode for zb1->zb_object */ 3459boolean_t 3460zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_phys_t *zb1, 3461 const zbookmark_phys_t *zb2) 3462{ 3463 uint64_t zb1nextL0, zb2thisobj; 3464 3465 ASSERT(zb1->zb_objset == zb2->zb_objset); 3466 ASSERT(zb2->zb_level == 0); 3467 3468 /* The objset_phys_t isn't before anything. */ 3469 if (dnp == NULL) 3470 return (B_FALSE); 3471 3472 zb1nextL0 = (zb1->zb_blkid + 1) << 3473 ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); 3474 3475 zb2thisobj = zb2->zb_object ? zb2->zb_object : 3476 zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); 3477 3478 if (zb1->zb_object == DMU_META_DNODE_OBJECT) { 3479 uint64_t nextobj = zb1nextL0 * 3480 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; 3481 return (nextobj <= zb2thisobj); 3482 } 3483 3484 if (zb1->zb_object < zb2thisobj) 3485 return (B_TRUE); 3486 if (zb1->zb_object > zb2thisobj) 3487 return (B_FALSE); 3488 if (zb2->zb_object == DMU_META_DNODE_OBJECT) 3489 return (B_FALSE); 3490 return (zb1nextL0 <= zb2->zb_blkid); 3491} 3492