zio.c revision 288543
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2015 by Delphix. All rights reserved. 24 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. 25 */ 26 27#include <sys/sysmacros.h> 28#include <sys/zfs_context.h> 29#include <sys/fm/fs/zfs.h> 30#include <sys/spa.h> 31#include <sys/txg.h> 32#include <sys/spa_impl.h> 33#include <sys/vdev_impl.h> 34#include <sys/zio_impl.h> 35#include <sys/zio_compress.h> 36#include <sys/zio_checksum.h> 37#include <sys/dmu_objset.h> 38#include <sys/arc.h> 39#include <sys/ddt.h> 40#include <sys/trim_map.h> 41#include <sys/blkptr.h> 42#include <sys/zfeature.h> 43 44SYSCTL_DECL(_vfs_zfs); 45SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); 46#if defined(__amd64__) 47static int zio_use_uma = 1; 48#else 49static int zio_use_uma = 0; 50#endif 51TUNABLE_INT("vfs.zfs.zio.use_uma", &zio_use_uma); 52SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0, 53 "Use uma(9) for ZIO allocations"); 54static int zio_exclude_metadata = 0; 55TUNABLE_INT("vfs.zfs.zio.exclude_metadata", &zio_exclude_metadata); 56SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0, 57 "Exclude metadata buffers from dumps as well"); 58 59zio_trim_stats_t zio_trim_stats = { 60 { "bytes", KSTAT_DATA_UINT64, 61 "Number of bytes successfully TRIMmed" }, 62 { "success", KSTAT_DATA_UINT64, 63 "Number of successful TRIM requests" }, 64 { "unsupported", KSTAT_DATA_UINT64, 65 "Number of TRIM requests that failed because TRIM is not supported" }, 66 { "failed", KSTAT_DATA_UINT64, 67 "Number of TRIM requests that failed for reasons other than not supported" }, 68}; 69 70static kstat_t *zio_trim_ksp; 71 72/* 73 * ========================================================================== 74 * I/O type descriptions 75 * ========================================================================== 76 */ 77const char *zio_type_name[ZIO_TYPES] = { 78 "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", 79 "zio_ioctl" 80}; 81 82/* 83 * ========================================================================== 84 * I/O kmem caches 85 * ========================================================================== 86 */ 87kmem_cache_t *zio_cache; 88kmem_cache_t *zio_link_cache; 89kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 90kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 91 92#ifdef _KERNEL 93extern vmem_t *zio_alloc_arena; 94#endif 95 96/* 97 * The following actions directly effect the spa's sync-to-convergence logic. 98 * The values below define the sync pass when we start performing the action. 99 * Care should be taken when changing these values as they directly impact 100 * spa_sync() performance. Tuning these values may introduce subtle performance 101 * pathologies and should only be done in the context of performance analysis. 102 * These tunables will eventually be removed and replaced with #defines once 103 * enough analysis has been done to determine optimal values. 104 * 105 * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that 106 * regular blocks are not deferred. 107 */ 108int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */ 109TUNABLE_INT("vfs.zfs.sync_pass_deferred_free", &zfs_sync_pass_deferred_free); 110SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_deferred_free, CTLFLAG_RDTUN, 111 &zfs_sync_pass_deferred_free, 0, "defer frees starting in this pass"); 112int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */ 113TUNABLE_INT("vfs.zfs.sync_pass_dont_compress", &zfs_sync_pass_dont_compress); 114SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_dont_compress, CTLFLAG_RDTUN, 115 &zfs_sync_pass_dont_compress, 0, "don't compress starting in this pass"); 116int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ 117TUNABLE_INT("vfs.zfs.sync_pass_rewrite", &zfs_sync_pass_rewrite); 118SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_rewrite, CTLFLAG_RDTUN, 119 &zfs_sync_pass_rewrite, 0, "rewrite new bps starting in this pass"); 120 121/* 122 * An allocating zio is one that either currently has the DVA allocate 123 * stage set or will have it later in its lifetime. 124 */ 125#define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) 126 127boolean_t zio_requeue_io_start_cut_in_line = B_TRUE; 128 129#ifdef ZFS_DEBUG 130int zio_buf_debug_limit = 16384; 131#else 132int zio_buf_debug_limit = 0; 133#endif 134 135void 136zio_init(void) 137{ 138 size_t c; 139 zio_cache = kmem_cache_create("zio_cache", 140 sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 141 zio_link_cache = kmem_cache_create("zio_link_cache", 142 sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 143 if (!zio_use_uma) 144 goto out; 145 146 /* 147 * For small buffers, we want a cache for each multiple of 148 * SPA_MINBLOCKSIZE. For larger buffers, we want a cache 149 * for each quarter-power of 2. 150 */ 151 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 152 size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 153 size_t p2 = size; 154 size_t align = 0; 155 size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0; 156 157 while (!ISP2(p2)) 158 p2 &= p2 - 1; 159 160#ifdef illumos 161#ifndef _KERNEL 162 /* 163 * If we are using watchpoints, put each buffer on its own page, 164 * to eliminate the performance overhead of trapping to the 165 * kernel when modifying a non-watched buffer that shares the 166 * page with a watched buffer. 167 */ 168 if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) 169 continue; 170#endif 171#endif /* illumos */ 172 if (size <= 4 * SPA_MINBLOCKSIZE) { 173 align = SPA_MINBLOCKSIZE; 174 } else if (IS_P2ALIGNED(size, p2 >> 2)) { 175 align = MIN(p2 >> 2, PAGESIZE); 176 } 177 178 if (align != 0) { 179 char name[36]; 180 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 181 zio_buf_cache[c] = kmem_cache_create(name, size, 182 align, NULL, NULL, NULL, NULL, NULL, cflags); 183 184 /* 185 * Since zio_data bufs do not appear in crash dumps, we 186 * pass KMC_NOTOUCH so that no allocator metadata is 187 * stored with the buffers. 188 */ 189 (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 190 zio_data_buf_cache[c] = kmem_cache_create(name, size, 191 align, NULL, NULL, NULL, NULL, NULL, 192 cflags | KMC_NOTOUCH | KMC_NODEBUG); 193 } 194 } 195 196 while (--c != 0) { 197 ASSERT(zio_buf_cache[c] != NULL); 198 if (zio_buf_cache[c - 1] == NULL) 199 zio_buf_cache[c - 1] = zio_buf_cache[c]; 200 201 ASSERT(zio_data_buf_cache[c] != NULL); 202 if (zio_data_buf_cache[c - 1] == NULL) 203 zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 204 } 205out: 206 207 zio_inject_init(); 208 209 zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc", 210 KSTAT_TYPE_NAMED, 211 sizeof(zio_trim_stats) / sizeof(kstat_named_t), 212 KSTAT_FLAG_VIRTUAL); 213 214 if (zio_trim_ksp != NULL) { 215 zio_trim_ksp->ks_data = &zio_trim_stats; 216 kstat_install(zio_trim_ksp); 217 } 218} 219 220void 221zio_fini(void) 222{ 223 size_t c; 224 kmem_cache_t *last_cache = NULL; 225 kmem_cache_t *last_data_cache = NULL; 226 227 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 228 if (zio_buf_cache[c] != last_cache) { 229 last_cache = zio_buf_cache[c]; 230 kmem_cache_destroy(zio_buf_cache[c]); 231 } 232 zio_buf_cache[c] = NULL; 233 234 if (zio_data_buf_cache[c] != last_data_cache) { 235 last_data_cache = zio_data_buf_cache[c]; 236 kmem_cache_destroy(zio_data_buf_cache[c]); 237 } 238 zio_data_buf_cache[c] = NULL; 239 } 240 241 kmem_cache_destroy(zio_link_cache); 242 kmem_cache_destroy(zio_cache); 243 244 zio_inject_fini(); 245 246 if (zio_trim_ksp != NULL) { 247 kstat_delete(zio_trim_ksp); 248 zio_trim_ksp = NULL; 249 } 250} 251 252/* 253 * ========================================================================== 254 * Allocate and free I/O buffers 255 * ========================================================================== 256 */ 257 258/* 259 * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 260 * crashdump if the kernel panics, so use it judiciously. Obviously, it's 261 * useful to inspect ZFS metadata, but if possible, we should avoid keeping 262 * excess / transient data in-core during a crashdump. 263 */ 264void * 265zio_buf_alloc(size_t size) 266{ 267 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 268 int flags = zio_exclude_metadata ? KM_NODEBUG : 0; 269 270 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 271 272 if (zio_use_uma) 273 return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 274 else 275 return (kmem_alloc(size, KM_SLEEP|flags)); 276} 277 278/* 279 * Use zio_data_buf_alloc to allocate data. The data will not appear in a 280 * crashdump if the kernel panics. This exists so that we will limit the amount 281 * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 282 * of kernel heap dumped to disk when the kernel panics) 283 */ 284void * 285zio_data_buf_alloc(size_t size) 286{ 287 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 288 289 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 290 291 if (zio_use_uma) 292 return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 293 else 294 return (kmem_alloc(size, KM_SLEEP | KM_NODEBUG)); 295} 296 297void 298zio_buf_free(void *buf, size_t size) 299{ 300 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 301 302 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 303 304 if (zio_use_uma) 305 kmem_cache_free(zio_buf_cache[c], buf); 306 else 307 kmem_free(buf, size); 308} 309 310void 311zio_data_buf_free(void *buf, size_t size) 312{ 313 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 314 315 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 316 317 if (zio_use_uma) 318 kmem_cache_free(zio_data_buf_cache[c], buf); 319 else 320 kmem_free(buf, size); 321} 322 323/* 324 * ========================================================================== 325 * Push and pop I/O transform buffers 326 * ========================================================================== 327 */ 328static void 329zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, 330 zio_transform_func_t *transform) 331{ 332 zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 333 334 zt->zt_orig_data = zio->io_data; 335 zt->zt_orig_size = zio->io_size; 336 zt->zt_bufsize = bufsize; 337 zt->zt_transform = transform; 338 339 zt->zt_next = zio->io_transform_stack; 340 zio->io_transform_stack = zt; 341 342 zio->io_data = data; 343 zio->io_size = size; 344} 345 346static void 347zio_pop_transforms(zio_t *zio) 348{ 349 zio_transform_t *zt; 350 351 while ((zt = zio->io_transform_stack) != NULL) { 352 if (zt->zt_transform != NULL) 353 zt->zt_transform(zio, 354 zt->zt_orig_data, zt->zt_orig_size); 355 356 if (zt->zt_bufsize != 0) 357 zio_buf_free(zio->io_data, zt->zt_bufsize); 358 359 zio->io_data = zt->zt_orig_data; 360 zio->io_size = zt->zt_orig_size; 361 zio->io_transform_stack = zt->zt_next; 362 363 kmem_free(zt, sizeof (zio_transform_t)); 364 } 365} 366 367/* 368 * ========================================================================== 369 * I/O transform callbacks for subblocks and decompression 370 * ========================================================================== 371 */ 372static void 373zio_subblock(zio_t *zio, void *data, uint64_t size) 374{ 375 ASSERT(zio->io_size > size); 376 377 if (zio->io_type == ZIO_TYPE_READ) 378 bcopy(zio->io_data, data, size); 379} 380 381static void 382zio_decompress(zio_t *zio, void *data, uint64_t size) 383{ 384 if (zio->io_error == 0 && 385 zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), 386 zio->io_data, data, zio->io_size, size) != 0) 387 zio->io_error = SET_ERROR(EIO); 388} 389 390/* 391 * ========================================================================== 392 * I/O parent/child relationships and pipeline interlocks 393 * ========================================================================== 394 */ 395/* 396 * NOTE - Callers to zio_walk_parents() and zio_walk_children must 397 * continue calling these functions until they return NULL. 398 * Otherwise, the next caller will pick up the list walk in 399 * some indeterminate state. (Otherwise every caller would 400 * have to pass in a cookie to keep the state represented by 401 * io_walk_link, which gets annoying.) 402 */ 403zio_t * 404zio_walk_parents(zio_t *cio) 405{ 406 zio_link_t *zl = cio->io_walk_link; 407 list_t *pl = &cio->io_parent_list; 408 409 zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); 410 cio->io_walk_link = zl; 411 412 if (zl == NULL) 413 return (NULL); 414 415 ASSERT(zl->zl_child == cio); 416 return (zl->zl_parent); 417} 418 419zio_t * 420zio_walk_children(zio_t *pio) 421{ 422 zio_link_t *zl = pio->io_walk_link; 423 list_t *cl = &pio->io_child_list; 424 425 zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); 426 pio->io_walk_link = zl; 427 428 if (zl == NULL) 429 return (NULL); 430 431 ASSERT(zl->zl_parent == pio); 432 return (zl->zl_child); 433} 434 435zio_t * 436zio_unique_parent(zio_t *cio) 437{ 438 zio_t *pio = zio_walk_parents(cio); 439 440 VERIFY(zio_walk_parents(cio) == NULL); 441 return (pio); 442} 443 444void 445zio_add_child(zio_t *pio, zio_t *cio) 446{ 447 zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); 448 449 /* 450 * Logical I/Os can have logical, gang, or vdev children. 451 * Gang I/Os can have gang or vdev children. 452 * Vdev I/Os can only have vdev children. 453 * The following ASSERT captures all of these constraints. 454 */ 455 ASSERT(cio->io_child_type <= pio->io_child_type); 456 457 zl->zl_parent = pio; 458 zl->zl_child = cio; 459 460 mutex_enter(&cio->io_lock); 461 mutex_enter(&pio->io_lock); 462 463 ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); 464 465 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 466 pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; 467 468 list_insert_head(&pio->io_child_list, zl); 469 list_insert_head(&cio->io_parent_list, zl); 470 471 pio->io_child_count++; 472 cio->io_parent_count++; 473 474 mutex_exit(&pio->io_lock); 475 mutex_exit(&cio->io_lock); 476} 477 478static void 479zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) 480{ 481 ASSERT(zl->zl_parent == pio); 482 ASSERT(zl->zl_child == cio); 483 484 mutex_enter(&cio->io_lock); 485 mutex_enter(&pio->io_lock); 486 487 list_remove(&pio->io_child_list, zl); 488 list_remove(&cio->io_parent_list, zl); 489 490 pio->io_child_count--; 491 cio->io_parent_count--; 492 493 mutex_exit(&pio->io_lock); 494 mutex_exit(&cio->io_lock); 495 496 kmem_cache_free(zio_link_cache, zl); 497} 498 499static boolean_t 500zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) 501{ 502 uint64_t *countp = &zio->io_children[child][wait]; 503 boolean_t waiting = B_FALSE; 504 505 mutex_enter(&zio->io_lock); 506 ASSERT(zio->io_stall == NULL); 507 if (*countp != 0) { 508 zio->io_stage >>= 1; 509 zio->io_stall = countp; 510 waiting = B_TRUE; 511 } 512 mutex_exit(&zio->io_lock); 513 514 return (waiting); 515} 516 517static void 518zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) 519{ 520 uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; 521 int *errorp = &pio->io_child_error[zio->io_child_type]; 522 523 mutex_enter(&pio->io_lock); 524 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 525 *errorp = zio_worst_error(*errorp, zio->io_error); 526 pio->io_reexecute |= zio->io_reexecute; 527 ASSERT3U(*countp, >, 0); 528 529 (*countp)--; 530 531 if (*countp == 0 && pio->io_stall == countp) { 532 pio->io_stall = NULL; 533 mutex_exit(&pio->io_lock); 534 zio_execute(pio); 535 } else { 536 mutex_exit(&pio->io_lock); 537 } 538} 539 540static void 541zio_inherit_child_errors(zio_t *zio, enum zio_child c) 542{ 543 if (zio->io_child_error[c] != 0 && zio->io_error == 0) 544 zio->io_error = zio->io_child_error[c]; 545} 546 547/* 548 * ========================================================================== 549 * Create the various types of I/O (read, write, free, etc) 550 * ========================================================================== 551 */ 552static zio_t * 553zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 554 void *data, uint64_t size, zio_done_func_t *done, void *private, 555 zio_type_t type, zio_priority_t priority, enum zio_flag flags, 556 vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb, 557 enum zio_stage stage, enum zio_stage pipeline) 558{ 559 zio_t *zio; 560 561 ASSERT3U(type == ZIO_TYPE_FREE || size, <=, SPA_MAXBLOCKSIZE); 562 ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 563 ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 564 565 ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); 566 ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); 567 ASSERT(vd || stage == ZIO_STAGE_OPEN); 568 569 zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 570 bzero(zio, sizeof (zio_t)); 571 572 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 573 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 574 575 list_create(&zio->io_parent_list, sizeof (zio_link_t), 576 offsetof(zio_link_t, zl_parent_node)); 577 list_create(&zio->io_child_list, sizeof (zio_link_t), 578 offsetof(zio_link_t, zl_child_node)); 579 580 if (vd != NULL) 581 zio->io_child_type = ZIO_CHILD_VDEV; 582 else if (flags & ZIO_FLAG_GANG_CHILD) 583 zio->io_child_type = ZIO_CHILD_GANG; 584 else if (flags & ZIO_FLAG_DDT_CHILD) 585 zio->io_child_type = ZIO_CHILD_DDT; 586 else 587 zio->io_child_type = ZIO_CHILD_LOGICAL; 588 589 if (bp != NULL) { 590 zio->io_bp = (blkptr_t *)bp; 591 zio->io_bp_copy = *bp; 592 zio->io_bp_orig = *bp; 593 if (type != ZIO_TYPE_WRITE || 594 zio->io_child_type == ZIO_CHILD_DDT) 595 zio->io_bp = &zio->io_bp_copy; /* so caller can free */ 596 if (zio->io_child_type == ZIO_CHILD_LOGICAL) 597 zio->io_logical = zio; 598 if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) 599 pipeline |= ZIO_GANG_STAGES; 600 } 601 602 zio->io_spa = spa; 603 zio->io_txg = txg; 604 zio->io_done = done; 605 zio->io_private = private; 606 zio->io_type = type; 607 zio->io_priority = priority; 608 zio->io_vd = vd; 609 zio->io_offset = offset; 610 zio->io_orig_data = zio->io_data = data; 611 zio->io_orig_size = zio->io_size = size; 612 zio->io_orig_flags = zio->io_flags = flags; 613 zio->io_orig_stage = zio->io_stage = stage; 614 zio->io_orig_pipeline = zio->io_pipeline = pipeline; 615 616 zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); 617 zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); 618 619 if (zb != NULL) 620 zio->io_bookmark = *zb; 621 622 if (pio != NULL) { 623 if (zio->io_logical == NULL) 624 zio->io_logical = pio->io_logical; 625 if (zio->io_child_type == ZIO_CHILD_GANG) 626 zio->io_gang_leader = pio->io_gang_leader; 627 zio_add_child(pio, zio); 628 } 629 630 return (zio); 631} 632 633static void 634zio_destroy(zio_t *zio) 635{ 636 list_destroy(&zio->io_parent_list); 637 list_destroy(&zio->io_child_list); 638 mutex_destroy(&zio->io_lock); 639 cv_destroy(&zio->io_cv); 640 kmem_cache_free(zio_cache, zio); 641} 642 643zio_t * 644zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, 645 void *private, enum zio_flag flags) 646{ 647 zio_t *zio; 648 649 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 650 ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 651 ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); 652 653 return (zio); 654} 655 656zio_t * 657zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) 658{ 659 return (zio_null(NULL, spa, NULL, done, private, flags)); 660} 661 662void 663zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp) 664{ 665 if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) { 666 zfs_panic_recover("blkptr at %p has invalid TYPE %llu", 667 bp, (longlong_t)BP_GET_TYPE(bp)); 668 } 669 if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS || 670 BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) { 671 zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu", 672 bp, (longlong_t)BP_GET_CHECKSUM(bp)); 673 } 674 if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS || 675 BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) { 676 zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu", 677 bp, (longlong_t)BP_GET_COMPRESS(bp)); 678 } 679 if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) { 680 zfs_panic_recover("blkptr at %p has invalid LSIZE %llu", 681 bp, (longlong_t)BP_GET_LSIZE(bp)); 682 } 683 if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) { 684 zfs_panic_recover("blkptr at %p has invalid PSIZE %llu", 685 bp, (longlong_t)BP_GET_PSIZE(bp)); 686 } 687 688 if (BP_IS_EMBEDDED(bp)) { 689 if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) { 690 zfs_panic_recover("blkptr at %p has invalid ETYPE %llu", 691 bp, (longlong_t)BPE_GET_ETYPE(bp)); 692 } 693 } 694 695 /* 696 * Pool-specific checks. 697 * 698 * Note: it would be nice to verify that the blk_birth and 699 * BP_PHYSICAL_BIRTH() are not too large. However, spa_freeze() 700 * allows the birth time of log blocks (and dmu_sync()-ed blocks 701 * that are in the log) to be arbitrarily large. 702 */ 703 for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 704 uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]); 705 if (vdevid >= spa->spa_root_vdev->vdev_children) { 706 zfs_panic_recover("blkptr at %p DVA %u has invalid " 707 "VDEV %llu", 708 bp, i, (longlong_t)vdevid); 709 continue; 710 } 711 vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; 712 if (vd == NULL) { 713 zfs_panic_recover("blkptr at %p DVA %u has invalid " 714 "VDEV %llu", 715 bp, i, (longlong_t)vdevid); 716 continue; 717 } 718 if (vd->vdev_ops == &vdev_hole_ops) { 719 zfs_panic_recover("blkptr at %p DVA %u has hole " 720 "VDEV %llu", 721 bp, i, (longlong_t)vdevid); 722 continue; 723 } 724 if (vd->vdev_ops == &vdev_missing_ops) { 725 /* 726 * "missing" vdevs are valid during import, but we 727 * don't have their detailed info (e.g. asize), so 728 * we can't perform any more checks on them. 729 */ 730 continue; 731 } 732 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); 733 uint64_t asize = DVA_GET_ASIZE(&bp->blk_dva[i]); 734 if (BP_IS_GANG(bp)) 735 asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 736 if (offset + asize > vd->vdev_asize) { 737 zfs_panic_recover("blkptr at %p DVA %u has invalid " 738 "OFFSET %llu", 739 bp, i, (longlong_t)offset); 740 } 741 } 742} 743 744zio_t * 745zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, 746 void *data, uint64_t size, zio_done_func_t *done, void *private, 747 zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) 748{ 749 zio_t *zio; 750 751 zfs_blkptr_verify(spa, bp); 752 753 zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, 754 data, size, done, private, 755 ZIO_TYPE_READ, priority, flags, NULL, 0, zb, 756 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 757 ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); 758 759 return (zio); 760} 761 762zio_t * 763zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 764 void *data, uint64_t size, const zio_prop_t *zp, 765 zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done, 766 void *private, 767 zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) 768{ 769 zio_t *zio; 770 771 ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && 772 zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && 773 zp->zp_compress >= ZIO_COMPRESS_OFF && 774 zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && 775 DMU_OT_IS_VALID(zp->zp_type) && 776 zp->zp_level < 32 && 777 zp->zp_copies > 0 && 778 zp->zp_copies <= spa_max_replication(spa)); 779 780 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 781 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 782 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 783 ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); 784 785 zio->io_ready = ready; 786 zio->io_physdone = physdone; 787 zio->io_prop = *zp; 788 789 /* 790 * Data can be NULL if we are going to call zio_write_override() to 791 * provide the already-allocated BP. But we may need the data to 792 * verify a dedup hit (if requested). In this case, don't try to 793 * dedup (just take the already-allocated BP verbatim). 794 */ 795 if (data == NULL && zio->io_prop.zp_dedup_verify) { 796 zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE; 797 } 798 799 return (zio); 800} 801 802zio_t * 803zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, 804 uint64_t size, zio_done_func_t *done, void *private, 805 zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb) 806{ 807 zio_t *zio; 808 809 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 810 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 811 ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 812 813 return (zio); 814} 815 816void 817zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) 818{ 819 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 820 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 821 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 822 ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); 823 824 /* 825 * We must reset the io_prop to match the values that existed 826 * when the bp was first written by dmu_sync() keeping in mind 827 * that nopwrite and dedup are mutually exclusive. 828 */ 829 zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; 830 zio->io_prop.zp_nopwrite = nopwrite; 831 zio->io_prop.zp_copies = copies; 832 zio->io_bp_override = bp; 833} 834 835void 836zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) 837{ 838 839 /* 840 * The check for EMBEDDED is a performance optimization. We 841 * process the free here (by ignoring it) rather than 842 * putting it on the list and then processing it in zio_free_sync(). 843 */ 844 if (BP_IS_EMBEDDED(bp)) 845 return; 846 metaslab_check_free(spa, bp); 847 848 /* 849 * Frees that are for the currently-syncing txg, are not going to be 850 * deferred, and which will not need to do a read (i.e. not GANG or 851 * DEDUP), can be processed immediately. Otherwise, put them on the 852 * in-memory list for later processing. 853 */ 854 if (zfs_trim_enabled || BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || 855 txg != spa->spa_syncing_txg || 856 spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) { 857 bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); 858 } else { 859 VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 860 BP_GET_PSIZE(bp), 0))); 861 } 862} 863 864zio_t * 865zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 866 uint64_t size, enum zio_flag flags) 867{ 868 zio_t *zio; 869 enum zio_stage stage = ZIO_FREE_PIPELINE; 870 871 ASSERT(!BP_IS_HOLE(bp)); 872 ASSERT(spa_syncing_txg(spa) == txg); 873 ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); 874 875 if (BP_IS_EMBEDDED(bp)) 876 return (zio_null(pio, spa, NULL, NULL, NULL, 0)); 877 878 metaslab_check_free(spa, bp); 879 arc_freed(spa, bp); 880 881 if (zfs_trim_enabled) 882 stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START | 883 ZIO_STAGE_VDEV_IO_ASSESS; 884 /* 885 * GANG and DEDUP blocks can induce a read (for the gang block header, 886 * or the DDT), so issue them asynchronously so that this thread is 887 * not tied up. 888 */ 889 else if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) 890 stage |= ZIO_STAGE_ISSUE_ASYNC; 891 892 flags |= ZIO_FLAG_DONT_QUEUE; 893 894 zio = zio_create(pio, spa, txg, bp, NULL, size, 895 NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags, 896 NULL, 0, NULL, ZIO_STAGE_OPEN, stage); 897 898 return (zio); 899} 900 901zio_t * 902zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 903 zio_done_func_t *done, void *private, enum zio_flag flags) 904{ 905 zio_t *zio; 906 907 dprintf_bp(bp, "claiming in txg %llu", txg); 908 909 if (BP_IS_EMBEDDED(bp)) 910 return (zio_null(pio, spa, NULL, NULL, NULL, 0)); 911 912 /* 913 * A claim is an allocation of a specific block. Claims are needed 914 * to support immediate writes in the intent log. The issue is that 915 * immediate writes contain committed data, but in a txg that was 916 * *not* committed. Upon opening the pool after an unclean shutdown, 917 * the intent log claims all blocks that contain immediate write data 918 * so that the SPA knows they're in use. 919 * 920 * All claims *must* be resolved in the first txg -- before the SPA 921 * starts allocating blocks -- so that nothing is allocated twice. 922 * If txg == 0 we just verify that the block is claimable. 923 */ 924 ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 925 ASSERT(txg == spa_first_txg(spa) || txg == 0); 926 ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ 927 928 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 929 done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, 930 NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 931 932 return (zio); 933} 934 935zio_t * 936zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset, 937 uint64_t size, zio_done_func_t *done, void *private, 938 zio_priority_t priority, enum zio_flag flags) 939{ 940 zio_t *zio; 941 int c; 942 943 if (vd->vdev_children == 0) { 944 zio = zio_create(pio, spa, 0, NULL, NULL, size, done, private, 945 ZIO_TYPE_IOCTL, priority, flags, vd, offset, NULL, 946 ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 947 948 zio->io_cmd = cmd; 949 } else { 950 zio = zio_null(pio, spa, NULL, NULL, NULL, flags); 951 952 for (c = 0; c < vd->vdev_children; c++) 953 zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 954 offset, size, done, private, priority, flags)); 955 } 956 957 return (zio); 958} 959 960zio_t * 961zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 962 void *data, int checksum, zio_done_func_t *done, void *private, 963 zio_priority_t priority, enum zio_flag flags, boolean_t labels) 964{ 965 zio_t *zio; 966 967 ASSERT(vd->vdev_children == 0); 968 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 969 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 970 ASSERT3U(offset + size, <=, vd->vdev_psize); 971 972 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 973 ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, 974 NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 975 976 zio->io_prop.zp_checksum = checksum; 977 978 return (zio); 979} 980 981zio_t * 982zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 983 void *data, int checksum, zio_done_func_t *done, void *private, 984 zio_priority_t priority, enum zio_flag flags, boolean_t labels) 985{ 986 zio_t *zio; 987 988 ASSERT(vd->vdev_children == 0); 989 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 990 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 991 ASSERT3U(offset + size, <=, vd->vdev_psize); 992 993 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 994 ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, 995 NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 996 997 zio->io_prop.zp_checksum = checksum; 998 999 if (zio_checksum_table[checksum].ci_eck) { 1000 /* 1001 * zec checksums are necessarily destructive -- they modify 1002 * the end of the write buffer to hold the verifier/checksum. 1003 * Therefore, we must make a local copy in case the data is 1004 * being written to multiple places in parallel. 1005 */ 1006 void *wbuf = zio_buf_alloc(size); 1007 bcopy(data, wbuf, size); 1008 zio_push_transform(zio, wbuf, size, size, NULL); 1009 } 1010 1011 return (zio); 1012} 1013 1014/* 1015 * Create a child I/O to do some work for us. 1016 */ 1017zio_t * 1018zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 1019 void *data, uint64_t size, int type, zio_priority_t priority, 1020 enum zio_flag flags, zio_done_func_t *done, void *private) 1021{ 1022 enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; 1023 zio_t *zio; 1024 1025 ASSERT(vd->vdev_parent == 1026 (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); 1027 1028 if (type == ZIO_TYPE_READ && bp != NULL) { 1029 /* 1030 * If we have the bp, then the child should perform the 1031 * checksum and the parent need not. This pushes error 1032 * detection as close to the leaves as possible and 1033 * eliminates redundant checksums in the interior nodes. 1034 */ 1035 pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; 1036 pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 1037 } 1038 1039 /* Not all IO types require vdev io done stage e.g. free */ 1040 if (!(pio->io_pipeline & ZIO_STAGE_VDEV_IO_DONE)) 1041 pipeline &= ~ZIO_STAGE_VDEV_IO_DONE; 1042 1043 if (vd->vdev_children == 0) 1044 offset += VDEV_LABEL_START_SIZE; 1045 1046 flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE; 1047 1048 /* 1049 * If we've decided to do a repair, the write is not speculative -- 1050 * even if the original read was. 1051 */ 1052 if (flags & ZIO_FLAG_IO_REPAIR) 1053 flags &= ~ZIO_FLAG_SPECULATIVE; 1054 1055 zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, 1056 done, private, type, priority, flags, vd, offset, &pio->io_bookmark, 1057 ZIO_STAGE_VDEV_IO_START >> 1, pipeline); 1058 1059 zio->io_physdone = pio->io_physdone; 1060 if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) 1061 zio->io_logical->io_phys_children++; 1062 1063 return (zio); 1064} 1065 1066zio_t * 1067zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, 1068 int type, zio_priority_t priority, enum zio_flag flags, 1069 zio_done_func_t *done, void *private) 1070{ 1071 zio_t *zio; 1072 1073 ASSERT(vd->vdev_ops->vdev_op_leaf); 1074 1075 zio = zio_create(NULL, vd->vdev_spa, 0, NULL, 1076 data, size, done, private, type, priority, 1077 flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED, 1078 vd, offset, NULL, 1079 ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); 1080 1081 return (zio); 1082} 1083 1084void 1085zio_flush(zio_t *zio, vdev_t *vd) 1086{ 1087 zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0, 1088 NULL, NULL, ZIO_PRIORITY_NOW, 1089 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); 1090} 1091 1092zio_t * 1093zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size) 1094{ 1095 1096 ASSERT(vd->vdev_ops->vdev_op_leaf); 1097 1098 return (zio_create(zio, spa, 0, NULL, NULL, size, NULL, NULL, 1099 ZIO_TYPE_FREE, ZIO_PRIORITY_TRIM, ZIO_FLAG_DONT_AGGREGATE | 1100 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, 1101 vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PHYS_PIPELINE)); 1102} 1103 1104void 1105zio_shrink(zio_t *zio, uint64_t size) 1106{ 1107 ASSERT(zio->io_executor == NULL); 1108 ASSERT(zio->io_orig_size == zio->io_size); 1109 ASSERT(size <= zio->io_size); 1110 1111 /* 1112 * We don't shrink for raidz because of problems with the 1113 * reconstruction when reading back less than the block size. 1114 * Note, BP_IS_RAIDZ() assumes no compression. 1115 */ 1116 ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); 1117 if (!BP_IS_RAIDZ(zio->io_bp)) 1118 zio->io_orig_size = zio->io_size = size; 1119} 1120 1121/* 1122 * ========================================================================== 1123 * Prepare to read and write logical blocks 1124 * ========================================================================== 1125 */ 1126 1127static int 1128zio_read_bp_init(zio_t *zio) 1129{ 1130 blkptr_t *bp = zio->io_bp; 1131 1132 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && 1133 zio->io_child_type == ZIO_CHILD_LOGICAL && 1134 !(zio->io_flags & ZIO_FLAG_RAW)) { 1135 uint64_t psize = 1136 BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); 1137 void *cbuf = zio_buf_alloc(psize); 1138 1139 zio_push_transform(zio, cbuf, psize, psize, zio_decompress); 1140 } 1141 1142 if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { 1143 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1144 decode_embedded_bp_compressed(bp, zio->io_data); 1145 } else { 1146 ASSERT(!BP_IS_EMBEDDED(bp)); 1147 } 1148 1149 if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) 1150 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1151 1152 if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) 1153 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1154 1155 if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) 1156 zio->io_pipeline = ZIO_DDT_READ_PIPELINE; 1157 1158 return (ZIO_PIPELINE_CONTINUE); 1159} 1160 1161static int 1162zio_write_bp_init(zio_t *zio) 1163{ 1164 spa_t *spa = zio->io_spa; 1165 zio_prop_t *zp = &zio->io_prop; 1166 enum zio_compress compress = zp->zp_compress; 1167 blkptr_t *bp = zio->io_bp; 1168 uint64_t lsize = zio->io_size; 1169 uint64_t psize = lsize; 1170 int pass = 1; 1171 1172 /* 1173 * If our children haven't all reached the ready stage, 1174 * wait for them and then repeat this pipeline stage. 1175 */ 1176 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 1177 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) 1178 return (ZIO_PIPELINE_STOP); 1179 1180 if (!IO_IS_ALLOCATING(zio)) 1181 return (ZIO_PIPELINE_CONTINUE); 1182 1183 ASSERT(zio->io_child_type != ZIO_CHILD_DDT); 1184 1185 if (zio->io_bp_override) { 1186 ASSERT(bp->blk_birth != zio->io_txg); 1187 ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); 1188 1189 *bp = *zio->io_bp_override; 1190 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1191 1192 if (BP_IS_EMBEDDED(bp)) 1193 return (ZIO_PIPELINE_CONTINUE); 1194 1195 /* 1196 * If we've been overridden and nopwrite is set then 1197 * set the flag accordingly to indicate that a nopwrite 1198 * has already occurred. 1199 */ 1200 if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { 1201 ASSERT(!zp->zp_dedup); 1202 zio->io_flags |= ZIO_FLAG_NOPWRITE; 1203 return (ZIO_PIPELINE_CONTINUE); 1204 } 1205 1206 ASSERT(!zp->zp_nopwrite); 1207 1208 if (BP_IS_HOLE(bp) || !zp->zp_dedup) 1209 return (ZIO_PIPELINE_CONTINUE); 1210 1211 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup || 1212 zp->zp_dedup_verify); 1213 1214 if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { 1215 BP_SET_DEDUP(bp, 1); 1216 zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; 1217 return (ZIO_PIPELINE_CONTINUE); 1218 } 1219 } 1220 1221 if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) { 1222 /* 1223 * We're rewriting an existing block, which means we're 1224 * working on behalf of spa_sync(). For spa_sync() to 1225 * converge, it must eventually be the case that we don't 1226 * have to allocate new blocks. But compression changes 1227 * the blocksize, which forces a reallocate, and makes 1228 * convergence take longer. Therefore, after the first 1229 * few passes, stop compressing to ensure convergence. 1230 */ 1231 pass = spa_sync_pass(spa); 1232 1233 ASSERT(zio->io_txg == spa_syncing_txg(spa)); 1234 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1235 ASSERT(!BP_GET_DEDUP(bp)); 1236 1237 if (pass >= zfs_sync_pass_dont_compress) 1238 compress = ZIO_COMPRESS_OFF; 1239 1240 /* Make sure someone doesn't change their mind on overwrites */ 1241 ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp), 1242 spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 1243 } 1244 1245 if (compress != ZIO_COMPRESS_OFF) { 1246 void *cbuf = zio_buf_alloc(lsize); 1247 psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); 1248 if (psize == 0 || psize == lsize) { 1249 compress = ZIO_COMPRESS_OFF; 1250 zio_buf_free(cbuf, lsize); 1251 } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE && 1252 zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && 1253 spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { 1254 encode_embedded_bp_compressed(bp, 1255 cbuf, compress, lsize, psize); 1256 BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA); 1257 BP_SET_TYPE(bp, zio->io_prop.zp_type); 1258 BP_SET_LEVEL(bp, zio->io_prop.zp_level); 1259 zio_buf_free(cbuf, lsize); 1260 bp->blk_birth = zio->io_txg; 1261 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1262 ASSERT(spa_feature_is_active(spa, 1263 SPA_FEATURE_EMBEDDED_DATA)); 1264 return (ZIO_PIPELINE_CONTINUE); 1265 } else { 1266 /* 1267 * Round up compressed size up to the ashift 1268 * of the smallest-ashift device, and zero the tail. 1269 * This ensures that the compressed size of the BP 1270 * (and thus compressratio property) are correct, 1271 * in that we charge for the padding used to fill out 1272 * the last sector. 1273 */ 1274 ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); 1275 size_t rounded = (size_t)P2ROUNDUP(psize, 1276 1ULL << spa->spa_min_ashift); 1277 if (rounded >= lsize) { 1278 compress = ZIO_COMPRESS_OFF; 1279 zio_buf_free(cbuf, lsize); 1280 psize = lsize; 1281 } else { 1282 bzero((char *)cbuf + psize, rounded - psize); 1283 psize = rounded; 1284 zio_push_transform(zio, cbuf, 1285 psize, lsize, NULL); 1286 } 1287 } 1288 } 1289 1290 /* 1291 * The final pass of spa_sync() must be all rewrites, but the first 1292 * few passes offer a trade-off: allocating blocks defers convergence, 1293 * but newly allocated blocks are sequential, so they can be written 1294 * to disk faster. Therefore, we allow the first few passes of 1295 * spa_sync() to allocate new blocks, but force rewrites after that. 1296 * There should only be a handful of blocks after pass 1 in any case. 1297 */ 1298 if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg && 1299 BP_GET_PSIZE(bp) == psize && 1300 pass >= zfs_sync_pass_rewrite) { 1301 ASSERT(psize != 0); 1302 enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; 1303 zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; 1304 zio->io_flags |= ZIO_FLAG_IO_REWRITE; 1305 } else { 1306 BP_ZERO(bp); 1307 zio->io_pipeline = ZIO_WRITE_PIPELINE; 1308 } 1309 1310 if (psize == 0) { 1311 if (zio->io_bp_orig.blk_birth != 0 && 1312 spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { 1313 BP_SET_LSIZE(bp, lsize); 1314 BP_SET_TYPE(bp, zp->zp_type); 1315 BP_SET_LEVEL(bp, zp->zp_level); 1316 BP_SET_BIRTH(bp, zio->io_txg, 0); 1317 } 1318 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1319 } else { 1320 ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); 1321 BP_SET_LSIZE(bp, lsize); 1322 BP_SET_TYPE(bp, zp->zp_type); 1323 BP_SET_LEVEL(bp, zp->zp_level); 1324 BP_SET_PSIZE(bp, psize); 1325 BP_SET_COMPRESS(bp, compress); 1326 BP_SET_CHECKSUM(bp, zp->zp_checksum); 1327 BP_SET_DEDUP(bp, zp->zp_dedup); 1328 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1329 if (zp->zp_dedup) { 1330 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1331 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1332 zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; 1333 } 1334 if (zp->zp_nopwrite) { 1335 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1336 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1337 zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; 1338 } 1339 } 1340 1341 return (ZIO_PIPELINE_CONTINUE); 1342} 1343 1344static int 1345zio_free_bp_init(zio_t *zio) 1346{ 1347 blkptr_t *bp = zio->io_bp; 1348 1349 if (zio->io_child_type == ZIO_CHILD_LOGICAL) { 1350 if (BP_GET_DEDUP(bp)) 1351 zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; 1352 } 1353 1354 return (ZIO_PIPELINE_CONTINUE); 1355} 1356 1357/* 1358 * ========================================================================== 1359 * Execute the I/O pipeline 1360 * ========================================================================== 1361 */ 1362 1363static void 1364zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) 1365{ 1366 spa_t *spa = zio->io_spa; 1367 zio_type_t t = zio->io_type; 1368 int flags = (cutinline ? TQ_FRONT : 0); 1369 1370 ASSERT(q == ZIO_TASKQ_ISSUE || q == ZIO_TASKQ_INTERRUPT); 1371 1372 /* 1373 * If we're a config writer or a probe, the normal issue and 1374 * interrupt threads may all be blocked waiting for the config lock. 1375 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. 1376 */ 1377 if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) 1378 t = ZIO_TYPE_NULL; 1379 1380 /* 1381 * A similar issue exists for the L2ARC write thread until L2ARC 2.0. 1382 */ 1383 if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) 1384 t = ZIO_TYPE_NULL; 1385 1386 /* 1387 * If this is a high priority I/O, then use the high priority taskq if 1388 * available. 1389 */ 1390 if (zio->io_priority == ZIO_PRIORITY_NOW && 1391 spa->spa_zio_taskq[t][q + 1].stqs_count != 0) 1392 q++; 1393 1394 ASSERT3U(q, <, ZIO_TASKQ_TYPES); 1395 1396 /* 1397 * NB: We are assuming that the zio can only be dispatched 1398 * to a single taskq at a time. It would be a grievous error 1399 * to dispatch the zio to another taskq at the same time. 1400 */ 1401#if defined(illumos) || !defined(_KERNEL) 1402 ASSERT(zio->io_tqent.tqent_next == NULL); 1403#else 1404 ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); 1405#endif 1406 spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio, 1407 flags, &zio->io_tqent); 1408} 1409 1410static boolean_t 1411zio_taskq_member(zio_t *zio, zio_taskq_type_t q) 1412{ 1413 kthread_t *executor = zio->io_executor; 1414 spa_t *spa = zio->io_spa; 1415 1416 for (zio_type_t t = 0; t < ZIO_TYPES; t++) { 1417 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1418 uint_t i; 1419 for (i = 0; i < tqs->stqs_count; i++) { 1420 if (taskq_member(tqs->stqs_taskq[i], executor)) 1421 return (B_TRUE); 1422 } 1423 } 1424 1425 return (B_FALSE); 1426} 1427 1428static int 1429zio_issue_async(zio_t *zio) 1430{ 1431 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 1432 1433 return (ZIO_PIPELINE_STOP); 1434} 1435 1436void 1437zio_interrupt(zio_t *zio) 1438{ 1439 zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); 1440} 1441 1442/* 1443 * Execute the I/O pipeline until one of the following occurs: 1444 * 1445 * (1) the I/O completes 1446 * (2) the pipeline stalls waiting for dependent child I/Os 1447 * (3) the I/O issues, so we're waiting for an I/O completion interrupt 1448 * (4) the I/O is delegated by vdev-level caching or aggregation 1449 * (5) the I/O is deferred due to vdev-level queueing 1450 * (6) the I/O is handed off to another thread. 1451 * 1452 * In all cases, the pipeline stops whenever there's no CPU work; it never 1453 * burns a thread in cv_wait(). 1454 * 1455 * There's no locking on io_stage because there's no legitimate way 1456 * for multiple threads to be attempting to process the same I/O. 1457 */ 1458static zio_pipe_stage_t *zio_pipeline[]; 1459 1460void 1461zio_execute(zio_t *zio) 1462{ 1463 zio->io_executor = curthread; 1464 1465 while (zio->io_stage < ZIO_STAGE_DONE) { 1466 enum zio_stage pipeline = zio->io_pipeline; 1467 enum zio_stage stage = zio->io_stage; 1468 int rv; 1469 1470 ASSERT(!MUTEX_HELD(&zio->io_lock)); 1471 ASSERT(ISP2(stage)); 1472 ASSERT(zio->io_stall == NULL); 1473 1474 do { 1475 stage <<= 1; 1476 } while ((stage & pipeline) == 0); 1477 1478 ASSERT(stage <= ZIO_STAGE_DONE); 1479 1480 /* 1481 * If we are in interrupt context and this pipeline stage 1482 * will grab a config lock that is held across I/O, 1483 * or may wait for an I/O that needs an interrupt thread 1484 * to complete, issue async to avoid deadlock. 1485 * 1486 * For VDEV_IO_START, we cut in line so that the io will 1487 * be sent to disk promptly. 1488 */ 1489 if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && 1490 zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { 1491 boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? 1492 zio_requeue_io_start_cut_in_line : B_FALSE; 1493 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); 1494 return; 1495 } 1496 1497 zio->io_stage = stage; 1498 rv = zio_pipeline[highbit64(stage) - 1](zio); 1499 1500 if (rv == ZIO_PIPELINE_STOP) 1501 return; 1502 1503 ASSERT(rv == ZIO_PIPELINE_CONTINUE); 1504 } 1505} 1506 1507/* 1508 * ========================================================================== 1509 * Initiate I/O, either sync or async 1510 * ========================================================================== 1511 */ 1512int 1513zio_wait(zio_t *zio) 1514{ 1515 int error; 1516 1517 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 1518 ASSERT(zio->io_executor == NULL); 1519 1520 zio->io_waiter = curthread; 1521 1522 zio_execute(zio); 1523 1524 mutex_enter(&zio->io_lock); 1525 while (zio->io_executor != NULL) 1526 cv_wait(&zio->io_cv, &zio->io_lock); 1527 mutex_exit(&zio->io_lock); 1528 1529 error = zio->io_error; 1530 zio_destroy(zio); 1531 1532 return (error); 1533} 1534 1535void 1536zio_nowait(zio_t *zio) 1537{ 1538 ASSERT(zio->io_executor == NULL); 1539 1540 if (zio->io_child_type == ZIO_CHILD_LOGICAL && 1541 zio_unique_parent(zio) == NULL) { 1542 /* 1543 * This is a logical async I/O with no parent to wait for it. 1544 * We add it to the spa_async_root_zio "Godfather" I/O which 1545 * will ensure they complete prior to unloading the pool. 1546 */ 1547 spa_t *spa = zio->io_spa; 1548 1549 zio_add_child(spa->spa_async_zio_root[CPU_SEQID], zio); 1550 } 1551 1552 zio_execute(zio); 1553} 1554 1555/* 1556 * ========================================================================== 1557 * Reexecute or suspend/resume failed I/O 1558 * ========================================================================== 1559 */ 1560 1561static void 1562zio_reexecute(zio_t *pio) 1563{ 1564 zio_t *cio, *cio_next; 1565 1566 ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); 1567 ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); 1568 ASSERT(pio->io_gang_leader == NULL); 1569 ASSERT(pio->io_gang_tree == NULL); 1570 1571 pio->io_flags = pio->io_orig_flags; 1572 pio->io_stage = pio->io_orig_stage; 1573 pio->io_pipeline = pio->io_orig_pipeline; 1574 pio->io_reexecute = 0; 1575 pio->io_flags |= ZIO_FLAG_REEXECUTED; 1576 pio->io_error = 0; 1577 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1578 pio->io_state[w] = 0; 1579 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 1580 pio->io_child_error[c] = 0; 1581 1582 if (IO_IS_ALLOCATING(pio)) 1583 BP_ZERO(pio->io_bp); 1584 1585 /* 1586 * As we reexecute pio's children, new children could be created. 1587 * New children go to the head of pio's io_child_list, however, 1588 * so we will (correctly) not reexecute them. The key is that 1589 * the remainder of pio's io_child_list, from 'cio_next' onward, 1590 * cannot be affected by any side effects of reexecuting 'cio'. 1591 */ 1592 for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { 1593 cio_next = zio_walk_children(pio); 1594 mutex_enter(&pio->io_lock); 1595 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1596 pio->io_children[cio->io_child_type][w]++; 1597 mutex_exit(&pio->io_lock); 1598 zio_reexecute(cio); 1599 } 1600 1601 /* 1602 * Now that all children have been reexecuted, execute the parent. 1603 * We don't reexecute "The Godfather" I/O here as it's the 1604 * responsibility of the caller to wait on him. 1605 */ 1606 if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) 1607 zio_execute(pio); 1608} 1609 1610void 1611zio_suspend(spa_t *spa, zio_t *zio) 1612{ 1613 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) 1614 fm_panic("Pool '%s' has encountered an uncorrectable I/O " 1615 "failure and the failure mode property for this pool " 1616 "is set to panic.", spa_name(spa)); 1617 1618 zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); 1619 1620 mutex_enter(&spa->spa_suspend_lock); 1621 1622 if (spa->spa_suspend_zio_root == NULL) 1623 spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 1624 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 1625 ZIO_FLAG_GODFATHER); 1626 1627 spa->spa_suspended = B_TRUE; 1628 1629 if (zio != NULL) { 1630 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 1631 ASSERT(zio != spa->spa_suspend_zio_root); 1632 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1633 ASSERT(zio_unique_parent(zio) == NULL); 1634 ASSERT(zio->io_stage == ZIO_STAGE_DONE); 1635 zio_add_child(spa->spa_suspend_zio_root, zio); 1636 } 1637 1638 mutex_exit(&spa->spa_suspend_lock); 1639} 1640 1641int 1642zio_resume(spa_t *spa) 1643{ 1644 zio_t *pio; 1645 1646 /* 1647 * Reexecute all previously suspended i/o. 1648 */ 1649 mutex_enter(&spa->spa_suspend_lock); 1650 spa->spa_suspended = B_FALSE; 1651 cv_broadcast(&spa->spa_suspend_cv); 1652 pio = spa->spa_suspend_zio_root; 1653 spa->spa_suspend_zio_root = NULL; 1654 mutex_exit(&spa->spa_suspend_lock); 1655 1656 if (pio == NULL) 1657 return (0); 1658 1659 zio_reexecute(pio); 1660 return (zio_wait(pio)); 1661} 1662 1663void 1664zio_resume_wait(spa_t *spa) 1665{ 1666 mutex_enter(&spa->spa_suspend_lock); 1667 while (spa_suspended(spa)) 1668 cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); 1669 mutex_exit(&spa->spa_suspend_lock); 1670} 1671 1672/* 1673 * ========================================================================== 1674 * Gang blocks. 1675 * 1676 * A gang block is a collection of small blocks that looks to the DMU 1677 * like one large block. When zio_dva_allocate() cannot find a block 1678 * of the requested size, due to either severe fragmentation or the pool 1679 * being nearly full, it calls zio_write_gang_block() to construct the 1680 * block from smaller fragments. 1681 * 1682 * A gang block consists of a gang header (zio_gbh_phys_t) and up to 1683 * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like 1684 * an indirect block: it's an array of block pointers. It consumes 1685 * only one sector and hence is allocatable regardless of fragmentation. 1686 * The gang header's bps point to its gang members, which hold the data. 1687 * 1688 * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> 1689 * as the verifier to ensure uniqueness of the SHA256 checksum. 1690 * Critically, the gang block bp's blk_cksum is the checksum of the data, 1691 * not the gang header. This ensures that data block signatures (needed for 1692 * deduplication) are independent of how the block is physically stored. 1693 * 1694 * Gang blocks can be nested: a gang member may itself be a gang block. 1695 * Thus every gang block is a tree in which root and all interior nodes are 1696 * gang headers, and the leaves are normal blocks that contain user data. 1697 * The root of the gang tree is called the gang leader. 1698 * 1699 * To perform any operation (read, rewrite, free, claim) on a gang block, 1700 * zio_gang_assemble() first assembles the gang tree (minus data leaves) 1701 * in the io_gang_tree field of the original logical i/o by recursively 1702 * reading the gang leader and all gang headers below it. This yields 1703 * an in-core tree containing the contents of every gang header and the 1704 * bps for every constituent of the gang block. 1705 * 1706 * With the gang tree now assembled, zio_gang_issue() just walks the gang tree 1707 * and invokes a callback on each bp. To free a gang block, zio_gang_issue() 1708 * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. 1709 * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). 1710 * zio_read_gang() is a wrapper around zio_read() that omits reading gang 1711 * headers, since we already have those in io_gang_tree. zio_rewrite_gang() 1712 * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() 1713 * of the gang header plus zio_checksum_compute() of the data to update the 1714 * gang header's blk_cksum as described above. 1715 * 1716 * The two-phase assemble/issue model solves the problem of partial failure -- 1717 * what if you'd freed part of a gang block but then couldn't read the 1718 * gang header for another part? Assembling the entire gang tree first 1719 * ensures that all the necessary gang header I/O has succeeded before 1720 * starting the actual work of free, claim, or write. Once the gang tree 1721 * is assembled, free and claim are in-memory operations that cannot fail. 1722 * 1723 * In the event that a gang write fails, zio_dva_unallocate() walks the 1724 * gang tree to immediately free (i.e. insert back into the space map) 1725 * everything we've allocated. This ensures that we don't get ENOSPC 1726 * errors during repeated suspend/resume cycles due to a flaky device. 1727 * 1728 * Gang rewrites only happen during sync-to-convergence. If we can't assemble 1729 * the gang tree, we won't modify the block, so we can safely defer the free 1730 * (knowing that the block is still intact). If we *can* assemble the gang 1731 * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free 1732 * each constituent bp and we can allocate a new block on the next sync pass. 1733 * 1734 * In all cases, the gang tree allows complete recovery from partial failure. 1735 * ========================================================================== 1736 */ 1737 1738static zio_t * 1739zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1740{ 1741 if (gn != NULL) 1742 return (pio); 1743 1744 return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), 1745 NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1746 &pio->io_bookmark)); 1747} 1748 1749zio_t * 1750zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1751{ 1752 zio_t *zio; 1753 1754 if (gn != NULL) { 1755 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1756 gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, 1757 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1758 /* 1759 * As we rewrite each gang header, the pipeline will compute 1760 * a new gang block header checksum for it; but no one will 1761 * compute a new data checksum, so we do that here. The one 1762 * exception is the gang leader: the pipeline already computed 1763 * its data checksum because that stage precedes gang assembly. 1764 * (Presently, nothing actually uses interior data checksums; 1765 * this is just good hygiene.) 1766 */ 1767 if (gn != pio->io_gang_leader->io_gang_tree) { 1768 zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), 1769 data, BP_GET_PSIZE(bp)); 1770 } 1771 /* 1772 * If we are here to damage data for testing purposes, 1773 * leave the GBH alone so that we can detect the damage. 1774 */ 1775 if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) 1776 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 1777 } else { 1778 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1779 data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, 1780 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1781 } 1782 1783 return (zio); 1784} 1785 1786/* ARGSUSED */ 1787zio_t * 1788zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1789{ 1790 return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, 1791 BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp), 1792 ZIO_GANG_CHILD_FLAGS(pio))); 1793} 1794 1795/* ARGSUSED */ 1796zio_t * 1797zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1798{ 1799 return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, 1800 NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); 1801} 1802 1803static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { 1804 NULL, 1805 zio_read_gang, 1806 zio_rewrite_gang, 1807 zio_free_gang, 1808 zio_claim_gang, 1809 NULL 1810}; 1811 1812static void zio_gang_tree_assemble_done(zio_t *zio); 1813 1814static zio_gang_node_t * 1815zio_gang_node_alloc(zio_gang_node_t **gnpp) 1816{ 1817 zio_gang_node_t *gn; 1818 1819 ASSERT(*gnpp == NULL); 1820 1821 gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); 1822 gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); 1823 *gnpp = gn; 1824 1825 return (gn); 1826} 1827 1828static void 1829zio_gang_node_free(zio_gang_node_t **gnpp) 1830{ 1831 zio_gang_node_t *gn = *gnpp; 1832 1833 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1834 ASSERT(gn->gn_child[g] == NULL); 1835 1836 zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); 1837 kmem_free(gn, sizeof (*gn)); 1838 *gnpp = NULL; 1839} 1840 1841static void 1842zio_gang_tree_free(zio_gang_node_t **gnpp) 1843{ 1844 zio_gang_node_t *gn = *gnpp; 1845 1846 if (gn == NULL) 1847 return; 1848 1849 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1850 zio_gang_tree_free(&gn->gn_child[g]); 1851 1852 zio_gang_node_free(gnpp); 1853} 1854 1855static void 1856zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) 1857{ 1858 zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); 1859 1860 ASSERT(gio->io_gang_leader == gio); 1861 ASSERT(BP_IS_GANG(bp)); 1862 1863 zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, 1864 SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, 1865 gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); 1866} 1867 1868static void 1869zio_gang_tree_assemble_done(zio_t *zio) 1870{ 1871 zio_t *gio = zio->io_gang_leader; 1872 zio_gang_node_t *gn = zio->io_private; 1873 blkptr_t *bp = zio->io_bp; 1874 1875 ASSERT(gio == zio_unique_parent(zio)); 1876 ASSERT(zio->io_child_count == 0); 1877 1878 if (zio->io_error) 1879 return; 1880 1881 if (BP_SHOULD_BYTESWAP(bp)) 1882 byteswap_uint64_array(zio->io_data, zio->io_size); 1883 1884 ASSERT(zio->io_data == gn->gn_gbh); 1885 ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 1886 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1887 1888 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1889 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1890 if (!BP_IS_GANG(gbp)) 1891 continue; 1892 zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); 1893 } 1894} 1895 1896static void 1897zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) 1898{ 1899 zio_t *gio = pio->io_gang_leader; 1900 zio_t *zio; 1901 1902 ASSERT(BP_IS_GANG(bp) == !!gn); 1903 ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); 1904 ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); 1905 1906 /* 1907 * If you're a gang header, your data is in gn->gn_gbh. 1908 * If you're a gang member, your data is in 'data' and gn == NULL. 1909 */ 1910 zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); 1911 1912 if (gn != NULL) { 1913 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1914 1915 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1916 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1917 if (BP_IS_HOLE(gbp)) 1918 continue; 1919 zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); 1920 data = (char *)data + BP_GET_PSIZE(gbp); 1921 } 1922 } 1923 1924 if (gn == gio->io_gang_tree && gio->io_data != NULL) 1925 ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); 1926 1927 if (zio != pio) 1928 zio_nowait(zio); 1929} 1930 1931static int 1932zio_gang_assemble(zio_t *zio) 1933{ 1934 blkptr_t *bp = zio->io_bp; 1935 1936 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); 1937 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1938 1939 zio->io_gang_leader = zio; 1940 1941 zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); 1942 1943 return (ZIO_PIPELINE_CONTINUE); 1944} 1945 1946static int 1947zio_gang_issue(zio_t *zio) 1948{ 1949 blkptr_t *bp = zio->io_bp; 1950 1951 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) 1952 return (ZIO_PIPELINE_STOP); 1953 1954 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); 1955 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1956 1957 if (zio->io_child_error[ZIO_CHILD_GANG] == 0) 1958 zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); 1959 else 1960 zio_gang_tree_free(&zio->io_gang_tree); 1961 1962 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1963 1964 return (ZIO_PIPELINE_CONTINUE); 1965} 1966 1967static void 1968zio_write_gang_member_ready(zio_t *zio) 1969{ 1970 zio_t *pio = zio_unique_parent(zio); 1971 zio_t *gio = zio->io_gang_leader; 1972 dva_t *cdva = zio->io_bp->blk_dva; 1973 dva_t *pdva = pio->io_bp->blk_dva; 1974 uint64_t asize; 1975 1976 if (BP_IS_HOLE(zio->io_bp)) 1977 return; 1978 1979 ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); 1980 1981 ASSERT(zio->io_child_type == ZIO_CHILD_GANG); 1982 ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); 1983 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); 1984 ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); 1985 ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 1986 1987 mutex_enter(&pio->io_lock); 1988 for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { 1989 ASSERT(DVA_GET_GANG(&pdva[d])); 1990 asize = DVA_GET_ASIZE(&pdva[d]); 1991 asize += DVA_GET_ASIZE(&cdva[d]); 1992 DVA_SET_ASIZE(&pdva[d], asize); 1993 } 1994 mutex_exit(&pio->io_lock); 1995} 1996 1997static int 1998zio_write_gang_block(zio_t *pio) 1999{ 2000 spa_t *spa = pio->io_spa; 2001 blkptr_t *bp = pio->io_bp; 2002 zio_t *gio = pio->io_gang_leader; 2003 zio_t *zio; 2004 zio_gang_node_t *gn, **gnpp; 2005 zio_gbh_phys_t *gbh; 2006 uint64_t txg = pio->io_txg; 2007 uint64_t resid = pio->io_size; 2008 uint64_t lsize; 2009 int copies = gio->io_prop.zp_copies; 2010 int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); 2011 zio_prop_t zp; 2012 int error; 2013 2014 error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE, 2015 bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, 2016 METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); 2017 if (error) { 2018 pio->io_error = error; 2019 return (ZIO_PIPELINE_CONTINUE); 2020 } 2021 2022 if (pio == gio) { 2023 gnpp = &gio->io_gang_tree; 2024 } else { 2025 gnpp = pio->io_private; 2026 ASSERT(pio->io_ready == zio_write_gang_member_ready); 2027 } 2028 2029 gn = zio_gang_node_alloc(gnpp); 2030 gbh = gn->gn_gbh; 2031 bzero(gbh, SPA_GANGBLOCKSIZE); 2032 2033 /* 2034 * Create the gang header. 2035 */ 2036 zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, 2037 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 2038 2039 /* 2040 * Create and nowait the gang children. 2041 */ 2042 for (int g = 0; resid != 0; resid -= lsize, g++) { 2043 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), 2044 SPA_MINBLOCKSIZE); 2045 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); 2046 2047 zp.zp_checksum = gio->io_prop.zp_checksum; 2048 zp.zp_compress = ZIO_COMPRESS_OFF; 2049 zp.zp_type = DMU_OT_NONE; 2050 zp.zp_level = 0; 2051 zp.zp_copies = gio->io_prop.zp_copies; 2052 zp.zp_dedup = B_FALSE; 2053 zp.zp_dedup_verify = B_FALSE; 2054 zp.zp_nopwrite = B_FALSE; 2055 2056 zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], 2057 (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, 2058 zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g], 2059 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 2060 &pio->io_bookmark)); 2061 } 2062 2063 /* 2064 * Set pio's pipeline to just wait for zio to finish. 2065 */ 2066 pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2067 2068 zio_nowait(zio); 2069 2070 return (ZIO_PIPELINE_CONTINUE); 2071} 2072 2073/* 2074 * The zio_nop_write stage in the pipeline determines if allocating 2075 * a new bp is necessary. By leveraging a cryptographically secure checksum, 2076 * such as SHA256, we can compare the checksums of the new data and the old 2077 * to determine if allocating a new block is required. The nopwrite 2078 * feature can handle writes in either syncing or open context (i.e. zil 2079 * writes) and as a result is mutually exclusive with dedup. 2080 */ 2081static int 2082zio_nop_write(zio_t *zio) 2083{ 2084 blkptr_t *bp = zio->io_bp; 2085 blkptr_t *bp_orig = &zio->io_bp_orig; 2086 zio_prop_t *zp = &zio->io_prop; 2087 2088 ASSERT(BP_GET_LEVEL(bp) == 0); 2089 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 2090 ASSERT(zp->zp_nopwrite); 2091 ASSERT(!zp->zp_dedup); 2092 ASSERT(zio->io_bp_override == NULL); 2093 ASSERT(IO_IS_ALLOCATING(zio)); 2094 2095 /* 2096 * Check to see if the original bp and the new bp have matching 2097 * characteristics (i.e. same checksum, compression algorithms, etc). 2098 * If they don't then just continue with the pipeline which will 2099 * allocate a new bp. 2100 */ 2101 if (BP_IS_HOLE(bp_orig) || 2102 !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup || 2103 BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || 2104 BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || 2105 BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || 2106 zp->zp_copies != BP_GET_NDVAS(bp_orig)) 2107 return (ZIO_PIPELINE_CONTINUE); 2108 2109 /* 2110 * If the checksums match then reset the pipeline so that we 2111 * avoid allocating a new bp and issuing any I/O. 2112 */ 2113 if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { 2114 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup); 2115 ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); 2116 ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); 2117 ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); 2118 ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, 2119 sizeof (uint64_t)) == 0); 2120 2121 *bp = *bp_orig; 2122 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2123 zio->io_flags |= ZIO_FLAG_NOPWRITE; 2124 } 2125 2126 return (ZIO_PIPELINE_CONTINUE); 2127} 2128 2129/* 2130 * ========================================================================== 2131 * Dedup 2132 * ========================================================================== 2133 */ 2134static void 2135zio_ddt_child_read_done(zio_t *zio) 2136{ 2137 blkptr_t *bp = zio->io_bp; 2138 ddt_entry_t *dde = zio->io_private; 2139 ddt_phys_t *ddp; 2140 zio_t *pio = zio_unique_parent(zio); 2141 2142 mutex_enter(&pio->io_lock); 2143 ddp = ddt_phys_select(dde, bp); 2144 if (zio->io_error == 0) 2145 ddt_phys_clear(ddp); /* this ddp doesn't need repair */ 2146 if (zio->io_error == 0 && dde->dde_repair_data == NULL) 2147 dde->dde_repair_data = zio->io_data; 2148 else 2149 zio_buf_free(zio->io_data, zio->io_size); 2150 mutex_exit(&pio->io_lock); 2151} 2152 2153static int 2154zio_ddt_read_start(zio_t *zio) 2155{ 2156 blkptr_t *bp = zio->io_bp; 2157 2158 ASSERT(BP_GET_DEDUP(bp)); 2159 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2160 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2161 2162 if (zio->io_child_error[ZIO_CHILD_DDT]) { 2163 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2164 ddt_entry_t *dde = ddt_repair_start(ddt, bp); 2165 ddt_phys_t *ddp = dde->dde_phys; 2166 ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); 2167 blkptr_t blk; 2168 2169 ASSERT(zio->io_vsd == NULL); 2170 zio->io_vsd = dde; 2171 2172 if (ddp_self == NULL) 2173 return (ZIO_PIPELINE_CONTINUE); 2174 2175 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 2176 if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) 2177 continue; 2178 ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, 2179 &blk); 2180 zio_nowait(zio_read(zio, zio->io_spa, &blk, 2181 zio_buf_alloc(zio->io_size), zio->io_size, 2182 zio_ddt_child_read_done, dde, zio->io_priority, 2183 ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, 2184 &zio->io_bookmark)); 2185 } 2186 return (ZIO_PIPELINE_CONTINUE); 2187 } 2188 2189 zio_nowait(zio_read(zio, zio->io_spa, bp, 2190 zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, 2191 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); 2192 2193 return (ZIO_PIPELINE_CONTINUE); 2194} 2195 2196static int 2197zio_ddt_read_done(zio_t *zio) 2198{ 2199 blkptr_t *bp = zio->io_bp; 2200 2201 if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)) 2202 return (ZIO_PIPELINE_STOP); 2203 2204 ASSERT(BP_GET_DEDUP(bp)); 2205 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2206 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2207 2208 if (zio->io_child_error[ZIO_CHILD_DDT]) { 2209 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2210 ddt_entry_t *dde = zio->io_vsd; 2211 if (ddt == NULL) { 2212 ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); 2213 return (ZIO_PIPELINE_CONTINUE); 2214 } 2215 if (dde == NULL) { 2216 zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; 2217 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 2218 return (ZIO_PIPELINE_STOP); 2219 } 2220 if (dde->dde_repair_data != NULL) { 2221 bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); 2222 zio->io_child_error[ZIO_CHILD_DDT] = 0; 2223 } 2224 ddt_repair_done(ddt, dde); 2225 zio->io_vsd = NULL; 2226 } 2227 2228 ASSERT(zio->io_vsd == NULL); 2229 2230 return (ZIO_PIPELINE_CONTINUE); 2231} 2232 2233static boolean_t 2234zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) 2235{ 2236 spa_t *spa = zio->io_spa; 2237 2238 /* 2239 * Note: we compare the original data, not the transformed data, 2240 * because when zio->io_bp is an override bp, we will not have 2241 * pushed the I/O transforms. That's an important optimization 2242 * because otherwise we'd compress/encrypt all dmu_sync() data twice. 2243 */ 2244 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2245 zio_t *lio = dde->dde_lead_zio[p]; 2246 2247 if (lio != NULL) { 2248 return (lio->io_orig_size != zio->io_orig_size || 2249 bcmp(zio->io_orig_data, lio->io_orig_data, 2250 zio->io_orig_size) != 0); 2251 } 2252 } 2253 2254 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2255 ddt_phys_t *ddp = &dde->dde_phys[p]; 2256 2257 if (ddp->ddp_phys_birth != 0) { 2258 arc_buf_t *abuf = NULL; 2259 arc_flags_t aflags = ARC_FLAG_WAIT; 2260 blkptr_t blk = *zio->io_bp; 2261 int error; 2262 2263 ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); 2264 2265 ddt_exit(ddt); 2266 2267 error = arc_read(NULL, spa, &blk, 2268 arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, 2269 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2270 &aflags, &zio->io_bookmark); 2271 2272 if (error == 0) { 2273 if (arc_buf_size(abuf) != zio->io_orig_size || 2274 bcmp(abuf->b_data, zio->io_orig_data, 2275 zio->io_orig_size) != 0) 2276 error = SET_ERROR(EEXIST); 2277 VERIFY(arc_buf_remove_ref(abuf, &abuf)); 2278 } 2279 2280 ddt_enter(ddt); 2281 return (error != 0); 2282 } 2283 } 2284 2285 return (B_FALSE); 2286} 2287 2288static void 2289zio_ddt_child_write_ready(zio_t *zio) 2290{ 2291 int p = zio->io_prop.zp_copies; 2292 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2293 ddt_entry_t *dde = zio->io_private; 2294 ddt_phys_t *ddp = &dde->dde_phys[p]; 2295 zio_t *pio; 2296 2297 if (zio->io_error) 2298 return; 2299 2300 ddt_enter(ddt); 2301 2302 ASSERT(dde->dde_lead_zio[p] == zio); 2303 2304 ddt_phys_fill(ddp, zio->io_bp); 2305 2306 while ((pio = zio_walk_parents(zio)) != NULL) 2307 ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); 2308 2309 ddt_exit(ddt); 2310} 2311 2312static void 2313zio_ddt_child_write_done(zio_t *zio) 2314{ 2315 int p = zio->io_prop.zp_copies; 2316 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2317 ddt_entry_t *dde = zio->io_private; 2318 ddt_phys_t *ddp = &dde->dde_phys[p]; 2319 2320 ddt_enter(ddt); 2321 2322 ASSERT(ddp->ddp_refcnt == 0); 2323 ASSERT(dde->dde_lead_zio[p] == zio); 2324 dde->dde_lead_zio[p] = NULL; 2325 2326 if (zio->io_error == 0) { 2327 while (zio_walk_parents(zio) != NULL) 2328 ddt_phys_addref(ddp); 2329 } else { 2330 ddt_phys_clear(ddp); 2331 } 2332 2333 ddt_exit(ddt); 2334} 2335 2336static void 2337zio_ddt_ditto_write_done(zio_t *zio) 2338{ 2339 int p = DDT_PHYS_DITTO; 2340 zio_prop_t *zp = &zio->io_prop; 2341 blkptr_t *bp = zio->io_bp; 2342 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2343 ddt_entry_t *dde = zio->io_private; 2344 ddt_phys_t *ddp = &dde->dde_phys[p]; 2345 ddt_key_t *ddk = &dde->dde_key; 2346 2347 ddt_enter(ddt); 2348 2349 ASSERT(ddp->ddp_refcnt == 0); 2350 ASSERT(dde->dde_lead_zio[p] == zio); 2351 dde->dde_lead_zio[p] = NULL; 2352 2353 if (zio->io_error == 0) { 2354 ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); 2355 ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); 2356 ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); 2357 if (ddp->ddp_phys_birth != 0) 2358 ddt_phys_free(ddt, ddk, ddp, zio->io_txg); 2359 ddt_phys_fill(ddp, bp); 2360 } 2361 2362 ddt_exit(ddt); 2363} 2364 2365static int 2366zio_ddt_write(zio_t *zio) 2367{ 2368 spa_t *spa = zio->io_spa; 2369 blkptr_t *bp = zio->io_bp; 2370 uint64_t txg = zio->io_txg; 2371 zio_prop_t *zp = &zio->io_prop; 2372 int p = zp->zp_copies; 2373 int ditto_copies; 2374 zio_t *cio = NULL; 2375 zio_t *dio = NULL; 2376 ddt_t *ddt = ddt_select(spa, bp); 2377 ddt_entry_t *dde; 2378 ddt_phys_t *ddp; 2379 2380 ASSERT(BP_GET_DEDUP(bp)); 2381 ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); 2382 ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); 2383 2384 ddt_enter(ddt); 2385 dde = ddt_lookup(ddt, bp, B_TRUE); 2386 ddp = &dde->dde_phys[p]; 2387 2388 if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { 2389 /* 2390 * If we're using a weak checksum, upgrade to a strong checksum 2391 * and try again. If we're already using a strong checksum, 2392 * we can't resolve it, so just convert to an ordinary write. 2393 * (And automatically e-mail a paper to Nature?) 2394 */ 2395 if (!zio_checksum_table[zp->zp_checksum].ci_dedup) { 2396 zp->zp_checksum = spa_dedup_checksum(spa); 2397 zio_pop_transforms(zio); 2398 zio->io_stage = ZIO_STAGE_OPEN; 2399 BP_ZERO(bp); 2400 } else { 2401 zp->zp_dedup = B_FALSE; 2402 } 2403 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2404 ddt_exit(ddt); 2405 return (ZIO_PIPELINE_CONTINUE); 2406 } 2407 2408 ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); 2409 ASSERT(ditto_copies < SPA_DVAS_PER_BP); 2410 2411 if (ditto_copies > ddt_ditto_copies_present(dde) && 2412 dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { 2413 zio_prop_t czp = *zp; 2414 2415 czp.zp_copies = ditto_copies; 2416 2417 /* 2418 * If we arrived here with an override bp, we won't have run 2419 * the transform stack, so we won't have the data we need to 2420 * generate a child i/o. So, toss the override bp and restart. 2421 * This is safe, because using the override bp is just an 2422 * optimization; and it's rare, so the cost doesn't matter. 2423 */ 2424 if (zio->io_bp_override) { 2425 zio_pop_transforms(zio); 2426 zio->io_stage = ZIO_STAGE_OPEN; 2427 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2428 zio->io_bp_override = NULL; 2429 BP_ZERO(bp); 2430 ddt_exit(ddt); 2431 return (ZIO_PIPELINE_CONTINUE); 2432 } 2433 2434 dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2435 zio->io_orig_size, &czp, NULL, NULL, 2436 zio_ddt_ditto_write_done, dde, zio->io_priority, 2437 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2438 2439 zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); 2440 dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; 2441 } 2442 2443 if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { 2444 if (ddp->ddp_phys_birth != 0) 2445 ddt_bp_fill(ddp, bp, txg); 2446 if (dde->dde_lead_zio[p] != NULL) 2447 zio_add_child(zio, dde->dde_lead_zio[p]); 2448 else 2449 ddt_phys_addref(ddp); 2450 } else if (zio->io_bp_override) { 2451 ASSERT(bp->blk_birth == txg); 2452 ASSERT(BP_EQUAL(bp, zio->io_bp_override)); 2453 ddt_phys_fill(ddp, bp); 2454 ddt_phys_addref(ddp); 2455 } else { 2456 cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2457 zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL, 2458 zio_ddt_child_write_done, dde, zio->io_priority, 2459 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2460 2461 zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); 2462 dde->dde_lead_zio[p] = cio; 2463 } 2464 2465 ddt_exit(ddt); 2466 2467 if (cio) 2468 zio_nowait(cio); 2469 if (dio) 2470 zio_nowait(dio); 2471 2472 return (ZIO_PIPELINE_CONTINUE); 2473} 2474 2475ddt_entry_t *freedde; /* for debugging */ 2476 2477static int 2478zio_ddt_free(zio_t *zio) 2479{ 2480 spa_t *spa = zio->io_spa; 2481 blkptr_t *bp = zio->io_bp; 2482 ddt_t *ddt = ddt_select(spa, bp); 2483 ddt_entry_t *dde; 2484 ddt_phys_t *ddp; 2485 2486 ASSERT(BP_GET_DEDUP(bp)); 2487 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2488 2489 ddt_enter(ddt); 2490 freedde = dde = ddt_lookup(ddt, bp, B_TRUE); 2491 ddp = ddt_phys_select(dde, bp); 2492 ddt_phys_decref(ddp); 2493 ddt_exit(ddt); 2494 2495 return (ZIO_PIPELINE_CONTINUE); 2496} 2497 2498/* 2499 * ========================================================================== 2500 * Allocate and free blocks 2501 * ========================================================================== 2502 */ 2503static int 2504zio_dva_allocate(zio_t *zio) 2505{ 2506 spa_t *spa = zio->io_spa; 2507 metaslab_class_t *mc = spa_normal_class(spa); 2508 blkptr_t *bp = zio->io_bp; 2509 int error; 2510 int flags = 0; 2511 2512 if (zio->io_gang_leader == NULL) { 2513 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2514 zio->io_gang_leader = zio; 2515 } 2516 2517 ASSERT(BP_IS_HOLE(bp)); 2518 ASSERT0(BP_GET_NDVAS(bp)); 2519 ASSERT3U(zio->io_prop.zp_copies, >, 0); 2520 ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); 2521 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 2522 2523 /* 2524 * The dump device does not support gang blocks so allocation on 2525 * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid 2526 * the "fast" gang feature. 2527 */ 2528 flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0; 2529 flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ? 2530 METASLAB_GANG_CHILD : 0; 2531 error = metaslab_alloc(spa, mc, zio->io_size, bp, 2532 zio->io_prop.zp_copies, zio->io_txg, NULL, flags); 2533 2534 if (error) { 2535 spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " 2536 "size %llu, error %d", spa_name(spa), zio, zio->io_size, 2537 error); 2538 if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) 2539 return (zio_write_gang_block(zio)); 2540 zio->io_error = error; 2541 } 2542 2543 return (ZIO_PIPELINE_CONTINUE); 2544} 2545 2546static int 2547zio_dva_free(zio_t *zio) 2548{ 2549 metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); 2550 2551 return (ZIO_PIPELINE_CONTINUE); 2552} 2553 2554static int 2555zio_dva_claim(zio_t *zio) 2556{ 2557 int error; 2558 2559 error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 2560 if (error) 2561 zio->io_error = error; 2562 2563 return (ZIO_PIPELINE_CONTINUE); 2564} 2565 2566/* 2567 * Undo an allocation. This is used by zio_done() when an I/O fails 2568 * and we want to give back the block we just allocated. 2569 * This handles both normal blocks and gang blocks. 2570 */ 2571static void 2572zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) 2573{ 2574 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 2575 ASSERT(zio->io_bp_override == NULL); 2576 2577 if (!BP_IS_HOLE(bp)) 2578 metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); 2579 2580 if (gn != NULL) { 2581 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 2582 zio_dva_unallocate(zio, gn->gn_child[g], 2583 &gn->gn_gbh->zg_blkptr[g]); 2584 } 2585 } 2586} 2587 2588/* 2589 * Try to allocate an intent log block. Return 0 on success, errno on failure. 2590 */ 2591int 2592zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, 2593 uint64_t size, boolean_t use_slog) 2594{ 2595 int error = 1; 2596 2597 ASSERT(txg > spa_syncing_txg(spa)); 2598 2599 /* 2600 * ZIL blocks are always contiguous (i.e. not gang blocks) so we 2601 * set the METASLAB_GANG_AVOID flag so that they don't "fast gang" 2602 * when allocating them. 2603 */ 2604 if (use_slog) { 2605 error = metaslab_alloc(spa, spa_log_class(spa), size, 2606 new_bp, 1, txg, old_bp, 2607 METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID); 2608 } 2609 2610 if (error) { 2611 error = metaslab_alloc(spa, spa_normal_class(spa), size, 2612 new_bp, 1, txg, old_bp, 2613 METASLAB_HINTBP_AVOID); 2614 } 2615 2616 if (error == 0) { 2617 BP_SET_LSIZE(new_bp, size); 2618 BP_SET_PSIZE(new_bp, size); 2619 BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 2620 BP_SET_CHECKSUM(new_bp, 2621 spa_version(spa) >= SPA_VERSION_SLIM_ZIL 2622 ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); 2623 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 2624 BP_SET_LEVEL(new_bp, 0); 2625 BP_SET_DEDUP(new_bp, 0); 2626 BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 2627 } 2628 2629 return (error); 2630} 2631 2632/* 2633 * Free an intent log block. 2634 */ 2635void 2636zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp) 2637{ 2638 ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG); 2639 ASSERT(!BP_IS_GANG(bp)); 2640 2641 zio_free(spa, txg, bp); 2642} 2643 2644/* 2645 * ========================================================================== 2646 * Read, write and delete to physical devices 2647 * ========================================================================== 2648 */ 2649static int 2650zio_vdev_io_start(zio_t *zio) 2651{ 2652 vdev_t *vd = zio->io_vd; 2653 uint64_t align; 2654 spa_t *spa = zio->io_spa; 2655 int ret; 2656 2657 ASSERT(zio->io_error == 0); 2658 ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); 2659 2660 if (vd == NULL) { 2661 if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2662 spa_config_enter(spa, SCL_ZIO, zio, RW_READER); 2663 2664 /* 2665 * The mirror_ops handle multiple DVAs in a single BP. 2666 */ 2667 return (vdev_mirror_ops.vdev_op_io_start(zio)); 2668 } 2669 2670 if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE && 2671 zio->io_priority == ZIO_PRIORITY_NOW) { 2672 trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg); 2673 return (ZIO_PIPELINE_CONTINUE); 2674 } 2675 2676 /* 2677 * We keep track of time-sensitive I/Os so that the scan thread 2678 * can quickly react to certain workloads. In particular, we care 2679 * about non-scrubbing, top-level reads and writes with the following 2680 * characteristics: 2681 * - synchronous writes of user data to non-slog devices 2682 * - any reads of user data 2683 * When these conditions are met, adjust the timestamp of spa_last_io 2684 * which allows the scan thread to adjust its workload accordingly. 2685 */ 2686 if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && 2687 vd == vd->vdev_top && !vd->vdev_islog && 2688 zio->io_bookmark.zb_objset != DMU_META_OBJSET && 2689 zio->io_txg != spa_syncing_txg(spa)) { 2690 uint64_t old = spa->spa_last_io; 2691 uint64_t new = ddi_get_lbolt64(); 2692 if (old != new) 2693 (void) atomic_cas_64(&spa->spa_last_io, old, new); 2694 } 2695 2696 align = 1ULL << vd->vdev_top->vdev_ashift; 2697 2698 if ((!(zio->io_flags & ZIO_FLAG_PHYSICAL) || 2699 (vd->vdev_top->vdev_physical_ashift > SPA_MINBLOCKSHIFT)) && 2700 P2PHASE(zio->io_size, align) != 0) { 2701 /* Transform logical writes to be a full physical block size. */ 2702 uint64_t asize = P2ROUNDUP(zio->io_size, align); 2703 char *abuf = NULL; 2704 if (zio->io_type == ZIO_TYPE_READ || 2705 zio->io_type == ZIO_TYPE_WRITE) 2706 abuf = zio_buf_alloc(asize); 2707 ASSERT(vd == vd->vdev_top); 2708 if (zio->io_type == ZIO_TYPE_WRITE) { 2709 bcopy(zio->io_data, abuf, zio->io_size); 2710 bzero(abuf + zio->io_size, asize - zio->io_size); 2711 } 2712 zio_push_transform(zio, abuf, asize, abuf ? asize : 0, 2713 zio_subblock); 2714 } 2715 2716 /* 2717 * If this is not a physical io, make sure that it is properly aligned 2718 * before proceeding. 2719 */ 2720 if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) { 2721 ASSERT0(P2PHASE(zio->io_offset, align)); 2722 ASSERT0(P2PHASE(zio->io_size, align)); 2723 } else { 2724 /* 2725 * For physical writes, we allow 512b aligned writes and assume 2726 * the device will perform a read-modify-write as necessary. 2727 */ 2728 ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE)); 2729 ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE)); 2730 } 2731 2732 VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa)); 2733 2734 /* 2735 * If this is a repair I/O, and there's no self-healing involved -- 2736 * that is, we're just resilvering what we expect to resilver -- 2737 * then don't do the I/O unless zio's txg is actually in vd's DTL. 2738 * This prevents spurious resilvering with nested replication. 2739 * For example, given a mirror of mirrors, (A+B)+(C+D), if only 2740 * A is out of date, we'll read from C+D, then use the data to 2741 * resilver A+B -- but we don't actually want to resilver B, just A. 2742 * The top-level mirror has no way to know this, so instead we just 2743 * discard unnecessary repairs as we work our way down the vdev tree. 2744 * The same logic applies to any form of nested replication: 2745 * ditto + mirror, RAID-Z + replacing, etc. This covers them all. 2746 */ 2747 if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && 2748 !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && 2749 zio->io_txg != 0 && /* not a delegated i/o */ 2750 !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { 2751 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 2752 zio_vdev_io_bypass(zio); 2753 return (ZIO_PIPELINE_CONTINUE); 2754 } 2755 2756 if (vd->vdev_ops->vdev_op_leaf) { 2757 switch (zio->io_type) { 2758 case ZIO_TYPE_READ: 2759 if (vdev_cache_read(zio)) 2760 return (ZIO_PIPELINE_CONTINUE); 2761 /* FALLTHROUGH */ 2762 case ZIO_TYPE_WRITE: 2763 case ZIO_TYPE_FREE: 2764 if ((zio = vdev_queue_io(zio)) == NULL) 2765 return (ZIO_PIPELINE_STOP); 2766 2767 if (!vdev_accessible(vd, zio)) { 2768 zio->io_error = SET_ERROR(ENXIO); 2769 zio_interrupt(zio); 2770 return (ZIO_PIPELINE_STOP); 2771 } 2772 break; 2773 } 2774 /* 2775 * Note that we ignore repair writes for TRIM because they can 2776 * conflict with normal writes. This isn't an issue because, by 2777 * definition, we only repair blocks that aren't freed. 2778 */ 2779 if (zio->io_type == ZIO_TYPE_WRITE && 2780 !(zio->io_flags & ZIO_FLAG_IO_REPAIR) && 2781 !trim_map_write_start(zio)) 2782 return (ZIO_PIPELINE_STOP); 2783 } 2784 2785 ret = vd->vdev_ops->vdev_op_io_start(zio); 2786 ASSERT(ret == ZIO_PIPELINE_STOP); 2787 2788 return (ret); 2789} 2790 2791static int 2792zio_vdev_io_done(zio_t *zio) 2793{ 2794 vdev_t *vd = zio->io_vd; 2795 vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; 2796 boolean_t unexpected_error = B_FALSE; 2797 2798 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2799 return (ZIO_PIPELINE_STOP); 2800 2801 ASSERT(zio->io_type == ZIO_TYPE_READ || 2802 zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE); 2803 2804 if (vd != NULL && vd->vdev_ops->vdev_op_leaf && 2805 (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || 2806 zio->io_type == ZIO_TYPE_FREE)) { 2807 2808 if (zio->io_type == ZIO_TYPE_WRITE && 2809 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) 2810 trim_map_write_done(zio); 2811 2812 vdev_queue_io_done(zio); 2813 2814 if (zio->io_type == ZIO_TYPE_WRITE) 2815 vdev_cache_write(zio); 2816 2817 if (zio_injection_enabled && zio->io_error == 0) 2818 zio->io_error = zio_handle_device_injection(vd, 2819 zio, EIO); 2820 2821 if (zio_injection_enabled && zio->io_error == 0) 2822 zio->io_error = zio_handle_label_injection(zio, EIO); 2823 2824 if (zio->io_error) { 2825 if (zio->io_error == ENOTSUP && 2826 zio->io_type == ZIO_TYPE_FREE) { 2827 /* Not all devices support TRIM. */ 2828 } else if (!vdev_accessible(vd, zio)) { 2829 zio->io_error = SET_ERROR(ENXIO); 2830 } else { 2831 unexpected_error = B_TRUE; 2832 } 2833 } 2834 } 2835 2836 ops->vdev_op_io_done(zio); 2837 2838 if (unexpected_error) 2839 VERIFY(vdev_probe(vd, zio) == NULL); 2840 2841 return (ZIO_PIPELINE_CONTINUE); 2842} 2843 2844/* 2845 * For non-raidz ZIOs, we can just copy aside the bad data read from the 2846 * disk, and use that to finish the checksum ereport later. 2847 */ 2848static void 2849zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, 2850 const void *good_buf) 2851{ 2852 /* no processing needed */ 2853 zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); 2854} 2855 2856/*ARGSUSED*/ 2857void 2858zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) 2859{ 2860 void *buf = zio_buf_alloc(zio->io_size); 2861 2862 bcopy(zio->io_data, buf, zio->io_size); 2863 2864 zcr->zcr_cbinfo = zio->io_size; 2865 zcr->zcr_cbdata = buf; 2866 zcr->zcr_finish = zio_vsd_default_cksum_finish; 2867 zcr->zcr_free = zio_buf_free; 2868} 2869 2870static int 2871zio_vdev_io_assess(zio_t *zio) 2872{ 2873 vdev_t *vd = zio->io_vd; 2874 2875 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2876 return (ZIO_PIPELINE_STOP); 2877 2878 if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2879 spa_config_exit(zio->io_spa, SCL_ZIO, zio); 2880 2881 if (zio->io_vsd != NULL) { 2882 zio->io_vsd_ops->vsd_free(zio); 2883 zio->io_vsd = NULL; 2884 } 2885 2886 if (zio_injection_enabled && zio->io_error == 0) 2887 zio->io_error = zio_handle_fault_injection(zio, EIO); 2888 2889 if (zio->io_type == ZIO_TYPE_FREE && 2890 zio->io_priority != ZIO_PRIORITY_NOW) { 2891 switch (zio->io_error) { 2892 case 0: 2893 ZIO_TRIM_STAT_INCR(bytes, zio->io_size); 2894 ZIO_TRIM_STAT_BUMP(success); 2895 break; 2896 case EOPNOTSUPP: 2897 ZIO_TRIM_STAT_BUMP(unsupported); 2898 break; 2899 default: 2900 ZIO_TRIM_STAT_BUMP(failed); 2901 break; 2902 } 2903 } 2904 2905 /* 2906 * If the I/O failed, determine whether we should attempt to retry it. 2907 * 2908 * On retry, we cut in line in the issue queue, since we don't want 2909 * compression/checksumming/etc. work to prevent our (cheap) IO reissue. 2910 */ 2911 if (zio->io_error && vd == NULL && 2912 !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { 2913 ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ 2914 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ 2915 zio->io_error = 0; 2916 zio->io_flags |= ZIO_FLAG_IO_RETRY | 2917 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; 2918 zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; 2919 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, 2920 zio_requeue_io_start_cut_in_line); 2921 return (ZIO_PIPELINE_STOP); 2922 } 2923 2924 /* 2925 * If we got an error on a leaf device, convert it to ENXIO 2926 * if the device is not accessible at all. 2927 */ 2928 if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && 2929 !vdev_accessible(vd, zio)) 2930 zio->io_error = SET_ERROR(ENXIO); 2931 2932 /* 2933 * If we can't write to an interior vdev (mirror or RAID-Z), 2934 * set vdev_cant_write so that we stop trying to allocate from it. 2935 */ 2936 if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && 2937 vd != NULL && !vd->vdev_ops->vdev_op_leaf) { 2938 vd->vdev_cant_write = B_TRUE; 2939 } 2940 2941 if (zio->io_error) 2942 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2943 2944 if (vd != NULL && vd->vdev_ops->vdev_op_leaf && 2945 zio->io_physdone != NULL) { 2946 ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED)); 2947 ASSERT(zio->io_child_type == ZIO_CHILD_VDEV); 2948 zio->io_physdone(zio->io_logical); 2949 } 2950 2951 return (ZIO_PIPELINE_CONTINUE); 2952} 2953 2954void 2955zio_vdev_io_reissue(zio_t *zio) 2956{ 2957 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2958 ASSERT(zio->io_error == 0); 2959 2960 zio->io_stage >>= 1; 2961} 2962 2963void 2964zio_vdev_io_redone(zio_t *zio) 2965{ 2966 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 2967 2968 zio->io_stage >>= 1; 2969} 2970 2971void 2972zio_vdev_io_bypass(zio_t *zio) 2973{ 2974 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2975 ASSERT(zio->io_error == 0); 2976 2977 zio->io_flags |= ZIO_FLAG_IO_BYPASS; 2978 zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; 2979} 2980 2981/* 2982 * ========================================================================== 2983 * Generate and verify checksums 2984 * ========================================================================== 2985 */ 2986static int 2987zio_checksum_generate(zio_t *zio) 2988{ 2989 blkptr_t *bp = zio->io_bp; 2990 enum zio_checksum checksum; 2991 2992 if (bp == NULL) { 2993 /* 2994 * This is zio_write_phys(). 2995 * We're either generating a label checksum, or none at all. 2996 */ 2997 checksum = zio->io_prop.zp_checksum; 2998 2999 if (checksum == ZIO_CHECKSUM_OFF) 3000 return (ZIO_PIPELINE_CONTINUE); 3001 3002 ASSERT(checksum == ZIO_CHECKSUM_LABEL); 3003 } else { 3004 if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { 3005 ASSERT(!IO_IS_ALLOCATING(zio)); 3006 checksum = ZIO_CHECKSUM_GANG_HEADER; 3007 } else { 3008 checksum = BP_GET_CHECKSUM(bp); 3009 } 3010 } 3011 3012 zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); 3013 3014 return (ZIO_PIPELINE_CONTINUE); 3015} 3016 3017static int 3018zio_checksum_verify(zio_t *zio) 3019{ 3020 zio_bad_cksum_t info; 3021 blkptr_t *bp = zio->io_bp; 3022 int error; 3023 3024 ASSERT(zio->io_vd != NULL); 3025 3026 if (bp == NULL) { 3027 /* 3028 * This is zio_read_phys(). 3029 * We're either verifying a label checksum, or nothing at all. 3030 */ 3031 if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) 3032 return (ZIO_PIPELINE_CONTINUE); 3033 3034 ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); 3035 } 3036 3037 if ((error = zio_checksum_error(zio, &info)) != 0) { 3038 zio->io_error = error; 3039 if (error == ECKSUM && 3040 !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 3041 zfs_ereport_start_checksum(zio->io_spa, 3042 zio->io_vd, zio, zio->io_offset, 3043 zio->io_size, NULL, &info); 3044 } 3045 } 3046 3047 return (ZIO_PIPELINE_CONTINUE); 3048} 3049 3050/* 3051 * Called by RAID-Z to ensure we don't compute the checksum twice. 3052 */ 3053void 3054zio_checksum_verified(zio_t *zio) 3055{ 3056 zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 3057} 3058 3059/* 3060 * ========================================================================== 3061 * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. 3062 * An error of 0 indicates success. ENXIO indicates whole-device failure, 3063 * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO 3064 * indicate errors that are specific to one I/O, and most likely permanent. 3065 * Any other error is presumed to be worse because we weren't expecting it. 3066 * ========================================================================== 3067 */ 3068int 3069zio_worst_error(int e1, int e2) 3070{ 3071 static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; 3072 int r1, r2; 3073 3074 for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) 3075 if (e1 == zio_error_rank[r1]) 3076 break; 3077 3078 for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) 3079 if (e2 == zio_error_rank[r2]) 3080 break; 3081 3082 return (r1 > r2 ? e1 : e2); 3083} 3084 3085/* 3086 * ========================================================================== 3087 * I/O completion 3088 * ========================================================================== 3089 */ 3090static int 3091zio_ready(zio_t *zio) 3092{ 3093 blkptr_t *bp = zio->io_bp; 3094 zio_t *pio, *pio_next; 3095 3096 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 3097 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY)) 3098 return (ZIO_PIPELINE_STOP); 3099 3100 if (zio->io_ready) { 3101 ASSERT(IO_IS_ALLOCATING(zio)); 3102 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || 3103 (zio->io_flags & ZIO_FLAG_NOPWRITE)); 3104 ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); 3105 3106 zio->io_ready(zio); 3107 } 3108 3109 if (bp != NULL && bp != &zio->io_bp_copy) 3110 zio->io_bp_copy = *bp; 3111 3112 if (zio->io_error) 3113 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 3114 3115 mutex_enter(&zio->io_lock); 3116 zio->io_state[ZIO_WAIT_READY] = 1; 3117 pio = zio_walk_parents(zio); 3118 mutex_exit(&zio->io_lock); 3119 3120 /* 3121 * As we notify zio's parents, new parents could be added. 3122 * New parents go to the head of zio's io_parent_list, however, 3123 * so we will (correctly) not notify them. The remainder of zio's 3124 * io_parent_list, from 'pio_next' onward, cannot change because 3125 * all parents must wait for us to be done before they can be done. 3126 */ 3127 for (; pio != NULL; pio = pio_next) { 3128 pio_next = zio_walk_parents(zio); 3129 zio_notify_parent(pio, zio, ZIO_WAIT_READY); 3130 } 3131 3132 if (zio->io_flags & ZIO_FLAG_NODATA) { 3133 if (BP_IS_GANG(bp)) { 3134 zio->io_flags &= ~ZIO_FLAG_NODATA; 3135 } else { 3136 ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE); 3137 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 3138 } 3139 } 3140 3141 if (zio_injection_enabled && 3142 zio->io_spa->spa_syncing_txg == zio->io_txg) 3143 zio_handle_ignored_writes(zio); 3144 3145 return (ZIO_PIPELINE_CONTINUE); 3146} 3147 3148static int 3149zio_done(zio_t *zio) 3150{ 3151 spa_t *spa = zio->io_spa; 3152 zio_t *lio = zio->io_logical; 3153 blkptr_t *bp = zio->io_bp; 3154 vdev_t *vd = zio->io_vd; 3155 uint64_t psize = zio->io_size; 3156 zio_t *pio, *pio_next; 3157 3158 /* 3159 * If our children haven't all completed, 3160 * wait for them and then repeat this pipeline stage. 3161 */ 3162 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || 3163 zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || 3164 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) || 3165 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) 3166 return (ZIO_PIPELINE_STOP); 3167 3168 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 3169 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 3170 ASSERT(zio->io_children[c][w] == 0); 3171 3172 if (bp != NULL && !BP_IS_EMBEDDED(bp)) { 3173 ASSERT(bp->blk_pad[0] == 0); 3174 ASSERT(bp->blk_pad[1] == 0); 3175 ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || 3176 (bp == zio_unique_parent(zio)->io_bp)); 3177 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 3178 zio->io_bp_override == NULL && 3179 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 3180 ASSERT(!BP_SHOULD_BYTESWAP(bp)); 3181 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); 3182 ASSERT(BP_COUNT_GANG(bp) == 0 || 3183 (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 3184 } 3185 if (zio->io_flags & ZIO_FLAG_NOPWRITE) 3186 VERIFY(BP_EQUAL(bp, &zio->io_bp_orig)); 3187 } 3188 3189 /* 3190 * If there were child vdev/gang/ddt errors, they apply to us now. 3191 */ 3192 zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); 3193 zio_inherit_child_errors(zio, ZIO_CHILD_GANG); 3194 zio_inherit_child_errors(zio, ZIO_CHILD_DDT); 3195 3196 /* 3197 * If the I/O on the transformed data was successful, generate any 3198 * checksum reports now while we still have the transformed data. 3199 */ 3200 if (zio->io_error == 0) { 3201 while (zio->io_cksum_report != NULL) { 3202 zio_cksum_report_t *zcr = zio->io_cksum_report; 3203 uint64_t align = zcr->zcr_align; 3204 uint64_t asize = P2ROUNDUP(psize, align); 3205 char *abuf = zio->io_data; 3206 3207 if (asize != psize) { 3208 abuf = zio_buf_alloc(asize); 3209 bcopy(zio->io_data, abuf, psize); 3210 bzero(abuf + psize, asize - psize); 3211 } 3212 3213 zio->io_cksum_report = zcr->zcr_next; 3214 zcr->zcr_next = NULL; 3215 zcr->zcr_finish(zcr, abuf); 3216 zfs_ereport_free_checksum(zcr); 3217 3218 if (asize != psize) 3219 zio_buf_free(abuf, asize); 3220 } 3221 } 3222 3223 zio_pop_transforms(zio); /* note: may set zio->io_error */ 3224 3225 vdev_stat_update(zio, psize); 3226 3227 if (zio->io_error) { 3228 /* 3229 * If this I/O is attached to a particular vdev, 3230 * generate an error message describing the I/O failure 3231 * at the block level. We ignore these errors if the 3232 * device is currently unavailable. 3233 */ 3234 if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 3235 zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 3236 3237 if ((zio->io_error == EIO || !(zio->io_flags & 3238 (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && 3239 zio == lio) { 3240 /* 3241 * For logical I/O requests, tell the SPA to log the 3242 * error and generate a logical data ereport. 3243 */ 3244 spa_log_error(spa, zio); 3245 zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 3246 0, 0); 3247 } 3248 } 3249 3250 if (zio->io_error && zio == lio) { 3251 /* 3252 * Determine whether zio should be reexecuted. This will 3253 * propagate all the way to the root via zio_notify_parent(). 3254 */ 3255 ASSERT(vd == NULL && bp != NULL); 3256 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3257 3258 if (IO_IS_ALLOCATING(zio) && 3259 !(zio->io_flags & ZIO_FLAG_CANFAIL)) { 3260 if (zio->io_error != ENOSPC) 3261 zio->io_reexecute |= ZIO_REEXECUTE_NOW; 3262 else 3263 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3264 } 3265 3266 if ((zio->io_type == ZIO_TYPE_READ || 3267 zio->io_type == ZIO_TYPE_FREE) && 3268 !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && 3269 zio->io_error == ENXIO && 3270 spa_load_state(spa) == SPA_LOAD_NONE && 3271 spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 3272 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3273 3274 if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) 3275 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3276 3277 /* 3278 * Here is a possibly good place to attempt to do 3279 * either combinatorial reconstruction or error correction 3280 * based on checksums. It also might be a good place 3281 * to send out preliminary ereports before we suspend 3282 * processing. 3283 */ 3284 } 3285 3286 /* 3287 * If there were logical child errors, they apply to us now. 3288 * We defer this until now to avoid conflating logical child 3289 * errors with errors that happened to the zio itself when 3290 * updating vdev stats and reporting FMA events above. 3291 */ 3292 zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); 3293 3294 if ((zio->io_error || zio->io_reexecute) && 3295 IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && 3296 !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) 3297 zio_dva_unallocate(zio, zio->io_gang_tree, bp); 3298 3299 zio_gang_tree_free(&zio->io_gang_tree); 3300 3301 /* 3302 * Godfather I/Os should never suspend. 3303 */ 3304 if ((zio->io_flags & ZIO_FLAG_GODFATHER) && 3305 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) 3306 zio->io_reexecute = 0; 3307 3308 if (zio->io_reexecute) { 3309 /* 3310 * This is a logical I/O that wants to reexecute. 3311 * 3312 * Reexecute is top-down. When an i/o fails, if it's not 3313 * the root, it simply notifies its parent and sticks around. 3314 * The parent, seeing that it still has children in zio_done(), 3315 * does the same. This percolates all the way up to the root. 3316 * The root i/o will reexecute or suspend the entire tree. 3317 * 3318 * This approach ensures that zio_reexecute() honors 3319 * all the original i/o dependency relationships, e.g. 3320 * parents not executing until children are ready. 3321 */ 3322 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3323 3324 zio->io_gang_leader = NULL; 3325 3326 mutex_enter(&zio->io_lock); 3327 zio->io_state[ZIO_WAIT_DONE] = 1; 3328 mutex_exit(&zio->io_lock); 3329 3330 /* 3331 * "The Godfather" I/O monitors its children but is 3332 * not a true parent to them. It will track them through 3333 * the pipeline but severs its ties whenever they get into 3334 * trouble (e.g. suspended). This allows "The Godfather" 3335 * I/O to return status without blocking. 3336 */ 3337 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 3338 zio_link_t *zl = zio->io_walk_link; 3339 pio_next = zio_walk_parents(zio); 3340 3341 if ((pio->io_flags & ZIO_FLAG_GODFATHER) && 3342 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { 3343 zio_remove_child(pio, zio, zl); 3344 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3345 } 3346 } 3347 3348 if ((pio = zio_unique_parent(zio)) != NULL) { 3349 /* 3350 * We're not a root i/o, so there's nothing to do 3351 * but notify our parent. Don't propagate errors 3352 * upward since we haven't permanently failed yet. 3353 */ 3354 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 3355 zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; 3356 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3357 } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { 3358 /* 3359 * We'd fail again if we reexecuted now, so suspend 3360 * until conditions improve (e.g. device comes online). 3361 */ 3362 zio_suspend(spa, zio); 3363 } else { 3364 /* 3365 * Reexecution is potentially a huge amount of work. 3366 * Hand it off to the otherwise-unused claim taskq. 3367 */ 3368#if defined(illumos) || !defined(_KERNEL) 3369 ASSERT(zio->io_tqent.tqent_next == NULL); 3370#else 3371 ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); 3372#endif 3373 spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM, 3374 ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio, 3375 0, &zio->io_tqent); 3376 } 3377 return (ZIO_PIPELINE_STOP); 3378 } 3379 3380 ASSERT(zio->io_child_count == 0); 3381 ASSERT(zio->io_reexecute == 0); 3382 ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); 3383 3384 /* 3385 * Report any checksum errors, since the I/O is complete. 3386 */ 3387 while (zio->io_cksum_report != NULL) { 3388 zio_cksum_report_t *zcr = zio->io_cksum_report; 3389 zio->io_cksum_report = zcr->zcr_next; 3390 zcr->zcr_next = NULL; 3391 zcr->zcr_finish(zcr, NULL); 3392 zfs_ereport_free_checksum(zcr); 3393 } 3394 3395 /* 3396 * It is the responsibility of the done callback to ensure that this 3397 * particular zio is no longer discoverable for adoption, and as 3398 * such, cannot acquire any new parents. 3399 */ 3400 if (zio->io_done) 3401 zio->io_done(zio); 3402 3403 mutex_enter(&zio->io_lock); 3404 zio->io_state[ZIO_WAIT_DONE] = 1; 3405 mutex_exit(&zio->io_lock); 3406 3407 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 3408 zio_link_t *zl = zio->io_walk_link; 3409 pio_next = zio_walk_parents(zio); 3410 zio_remove_child(pio, zio, zl); 3411 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3412 } 3413 3414 if (zio->io_waiter != NULL) { 3415 mutex_enter(&zio->io_lock); 3416 zio->io_executor = NULL; 3417 cv_broadcast(&zio->io_cv); 3418 mutex_exit(&zio->io_lock); 3419 } else { 3420 zio_destroy(zio); 3421 } 3422 3423 return (ZIO_PIPELINE_STOP); 3424} 3425 3426/* 3427 * ========================================================================== 3428 * I/O pipeline definition 3429 * ========================================================================== 3430 */ 3431static zio_pipe_stage_t *zio_pipeline[] = { 3432 NULL, 3433 zio_read_bp_init, 3434 zio_free_bp_init, 3435 zio_issue_async, 3436 zio_write_bp_init, 3437 zio_checksum_generate, 3438 zio_nop_write, 3439 zio_ddt_read_start, 3440 zio_ddt_read_done, 3441 zio_ddt_write, 3442 zio_ddt_free, 3443 zio_gang_assemble, 3444 zio_gang_issue, 3445 zio_dva_allocate, 3446 zio_dva_free, 3447 zio_dva_claim, 3448 zio_ready, 3449 zio_vdev_io_start, 3450 zio_vdev_io_done, 3451 zio_vdev_io_assess, 3452 zio_checksum_verify, 3453 zio_done 3454}; 3455 3456/* dnp is the dnode for zb1->zb_object */ 3457boolean_t 3458zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_phys_t *zb1, 3459 const zbookmark_phys_t *zb2) 3460{ 3461 uint64_t zb1nextL0, zb2thisobj; 3462 3463 ASSERT(zb1->zb_objset == zb2->zb_objset); 3464 ASSERT(zb2->zb_level == 0); 3465 3466 /* The objset_phys_t isn't before anything. */ 3467 if (dnp == NULL) 3468 return (B_FALSE); 3469 3470 zb1nextL0 = (zb1->zb_blkid + 1) << 3471 ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); 3472 3473 zb2thisobj = zb2->zb_object ? zb2->zb_object : 3474 zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); 3475 3476 if (zb1->zb_object == DMU_META_DNODE_OBJECT) { 3477 uint64_t nextobj = zb1nextL0 * 3478 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; 3479 return (nextobj <= zb2thisobj); 3480 } 3481 3482 if (zb1->zb_object < zb2thisobj) 3483 return (B_TRUE); 3484 if (zb1->zb_object > zb2thisobj) 3485 return (B_FALSE); 3486 if (zb2->zb_object == DMU_META_DNODE_OBJECT) 3487 return (B_FALSE); 3488 return (zb1nextL0 <= zb2->zb_blkid); 3489} 3490