zio.c revision 300039
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2015 by Delphix. All rights reserved. 24 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 */ 27 28#include <sys/sysmacros.h> 29#include <sys/zfs_context.h> 30#include <sys/fm/fs/zfs.h> 31#include <sys/spa.h> 32#include <sys/txg.h> 33#include <sys/spa_impl.h> 34#include <sys/vdev_impl.h> 35#include <sys/zio_impl.h> 36#include <sys/zio_compress.h> 37#include <sys/zio_checksum.h> 38#include <sys/dmu_objset.h> 39#include <sys/arc.h> 40#include <sys/ddt.h> 41#include <sys/trim_map.h> 42#include <sys/blkptr.h> 43#include <sys/zfeature.h> 44 45SYSCTL_DECL(_vfs_zfs); 46SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); 47#if defined(__amd64__) 48static int zio_use_uma = 1; 49#else 50static int zio_use_uma = 0; 51#endif 52TUNABLE_INT("vfs.zfs.zio.use_uma", &zio_use_uma); 53SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0, 54 "Use uma(9) for ZIO allocations"); 55static int zio_exclude_metadata = 0; 56TUNABLE_INT("vfs.zfs.zio.exclude_metadata", &zio_exclude_metadata); 57SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0, 58 "Exclude metadata buffers from dumps as well"); 59 60zio_trim_stats_t zio_trim_stats = { 61 { "bytes", KSTAT_DATA_UINT64, 62 "Number of bytes successfully TRIMmed" }, 63 { "success", KSTAT_DATA_UINT64, 64 "Number of successful TRIM requests" }, 65 { "unsupported", KSTAT_DATA_UINT64, 66 "Number of TRIM requests that failed because TRIM is not supported" }, 67 { "failed", KSTAT_DATA_UINT64, 68 "Number of TRIM requests that failed for reasons other than not supported" }, 69}; 70 71static kstat_t *zio_trim_ksp; 72 73/* 74 * ========================================================================== 75 * I/O type descriptions 76 * ========================================================================== 77 */ 78const char *zio_type_name[ZIO_TYPES] = { 79 "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", 80 "zio_ioctl" 81}; 82 83/* 84 * ========================================================================== 85 * I/O kmem caches 86 * ========================================================================== 87 */ 88kmem_cache_t *zio_cache; 89kmem_cache_t *zio_link_cache; 90kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 91kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 92 93#ifdef _KERNEL 94extern vmem_t *zio_alloc_arena; 95#endif 96 97#define ZIO_PIPELINE_CONTINUE 0x100 98#define ZIO_PIPELINE_STOP 0x101 99 100#define BP_SPANB(indblkshift, level) \ 101 (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT))) 102#define COMPARE_META_LEVEL 0x80000000ul 103/* 104 * The following actions directly effect the spa's sync-to-convergence logic. 105 * The values below define the sync pass when we start performing the action. 106 * Care should be taken when changing these values as they directly impact 107 * spa_sync() performance. Tuning these values may introduce subtle performance 108 * pathologies and should only be done in the context of performance analysis. 109 * These tunables will eventually be removed and replaced with #defines once 110 * enough analysis has been done to determine optimal values. 111 * 112 * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that 113 * regular blocks are not deferred. 114 */ 115int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */ 116TUNABLE_INT("vfs.zfs.sync_pass_deferred_free", &zfs_sync_pass_deferred_free); 117SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_deferred_free, CTLFLAG_RDTUN, 118 &zfs_sync_pass_deferred_free, 0, "defer frees starting in this pass"); 119int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */ 120TUNABLE_INT("vfs.zfs.sync_pass_dont_compress", &zfs_sync_pass_dont_compress); 121SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_dont_compress, CTLFLAG_RDTUN, 122 &zfs_sync_pass_dont_compress, 0, "don't compress starting in this pass"); 123int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ 124TUNABLE_INT("vfs.zfs.sync_pass_rewrite", &zfs_sync_pass_rewrite); 125SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_rewrite, CTLFLAG_RDTUN, 126 &zfs_sync_pass_rewrite, 0, "rewrite new bps starting in this pass"); 127 128/* 129 * An allocating zio is one that either currently has the DVA allocate 130 * stage set or will have it later in its lifetime. 131 */ 132#define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) 133 134boolean_t zio_requeue_io_start_cut_in_line = B_TRUE; 135 136#ifdef illumos 137#ifdef ZFS_DEBUG 138int zio_buf_debug_limit = 16384; 139#else 140int zio_buf_debug_limit = 0; 141#endif 142#endif 143 144void 145zio_init(void) 146{ 147 size_t c; 148 zio_cache = kmem_cache_create("zio_cache", 149 sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 150 zio_link_cache = kmem_cache_create("zio_link_cache", 151 sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 152 if (!zio_use_uma) 153 goto out; 154 155 /* 156 * For small buffers, we want a cache for each multiple of 157 * SPA_MINBLOCKSIZE. For larger buffers, we want a cache 158 * for each quarter-power of 2. 159 */ 160 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 161 size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 162 size_t p2 = size; 163 size_t align = 0; 164 int cflags = zio_exclude_metadata ? KMC_NODEBUG : 0; 165 166 while (!ISP2(p2)) 167 p2 &= p2 - 1; 168 169#ifdef illumos 170#ifndef _KERNEL 171 /* 172 * If we are using watchpoints, put each buffer on its own page, 173 * to eliminate the performance overhead of trapping to the 174 * kernel when modifying a non-watched buffer that shares the 175 * page with a watched buffer. 176 */ 177 if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) 178 continue; 179#endif 180#endif /* illumos */ 181 if (size <= 4 * SPA_MINBLOCKSIZE) { 182 align = SPA_MINBLOCKSIZE; 183 } else if (IS_P2ALIGNED(size, p2 >> 2)) { 184 align = MIN(p2 >> 2, PAGESIZE); 185 } 186 187 if (align != 0) { 188 char name[36]; 189 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 190 zio_buf_cache[c] = kmem_cache_create(name, size, 191 align, NULL, NULL, NULL, NULL, NULL, cflags); 192 193 /* 194 * Since zio_data bufs do not appear in crash dumps, we 195 * pass KMC_NOTOUCH so that no allocator metadata is 196 * stored with the buffers. 197 */ 198 (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 199 zio_data_buf_cache[c] = kmem_cache_create(name, size, 200 align, NULL, NULL, NULL, NULL, NULL, 201 cflags | KMC_NOTOUCH | KMC_NODEBUG); 202 } 203 } 204 205 while (--c != 0) { 206 ASSERT(zio_buf_cache[c] != NULL); 207 if (zio_buf_cache[c - 1] == NULL) 208 zio_buf_cache[c - 1] = zio_buf_cache[c]; 209 210 ASSERT(zio_data_buf_cache[c] != NULL); 211 if (zio_data_buf_cache[c - 1] == NULL) 212 zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 213 } 214out: 215 216 zio_inject_init(); 217 218 zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc", 219 KSTAT_TYPE_NAMED, 220 sizeof(zio_trim_stats) / sizeof(kstat_named_t), 221 KSTAT_FLAG_VIRTUAL); 222 223 if (zio_trim_ksp != NULL) { 224 zio_trim_ksp->ks_data = &zio_trim_stats; 225 kstat_install(zio_trim_ksp); 226 } 227} 228 229void 230zio_fini(void) 231{ 232 size_t c; 233 kmem_cache_t *last_cache = NULL; 234 kmem_cache_t *last_data_cache = NULL; 235 236 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 237 if (zio_buf_cache[c] != last_cache) { 238 last_cache = zio_buf_cache[c]; 239 kmem_cache_destroy(zio_buf_cache[c]); 240 } 241 zio_buf_cache[c] = NULL; 242 243 if (zio_data_buf_cache[c] != last_data_cache) { 244 last_data_cache = zio_data_buf_cache[c]; 245 kmem_cache_destroy(zio_data_buf_cache[c]); 246 } 247 zio_data_buf_cache[c] = NULL; 248 } 249 250 kmem_cache_destroy(zio_link_cache); 251 kmem_cache_destroy(zio_cache); 252 253 zio_inject_fini(); 254 255 if (zio_trim_ksp != NULL) { 256 kstat_delete(zio_trim_ksp); 257 zio_trim_ksp = NULL; 258 } 259} 260 261/* 262 * ========================================================================== 263 * Allocate and free I/O buffers 264 * ========================================================================== 265 */ 266 267/* 268 * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 269 * crashdump if the kernel panics, so use it judiciously. Obviously, it's 270 * useful to inspect ZFS metadata, but if possible, we should avoid keeping 271 * excess / transient data in-core during a crashdump. 272 */ 273void * 274zio_buf_alloc(size_t size) 275{ 276 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 277 int flags = zio_exclude_metadata ? KM_NODEBUG : 0; 278 279 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 280 281 if (zio_use_uma) 282 return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 283 else 284 return (kmem_alloc(size, KM_SLEEP|flags)); 285} 286 287/* 288 * Use zio_data_buf_alloc to allocate data. The data will not appear in a 289 * crashdump if the kernel panics. This exists so that we will limit the amount 290 * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 291 * of kernel heap dumped to disk when the kernel panics) 292 */ 293void * 294zio_data_buf_alloc(size_t size) 295{ 296 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 297 298 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 299 300 if (zio_use_uma) 301 return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 302 else 303 return (kmem_alloc(size, KM_SLEEP | KM_NODEBUG)); 304} 305 306void 307zio_buf_free(void *buf, size_t size) 308{ 309 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 310 311 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 312 313 if (zio_use_uma) 314 kmem_cache_free(zio_buf_cache[c], buf); 315 else 316 kmem_free(buf, size); 317} 318 319void 320zio_data_buf_free(void *buf, size_t size) 321{ 322 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 323 324 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 325 326 if (zio_use_uma) 327 kmem_cache_free(zio_data_buf_cache[c], buf); 328 else 329 kmem_free(buf, size); 330} 331 332/* 333 * ========================================================================== 334 * Push and pop I/O transform buffers 335 * ========================================================================== 336 */ 337static void 338zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, 339 zio_transform_func_t *transform) 340{ 341 zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 342 343 zt->zt_orig_data = zio->io_data; 344 zt->zt_orig_size = zio->io_size; 345 zt->zt_bufsize = bufsize; 346 zt->zt_transform = transform; 347 348 zt->zt_next = zio->io_transform_stack; 349 zio->io_transform_stack = zt; 350 351 zio->io_data = data; 352 zio->io_size = size; 353} 354 355static void 356zio_pop_transforms(zio_t *zio) 357{ 358 zio_transform_t *zt; 359 360 while ((zt = zio->io_transform_stack) != NULL) { 361 if (zt->zt_transform != NULL) 362 zt->zt_transform(zio, 363 zt->zt_orig_data, zt->zt_orig_size); 364 365 if (zt->zt_bufsize != 0) 366 zio_buf_free(zio->io_data, zt->zt_bufsize); 367 368 zio->io_data = zt->zt_orig_data; 369 zio->io_size = zt->zt_orig_size; 370 zio->io_transform_stack = zt->zt_next; 371 372 kmem_free(zt, sizeof (zio_transform_t)); 373 } 374} 375 376/* 377 * ========================================================================== 378 * I/O transform callbacks for subblocks and decompression 379 * ========================================================================== 380 */ 381static void 382zio_subblock(zio_t *zio, void *data, uint64_t size) 383{ 384 ASSERT(zio->io_size > size); 385 386 if (zio->io_type == ZIO_TYPE_READ) 387 bcopy(zio->io_data, data, size); 388} 389 390static void 391zio_decompress(zio_t *zio, void *data, uint64_t size) 392{ 393 if (zio->io_error == 0 && 394 zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), 395 zio->io_data, data, zio->io_size, size) != 0) 396 zio->io_error = SET_ERROR(EIO); 397} 398 399/* 400 * ========================================================================== 401 * I/O parent/child relationships and pipeline interlocks 402 * ========================================================================== 403 */ 404/* 405 * NOTE - Callers to zio_walk_parents() and zio_walk_children must 406 * continue calling these functions until they return NULL. 407 * Otherwise, the next caller will pick up the list walk in 408 * some indeterminate state. (Otherwise every caller would 409 * have to pass in a cookie to keep the state represented by 410 * io_walk_link, which gets annoying.) 411 */ 412zio_t * 413zio_walk_parents(zio_t *cio) 414{ 415 zio_link_t *zl = cio->io_walk_link; 416 list_t *pl = &cio->io_parent_list; 417 418 zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); 419 cio->io_walk_link = zl; 420 421 if (zl == NULL) 422 return (NULL); 423 424 ASSERT(zl->zl_child == cio); 425 return (zl->zl_parent); 426} 427 428zio_t * 429zio_walk_children(zio_t *pio) 430{ 431 zio_link_t *zl = pio->io_walk_link; 432 list_t *cl = &pio->io_child_list; 433 434 zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); 435 pio->io_walk_link = zl; 436 437 if (zl == NULL) 438 return (NULL); 439 440 ASSERT(zl->zl_parent == pio); 441 return (zl->zl_child); 442} 443 444zio_t * 445zio_unique_parent(zio_t *cio) 446{ 447 zio_t *pio = zio_walk_parents(cio); 448 449 VERIFY(zio_walk_parents(cio) == NULL); 450 return (pio); 451} 452 453void 454zio_add_child(zio_t *pio, zio_t *cio) 455{ 456 zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); 457 458 /* 459 * Logical I/Os can have logical, gang, or vdev children. 460 * Gang I/Os can have gang or vdev children. 461 * Vdev I/Os can only have vdev children. 462 * The following ASSERT captures all of these constraints. 463 */ 464 ASSERT(cio->io_child_type <= pio->io_child_type); 465 466 zl->zl_parent = pio; 467 zl->zl_child = cio; 468 469 mutex_enter(&cio->io_lock); 470 mutex_enter(&pio->io_lock); 471 472 ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); 473 474 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 475 pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; 476 477 list_insert_head(&pio->io_child_list, zl); 478 list_insert_head(&cio->io_parent_list, zl); 479 480 pio->io_child_count++; 481 cio->io_parent_count++; 482 483 mutex_exit(&pio->io_lock); 484 mutex_exit(&cio->io_lock); 485} 486 487static void 488zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) 489{ 490 ASSERT(zl->zl_parent == pio); 491 ASSERT(zl->zl_child == cio); 492 493 mutex_enter(&cio->io_lock); 494 mutex_enter(&pio->io_lock); 495 496 list_remove(&pio->io_child_list, zl); 497 list_remove(&cio->io_parent_list, zl); 498 499 pio->io_child_count--; 500 cio->io_parent_count--; 501 502 mutex_exit(&pio->io_lock); 503 mutex_exit(&cio->io_lock); 504 505 kmem_cache_free(zio_link_cache, zl); 506} 507 508static boolean_t 509zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) 510{ 511 uint64_t *countp = &zio->io_children[child][wait]; 512 boolean_t waiting = B_FALSE; 513 514 mutex_enter(&zio->io_lock); 515 ASSERT(zio->io_stall == NULL); 516 if (*countp != 0) { 517 zio->io_stage >>= 1; 518 zio->io_stall = countp; 519 waiting = B_TRUE; 520 } 521 mutex_exit(&zio->io_lock); 522 523 return (waiting); 524} 525 526static void 527zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) 528{ 529 uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; 530 int *errorp = &pio->io_child_error[zio->io_child_type]; 531 532 mutex_enter(&pio->io_lock); 533 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 534 *errorp = zio_worst_error(*errorp, zio->io_error); 535 pio->io_reexecute |= zio->io_reexecute; 536 ASSERT3U(*countp, >, 0); 537 538 (*countp)--; 539 540 if (*countp == 0 && pio->io_stall == countp) { 541 pio->io_stall = NULL; 542 mutex_exit(&pio->io_lock); 543 zio_execute(pio); 544 } else { 545 mutex_exit(&pio->io_lock); 546 } 547} 548 549static void 550zio_inherit_child_errors(zio_t *zio, enum zio_child c) 551{ 552 if (zio->io_child_error[c] != 0 && zio->io_error == 0) 553 zio->io_error = zio->io_child_error[c]; 554} 555 556/* 557 * ========================================================================== 558 * Create the various types of I/O (read, write, free, etc) 559 * ========================================================================== 560 */ 561static zio_t * 562zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 563 void *data, uint64_t size, zio_done_func_t *done, void *private, 564 zio_type_t type, zio_priority_t priority, enum zio_flag flags, 565 vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb, 566 enum zio_stage stage, enum zio_stage pipeline) 567{ 568 zio_t *zio; 569 570 ASSERT3U(type == ZIO_TYPE_FREE || size, <=, SPA_MAXBLOCKSIZE); 571 ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 572 ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 573 574 ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); 575 ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); 576 ASSERT(vd || stage == ZIO_STAGE_OPEN); 577 578 zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 579 bzero(zio, sizeof (zio_t)); 580 581 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 582 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 583 584 list_create(&zio->io_parent_list, sizeof (zio_link_t), 585 offsetof(zio_link_t, zl_parent_node)); 586 list_create(&zio->io_child_list, sizeof (zio_link_t), 587 offsetof(zio_link_t, zl_child_node)); 588 589 if (vd != NULL) 590 zio->io_child_type = ZIO_CHILD_VDEV; 591 else if (flags & ZIO_FLAG_GANG_CHILD) 592 zio->io_child_type = ZIO_CHILD_GANG; 593 else if (flags & ZIO_FLAG_DDT_CHILD) 594 zio->io_child_type = ZIO_CHILD_DDT; 595 else 596 zio->io_child_type = ZIO_CHILD_LOGICAL; 597 598 if (bp != NULL) { 599 zio->io_bp = (blkptr_t *)bp; 600 zio->io_bp_copy = *bp; 601 zio->io_bp_orig = *bp; 602 if (type != ZIO_TYPE_WRITE || 603 zio->io_child_type == ZIO_CHILD_DDT) 604 zio->io_bp = &zio->io_bp_copy; /* so caller can free */ 605 if (zio->io_child_type == ZIO_CHILD_LOGICAL) 606 zio->io_logical = zio; 607 if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) 608 pipeline |= ZIO_GANG_STAGES; 609 } 610 611 zio->io_spa = spa; 612 zio->io_txg = txg; 613 zio->io_done = done; 614 zio->io_private = private; 615 zio->io_type = type; 616 zio->io_priority = priority; 617 zio->io_vd = vd; 618 zio->io_offset = offset; 619 zio->io_orig_data = zio->io_data = data; 620 zio->io_orig_size = zio->io_size = size; 621 zio->io_orig_flags = zio->io_flags = flags; 622 zio->io_orig_stage = zio->io_stage = stage; 623 zio->io_orig_pipeline = zio->io_pipeline = pipeline; 624 625 zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); 626 zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); 627 628 if (zb != NULL) 629 zio->io_bookmark = *zb; 630 631 if (pio != NULL) { 632 if (zio->io_logical == NULL) 633 zio->io_logical = pio->io_logical; 634 if (zio->io_child_type == ZIO_CHILD_GANG) 635 zio->io_gang_leader = pio->io_gang_leader; 636 zio_add_child(pio, zio); 637 } 638 639 return (zio); 640} 641 642static void 643zio_destroy(zio_t *zio) 644{ 645 list_destroy(&zio->io_parent_list); 646 list_destroy(&zio->io_child_list); 647 mutex_destroy(&zio->io_lock); 648 cv_destroy(&zio->io_cv); 649 kmem_cache_free(zio_cache, zio); 650} 651 652zio_t * 653zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, 654 void *private, enum zio_flag flags) 655{ 656 zio_t *zio; 657 658 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 659 ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 660 ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); 661 662 return (zio); 663} 664 665zio_t * 666zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) 667{ 668 return (zio_null(NULL, spa, NULL, done, private, flags)); 669} 670 671void 672zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp) 673{ 674 if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) { 675 zfs_panic_recover("blkptr at %p has invalid TYPE %llu", 676 bp, (longlong_t)BP_GET_TYPE(bp)); 677 } 678 if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS || 679 BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) { 680 zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu", 681 bp, (longlong_t)BP_GET_CHECKSUM(bp)); 682 } 683 if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS || 684 BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) { 685 zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu", 686 bp, (longlong_t)BP_GET_COMPRESS(bp)); 687 } 688 if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) { 689 zfs_panic_recover("blkptr at %p has invalid LSIZE %llu", 690 bp, (longlong_t)BP_GET_LSIZE(bp)); 691 } 692 if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) { 693 zfs_panic_recover("blkptr at %p has invalid PSIZE %llu", 694 bp, (longlong_t)BP_GET_PSIZE(bp)); 695 } 696 697 if (BP_IS_EMBEDDED(bp)) { 698 if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) { 699 zfs_panic_recover("blkptr at %p has invalid ETYPE %llu", 700 bp, (longlong_t)BPE_GET_ETYPE(bp)); 701 } 702 } 703 704 /* 705 * Pool-specific checks. 706 * 707 * Note: it would be nice to verify that the blk_birth and 708 * BP_PHYSICAL_BIRTH() are not too large. However, spa_freeze() 709 * allows the birth time of log blocks (and dmu_sync()-ed blocks 710 * that are in the log) to be arbitrarily large. 711 */ 712 for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 713 uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]); 714 if (vdevid >= spa->spa_root_vdev->vdev_children) { 715 zfs_panic_recover("blkptr at %p DVA %u has invalid " 716 "VDEV %llu", 717 bp, i, (longlong_t)vdevid); 718 continue; 719 } 720 vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; 721 if (vd == NULL) { 722 zfs_panic_recover("blkptr at %p DVA %u has invalid " 723 "VDEV %llu", 724 bp, i, (longlong_t)vdevid); 725 continue; 726 } 727 if (vd->vdev_ops == &vdev_hole_ops) { 728 zfs_panic_recover("blkptr at %p DVA %u has hole " 729 "VDEV %llu", 730 bp, i, (longlong_t)vdevid); 731 continue; 732 } 733 if (vd->vdev_ops == &vdev_missing_ops) { 734 /* 735 * "missing" vdevs are valid during import, but we 736 * don't have their detailed info (e.g. asize), so 737 * we can't perform any more checks on them. 738 */ 739 continue; 740 } 741 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); 742 uint64_t asize = DVA_GET_ASIZE(&bp->blk_dva[i]); 743 if (BP_IS_GANG(bp)) 744 asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 745 if (offset + asize > vd->vdev_asize) { 746 zfs_panic_recover("blkptr at %p DVA %u has invalid " 747 "OFFSET %llu", 748 bp, i, (longlong_t)offset); 749 } 750 } 751} 752 753zio_t * 754zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, 755 void *data, uint64_t size, zio_done_func_t *done, void *private, 756 zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) 757{ 758 zio_t *zio; 759 760 zfs_blkptr_verify(spa, bp); 761 762 zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, 763 data, size, done, private, 764 ZIO_TYPE_READ, priority, flags, NULL, 0, zb, 765 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 766 ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); 767 768 return (zio); 769} 770 771zio_t * 772zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 773 void *data, uint64_t size, const zio_prop_t *zp, 774 zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done, 775 void *private, 776 zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) 777{ 778 zio_t *zio; 779 780 ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && 781 zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && 782 zp->zp_compress >= ZIO_COMPRESS_OFF && 783 zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && 784 DMU_OT_IS_VALID(zp->zp_type) && 785 zp->zp_level < 32 && 786 zp->zp_copies > 0 && 787 zp->zp_copies <= spa_max_replication(spa)); 788 789 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 790 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 791 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 792 ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); 793 794 zio->io_ready = ready; 795 zio->io_physdone = physdone; 796 zio->io_prop = *zp; 797 798 /* 799 * Data can be NULL if we are going to call zio_write_override() to 800 * provide the already-allocated BP. But we may need the data to 801 * verify a dedup hit (if requested). In this case, don't try to 802 * dedup (just take the already-allocated BP verbatim). 803 */ 804 if (data == NULL && zio->io_prop.zp_dedup_verify) { 805 zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE; 806 } 807 808 return (zio); 809} 810 811zio_t * 812zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, 813 uint64_t size, zio_done_func_t *done, void *private, 814 zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb) 815{ 816 zio_t *zio; 817 818 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 819 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 820 ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 821 822 return (zio); 823} 824 825void 826zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) 827{ 828 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 829 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 830 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 831 ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); 832 833 /* 834 * We must reset the io_prop to match the values that existed 835 * when the bp was first written by dmu_sync() keeping in mind 836 * that nopwrite and dedup are mutually exclusive. 837 */ 838 zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; 839 zio->io_prop.zp_nopwrite = nopwrite; 840 zio->io_prop.zp_copies = copies; 841 zio->io_bp_override = bp; 842} 843 844void 845zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) 846{ 847 848 /* 849 * The check for EMBEDDED is a performance optimization. We 850 * process the free here (by ignoring it) rather than 851 * putting it on the list and then processing it in zio_free_sync(). 852 */ 853 if (BP_IS_EMBEDDED(bp)) 854 return; 855 metaslab_check_free(spa, bp); 856 857 /* 858 * Frees that are for the currently-syncing txg, are not going to be 859 * deferred, and which will not need to do a read (i.e. not GANG or 860 * DEDUP), can be processed immediately. Otherwise, put them on the 861 * in-memory list for later processing. 862 */ 863 if (zfs_trim_enabled || BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || 864 txg != spa->spa_syncing_txg || 865 spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) { 866 bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); 867 } else { 868 VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 869 BP_GET_PSIZE(bp), 0))); 870 } 871} 872 873zio_t * 874zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 875 uint64_t size, enum zio_flag flags) 876{ 877 zio_t *zio; 878 enum zio_stage stage = ZIO_FREE_PIPELINE; 879 880 ASSERT(!BP_IS_HOLE(bp)); 881 ASSERT(spa_syncing_txg(spa) == txg); 882 ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); 883 884 if (BP_IS_EMBEDDED(bp)) 885 return (zio_null(pio, spa, NULL, NULL, NULL, 0)); 886 887 metaslab_check_free(spa, bp); 888 arc_freed(spa, bp); 889 890 if (zfs_trim_enabled) 891 stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START | 892 ZIO_STAGE_VDEV_IO_ASSESS; 893 /* 894 * GANG and DEDUP blocks can induce a read (for the gang block header, 895 * or the DDT), so issue them asynchronously so that this thread is 896 * not tied up. 897 */ 898 else if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) 899 stage |= ZIO_STAGE_ISSUE_ASYNC; 900 901 flags |= ZIO_FLAG_DONT_QUEUE; 902 903 zio = zio_create(pio, spa, txg, bp, NULL, size, 904 NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags, 905 NULL, 0, NULL, ZIO_STAGE_OPEN, stage); 906 907 return (zio); 908} 909 910zio_t * 911zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 912 zio_done_func_t *done, void *private, enum zio_flag flags) 913{ 914 zio_t *zio; 915 916 dprintf_bp(bp, "claiming in txg %llu", txg); 917 918 if (BP_IS_EMBEDDED(bp)) 919 return (zio_null(pio, spa, NULL, NULL, NULL, 0)); 920 921 /* 922 * A claim is an allocation of a specific block. Claims are needed 923 * to support immediate writes in the intent log. The issue is that 924 * immediate writes contain committed data, but in a txg that was 925 * *not* committed. Upon opening the pool after an unclean shutdown, 926 * the intent log claims all blocks that contain immediate write data 927 * so that the SPA knows they're in use. 928 * 929 * All claims *must* be resolved in the first txg -- before the SPA 930 * starts allocating blocks -- so that nothing is allocated twice. 931 * If txg == 0 we just verify that the block is claimable. 932 */ 933 ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 934 ASSERT(txg == spa_first_txg(spa) || txg == 0); 935 ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ 936 937 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 938 done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, 939 NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 940 941 return (zio); 942} 943 944zio_t * 945zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset, 946 uint64_t size, zio_done_func_t *done, void *private, 947 zio_priority_t priority, enum zio_flag flags) 948{ 949 zio_t *zio; 950 int c; 951 952 if (vd->vdev_children == 0) { 953 zio = zio_create(pio, spa, 0, NULL, NULL, size, done, private, 954 ZIO_TYPE_IOCTL, priority, flags, vd, offset, NULL, 955 ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 956 957 zio->io_cmd = cmd; 958 } else { 959 zio = zio_null(pio, spa, NULL, NULL, NULL, flags); 960 961 for (c = 0; c < vd->vdev_children; c++) 962 zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 963 offset, size, done, private, priority, flags)); 964 } 965 966 return (zio); 967} 968 969zio_t * 970zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 971 void *data, int checksum, zio_done_func_t *done, void *private, 972 zio_priority_t priority, enum zio_flag flags, boolean_t labels) 973{ 974 zio_t *zio; 975 976 ASSERT(vd->vdev_children == 0); 977 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 978 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 979 ASSERT3U(offset + size, <=, vd->vdev_psize); 980 981 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 982 ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, 983 NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 984 985 zio->io_prop.zp_checksum = checksum; 986 987 return (zio); 988} 989 990zio_t * 991zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 992 void *data, int checksum, zio_done_func_t *done, void *private, 993 zio_priority_t priority, enum zio_flag flags, boolean_t labels) 994{ 995 zio_t *zio; 996 997 ASSERT(vd->vdev_children == 0); 998 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 999 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 1000 ASSERT3U(offset + size, <=, vd->vdev_psize); 1001 1002 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 1003 ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, 1004 NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 1005 1006 zio->io_prop.zp_checksum = checksum; 1007 1008 if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { 1009 /* 1010 * zec checksums are necessarily destructive -- they modify 1011 * the end of the write buffer to hold the verifier/checksum. 1012 * Therefore, we must make a local copy in case the data is 1013 * being written to multiple places in parallel. 1014 */ 1015 void *wbuf = zio_buf_alloc(size); 1016 bcopy(data, wbuf, size); 1017 zio_push_transform(zio, wbuf, size, size, NULL); 1018 } 1019 1020 return (zio); 1021} 1022 1023/* 1024 * Create a child I/O to do some work for us. 1025 */ 1026zio_t * 1027zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 1028 void *data, uint64_t size, int type, zio_priority_t priority, 1029 enum zio_flag flags, zio_done_func_t *done, void *private) 1030{ 1031 enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; 1032 zio_t *zio; 1033 1034 ASSERT(vd->vdev_parent == 1035 (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); 1036 1037 if (type == ZIO_TYPE_READ && bp != NULL) { 1038 /* 1039 * If we have the bp, then the child should perform the 1040 * checksum and the parent need not. This pushes error 1041 * detection as close to the leaves as possible and 1042 * eliminates redundant checksums in the interior nodes. 1043 */ 1044 pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; 1045 pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 1046 } 1047 1048 /* Not all IO types require vdev io done stage e.g. free */ 1049 if (!(pio->io_pipeline & ZIO_STAGE_VDEV_IO_DONE)) 1050 pipeline &= ~ZIO_STAGE_VDEV_IO_DONE; 1051 1052 if (vd->vdev_children == 0) 1053 offset += VDEV_LABEL_START_SIZE; 1054 1055 flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE; 1056 1057 /* 1058 * If we've decided to do a repair, the write is not speculative -- 1059 * even if the original read was. 1060 */ 1061 if (flags & ZIO_FLAG_IO_REPAIR) 1062 flags &= ~ZIO_FLAG_SPECULATIVE; 1063 1064 zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, 1065 done, private, type, priority, flags, vd, offset, &pio->io_bookmark, 1066 ZIO_STAGE_VDEV_IO_START >> 1, pipeline); 1067 1068 zio->io_physdone = pio->io_physdone; 1069 if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) 1070 zio->io_logical->io_phys_children++; 1071 1072 return (zio); 1073} 1074 1075zio_t * 1076zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, 1077 int type, zio_priority_t priority, enum zio_flag flags, 1078 zio_done_func_t *done, void *private) 1079{ 1080 zio_t *zio; 1081 1082 ASSERT(vd->vdev_ops->vdev_op_leaf); 1083 1084 zio = zio_create(NULL, vd->vdev_spa, 0, NULL, 1085 data, size, done, private, type, priority, 1086 flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED, 1087 vd, offset, NULL, 1088 ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); 1089 1090 return (zio); 1091} 1092 1093void 1094zio_flush(zio_t *zio, vdev_t *vd) 1095{ 1096 zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0, 1097 NULL, NULL, ZIO_PRIORITY_NOW, 1098 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); 1099} 1100 1101zio_t * 1102zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size) 1103{ 1104 1105 ASSERT(vd->vdev_ops->vdev_op_leaf); 1106 1107 return (zio_create(zio, spa, 0, NULL, NULL, size, NULL, NULL, 1108 ZIO_TYPE_FREE, ZIO_PRIORITY_TRIM, ZIO_FLAG_DONT_AGGREGATE | 1109 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, 1110 vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PHYS_PIPELINE)); 1111} 1112 1113void 1114zio_shrink(zio_t *zio, uint64_t size) 1115{ 1116 ASSERT(zio->io_executor == NULL); 1117 ASSERT(zio->io_orig_size == zio->io_size); 1118 ASSERT(size <= zio->io_size); 1119 1120 /* 1121 * We don't shrink for raidz because of problems with the 1122 * reconstruction when reading back less than the block size. 1123 * Note, BP_IS_RAIDZ() assumes no compression. 1124 */ 1125 ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); 1126 if (!BP_IS_RAIDZ(zio->io_bp)) 1127 zio->io_orig_size = zio->io_size = size; 1128} 1129 1130/* 1131 * ========================================================================== 1132 * Prepare to read and write logical blocks 1133 * ========================================================================== 1134 */ 1135 1136static int 1137zio_read_bp_init(zio_t *zio) 1138{ 1139 blkptr_t *bp = zio->io_bp; 1140 1141 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && 1142 zio->io_child_type == ZIO_CHILD_LOGICAL && 1143 !(zio->io_flags & ZIO_FLAG_RAW)) { 1144 uint64_t psize = 1145 BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); 1146 void *cbuf = zio_buf_alloc(psize); 1147 1148 zio_push_transform(zio, cbuf, psize, psize, zio_decompress); 1149 } 1150 1151 if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { 1152 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1153 decode_embedded_bp_compressed(bp, zio->io_data); 1154 } else { 1155 ASSERT(!BP_IS_EMBEDDED(bp)); 1156 } 1157 1158 if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) 1159 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1160 1161 if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) 1162 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1163 1164 if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) 1165 zio->io_pipeline = ZIO_DDT_READ_PIPELINE; 1166 1167 return (ZIO_PIPELINE_CONTINUE); 1168} 1169 1170static int 1171zio_write_bp_init(zio_t *zio) 1172{ 1173 spa_t *spa = zio->io_spa; 1174 zio_prop_t *zp = &zio->io_prop; 1175 enum zio_compress compress = zp->zp_compress; 1176 blkptr_t *bp = zio->io_bp; 1177 uint64_t lsize = zio->io_size; 1178 uint64_t psize = lsize; 1179 int pass = 1; 1180 1181 /* 1182 * If our children haven't all reached the ready stage, 1183 * wait for them and then repeat this pipeline stage. 1184 */ 1185 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 1186 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) 1187 return (ZIO_PIPELINE_STOP); 1188 1189 if (!IO_IS_ALLOCATING(zio)) 1190 return (ZIO_PIPELINE_CONTINUE); 1191 1192 ASSERT(zio->io_child_type != ZIO_CHILD_DDT); 1193 1194 if (zio->io_bp_override) { 1195 ASSERT(bp->blk_birth != zio->io_txg); 1196 ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); 1197 1198 *bp = *zio->io_bp_override; 1199 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1200 1201 if (BP_IS_EMBEDDED(bp)) 1202 return (ZIO_PIPELINE_CONTINUE); 1203 1204 /* 1205 * If we've been overridden and nopwrite is set then 1206 * set the flag accordingly to indicate that a nopwrite 1207 * has already occurred. 1208 */ 1209 if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { 1210 ASSERT(!zp->zp_dedup); 1211 zio->io_flags |= ZIO_FLAG_NOPWRITE; 1212 return (ZIO_PIPELINE_CONTINUE); 1213 } 1214 1215 ASSERT(!zp->zp_nopwrite); 1216 1217 if (BP_IS_HOLE(bp) || !zp->zp_dedup) 1218 return (ZIO_PIPELINE_CONTINUE); 1219 1220 ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags & 1221 ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify); 1222 1223 if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { 1224 BP_SET_DEDUP(bp, 1); 1225 zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; 1226 return (ZIO_PIPELINE_CONTINUE); 1227 } 1228 zio->io_bp_override = NULL; 1229 BP_ZERO(bp); 1230 } 1231 1232 if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) { 1233 /* 1234 * We're rewriting an existing block, which means we're 1235 * working on behalf of spa_sync(). For spa_sync() to 1236 * converge, it must eventually be the case that we don't 1237 * have to allocate new blocks. But compression changes 1238 * the blocksize, which forces a reallocate, and makes 1239 * convergence take longer. Therefore, after the first 1240 * few passes, stop compressing to ensure convergence. 1241 */ 1242 pass = spa_sync_pass(spa); 1243 1244 ASSERT(zio->io_txg == spa_syncing_txg(spa)); 1245 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1246 ASSERT(!BP_GET_DEDUP(bp)); 1247 1248 if (pass >= zfs_sync_pass_dont_compress) 1249 compress = ZIO_COMPRESS_OFF; 1250 1251 /* Make sure someone doesn't change their mind on overwrites */ 1252 ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp), 1253 spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 1254 } 1255 1256 if (compress != ZIO_COMPRESS_OFF) { 1257 void *cbuf = zio_buf_alloc(lsize); 1258 psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); 1259 if (psize == 0 || psize == lsize) { 1260 compress = ZIO_COMPRESS_OFF; 1261 zio_buf_free(cbuf, lsize); 1262 } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE && 1263 zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && 1264 spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { 1265 encode_embedded_bp_compressed(bp, 1266 cbuf, compress, lsize, psize); 1267 BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA); 1268 BP_SET_TYPE(bp, zio->io_prop.zp_type); 1269 BP_SET_LEVEL(bp, zio->io_prop.zp_level); 1270 zio_buf_free(cbuf, lsize); 1271 bp->blk_birth = zio->io_txg; 1272 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1273 ASSERT(spa_feature_is_active(spa, 1274 SPA_FEATURE_EMBEDDED_DATA)); 1275 return (ZIO_PIPELINE_CONTINUE); 1276 } else { 1277 /* 1278 * Round up compressed size up to the ashift 1279 * of the smallest-ashift device, and zero the tail. 1280 * This ensures that the compressed size of the BP 1281 * (and thus compressratio property) are correct, 1282 * in that we charge for the padding used to fill out 1283 * the last sector. 1284 */ 1285 ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); 1286 size_t rounded = (size_t)P2ROUNDUP(psize, 1287 1ULL << spa->spa_min_ashift); 1288 if (rounded >= lsize) { 1289 compress = ZIO_COMPRESS_OFF; 1290 zio_buf_free(cbuf, lsize); 1291 psize = lsize; 1292 } else { 1293 bzero((char *)cbuf + psize, rounded - psize); 1294 psize = rounded; 1295 zio_push_transform(zio, cbuf, 1296 psize, lsize, NULL); 1297 } 1298 } 1299 } 1300 1301 /* 1302 * The final pass of spa_sync() must be all rewrites, but the first 1303 * few passes offer a trade-off: allocating blocks defers convergence, 1304 * but newly allocated blocks are sequential, so they can be written 1305 * to disk faster. Therefore, we allow the first few passes of 1306 * spa_sync() to allocate new blocks, but force rewrites after that. 1307 * There should only be a handful of blocks after pass 1 in any case. 1308 */ 1309 if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg && 1310 BP_GET_PSIZE(bp) == psize && 1311 pass >= zfs_sync_pass_rewrite) { 1312 ASSERT(psize != 0); 1313 enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; 1314 zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; 1315 zio->io_flags |= ZIO_FLAG_IO_REWRITE; 1316 } else { 1317 BP_ZERO(bp); 1318 zio->io_pipeline = ZIO_WRITE_PIPELINE; 1319 } 1320 1321 if (psize == 0) { 1322 if (zio->io_bp_orig.blk_birth != 0 && 1323 spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { 1324 BP_SET_LSIZE(bp, lsize); 1325 BP_SET_TYPE(bp, zp->zp_type); 1326 BP_SET_LEVEL(bp, zp->zp_level); 1327 BP_SET_BIRTH(bp, zio->io_txg, 0); 1328 } 1329 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1330 } else { 1331 ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); 1332 BP_SET_LSIZE(bp, lsize); 1333 BP_SET_TYPE(bp, zp->zp_type); 1334 BP_SET_LEVEL(bp, zp->zp_level); 1335 BP_SET_PSIZE(bp, psize); 1336 BP_SET_COMPRESS(bp, compress); 1337 BP_SET_CHECKSUM(bp, zp->zp_checksum); 1338 BP_SET_DEDUP(bp, zp->zp_dedup); 1339 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1340 if (zp->zp_dedup) { 1341 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1342 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1343 zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; 1344 } 1345 if (zp->zp_nopwrite) { 1346 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1347 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1348 zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; 1349 } 1350 } 1351 1352 return (ZIO_PIPELINE_CONTINUE); 1353} 1354 1355static int 1356zio_free_bp_init(zio_t *zio) 1357{ 1358 blkptr_t *bp = zio->io_bp; 1359 1360 if (zio->io_child_type == ZIO_CHILD_LOGICAL) { 1361 if (BP_GET_DEDUP(bp)) 1362 zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; 1363 } 1364 1365 return (ZIO_PIPELINE_CONTINUE); 1366} 1367 1368/* 1369 * ========================================================================== 1370 * Execute the I/O pipeline 1371 * ========================================================================== 1372 */ 1373 1374static void 1375zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) 1376{ 1377 spa_t *spa = zio->io_spa; 1378 zio_type_t t = zio->io_type; 1379 int flags = (cutinline ? TQ_FRONT : 0); 1380 1381 ASSERT(q == ZIO_TASKQ_ISSUE || q == ZIO_TASKQ_INTERRUPT); 1382 1383 /* 1384 * If we're a config writer or a probe, the normal issue and 1385 * interrupt threads may all be blocked waiting for the config lock. 1386 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. 1387 */ 1388 if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) 1389 t = ZIO_TYPE_NULL; 1390 1391 /* 1392 * A similar issue exists for the L2ARC write thread until L2ARC 2.0. 1393 */ 1394 if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) 1395 t = ZIO_TYPE_NULL; 1396 1397 /* 1398 * If this is a high priority I/O, then use the high priority taskq if 1399 * available. 1400 */ 1401 if (zio->io_priority == ZIO_PRIORITY_NOW && 1402 spa->spa_zio_taskq[t][q + 1].stqs_count != 0) 1403 q++; 1404 1405 ASSERT3U(q, <, ZIO_TASKQ_TYPES); 1406 1407 /* 1408 * NB: We are assuming that the zio can only be dispatched 1409 * to a single taskq at a time. It would be a grievous error 1410 * to dispatch the zio to another taskq at the same time. 1411 */ 1412#if defined(illumos) || !defined(_KERNEL) 1413 ASSERT(zio->io_tqent.tqent_next == NULL); 1414#else 1415 ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); 1416#endif 1417 spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio, 1418 flags, &zio->io_tqent); 1419} 1420 1421static boolean_t 1422zio_taskq_member(zio_t *zio, zio_taskq_type_t q) 1423{ 1424 kthread_t *executor = zio->io_executor; 1425 spa_t *spa = zio->io_spa; 1426 1427 for (zio_type_t t = 0; t < ZIO_TYPES; t++) { 1428 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1429 uint_t i; 1430 for (i = 0; i < tqs->stqs_count; i++) { 1431 if (taskq_member(tqs->stqs_taskq[i], executor)) 1432 return (B_TRUE); 1433 } 1434 } 1435 1436 return (B_FALSE); 1437} 1438 1439static int 1440zio_issue_async(zio_t *zio) 1441{ 1442 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 1443 1444 return (ZIO_PIPELINE_STOP); 1445} 1446 1447void 1448zio_interrupt(zio_t *zio) 1449{ 1450 zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); 1451} 1452 1453void 1454zio_delay_interrupt(zio_t *zio) 1455{ 1456 /* 1457 * The timeout_generic() function isn't defined in userspace, so 1458 * rather than trying to implement the function, the zio delay 1459 * functionality has been disabled for userspace builds. 1460 */ 1461 1462#ifdef _KERNEL 1463 /* 1464 * If io_target_timestamp is zero, then no delay has been registered 1465 * for this IO, thus jump to the end of this function and "skip" the 1466 * delay; issuing it directly to the zio layer. 1467 */ 1468 if (zio->io_target_timestamp != 0) { 1469 hrtime_t now = gethrtime(); 1470 1471 if (now >= zio->io_target_timestamp) { 1472 /* 1473 * This IO has already taken longer than the target 1474 * delay to complete, so we don't want to delay it 1475 * any longer; we "miss" the delay and issue it 1476 * directly to the zio layer. This is likely due to 1477 * the target latency being set to a value less than 1478 * the underlying hardware can satisfy (e.g. delay 1479 * set to 1ms, but the disks take 10ms to complete an 1480 * IO request). 1481 */ 1482 1483 DTRACE_PROBE2(zio__delay__miss, zio_t *, zio, 1484 hrtime_t, now); 1485 1486 zio_interrupt(zio); 1487 } else { 1488 hrtime_t diff = zio->io_target_timestamp - now; 1489 1490 DTRACE_PROBE3(zio__delay__hit, zio_t *, zio, 1491 hrtime_t, now, hrtime_t, diff); 1492 1493 (void) timeout_generic(CALLOUT_NORMAL, 1494 (void (*)(void *))zio_interrupt, zio, diff, 1, 0); 1495 } 1496 1497 return; 1498 } 1499#endif 1500 1501 DTRACE_PROBE1(zio__delay__skip, zio_t *, zio); 1502 zio_interrupt(zio); 1503} 1504 1505/* 1506 * Execute the I/O pipeline until one of the following occurs: 1507 * 1508 * (1) the I/O completes 1509 * (2) the pipeline stalls waiting for dependent child I/Os 1510 * (3) the I/O issues, so we're waiting for an I/O completion interrupt 1511 * (4) the I/O is delegated by vdev-level caching or aggregation 1512 * (5) the I/O is deferred due to vdev-level queueing 1513 * (6) the I/O is handed off to another thread. 1514 * 1515 * In all cases, the pipeline stops whenever there's no CPU work; it never 1516 * burns a thread in cv_wait(). 1517 * 1518 * There's no locking on io_stage because there's no legitimate way 1519 * for multiple threads to be attempting to process the same I/O. 1520 */ 1521static zio_pipe_stage_t *zio_pipeline[]; 1522 1523void 1524zio_execute(zio_t *zio) 1525{ 1526 zio->io_executor = curthread; 1527 1528 while (zio->io_stage < ZIO_STAGE_DONE) { 1529 enum zio_stage pipeline = zio->io_pipeline; 1530 enum zio_stage stage = zio->io_stage; 1531 int rv; 1532 1533 ASSERT(!MUTEX_HELD(&zio->io_lock)); 1534 ASSERT(ISP2(stage)); 1535 ASSERT(zio->io_stall == NULL); 1536 1537 do { 1538 stage <<= 1; 1539 } while ((stage & pipeline) == 0); 1540 1541 ASSERT(stage <= ZIO_STAGE_DONE); 1542 1543 /* 1544 * If we are in interrupt context and this pipeline stage 1545 * will grab a config lock that is held across I/O, 1546 * or may wait for an I/O that needs an interrupt thread 1547 * to complete, issue async to avoid deadlock. 1548 * 1549 * For VDEV_IO_START, we cut in line so that the io will 1550 * be sent to disk promptly. 1551 */ 1552 if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && 1553 zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { 1554 boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? 1555 zio_requeue_io_start_cut_in_line : B_FALSE; 1556 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); 1557 return; 1558 } 1559 1560 zio->io_stage = stage; 1561 rv = zio_pipeline[highbit64(stage) - 1](zio); 1562 1563 if (rv == ZIO_PIPELINE_STOP) 1564 return; 1565 1566 ASSERT(rv == ZIO_PIPELINE_CONTINUE); 1567 } 1568} 1569 1570/* 1571 * ========================================================================== 1572 * Initiate I/O, either sync or async 1573 * ========================================================================== 1574 */ 1575int 1576zio_wait(zio_t *zio) 1577{ 1578 int error; 1579 1580 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 1581 ASSERT(zio->io_executor == NULL); 1582 1583 zio->io_waiter = curthread; 1584 1585 zio_execute(zio); 1586 1587 mutex_enter(&zio->io_lock); 1588 while (zio->io_executor != NULL) 1589 cv_wait(&zio->io_cv, &zio->io_lock); 1590 mutex_exit(&zio->io_lock); 1591 1592 error = zio->io_error; 1593 zio_destroy(zio); 1594 1595 return (error); 1596} 1597 1598void 1599zio_nowait(zio_t *zio) 1600{ 1601 ASSERT(zio->io_executor == NULL); 1602 1603 if (zio->io_child_type == ZIO_CHILD_LOGICAL && 1604 zio_unique_parent(zio) == NULL) { 1605 /* 1606 * This is a logical async I/O with no parent to wait for it. 1607 * We add it to the spa_async_root_zio "Godfather" I/O which 1608 * will ensure they complete prior to unloading the pool. 1609 */ 1610 spa_t *spa = zio->io_spa; 1611 1612 zio_add_child(spa->spa_async_zio_root[CPU_SEQID], zio); 1613 } 1614 1615 zio_execute(zio); 1616} 1617 1618/* 1619 * ========================================================================== 1620 * Reexecute or suspend/resume failed I/O 1621 * ========================================================================== 1622 */ 1623 1624static void 1625zio_reexecute(zio_t *pio) 1626{ 1627 zio_t *cio, *cio_next; 1628 1629 ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); 1630 ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); 1631 ASSERT(pio->io_gang_leader == NULL); 1632 ASSERT(pio->io_gang_tree == NULL); 1633 1634 pio->io_flags = pio->io_orig_flags; 1635 pio->io_stage = pio->io_orig_stage; 1636 pio->io_pipeline = pio->io_orig_pipeline; 1637 pio->io_reexecute = 0; 1638 pio->io_flags |= ZIO_FLAG_REEXECUTED; 1639 pio->io_error = 0; 1640 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1641 pio->io_state[w] = 0; 1642 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 1643 pio->io_child_error[c] = 0; 1644 1645 if (IO_IS_ALLOCATING(pio)) 1646 BP_ZERO(pio->io_bp); 1647 1648 /* 1649 * As we reexecute pio's children, new children could be created. 1650 * New children go to the head of pio's io_child_list, however, 1651 * so we will (correctly) not reexecute them. The key is that 1652 * the remainder of pio's io_child_list, from 'cio_next' onward, 1653 * cannot be affected by any side effects of reexecuting 'cio'. 1654 */ 1655 for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { 1656 cio_next = zio_walk_children(pio); 1657 mutex_enter(&pio->io_lock); 1658 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1659 pio->io_children[cio->io_child_type][w]++; 1660 mutex_exit(&pio->io_lock); 1661 zio_reexecute(cio); 1662 } 1663 1664 /* 1665 * Now that all children have been reexecuted, execute the parent. 1666 * We don't reexecute "The Godfather" I/O here as it's the 1667 * responsibility of the caller to wait on him. 1668 */ 1669 if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) 1670 zio_execute(pio); 1671} 1672 1673void 1674zio_suspend(spa_t *spa, zio_t *zio) 1675{ 1676 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) 1677 fm_panic("Pool '%s' has encountered an uncorrectable I/O " 1678 "failure and the failure mode property for this pool " 1679 "is set to panic.", spa_name(spa)); 1680 1681 zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); 1682 1683 mutex_enter(&spa->spa_suspend_lock); 1684 1685 if (spa->spa_suspend_zio_root == NULL) 1686 spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 1687 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 1688 ZIO_FLAG_GODFATHER); 1689 1690 spa->spa_suspended = B_TRUE; 1691 1692 if (zio != NULL) { 1693 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 1694 ASSERT(zio != spa->spa_suspend_zio_root); 1695 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1696 ASSERT(zio_unique_parent(zio) == NULL); 1697 ASSERT(zio->io_stage == ZIO_STAGE_DONE); 1698 zio_add_child(spa->spa_suspend_zio_root, zio); 1699 } 1700 1701 mutex_exit(&spa->spa_suspend_lock); 1702} 1703 1704int 1705zio_resume(spa_t *spa) 1706{ 1707 zio_t *pio; 1708 1709 /* 1710 * Reexecute all previously suspended i/o. 1711 */ 1712 mutex_enter(&spa->spa_suspend_lock); 1713 spa->spa_suspended = B_FALSE; 1714 cv_broadcast(&spa->spa_suspend_cv); 1715 pio = spa->spa_suspend_zio_root; 1716 spa->spa_suspend_zio_root = NULL; 1717 mutex_exit(&spa->spa_suspend_lock); 1718 1719 if (pio == NULL) 1720 return (0); 1721 1722 zio_reexecute(pio); 1723 return (zio_wait(pio)); 1724} 1725 1726void 1727zio_resume_wait(spa_t *spa) 1728{ 1729 mutex_enter(&spa->spa_suspend_lock); 1730 while (spa_suspended(spa)) 1731 cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); 1732 mutex_exit(&spa->spa_suspend_lock); 1733} 1734 1735/* 1736 * ========================================================================== 1737 * Gang blocks. 1738 * 1739 * A gang block is a collection of small blocks that looks to the DMU 1740 * like one large block. When zio_dva_allocate() cannot find a block 1741 * of the requested size, due to either severe fragmentation or the pool 1742 * being nearly full, it calls zio_write_gang_block() to construct the 1743 * block from smaller fragments. 1744 * 1745 * A gang block consists of a gang header (zio_gbh_phys_t) and up to 1746 * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like 1747 * an indirect block: it's an array of block pointers. It consumes 1748 * only one sector and hence is allocatable regardless of fragmentation. 1749 * The gang header's bps point to its gang members, which hold the data. 1750 * 1751 * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> 1752 * as the verifier to ensure uniqueness of the SHA256 checksum. 1753 * Critically, the gang block bp's blk_cksum is the checksum of the data, 1754 * not the gang header. This ensures that data block signatures (needed for 1755 * deduplication) are independent of how the block is physically stored. 1756 * 1757 * Gang blocks can be nested: a gang member may itself be a gang block. 1758 * Thus every gang block is a tree in which root and all interior nodes are 1759 * gang headers, and the leaves are normal blocks that contain user data. 1760 * The root of the gang tree is called the gang leader. 1761 * 1762 * To perform any operation (read, rewrite, free, claim) on a gang block, 1763 * zio_gang_assemble() first assembles the gang tree (minus data leaves) 1764 * in the io_gang_tree field of the original logical i/o by recursively 1765 * reading the gang leader and all gang headers below it. This yields 1766 * an in-core tree containing the contents of every gang header and the 1767 * bps for every constituent of the gang block. 1768 * 1769 * With the gang tree now assembled, zio_gang_issue() just walks the gang tree 1770 * and invokes a callback on each bp. To free a gang block, zio_gang_issue() 1771 * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. 1772 * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). 1773 * zio_read_gang() is a wrapper around zio_read() that omits reading gang 1774 * headers, since we already have those in io_gang_tree. zio_rewrite_gang() 1775 * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() 1776 * of the gang header plus zio_checksum_compute() of the data to update the 1777 * gang header's blk_cksum as described above. 1778 * 1779 * The two-phase assemble/issue model solves the problem of partial failure -- 1780 * what if you'd freed part of a gang block but then couldn't read the 1781 * gang header for another part? Assembling the entire gang tree first 1782 * ensures that all the necessary gang header I/O has succeeded before 1783 * starting the actual work of free, claim, or write. Once the gang tree 1784 * is assembled, free and claim are in-memory operations that cannot fail. 1785 * 1786 * In the event that a gang write fails, zio_dva_unallocate() walks the 1787 * gang tree to immediately free (i.e. insert back into the space map) 1788 * everything we've allocated. This ensures that we don't get ENOSPC 1789 * errors during repeated suspend/resume cycles due to a flaky device. 1790 * 1791 * Gang rewrites only happen during sync-to-convergence. If we can't assemble 1792 * the gang tree, we won't modify the block, so we can safely defer the free 1793 * (knowing that the block is still intact). If we *can* assemble the gang 1794 * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free 1795 * each constituent bp and we can allocate a new block on the next sync pass. 1796 * 1797 * In all cases, the gang tree allows complete recovery from partial failure. 1798 * ========================================================================== 1799 */ 1800 1801static zio_t * 1802zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1803{ 1804 if (gn != NULL) 1805 return (pio); 1806 1807 return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), 1808 NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1809 &pio->io_bookmark)); 1810} 1811 1812zio_t * 1813zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1814{ 1815 zio_t *zio; 1816 1817 if (gn != NULL) { 1818 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1819 gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, 1820 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1821 /* 1822 * As we rewrite each gang header, the pipeline will compute 1823 * a new gang block header checksum for it; but no one will 1824 * compute a new data checksum, so we do that here. The one 1825 * exception is the gang leader: the pipeline already computed 1826 * its data checksum because that stage precedes gang assembly. 1827 * (Presently, nothing actually uses interior data checksums; 1828 * this is just good hygiene.) 1829 */ 1830 if (gn != pio->io_gang_leader->io_gang_tree) { 1831 zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), 1832 data, BP_GET_PSIZE(bp)); 1833 } 1834 /* 1835 * If we are here to damage data for testing purposes, 1836 * leave the GBH alone so that we can detect the damage. 1837 */ 1838 if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) 1839 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 1840 } else { 1841 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1842 data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, 1843 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1844 } 1845 1846 return (zio); 1847} 1848 1849/* ARGSUSED */ 1850zio_t * 1851zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1852{ 1853 return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, 1854 BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp), 1855 ZIO_GANG_CHILD_FLAGS(pio))); 1856} 1857 1858/* ARGSUSED */ 1859zio_t * 1860zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1861{ 1862 return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, 1863 NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); 1864} 1865 1866static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { 1867 NULL, 1868 zio_read_gang, 1869 zio_rewrite_gang, 1870 zio_free_gang, 1871 zio_claim_gang, 1872 NULL 1873}; 1874 1875static void zio_gang_tree_assemble_done(zio_t *zio); 1876 1877static zio_gang_node_t * 1878zio_gang_node_alloc(zio_gang_node_t **gnpp) 1879{ 1880 zio_gang_node_t *gn; 1881 1882 ASSERT(*gnpp == NULL); 1883 1884 gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); 1885 gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); 1886 *gnpp = gn; 1887 1888 return (gn); 1889} 1890 1891static void 1892zio_gang_node_free(zio_gang_node_t **gnpp) 1893{ 1894 zio_gang_node_t *gn = *gnpp; 1895 1896 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1897 ASSERT(gn->gn_child[g] == NULL); 1898 1899 zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); 1900 kmem_free(gn, sizeof (*gn)); 1901 *gnpp = NULL; 1902} 1903 1904static void 1905zio_gang_tree_free(zio_gang_node_t **gnpp) 1906{ 1907 zio_gang_node_t *gn = *gnpp; 1908 1909 if (gn == NULL) 1910 return; 1911 1912 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1913 zio_gang_tree_free(&gn->gn_child[g]); 1914 1915 zio_gang_node_free(gnpp); 1916} 1917 1918static void 1919zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) 1920{ 1921 zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); 1922 1923 ASSERT(gio->io_gang_leader == gio); 1924 ASSERT(BP_IS_GANG(bp)); 1925 1926 zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, 1927 SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, 1928 gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); 1929} 1930 1931static void 1932zio_gang_tree_assemble_done(zio_t *zio) 1933{ 1934 zio_t *gio = zio->io_gang_leader; 1935 zio_gang_node_t *gn = zio->io_private; 1936 blkptr_t *bp = zio->io_bp; 1937 1938 ASSERT(gio == zio_unique_parent(zio)); 1939 ASSERT(zio->io_child_count == 0); 1940 1941 if (zio->io_error) 1942 return; 1943 1944 if (BP_SHOULD_BYTESWAP(bp)) 1945 byteswap_uint64_array(zio->io_data, zio->io_size); 1946 1947 ASSERT(zio->io_data == gn->gn_gbh); 1948 ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 1949 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1950 1951 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1952 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1953 if (!BP_IS_GANG(gbp)) 1954 continue; 1955 zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); 1956 } 1957} 1958 1959static void 1960zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) 1961{ 1962 zio_t *gio = pio->io_gang_leader; 1963 zio_t *zio; 1964 1965 ASSERT(BP_IS_GANG(bp) == !!gn); 1966 ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); 1967 ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); 1968 1969 /* 1970 * If you're a gang header, your data is in gn->gn_gbh. 1971 * If you're a gang member, your data is in 'data' and gn == NULL. 1972 */ 1973 zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); 1974 1975 if (gn != NULL) { 1976 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1977 1978 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1979 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1980 if (BP_IS_HOLE(gbp)) 1981 continue; 1982 zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); 1983 data = (char *)data + BP_GET_PSIZE(gbp); 1984 } 1985 } 1986 1987 if (gn == gio->io_gang_tree && gio->io_data != NULL) 1988 ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); 1989 1990 if (zio != pio) 1991 zio_nowait(zio); 1992} 1993 1994static int 1995zio_gang_assemble(zio_t *zio) 1996{ 1997 blkptr_t *bp = zio->io_bp; 1998 1999 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); 2000 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2001 2002 zio->io_gang_leader = zio; 2003 2004 zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); 2005 2006 return (ZIO_PIPELINE_CONTINUE); 2007} 2008 2009static int 2010zio_gang_issue(zio_t *zio) 2011{ 2012 blkptr_t *bp = zio->io_bp; 2013 2014 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) 2015 return (ZIO_PIPELINE_STOP); 2016 2017 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); 2018 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2019 2020 if (zio->io_child_error[ZIO_CHILD_GANG] == 0) 2021 zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); 2022 else 2023 zio_gang_tree_free(&zio->io_gang_tree); 2024 2025 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2026 2027 return (ZIO_PIPELINE_CONTINUE); 2028} 2029 2030static void 2031zio_write_gang_member_ready(zio_t *zio) 2032{ 2033 zio_t *pio = zio_unique_parent(zio); 2034 zio_t *gio = zio->io_gang_leader; 2035 dva_t *cdva = zio->io_bp->blk_dva; 2036 dva_t *pdva = pio->io_bp->blk_dva; 2037 uint64_t asize; 2038 2039 if (BP_IS_HOLE(zio->io_bp)) 2040 return; 2041 2042 ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); 2043 2044 ASSERT(zio->io_child_type == ZIO_CHILD_GANG); 2045 ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); 2046 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); 2047 ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); 2048 ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 2049 2050 mutex_enter(&pio->io_lock); 2051 for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { 2052 ASSERT(DVA_GET_GANG(&pdva[d])); 2053 asize = DVA_GET_ASIZE(&pdva[d]); 2054 asize += DVA_GET_ASIZE(&cdva[d]); 2055 DVA_SET_ASIZE(&pdva[d], asize); 2056 } 2057 mutex_exit(&pio->io_lock); 2058} 2059 2060static int 2061zio_write_gang_block(zio_t *pio) 2062{ 2063 spa_t *spa = pio->io_spa; 2064 blkptr_t *bp = pio->io_bp; 2065 zio_t *gio = pio->io_gang_leader; 2066 zio_t *zio; 2067 zio_gang_node_t *gn, **gnpp; 2068 zio_gbh_phys_t *gbh; 2069 uint64_t txg = pio->io_txg; 2070 uint64_t resid = pio->io_size; 2071 uint64_t lsize; 2072 int copies = gio->io_prop.zp_copies; 2073 int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); 2074 zio_prop_t zp; 2075 int error; 2076 2077 error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE, 2078 bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, 2079 METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); 2080 if (error) { 2081 pio->io_error = error; 2082 return (ZIO_PIPELINE_CONTINUE); 2083 } 2084 2085 if (pio == gio) { 2086 gnpp = &gio->io_gang_tree; 2087 } else { 2088 gnpp = pio->io_private; 2089 ASSERT(pio->io_ready == zio_write_gang_member_ready); 2090 } 2091 2092 gn = zio_gang_node_alloc(gnpp); 2093 gbh = gn->gn_gbh; 2094 bzero(gbh, SPA_GANGBLOCKSIZE); 2095 2096 /* 2097 * Create the gang header. 2098 */ 2099 zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, 2100 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 2101 2102 /* 2103 * Create and nowait the gang children. 2104 */ 2105 for (int g = 0; resid != 0; resid -= lsize, g++) { 2106 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), 2107 SPA_MINBLOCKSIZE); 2108 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); 2109 2110 zp.zp_checksum = gio->io_prop.zp_checksum; 2111 zp.zp_compress = ZIO_COMPRESS_OFF; 2112 zp.zp_type = DMU_OT_NONE; 2113 zp.zp_level = 0; 2114 zp.zp_copies = gio->io_prop.zp_copies; 2115 zp.zp_dedup = B_FALSE; 2116 zp.zp_dedup_verify = B_FALSE; 2117 zp.zp_nopwrite = B_FALSE; 2118 2119 zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], 2120 (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, 2121 zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g], 2122 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 2123 &pio->io_bookmark)); 2124 } 2125 2126 /* 2127 * Set pio's pipeline to just wait for zio to finish. 2128 */ 2129 pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2130 2131 zio_nowait(zio); 2132 2133 return (ZIO_PIPELINE_CONTINUE); 2134} 2135 2136/* 2137 * The zio_nop_write stage in the pipeline determines if allocating a 2138 * new bp is necessary. The nopwrite feature can handle writes in 2139 * either syncing or open context (i.e. zil writes) and as a result is 2140 * mutually exclusive with dedup. 2141 * 2142 * By leveraging a cryptographically secure checksum, such as SHA256, we 2143 * can compare the checksums of the new data and the old to determine if 2144 * allocating a new block is required. Note that our requirements for 2145 * cryptographic strength are fairly weak: there can't be any accidental 2146 * hash collisions, but we don't need to be secure against intentional 2147 * (malicious) collisions. To trigger a nopwrite, you have to be able 2148 * to write the file to begin with, and triggering an incorrect (hash 2149 * collision) nopwrite is no worse than simply writing to the file. 2150 * That said, there are no known attacks against the checksum algorithms 2151 * used for nopwrite, assuming that the salt and the checksums 2152 * themselves remain secret. 2153 */ 2154static int 2155zio_nop_write(zio_t *zio) 2156{ 2157 blkptr_t *bp = zio->io_bp; 2158 blkptr_t *bp_orig = &zio->io_bp_orig; 2159 zio_prop_t *zp = &zio->io_prop; 2160 2161 ASSERT(BP_GET_LEVEL(bp) == 0); 2162 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 2163 ASSERT(zp->zp_nopwrite); 2164 ASSERT(!zp->zp_dedup); 2165 ASSERT(zio->io_bp_override == NULL); 2166 ASSERT(IO_IS_ALLOCATING(zio)); 2167 2168 /* 2169 * Check to see if the original bp and the new bp have matching 2170 * characteristics (i.e. same checksum, compression algorithms, etc). 2171 * If they don't then just continue with the pipeline which will 2172 * allocate a new bp. 2173 */ 2174 if (BP_IS_HOLE(bp_orig) || 2175 !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags & 2176 ZCHECKSUM_FLAG_NOPWRITE) || 2177 BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || 2178 BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || 2179 BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || 2180 zp->zp_copies != BP_GET_NDVAS(bp_orig)) 2181 return (ZIO_PIPELINE_CONTINUE); 2182 2183 /* 2184 * If the checksums match then reset the pipeline so that we 2185 * avoid allocating a new bp and issuing any I/O. 2186 */ 2187 if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { 2188 ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags & 2189 ZCHECKSUM_FLAG_NOPWRITE); 2190 ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); 2191 ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); 2192 ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); 2193 ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, 2194 sizeof (uint64_t)) == 0); 2195 2196 *bp = *bp_orig; 2197 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2198 zio->io_flags |= ZIO_FLAG_NOPWRITE; 2199 } 2200 2201 return (ZIO_PIPELINE_CONTINUE); 2202} 2203 2204/* 2205 * ========================================================================== 2206 * Dedup 2207 * ========================================================================== 2208 */ 2209static void 2210zio_ddt_child_read_done(zio_t *zio) 2211{ 2212 blkptr_t *bp = zio->io_bp; 2213 ddt_entry_t *dde = zio->io_private; 2214 ddt_phys_t *ddp; 2215 zio_t *pio = zio_unique_parent(zio); 2216 2217 mutex_enter(&pio->io_lock); 2218 ddp = ddt_phys_select(dde, bp); 2219 if (zio->io_error == 0) 2220 ddt_phys_clear(ddp); /* this ddp doesn't need repair */ 2221 if (zio->io_error == 0 && dde->dde_repair_data == NULL) 2222 dde->dde_repair_data = zio->io_data; 2223 else 2224 zio_buf_free(zio->io_data, zio->io_size); 2225 mutex_exit(&pio->io_lock); 2226} 2227 2228static int 2229zio_ddt_read_start(zio_t *zio) 2230{ 2231 blkptr_t *bp = zio->io_bp; 2232 2233 ASSERT(BP_GET_DEDUP(bp)); 2234 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2235 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2236 2237 if (zio->io_child_error[ZIO_CHILD_DDT]) { 2238 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2239 ddt_entry_t *dde = ddt_repair_start(ddt, bp); 2240 ddt_phys_t *ddp = dde->dde_phys; 2241 ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); 2242 blkptr_t blk; 2243 2244 ASSERT(zio->io_vsd == NULL); 2245 zio->io_vsd = dde; 2246 2247 if (ddp_self == NULL) 2248 return (ZIO_PIPELINE_CONTINUE); 2249 2250 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 2251 if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) 2252 continue; 2253 ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, 2254 &blk); 2255 zio_nowait(zio_read(zio, zio->io_spa, &blk, 2256 zio_buf_alloc(zio->io_size), zio->io_size, 2257 zio_ddt_child_read_done, dde, zio->io_priority, 2258 ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, 2259 &zio->io_bookmark)); 2260 } 2261 return (ZIO_PIPELINE_CONTINUE); 2262 } 2263 2264 zio_nowait(zio_read(zio, zio->io_spa, bp, 2265 zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, 2266 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); 2267 2268 return (ZIO_PIPELINE_CONTINUE); 2269} 2270 2271static int 2272zio_ddt_read_done(zio_t *zio) 2273{ 2274 blkptr_t *bp = zio->io_bp; 2275 2276 if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)) 2277 return (ZIO_PIPELINE_STOP); 2278 2279 ASSERT(BP_GET_DEDUP(bp)); 2280 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2281 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2282 2283 if (zio->io_child_error[ZIO_CHILD_DDT]) { 2284 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2285 ddt_entry_t *dde = zio->io_vsd; 2286 if (ddt == NULL) { 2287 ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); 2288 return (ZIO_PIPELINE_CONTINUE); 2289 } 2290 if (dde == NULL) { 2291 zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; 2292 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 2293 return (ZIO_PIPELINE_STOP); 2294 } 2295 if (dde->dde_repair_data != NULL) { 2296 bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); 2297 zio->io_child_error[ZIO_CHILD_DDT] = 0; 2298 } 2299 ddt_repair_done(ddt, dde); 2300 zio->io_vsd = NULL; 2301 } 2302 2303 ASSERT(zio->io_vsd == NULL); 2304 2305 return (ZIO_PIPELINE_CONTINUE); 2306} 2307 2308static boolean_t 2309zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) 2310{ 2311 spa_t *spa = zio->io_spa; 2312 2313 /* 2314 * Note: we compare the original data, not the transformed data, 2315 * because when zio->io_bp is an override bp, we will not have 2316 * pushed the I/O transforms. That's an important optimization 2317 * because otherwise we'd compress/encrypt all dmu_sync() data twice. 2318 */ 2319 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2320 zio_t *lio = dde->dde_lead_zio[p]; 2321 2322 if (lio != NULL) { 2323 return (lio->io_orig_size != zio->io_orig_size || 2324 bcmp(zio->io_orig_data, lio->io_orig_data, 2325 zio->io_orig_size) != 0); 2326 } 2327 } 2328 2329 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2330 ddt_phys_t *ddp = &dde->dde_phys[p]; 2331 2332 if (ddp->ddp_phys_birth != 0) { 2333 arc_buf_t *abuf = NULL; 2334 arc_flags_t aflags = ARC_FLAG_WAIT; 2335 blkptr_t blk = *zio->io_bp; 2336 int error; 2337 2338 ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); 2339 2340 ddt_exit(ddt); 2341 2342 error = arc_read(NULL, spa, &blk, 2343 arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, 2344 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2345 &aflags, &zio->io_bookmark); 2346 2347 if (error == 0) { 2348 if (arc_buf_size(abuf) != zio->io_orig_size || 2349 bcmp(abuf->b_data, zio->io_orig_data, 2350 zio->io_orig_size) != 0) 2351 error = SET_ERROR(EEXIST); 2352 VERIFY(arc_buf_remove_ref(abuf, &abuf)); 2353 } 2354 2355 ddt_enter(ddt); 2356 return (error != 0); 2357 } 2358 } 2359 2360 return (B_FALSE); 2361} 2362 2363static void 2364zio_ddt_child_write_ready(zio_t *zio) 2365{ 2366 int p = zio->io_prop.zp_copies; 2367 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2368 ddt_entry_t *dde = zio->io_private; 2369 ddt_phys_t *ddp = &dde->dde_phys[p]; 2370 zio_t *pio; 2371 2372 if (zio->io_error) 2373 return; 2374 2375 ddt_enter(ddt); 2376 2377 ASSERT(dde->dde_lead_zio[p] == zio); 2378 2379 ddt_phys_fill(ddp, zio->io_bp); 2380 2381 while ((pio = zio_walk_parents(zio)) != NULL) 2382 ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); 2383 2384 ddt_exit(ddt); 2385} 2386 2387static void 2388zio_ddt_child_write_done(zio_t *zio) 2389{ 2390 int p = zio->io_prop.zp_copies; 2391 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2392 ddt_entry_t *dde = zio->io_private; 2393 ddt_phys_t *ddp = &dde->dde_phys[p]; 2394 2395 ddt_enter(ddt); 2396 2397 ASSERT(ddp->ddp_refcnt == 0); 2398 ASSERT(dde->dde_lead_zio[p] == zio); 2399 dde->dde_lead_zio[p] = NULL; 2400 2401 if (zio->io_error == 0) { 2402 while (zio_walk_parents(zio) != NULL) 2403 ddt_phys_addref(ddp); 2404 } else { 2405 ddt_phys_clear(ddp); 2406 } 2407 2408 ddt_exit(ddt); 2409} 2410 2411static void 2412zio_ddt_ditto_write_done(zio_t *zio) 2413{ 2414 int p = DDT_PHYS_DITTO; 2415 zio_prop_t *zp = &zio->io_prop; 2416 blkptr_t *bp = zio->io_bp; 2417 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2418 ddt_entry_t *dde = zio->io_private; 2419 ddt_phys_t *ddp = &dde->dde_phys[p]; 2420 ddt_key_t *ddk = &dde->dde_key; 2421 2422 ddt_enter(ddt); 2423 2424 ASSERT(ddp->ddp_refcnt == 0); 2425 ASSERT(dde->dde_lead_zio[p] == zio); 2426 dde->dde_lead_zio[p] = NULL; 2427 2428 if (zio->io_error == 0) { 2429 ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); 2430 ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); 2431 ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); 2432 if (ddp->ddp_phys_birth != 0) 2433 ddt_phys_free(ddt, ddk, ddp, zio->io_txg); 2434 ddt_phys_fill(ddp, bp); 2435 } 2436 2437 ddt_exit(ddt); 2438} 2439 2440static int 2441zio_ddt_write(zio_t *zio) 2442{ 2443 spa_t *spa = zio->io_spa; 2444 blkptr_t *bp = zio->io_bp; 2445 uint64_t txg = zio->io_txg; 2446 zio_prop_t *zp = &zio->io_prop; 2447 int p = zp->zp_copies; 2448 int ditto_copies; 2449 zio_t *cio = NULL; 2450 zio_t *dio = NULL; 2451 ddt_t *ddt = ddt_select(spa, bp); 2452 ddt_entry_t *dde; 2453 ddt_phys_t *ddp; 2454 2455 ASSERT(BP_GET_DEDUP(bp)); 2456 ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); 2457 ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); 2458 2459 ddt_enter(ddt); 2460 dde = ddt_lookup(ddt, bp, B_TRUE); 2461 ddp = &dde->dde_phys[p]; 2462 2463 if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { 2464 /* 2465 * If we're using a weak checksum, upgrade to a strong checksum 2466 * and try again. If we're already using a strong checksum, 2467 * we can't resolve it, so just convert to an ordinary write. 2468 * (And automatically e-mail a paper to Nature?) 2469 */ 2470 if (!(zio_checksum_table[zp->zp_checksum].ci_flags & 2471 ZCHECKSUM_FLAG_DEDUP)) { 2472 zp->zp_checksum = spa_dedup_checksum(spa); 2473 zio_pop_transforms(zio); 2474 zio->io_stage = ZIO_STAGE_OPEN; 2475 BP_ZERO(bp); 2476 } else { 2477 zp->zp_dedup = B_FALSE; 2478 } 2479 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2480 ddt_exit(ddt); 2481 return (ZIO_PIPELINE_CONTINUE); 2482 } 2483 2484 ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); 2485 ASSERT(ditto_copies < SPA_DVAS_PER_BP); 2486 2487 if (ditto_copies > ddt_ditto_copies_present(dde) && 2488 dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { 2489 zio_prop_t czp = *zp; 2490 2491 czp.zp_copies = ditto_copies; 2492 2493 /* 2494 * If we arrived here with an override bp, we won't have run 2495 * the transform stack, so we won't have the data we need to 2496 * generate a child i/o. So, toss the override bp and restart. 2497 * This is safe, because using the override bp is just an 2498 * optimization; and it's rare, so the cost doesn't matter. 2499 */ 2500 if (zio->io_bp_override) { 2501 zio_pop_transforms(zio); 2502 zio->io_stage = ZIO_STAGE_OPEN; 2503 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2504 zio->io_bp_override = NULL; 2505 BP_ZERO(bp); 2506 ddt_exit(ddt); 2507 return (ZIO_PIPELINE_CONTINUE); 2508 } 2509 2510 dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2511 zio->io_orig_size, &czp, NULL, NULL, 2512 zio_ddt_ditto_write_done, dde, zio->io_priority, 2513 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2514 2515 zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); 2516 dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; 2517 } 2518 2519 if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { 2520 if (ddp->ddp_phys_birth != 0) 2521 ddt_bp_fill(ddp, bp, txg); 2522 if (dde->dde_lead_zio[p] != NULL) 2523 zio_add_child(zio, dde->dde_lead_zio[p]); 2524 else 2525 ddt_phys_addref(ddp); 2526 } else if (zio->io_bp_override) { 2527 ASSERT(bp->blk_birth == txg); 2528 ASSERT(BP_EQUAL(bp, zio->io_bp_override)); 2529 ddt_phys_fill(ddp, bp); 2530 ddt_phys_addref(ddp); 2531 } else { 2532 cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2533 zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL, 2534 zio_ddt_child_write_done, dde, zio->io_priority, 2535 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2536 2537 zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); 2538 dde->dde_lead_zio[p] = cio; 2539 } 2540 2541 ddt_exit(ddt); 2542 2543 if (cio) 2544 zio_nowait(cio); 2545 if (dio) 2546 zio_nowait(dio); 2547 2548 return (ZIO_PIPELINE_CONTINUE); 2549} 2550 2551ddt_entry_t *freedde; /* for debugging */ 2552 2553static int 2554zio_ddt_free(zio_t *zio) 2555{ 2556 spa_t *spa = zio->io_spa; 2557 blkptr_t *bp = zio->io_bp; 2558 ddt_t *ddt = ddt_select(spa, bp); 2559 ddt_entry_t *dde; 2560 ddt_phys_t *ddp; 2561 2562 ASSERT(BP_GET_DEDUP(bp)); 2563 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2564 2565 ddt_enter(ddt); 2566 freedde = dde = ddt_lookup(ddt, bp, B_TRUE); 2567 ddp = ddt_phys_select(dde, bp); 2568 ddt_phys_decref(ddp); 2569 ddt_exit(ddt); 2570 2571 return (ZIO_PIPELINE_CONTINUE); 2572} 2573 2574/* 2575 * ========================================================================== 2576 * Allocate and free blocks 2577 * ========================================================================== 2578 */ 2579static int 2580zio_dva_allocate(zio_t *zio) 2581{ 2582 spa_t *spa = zio->io_spa; 2583 metaslab_class_t *mc = spa_normal_class(spa); 2584 blkptr_t *bp = zio->io_bp; 2585 int error; 2586 int flags = 0; 2587 2588 if (zio->io_gang_leader == NULL) { 2589 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2590 zio->io_gang_leader = zio; 2591 } 2592 2593 ASSERT(BP_IS_HOLE(bp)); 2594 ASSERT0(BP_GET_NDVAS(bp)); 2595 ASSERT3U(zio->io_prop.zp_copies, >, 0); 2596 ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); 2597 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 2598 2599 /* 2600 * The dump device does not support gang blocks so allocation on 2601 * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid 2602 * the "fast" gang feature. 2603 */ 2604 flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0; 2605 flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ? 2606 METASLAB_GANG_CHILD : 0; 2607 error = metaslab_alloc(spa, mc, zio->io_size, bp, 2608 zio->io_prop.zp_copies, zio->io_txg, NULL, flags); 2609 2610 if (error) { 2611 spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " 2612 "size %llu, error %d", spa_name(spa), zio, zio->io_size, 2613 error); 2614 if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) 2615 return (zio_write_gang_block(zio)); 2616 zio->io_error = error; 2617 } 2618 2619 return (ZIO_PIPELINE_CONTINUE); 2620} 2621 2622static int 2623zio_dva_free(zio_t *zio) 2624{ 2625 metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); 2626 2627 return (ZIO_PIPELINE_CONTINUE); 2628} 2629 2630static int 2631zio_dva_claim(zio_t *zio) 2632{ 2633 int error; 2634 2635 error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 2636 if (error) 2637 zio->io_error = error; 2638 2639 return (ZIO_PIPELINE_CONTINUE); 2640} 2641 2642/* 2643 * Undo an allocation. This is used by zio_done() when an I/O fails 2644 * and we want to give back the block we just allocated. 2645 * This handles both normal blocks and gang blocks. 2646 */ 2647static void 2648zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) 2649{ 2650 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 2651 ASSERT(zio->io_bp_override == NULL); 2652 2653 if (!BP_IS_HOLE(bp)) 2654 metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); 2655 2656 if (gn != NULL) { 2657 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 2658 zio_dva_unallocate(zio, gn->gn_child[g], 2659 &gn->gn_gbh->zg_blkptr[g]); 2660 } 2661 } 2662} 2663 2664/* 2665 * Try to allocate an intent log block. Return 0 on success, errno on failure. 2666 */ 2667int 2668zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, 2669 uint64_t size, boolean_t use_slog) 2670{ 2671 int error = 1; 2672 2673 ASSERT(txg > spa_syncing_txg(spa)); 2674 2675 /* 2676 * ZIL blocks are always contiguous (i.e. not gang blocks) so we 2677 * set the METASLAB_GANG_AVOID flag so that they don't "fast gang" 2678 * when allocating them. 2679 */ 2680 if (use_slog) { 2681 error = metaslab_alloc(spa, spa_log_class(spa), size, 2682 new_bp, 1, txg, old_bp, 2683 METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID); 2684 } 2685 2686 if (error) { 2687 error = metaslab_alloc(spa, spa_normal_class(spa), size, 2688 new_bp, 1, txg, old_bp, 2689 METASLAB_HINTBP_AVOID); 2690 } 2691 2692 if (error == 0) { 2693 BP_SET_LSIZE(new_bp, size); 2694 BP_SET_PSIZE(new_bp, size); 2695 BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 2696 BP_SET_CHECKSUM(new_bp, 2697 spa_version(spa) >= SPA_VERSION_SLIM_ZIL 2698 ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); 2699 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 2700 BP_SET_LEVEL(new_bp, 0); 2701 BP_SET_DEDUP(new_bp, 0); 2702 BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 2703 } 2704 2705 return (error); 2706} 2707 2708/* 2709 * Free an intent log block. 2710 */ 2711void 2712zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp) 2713{ 2714 ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG); 2715 ASSERT(!BP_IS_GANG(bp)); 2716 2717 zio_free(spa, txg, bp); 2718} 2719 2720/* 2721 * ========================================================================== 2722 * Read, write and delete to physical devices 2723 * ========================================================================== 2724 */ 2725 2726 2727/* 2728 * Issue an I/O to the underlying vdev. Typically the issue pipeline 2729 * stops after this stage and will resume upon I/O completion. 2730 * However, there are instances where the vdev layer may need to 2731 * continue the pipeline when an I/O was not issued. Since the I/O 2732 * that was sent to the vdev layer might be different than the one 2733 * currently active in the pipeline (see vdev_queue_io()), we explicitly 2734 * force the underlying vdev layers to call either zio_execute() or 2735 * zio_interrupt() to ensure that the pipeline continues with the correct I/O. 2736 */ 2737static int 2738zio_vdev_io_start(zio_t *zio) 2739{ 2740 vdev_t *vd = zio->io_vd; 2741 uint64_t align; 2742 spa_t *spa = zio->io_spa; 2743 int ret; 2744 2745 ASSERT(zio->io_error == 0); 2746 ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); 2747 2748 if (vd == NULL) { 2749 if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2750 spa_config_enter(spa, SCL_ZIO, zio, RW_READER); 2751 2752 /* 2753 * The mirror_ops handle multiple DVAs in a single BP. 2754 */ 2755 vdev_mirror_ops.vdev_op_io_start(zio); 2756 return (ZIO_PIPELINE_STOP); 2757 } 2758 2759 if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE && 2760 zio->io_priority == ZIO_PRIORITY_NOW) { 2761 trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg); 2762 return (ZIO_PIPELINE_CONTINUE); 2763 } 2764 2765 /* 2766 * We keep track of time-sensitive I/Os so that the scan thread 2767 * can quickly react to certain workloads. In particular, we care 2768 * about non-scrubbing, top-level reads and writes with the following 2769 * characteristics: 2770 * - synchronous writes of user data to non-slog devices 2771 * - any reads of user data 2772 * When these conditions are met, adjust the timestamp of spa_last_io 2773 * which allows the scan thread to adjust its workload accordingly. 2774 */ 2775 if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && 2776 vd == vd->vdev_top && !vd->vdev_islog && 2777 zio->io_bookmark.zb_objset != DMU_META_OBJSET && 2778 zio->io_txg != spa_syncing_txg(spa)) { 2779 uint64_t old = spa->spa_last_io; 2780 uint64_t new = ddi_get_lbolt64(); 2781 if (old != new) 2782 (void) atomic_cas_64(&spa->spa_last_io, old, new); 2783 } 2784 2785 align = 1ULL << vd->vdev_top->vdev_ashift; 2786 2787 if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && 2788 P2PHASE(zio->io_size, align) != 0) { 2789 /* Transform logical writes to be a full physical block size. */ 2790 uint64_t asize = P2ROUNDUP(zio->io_size, align); 2791 char *abuf = NULL; 2792 if (zio->io_type == ZIO_TYPE_READ || 2793 zio->io_type == ZIO_TYPE_WRITE) 2794 abuf = zio_buf_alloc(asize); 2795 ASSERT(vd == vd->vdev_top); 2796 if (zio->io_type == ZIO_TYPE_WRITE) { 2797 bcopy(zio->io_data, abuf, zio->io_size); 2798 bzero(abuf + zio->io_size, asize - zio->io_size); 2799 } 2800 zio_push_transform(zio, abuf, asize, abuf ? asize : 0, 2801 zio_subblock); 2802 } 2803 2804 /* 2805 * If this is not a physical io, make sure that it is properly aligned 2806 * before proceeding. 2807 */ 2808 if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) { 2809 ASSERT0(P2PHASE(zio->io_offset, align)); 2810 ASSERT0(P2PHASE(zio->io_size, align)); 2811 } else { 2812 /* 2813 * For the physical io we allow alignment 2814 * to a logical block size. 2815 */ 2816 uint64_t log_align = 2817 1ULL << vd->vdev_top->vdev_logical_ashift; 2818 ASSERT0(P2PHASE(zio->io_offset, log_align)); 2819 ASSERT0(P2PHASE(zio->io_size, log_align)); 2820 } 2821 2822 VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa)); 2823 2824 /* 2825 * If this is a repair I/O, and there's no self-healing involved -- 2826 * that is, we're just resilvering what we expect to resilver -- 2827 * then don't do the I/O unless zio's txg is actually in vd's DTL. 2828 * This prevents spurious resilvering with nested replication. 2829 * For example, given a mirror of mirrors, (A+B)+(C+D), if only 2830 * A is out of date, we'll read from C+D, then use the data to 2831 * resilver A+B -- but we don't actually want to resilver B, just A. 2832 * The top-level mirror has no way to know this, so instead we just 2833 * discard unnecessary repairs as we work our way down the vdev tree. 2834 * The same logic applies to any form of nested replication: 2835 * ditto + mirror, RAID-Z + replacing, etc. This covers them all. 2836 */ 2837 if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && 2838 !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && 2839 zio->io_txg != 0 && /* not a delegated i/o */ 2840 !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { 2841 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 2842 zio_vdev_io_bypass(zio); 2843 return (ZIO_PIPELINE_CONTINUE); 2844 } 2845 2846 if (vd->vdev_ops->vdev_op_leaf) { 2847 switch (zio->io_type) { 2848 case ZIO_TYPE_READ: 2849 if (vdev_cache_read(zio)) 2850 return (ZIO_PIPELINE_CONTINUE); 2851 /* FALLTHROUGH */ 2852 case ZIO_TYPE_WRITE: 2853 case ZIO_TYPE_FREE: 2854 if ((zio = vdev_queue_io(zio)) == NULL) 2855 return (ZIO_PIPELINE_STOP); 2856 2857 if (!vdev_accessible(vd, zio)) { 2858 zio->io_error = SET_ERROR(ENXIO); 2859 zio_interrupt(zio); 2860 return (ZIO_PIPELINE_STOP); 2861 } 2862 break; 2863 } 2864 /* 2865 * Note that we ignore repair writes for TRIM because they can 2866 * conflict with normal writes. This isn't an issue because, by 2867 * definition, we only repair blocks that aren't freed. 2868 */ 2869 if (zio->io_type == ZIO_TYPE_WRITE && 2870 !(zio->io_flags & ZIO_FLAG_IO_REPAIR) && 2871 !trim_map_write_start(zio)) 2872 return (ZIO_PIPELINE_STOP); 2873 } 2874 2875 vd->vdev_ops->vdev_op_io_start(zio); 2876 return (ZIO_PIPELINE_STOP); 2877} 2878 2879static int 2880zio_vdev_io_done(zio_t *zio) 2881{ 2882 vdev_t *vd = zio->io_vd; 2883 vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; 2884 boolean_t unexpected_error = B_FALSE; 2885 2886 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2887 return (ZIO_PIPELINE_STOP); 2888 2889 ASSERT(zio->io_type == ZIO_TYPE_READ || 2890 zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE); 2891 2892 if (vd != NULL && vd->vdev_ops->vdev_op_leaf && 2893 (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || 2894 zio->io_type == ZIO_TYPE_FREE)) { 2895 2896 if (zio->io_type == ZIO_TYPE_WRITE && 2897 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) 2898 trim_map_write_done(zio); 2899 2900 vdev_queue_io_done(zio); 2901 2902 if (zio->io_type == ZIO_TYPE_WRITE) 2903 vdev_cache_write(zio); 2904 2905 if (zio_injection_enabled && zio->io_error == 0) 2906 zio->io_error = zio_handle_device_injection(vd, 2907 zio, EIO); 2908 2909 if (zio_injection_enabled && zio->io_error == 0) 2910 zio->io_error = zio_handle_label_injection(zio, EIO); 2911 2912 if (zio->io_error) { 2913 if (zio->io_error == ENOTSUP && 2914 zio->io_type == ZIO_TYPE_FREE) { 2915 /* Not all devices support TRIM. */ 2916 } else if (!vdev_accessible(vd, zio)) { 2917 zio->io_error = SET_ERROR(ENXIO); 2918 } else { 2919 unexpected_error = B_TRUE; 2920 } 2921 } 2922 } 2923 2924 ops->vdev_op_io_done(zio); 2925 2926 if (unexpected_error) 2927 VERIFY(vdev_probe(vd, zio) == NULL); 2928 2929 return (ZIO_PIPELINE_CONTINUE); 2930} 2931 2932/* 2933 * For non-raidz ZIOs, we can just copy aside the bad data read from the 2934 * disk, and use that to finish the checksum ereport later. 2935 */ 2936static void 2937zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, 2938 const void *good_buf) 2939{ 2940 /* no processing needed */ 2941 zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); 2942} 2943 2944/*ARGSUSED*/ 2945void 2946zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) 2947{ 2948 void *buf = zio_buf_alloc(zio->io_size); 2949 2950 bcopy(zio->io_data, buf, zio->io_size); 2951 2952 zcr->zcr_cbinfo = zio->io_size; 2953 zcr->zcr_cbdata = buf; 2954 zcr->zcr_finish = zio_vsd_default_cksum_finish; 2955 zcr->zcr_free = zio_buf_free; 2956} 2957 2958static int 2959zio_vdev_io_assess(zio_t *zio) 2960{ 2961 vdev_t *vd = zio->io_vd; 2962 2963 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2964 return (ZIO_PIPELINE_STOP); 2965 2966 if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2967 spa_config_exit(zio->io_spa, SCL_ZIO, zio); 2968 2969 if (zio->io_vsd != NULL) { 2970 zio->io_vsd_ops->vsd_free(zio); 2971 zio->io_vsd = NULL; 2972 } 2973 2974 if (zio_injection_enabled && zio->io_error == 0) 2975 zio->io_error = zio_handle_fault_injection(zio, EIO); 2976 2977 if (zio->io_type == ZIO_TYPE_FREE && 2978 zio->io_priority != ZIO_PRIORITY_NOW) { 2979 switch (zio->io_error) { 2980 case 0: 2981 ZIO_TRIM_STAT_INCR(bytes, zio->io_size); 2982 ZIO_TRIM_STAT_BUMP(success); 2983 break; 2984 case EOPNOTSUPP: 2985 ZIO_TRIM_STAT_BUMP(unsupported); 2986 break; 2987 default: 2988 ZIO_TRIM_STAT_BUMP(failed); 2989 break; 2990 } 2991 } 2992 2993 /* 2994 * If the I/O failed, determine whether we should attempt to retry it. 2995 * 2996 * On retry, we cut in line in the issue queue, since we don't want 2997 * compression/checksumming/etc. work to prevent our (cheap) IO reissue. 2998 */ 2999 if (zio->io_error && vd == NULL && 3000 !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { 3001 ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ 3002 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ 3003 zio->io_error = 0; 3004 zio->io_flags |= ZIO_FLAG_IO_RETRY | 3005 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; 3006 zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; 3007 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, 3008 zio_requeue_io_start_cut_in_line); 3009 return (ZIO_PIPELINE_STOP); 3010 } 3011 3012 /* 3013 * If we got an error on a leaf device, convert it to ENXIO 3014 * if the device is not accessible at all. 3015 */ 3016 if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && 3017 !vdev_accessible(vd, zio)) 3018 zio->io_error = SET_ERROR(ENXIO); 3019 3020 /* 3021 * If we can't write to an interior vdev (mirror or RAID-Z), 3022 * set vdev_cant_write so that we stop trying to allocate from it. 3023 */ 3024 if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && 3025 vd != NULL && !vd->vdev_ops->vdev_op_leaf) { 3026 vd->vdev_cant_write = B_TRUE; 3027 } 3028 3029 if (zio->io_error) 3030 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 3031 3032 if (vd != NULL && vd->vdev_ops->vdev_op_leaf && 3033 zio->io_physdone != NULL) { 3034 ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED)); 3035 ASSERT(zio->io_child_type == ZIO_CHILD_VDEV); 3036 zio->io_physdone(zio->io_logical); 3037 } 3038 3039 return (ZIO_PIPELINE_CONTINUE); 3040} 3041 3042void 3043zio_vdev_io_reissue(zio_t *zio) 3044{ 3045 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 3046 ASSERT(zio->io_error == 0); 3047 3048 zio->io_stage >>= 1; 3049} 3050 3051void 3052zio_vdev_io_redone(zio_t *zio) 3053{ 3054 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 3055 3056 zio->io_stage >>= 1; 3057} 3058 3059void 3060zio_vdev_io_bypass(zio_t *zio) 3061{ 3062 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 3063 ASSERT(zio->io_error == 0); 3064 3065 zio->io_flags |= ZIO_FLAG_IO_BYPASS; 3066 zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; 3067} 3068 3069/* 3070 * ========================================================================== 3071 * Generate and verify checksums 3072 * ========================================================================== 3073 */ 3074static int 3075zio_checksum_generate(zio_t *zio) 3076{ 3077 blkptr_t *bp = zio->io_bp; 3078 enum zio_checksum checksum; 3079 3080 if (bp == NULL) { 3081 /* 3082 * This is zio_write_phys(). 3083 * We're either generating a label checksum, or none at all. 3084 */ 3085 checksum = zio->io_prop.zp_checksum; 3086 3087 if (checksum == ZIO_CHECKSUM_OFF) 3088 return (ZIO_PIPELINE_CONTINUE); 3089 3090 ASSERT(checksum == ZIO_CHECKSUM_LABEL); 3091 } else { 3092 if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { 3093 ASSERT(!IO_IS_ALLOCATING(zio)); 3094 checksum = ZIO_CHECKSUM_GANG_HEADER; 3095 } else { 3096 checksum = BP_GET_CHECKSUM(bp); 3097 } 3098 } 3099 3100 zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); 3101 3102 return (ZIO_PIPELINE_CONTINUE); 3103} 3104 3105static int 3106zio_checksum_verify(zio_t *zio) 3107{ 3108 zio_bad_cksum_t info; 3109 blkptr_t *bp = zio->io_bp; 3110 int error; 3111 3112 ASSERT(zio->io_vd != NULL); 3113 3114 if (bp == NULL) { 3115 /* 3116 * This is zio_read_phys(). 3117 * We're either verifying a label checksum, or nothing at all. 3118 */ 3119 if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) 3120 return (ZIO_PIPELINE_CONTINUE); 3121 3122 ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); 3123 } 3124 3125 if ((error = zio_checksum_error(zio, &info)) != 0) { 3126 zio->io_error = error; 3127 if (error == ECKSUM && 3128 !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 3129 zfs_ereport_start_checksum(zio->io_spa, 3130 zio->io_vd, zio, zio->io_offset, 3131 zio->io_size, NULL, &info); 3132 } 3133 } 3134 3135 return (ZIO_PIPELINE_CONTINUE); 3136} 3137 3138/* 3139 * Called by RAID-Z to ensure we don't compute the checksum twice. 3140 */ 3141void 3142zio_checksum_verified(zio_t *zio) 3143{ 3144 zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 3145} 3146 3147/* 3148 * ========================================================================== 3149 * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. 3150 * An error of 0 indicates success. ENXIO indicates whole-device failure, 3151 * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO 3152 * indicate errors that are specific to one I/O, and most likely permanent. 3153 * Any other error is presumed to be worse because we weren't expecting it. 3154 * ========================================================================== 3155 */ 3156int 3157zio_worst_error(int e1, int e2) 3158{ 3159 static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; 3160 int r1, r2; 3161 3162 for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) 3163 if (e1 == zio_error_rank[r1]) 3164 break; 3165 3166 for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) 3167 if (e2 == zio_error_rank[r2]) 3168 break; 3169 3170 return (r1 > r2 ? e1 : e2); 3171} 3172 3173/* 3174 * ========================================================================== 3175 * I/O completion 3176 * ========================================================================== 3177 */ 3178static int 3179zio_ready(zio_t *zio) 3180{ 3181 blkptr_t *bp = zio->io_bp; 3182 zio_t *pio, *pio_next; 3183 3184 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 3185 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY)) 3186 return (ZIO_PIPELINE_STOP); 3187 3188 if (zio->io_ready) { 3189 ASSERT(IO_IS_ALLOCATING(zio)); 3190 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || 3191 (zio->io_flags & ZIO_FLAG_NOPWRITE)); 3192 ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); 3193 3194 zio->io_ready(zio); 3195 } 3196 3197 if (bp != NULL && bp != &zio->io_bp_copy) 3198 zio->io_bp_copy = *bp; 3199 3200 if (zio->io_error) 3201 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 3202 3203 mutex_enter(&zio->io_lock); 3204 zio->io_state[ZIO_WAIT_READY] = 1; 3205 pio = zio_walk_parents(zio); 3206 mutex_exit(&zio->io_lock); 3207 3208 /* 3209 * As we notify zio's parents, new parents could be added. 3210 * New parents go to the head of zio's io_parent_list, however, 3211 * so we will (correctly) not notify them. The remainder of zio's 3212 * io_parent_list, from 'pio_next' onward, cannot change because 3213 * all parents must wait for us to be done before they can be done. 3214 */ 3215 for (; pio != NULL; pio = pio_next) { 3216 pio_next = zio_walk_parents(zio); 3217 zio_notify_parent(pio, zio, ZIO_WAIT_READY); 3218 } 3219 3220 if (zio->io_flags & ZIO_FLAG_NODATA) { 3221 if (BP_IS_GANG(bp)) { 3222 zio->io_flags &= ~ZIO_FLAG_NODATA; 3223 } else { 3224 ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE); 3225 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 3226 } 3227 } 3228 3229 if (zio_injection_enabled && 3230 zio->io_spa->spa_syncing_txg == zio->io_txg) 3231 zio_handle_ignored_writes(zio); 3232 3233 return (ZIO_PIPELINE_CONTINUE); 3234} 3235 3236static int 3237zio_done(zio_t *zio) 3238{ 3239 spa_t *spa = zio->io_spa; 3240 zio_t *lio = zio->io_logical; 3241 blkptr_t *bp = zio->io_bp; 3242 vdev_t *vd = zio->io_vd; 3243 uint64_t psize = zio->io_size; 3244 zio_t *pio, *pio_next; 3245 3246 /* 3247 * If our children haven't all completed, 3248 * wait for them and then repeat this pipeline stage. 3249 */ 3250 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || 3251 zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || 3252 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) || 3253 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) 3254 return (ZIO_PIPELINE_STOP); 3255 3256 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 3257 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 3258 ASSERT(zio->io_children[c][w] == 0); 3259 3260 if (bp != NULL && !BP_IS_EMBEDDED(bp)) { 3261 ASSERT(bp->blk_pad[0] == 0); 3262 ASSERT(bp->blk_pad[1] == 0); 3263 ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || 3264 (bp == zio_unique_parent(zio)->io_bp)); 3265 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 3266 zio->io_bp_override == NULL && 3267 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 3268 ASSERT(!BP_SHOULD_BYTESWAP(bp)); 3269 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); 3270 ASSERT(BP_COUNT_GANG(bp) == 0 || 3271 (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 3272 } 3273 if (zio->io_flags & ZIO_FLAG_NOPWRITE) 3274 VERIFY(BP_EQUAL(bp, &zio->io_bp_orig)); 3275 } 3276 3277 /* 3278 * If there were child vdev/gang/ddt errors, they apply to us now. 3279 */ 3280 zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); 3281 zio_inherit_child_errors(zio, ZIO_CHILD_GANG); 3282 zio_inherit_child_errors(zio, ZIO_CHILD_DDT); 3283 3284 /* 3285 * If the I/O on the transformed data was successful, generate any 3286 * checksum reports now while we still have the transformed data. 3287 */ 3288 if (zio->io_error == 0) { 3289 while (zio->io_cksum_report != NULL) { 3290 zio_cksum_report_t *zcr = zio->io_cksum_report; 3291 uint64_t align = zcr->zcr_align; 3292 uint64_t asize = P2ROUNDUP(psize, align); 3293 char *abuf = zio->io_data; 3294 3295 if (asize != psize) { 3296 abuf = zio_buf_alloc(asize); 3297 bcopy(zio->io_data, abuf, psize); 3298 bzero(abuf + psize, asize - psize); 3299 } 3300 3301 zio->io_cksum_report = zcr->zcr_next; 3302 zcr->zcr_next = NULL; 3303 zcr->zcr_finish(zcr, abuf); 3304 zfs_ereport_free_checksum(zcr); 3305 3306 if (asize != psize) 3307 zio_buf_free(abuf, asize); 3308 } 3309 } 3310 3311 zio_pop_transforms(zio); /* note: may set zio->io_error */ 3312 3313 vdev_stat_update(zio, psize); 3314 3315 if (zio->io_error) { 3316 /* 3317 * If this I/O is attached to a particular vdev, 3318 * generate an error message describing the I/O failure 3319 * at the block level. We ignore these errors if the 3320 * device is currently unavailable. 3321 */ 3322 if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 3323 zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 3324 3325 if ((zio->io_error == EIO || !(zio->io_flags & 3326 (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && 3327 zio == lio) { 3328 /* 3329 * For logical I/O requests, tell the SPA to log the 3330 * error and generate a logical data ereport. 3331 */ 3332 spa_log_error(spa, zio); 3333 zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 3334 0, 0); 3335 } 3336 } 3337 3338 if (zio->io_error && zio == lio) { 3339 /* 3340 * Determine whether zio should be reexecuted. This will 3341 * propagate all the way to the root via zio_notify_parent(). 3342 */ 3343 ASSERT(vd == NULL && bp != NULL); 3344 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3345 3346 if (IO_IS_ALLOCATING(zio) && 3347 !(zio->io_flags & ZIO_FLAG_CANFAIL)) { 3348 if (zio->io_error != ENOSPC) 3349 zio->io_reexecute |= ZIO_REEXECUTE_NOW; 3350 else 3351 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3352 } 3353 3354 if ((zio->io_type == ZIO_TYPE_READ || 3355 zio->io_type == ZIO_TYPE_FREE) && 3356 !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && 3357 zio->io_error == ENXIO && 3358 spa_load_state(spa) == SPA_LOAD_NONE && 3359 spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 3360 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3361 3362 if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) 3363 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3364 3365 /* 3366 * Here is a possibly good place to attempt to do 3367 * either combinatorial reconstruction or error correction 3368 * based on checksums. It also might be a good place 3369 * to send out preliminary ereports before we suspend 3370 * processing. 3371 */ 3372 } 3373 3374 /* 3375 * If there were logical child errors, they apply to us now. 3376 * We defer this until now to avoid conflating logical child 3377 * errors with errors that happened to the zio itself when 3378 * updating vdev stats and reporting FMA events above. 3379 */ 3380 zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); 3381 3382 if ((zio->io_error || zio->io_reexecute) && 3383 IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && 3384 !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) 3385 zio_dva_unallocate(zio, zio->io_gang_tree, bp); 3386 3387 zio_gang_tree_free(&zio->io_gang_tree); 3388 3389 /* 3390 * Godfather I/Os should never suspend. 3391 */ 3392 if ((zio->io_flags & ZIO_FLAG_GODFATHER) && 3393 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) 3394 zio->io_reexecute = 0; 3395 3396 if (zio->io_reexecute) { 3397 /* 3398 * This is a logical I/O that wants to reexecute. 3399 * 3400 * Reexecute is top-down. When an i/o fails, if it's not 3401 * the root, it simply notifies its parent and sticks around. 3402 * The parent, seeing that it still has children in zio_done(), 3403 * does the same. This percolates all the way up to the root. 3404 * The root i/o will reexecute or suspend the entire tree. 3405 * 3406 * This approach ensures that zio_reexecute() honors 3407 * all the original i/o dependency relationships, e.g. 3408 * parents not executing until children are ready. 3409 */ 3410 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3411 3412 zio->io_gang_leader = NULL; 3413 3414 mutex_enter(&zio->io_lock); 3415 zio->io_state[ZIO_WAIT_DONE] = 1; 3416 mutex_exit(&zio->io_lock); 3417 3418 /* 3419 * "The Godfather" I/O monitors its children but is 3420 * not a true parent to them. It will track them through 3421 * the pipeline but severs its ties whenever they get into 3422 * trouble (e.g. suspended). This allows "The Godfather" 3423 * I/O to return status without blocking. 3424 */ 3425 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 3426 zio_link_t *zl = zio->io_walk_link; 3427 pio_next = zio_walk_parents(zio); 3428 3429 if ((pio->io_flags & ZIO_FLAG_GODFATHER) && 3430 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { 3431 zio_remove_child(pio, zio, zl); 3432 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3433 } 3434 } 3435 3436 if ((pio = zio_unique_parent(zio)) != NULL) { 3437 /* 3438 * We're not a root i/o, so there's nothing to do 3439 * but notify our parent. Don't propagate errors 3440 * upward since we haven't permanently failed yet. 3441 */ 3442 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 3443 zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; 3444 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3445 } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { 3446 /* 3447 * We'd fail again if we reexecuted now, so suspend 3448 * until conditions improve (e.g. device comes online). 3449 */ 3450 zio_suspend(spa, zio); 3451 } else { 3452 /* 3453 * Reexecution is potentially a huge amount of work. 3454 * Hand it off to the otherwise-unused claim taskq. 3455 */ 3456#if defined(illumos) || !defined(_KERNEL) 3457 ASSERT(zio->io_tqent.tqent_next == NULL); 3458#else 3459 ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); 3460#endif 3461 spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM, 3462 ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio, 3463 0, &zio->io_tqent); 3464 } 3465 return (ZIO_PIPELINE_STOP); 3466 } 3467 3468 ASSERT(zio->io_child_count == 0); 3469 ASSERT(zio->io_reexecute == 0); 3470 ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); 3471 3472 /* 3473 * Report any checksum errors, since the I/O is complete. 3474 */ 3475 while (zio->io_cksum_report != NULL) { 3476 zio_cksum_report_t *zcr = zio->io_cksum_report; 3477 zio->io_cksum_report = zcr->zcr_next; 3478 zcr->zcr_next = NULL; 3479 zcr->zcr_finish(zcr, NULL); 3480 zfs_ereport_free_checksum(zcr); 3481 } 3482 3483 /* 3484 * It is the responsibility of the done callback to ensure that this 3485 * particular zio is no longer discoverable for adoption, and as 3486 * such, cannot acquire any new parents. 3487 */ 3488 if (zio->io_done) 3489 zio->io_done(zio); 3490 3491 mutex_enter(&zio->io_lock); 3492 zio->io_state[ZIO_WAIT_DONE] = 1; 3493 mutex_exit(&zio->io_lock); 3494 3495 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 3496 zio_link_t *zl = zio->io_walk_link; 3497 pio_next = zio_walk_parents(zio); 3498 zio_remove_child(pio, zio, zl); 3499 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3500 } 3501 3502 if (zio->io_waiter != NULL) { 3503 mutex_enter(&zio->io_lock); 3504 zio->io_executor = NULL; 3505 cv_broadcast(&zio->io_cv); 3506 mutex_exit(&zio->io_lock); 3507 } else { 3508 zio_destroy(zio); 3509 } 3510 3511 return (ZIO_PIPELINE_STOP); 3512} 3513 3514/* 3515 * ========================================================================== 3516 * I/O pipeline definition 3517 * ========================================================================== 3518 */ 3519static zio_pipe_stage_t *zio_pipeline[] = { 3520 NULL, 3521 zio_read_bp_init, 3522 zio_free_bp_init, 3523 zio_issue_async, 3524 zio_write_bp_init, 3525 zio_checksum_generate, 3526 zio_nop_write, 3527 zio_ddt_read_start, 3528 zio_ddt_read_done, 3529 zio_ddt_write, 3530 zio_ddt_free, 3531 zio_gang_assemble, 3532 zio_gang_issue, 3533 zio_dva_allocate, 3534 zio_dva_free, 3535 zio_dva_claim, 3536 zio_ready, 3537 zio_vdev_io_start, 3538 zio_vdev_io_done, 3539 zio_vdev_io_assess, 3540 zio_checksum_verify, 3541 zio_done 3542}; 3543 3544 3545 3546 3547/* 3548 * Compare two zbookmark_phys_t's to see which we would reach first in a 3549 * pre-order traversal of the object tree. 3550 * 3551 * This is simple in every case aside from the meta-dnode object. For all other 3552 * objects, we traverse them in order (object 1 before object 2, and so on). 3553 * However, all of these objects are traversed while traversing object 0, since 3554 * the data it points to is the list of objects. Thus, we need to convert to a 3555 * canonical representation so we can compare meta-dnode bookmarks to 3556 * non-meta-dnode bookmarks. 3557 * 3558 * We do this by calculating "equivalents" for each field of the zbookmark. 3559 * zbookmarks outside of the meta-dnode use their own object and level, and 3560 * calculate the level 0 equivalent (the first L0 blkid that is contained in the 3561 * blocks this bookmark refers to) by multiplying their blkid by their span 3562 * (the number of L0 blocks contained within one block at their level). 3563 * zbookmarks inside the meta-dnode calculate their object equivalent 3564 * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use 3565 * level + 1<<31 (any value larger than a level could ever be) for their level. 3566 * This causes them to always compare before a bookmark in their object 3567 * equivalent, compare appropriately to bookmarks in other objects, and to 3568 * compare appropriately to other bookmarks in the meta-dnode. 3569 */ 3570int 3571zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2, 3572 const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2) 3573{ 3574 /* 3575 * These variables represent the "equivalent" values for the zbookmark, 3576 * after converting zbookmarks inside the meta dnode to their 3577 * normal-object equivalents. 3578 */ 3579 uint64_t zb1obj, zb2obj; 3580 uint64_t zb1L0, zb2L0; 3581 uint64_t zb1level, zb2level; 3582 3583 if (zb1->zb_object == zb2->zb_object && 3584 zb1->zb_level == zb2->zb_level && 3585 zb1->zb_blkid == zb2->zb_blkid) 3586 return (0); 3587 3588 /* 3589 * BP_SPANB calculates the span in blocks. 3590 */ 3591 zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level); 3592 zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level); 3593 3594 if (zb1->zb_object == DMU_META_DNODE_OBJECT) { 3595 zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); 3596 zb1L0 = 0; 3597 zb1level = zb1->zb_level + COMPARE_META_LEVEL; 3598 } else { 3599 zb1obj = zb1->zb_object; 3600 zb1level = zb1->zb_level; 3601 } 3602 3603 if (zb2->zb_object == DMU_META_DNODE_OBJECT) { 3604 zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); 3605 zb2L0 = 0; 3606 zb2level = zb2->zb_level + COMPARE_META_LEVEL; 3607 } else { 3608 zb2obj = zb2->zb_object; 3609 zb2level = zb2->zb_level; 3610 } 3611 3612 /* Now that we have a canonical representation, do the comparison. */ 3613 if (zb1obj != zb2obj) 3614 return (zb1obj < zb2obj ? -1 : 1); 3615 else if (zb1L0 != zb2L0) 3616 return (zb1L0 < zb2L0 ? -1 : 1); 3617 else if (zb1level != zb2level) 3618 return (zb1level > zb2level ? -1 : 1); 3619 /* 3620 * This can (theoretically) happen if the bookmarks have the same object 3621 * and level, but different blkids, if the block sizes are not the same. 3622 * There is presently no way to change the indirect block sizes 3623 */ 3624 return (0); 3625} 3626 3627/* 3628 * This function checks the following: given that last_block is the place that 3629 * our traversal stopped last time, does that guarantee that we've visited 3630 * every node under subtree_root? Therefore, we can't just use the raw output 3631 * of zbookmark_compare. We have to pass in a modified version of 3632 * subtree_root; by incrementing the block id, and then checking whether 3633 * last_block is before or equal to that, we can tell whether or not having 3634 * visited last_block implies that all of subtree_root's children have been 3635 * visited. 3636 */ 3637boolean_t 3638zbookmark_subtree_completed(const dnode_phys_t *dnp, 3639 const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block) 3640{ 3641 zbookmark_phys_t mod_zb = *subtree_root; 3642 mod_zb.zb_blkid++; 3643 ASSERT(last_block->zb_level == 0); 3644 3645 /* The objset_phys_t isn't before anything. */ 3646 if (dnp == NULL) 3647 return (B_FALSE); 3648 3649 /* 3650 * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the 3651 * data block size in sectors, because that variable is only used if 3652 * the bookmark refers to a block in the meta-dnode. Since we don't 3653 * know without examining it what object it refers to, and there's no 3654 * harm in passing in this value in other cases, we always pass it in. 3655 * 3656 * We pass in 0 for the indirect block size shift because zb2 must be 3657 * level 0. The indirect block size is only used to calculate the span 3658 * of the bookmark, but since the bookmark must be level 0, the span is 3659 * always 1, so the math works out. 3660 * 3661 * If you make changes to how the zbookmark_compare code works, be sure 3662 * to make sure that this code still works afterwards. 3663 */ 3664 return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift, 3665 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb, 3666 last_block) <= 0); 3667} 3668