zio.c revision 307266
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2016 by Delphix. All rights reserved. 24 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 */ 27 28#include <sys/sysmacros.h> 29#include <sys/zfs_context.h> 30#include <sys/fm/fs/zfs.h> 31#include <sys/spa.h> 32#include <sys/txg.h> 33#include <sys/spa_impl.h> 34#include <sys/vdev_impl.h> 35#include <sys/zio_impl.h> 36#include <sys/zio_compress.h> 37#include <sys/zio_checksum.h> 38#include <sys/dmu_objset.h> 39#include <sys/arc.h> 40#include <sys/ddt.h> 41#include <sys/trim_map.h> 42#include <sys/blkptr.h> 43#include <sys/zfeature.h> 44 45SYSCTL_DECL(_vfs_zfs); 46SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); 47#if defined(__amd64__) 48static int zio_use_uma = 1; 49#else 50static int zio_use_uma = 0; 51#endif 52TUNABLE_INT("vfs.zfs.zio.use_uma", &zio_use_uma); 53SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0, 54 "Use uma(9) for ZIO allocations"); 55static int zio_exclude_metadata = 0; 56TUNABLE_INT("vfs.zfs.zio.exclude_metadata", &zio_exclude_metadata); 57SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0, 58 "Exclude metadata buffers from dumps as well"); 59 60zio_trim_stats_t zio_trim_stats = { 61 { "bytes", KSTAT_DATA_UINT64, 62 "Number of bytes successfully TRIMmed" }, 63 { "success", KSTAT_DATA_UINT64, 64 "Number of successful TRIM requests" }, 65 { "unsupported", KSTAT_DATA_UINT64, 66 "Number of TRIM requests that failed because TRIM is not supported" }, 67 { "failed", KSTAT_DATA_UINT64, 68 "Number of TRIM requests that failed for reasons other than not supported" }, 69}; 70 71static kstat_t *zio_trim_ksp; 72 73/* 74 * ========================================================================== 75 * I/O type descriptions 76 * ========================================================================== 77 */ 78const char *zio_type_name[ZIO_TYPES] = { 79 "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", 80 "zio_ioctl" 81}; 82 83/* 84 * ========================================================================== 85 * I/O kmem caches 86 * ========================================================================== 87 */ 88kmem_cache_t *zio_cache; 89kmem_cache_t *zio_link_cache; 90kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 91kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 92 93#ifdef _KERNEL 94extern vmem_t *zio_alloc_arena; 95#endif 96 97#define ZIO_PIPELINE_CONTINUE 0x100 98#define ZIO_PIPELINE_STOP 0x101 99 100#define BP_SPANB(indblkshift, level) \ 101 (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT))) 102#define COMPARE_META_LEVEL 0x80000000ul 103/* 104 * The following actions directly effect the spa's sync-to-convergence logic. 105 * The values below define the sync pass when we start performing the action. 106 * Care should be taken when changing these values as they directly impact 107 * spa_sync() performance. Tuning these values may introduce subtle performance 108 * pathologies and should only be done in the context of performance analysis. 109 * These tunables will eventually be removed and replaced with #defines once 110 * enough analysis has been done to determine optimal values. 111 * 112 * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that 113 * regular blocks are not deferred. 114 */ 115int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */ 116TUNABLE_INT("vfs.zfs.sync_pass_deferred_free", &zfs_sync_pass_deferred_free); 117SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_deferred_free, CTLFLAG_RDTUN, 118 &zfs_sync_pass_deferred_free, 0, "defer frees starting in this pass"); 119int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */ 120TUNABLE_INT("vfs.zfs.sync_pass_dont_compress", &zfs_sync_pass_dont_compress); 121SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_dont_compress, CTLFLAG_RDTUN, 122 &zfs_sync_pass_dont_compress, 0, "don't compress starting in this pass"); 123int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ 124TUNABLE_INT("vfs.zfs.sync_pass_rewrite", &zfs_sync_pass_rewrite); 125SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_rewrite, CTLFLAG_RDTUN, 126 &zfs_sync_pass_rewrite, 0, "rewrite new bps starting in this pass"); 127 128/* 129 * An allocating zio is one that either currently has the DVA allocate 130 * stage set or will have it later in its lifetime. 131 */ 132#define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) 133 134boolean_t zio_requeue_io_start_cut_in_line = B_TRUE; 135 136#ifdef illumos 137#ifdef ZFS_DEBUG 138int zio_buf_debug_limit = 16384; 139#else 140int zio_buf_debug_limit = 0; 141#endif 142#endif 143 144void 145zio_init(void) 146{ 147 size_t c; 148 zio_cache = kmem_cache_create("zio_cache", 149 sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 150 zio_link_cache = kmem_cache_create("zio_link_cache", 151 sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 152 if (!zio_use_uma) 153 goto out; 154 155 /* 156 * For small buffers, we want a cache for each multiple of 157 * SPA_MINBLOCKSIZE. For larger buffers, we want a cache 158 * for each quarter-power of 2. 159 */ 160 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 161 size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 162 size_t p2 = size; 163 size_t align = 0; 164 int cflags = zio_exclude_metadata ? KMC_NODEBUG : 0; 165 166 while (!ISP2(p2)) 167 p2 &= p2 - 1; 168 169#ifdef illumos 170#ifndef _KERNEL 171 /* 172 * If we are using watchpoints, put each buffer on its own page, 173 * to eliminate the performance overhead of trapping to the 174 * kernel when modifying a non-watched buffer that shares the 175 * page with a watched buffer. 176 */ 177 if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) 178 continue; 179#endif 180#endif /* illumos */ 181 if (size <= 4 * SPA_MINBLOCKSIZE) { 182 align = SPA_MINBLOCKSIZE; 183 } else if (IS_P2ALIGNED(size, p2 >> 2)) { 184 align = MIN(p2 >> 2, PAGESIZE); 185 } 186 187 if (align != 0) { 188 char name[36]; 189 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 190 zio_buf_cache[c] = kmem_cache_create(name, size, 191 align, NULL, NULL, NULL, NULL, NULL, cflags); 192 193 /* 194 * Since zio_data bufs do not appear in crash dumps, we 195 * pass KMC_NOTOUCH so that no allocator metadata is 196 * stored with the buffers. 197 */ 198 (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 199 zio_data_buf_cache[c] = kmem_cache_create(name, size, 200 align, NULL, NULL, NULL, NULL, NULL, 201 cflags | KMC_NOTOUCH | KMC_NODEBUG); 202 } 203 } 204 205 while (--c != 0) { 206 ASSERT(zio_buf_cache[c] != NULL); 207 if (zio_buf_cache[c - 1] == NULL) 208 zio_buf_cache[c - 1] = zio_buf_cache[c]; 209 210 ASSERT(zio_data_buf_cache[c] != NULL); 211 if (zio_data_buf_cache[c - 1] == NULL) 212 zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 213 } 214out: 215 216 zio_inject_init(); 217 218 zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc", 219 KSTAT_TYPE_NAMED, 220 sizeof(zio_trim_stats) / sizeof(kstat_named_t), 221 KSTAT_FLAG_VIRTUAL); 222 223 if (zio_trim_ksp != NULL) { 224 zio_trim_ksp->ks_data = &zio_trim_stats; 225 kstat_install(zio_trim_ksp); 226 } 227} 228 229void 230zio_fini(void) 231{ 232 size_t c; 233 kmem_cache_t *last_cache = NULL; 234 kmem_cache_t *last_data_cache = NULL; 235 236 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 237 if (zio_buf_cache[c] != last_cache) { 238 last_cache = zio_buf_cache[c]; 239 kmem_cache_destroy(zio_buf_cache[c]); 240 } 241 zio_buf_cache[c] = NULL; 242 243 if (zio_data_buf_cache[c] != last_data_cache) { 244 last_data_cache = zio_data_buf_cache[c]; 245 kmem_cache_destroy(zio_data_buf_cache[c]); 246 } 247 zio_data_buf_cache[c] = NULL; 248 } 249 250 kmem_cache_destroy(zio_link_cache); 251 kmem_cache_destroy(zio_cache); 252 253 zio_inject_fini(); 254 255 if (zio_trim_ksp != NULL) { 256 kstat_delete(zio_trim_ksp); 257 zio_trim_ksp = NULL; 258 } 259} 260 261/* 262 * ========================================================================== 263 * Allocate and free I/O buffers 264 * ========================================================================== 265 */ 266 267/* 268 * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 269 * crashdump if the kernel panics, so use it judiciously. Obviously, it's 270 * useful to inspect ZFS metadata, but if possible, we should avoid keeping 271 * excess / transient data in-core during a crashdump. 272 */ 273void * 274zio_buf_alloc(size_t size) 275{ 276 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 277 int flags = zio_exclude_metadata ? KM_NODEBUG : 0; 278 279 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 280 281 if (zio_use_uma) 282 return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 283 else 284 return (kmem_alloc(size, KM_SLEEP|flags)); 285} 286 287/* 288 * Use zio_data_buf_alloc to allocate data. The data will not appear in a 289 * crashdump if the kernel panics. This exists so that we will limit the amount 290 * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 291 * of kernel heap dumped to disk when the kernel panics) 292 */ 293void * 294zio_data_buf_alloc(size_t size) 295{ 296 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 297 298 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 299 300 if (zio_use_uma) 301 return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 302 else 303 return (kmem_alloc(size, KM_SLEEP | KM_NODEBUG)); 304} 305 306void 307zio_buf_free(void *buf, size_t size) 308{ 309 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 310 311 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 312 313 if (zio_use_uma) 314 kmem_cache_free(zio_buf_cache[c], buf); 315 else 316 kmem_free(buf, size); 317} 318 319void 320zio_data_buf_free(void *buf, size_t size) 321{ 322 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 323 324 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 325 326 if (zio_use_uma) 327 kmem_cache_free(zio_data_buf_cache[c], buf); 328 else 329 kmem_free(buf, size); 330} 331 332/* 333 * ========================================================================== 334 * Push and pop I/O transform buffers 335 * ========================================================================== 336 */ 337void 338zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, 339 zio_transform_func_t *transform) 340{ 341 zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 342 343 zt->zt_orig_data = zio->io_data; 344 zt->zt_orig_size = zio->io_size; 345 zt->zt_bufsize = bufsize; 346 zt->zt_transform = transform; 347 348 zt->zt_next = zio->io_transform_stack; 349 zio->io_transform_stack = zt; 350 351 zio->io_data = data; 352 zio->io_size = size; 353} 354 355void 356zio_pop_transforms(zio_t *zio) 357{ 358 zio_transform_t *zt; 359 360 while ((zt = zio->io_transform_stack) != NULL) { 361 if (zt->zt_transform != NULL) 362 zt->zt_transform(zio, 363 zt->zt_orig_data, zt->zt_orig_size); 364 365 if (zt->zt_bufsize != 0) 366 zio_buf_free(zio->io_data, zt->zt_bufsize); 367 368 zio->io_data = zt->zt_orig_data; 369 zio->io_size = zt->zt_orig_size; 370 zio->io_transform_stack = zt->zt_next; 371 372 kmem_free(zt, sizeof (zio_transform_t)); 373 } 374} 375 376/* 377 * ========================================================================== 378 * I/O transform callbacks for subblocks and decompression 379 * ========================================================================== 380 */ 381static void 382zio_subblock(zio_t *zio, void *data, uint64_t size) 383{ 384 ASSERT(zio->io_size > size); 385 386 if (zio->io_type == ZIO_TYPE_READ) 387 bcopy(zio->io_data, data, size); 388} 389 390static void 391zio_decompress(zio_t *zio, void *data, uint64_t size) 392{ 393 if (zio->io_error == 0 && 394 zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), 395 zio->io_data, data, zio->io_size, size) != 0) 396 zio->io_error = SET_ERROR(EIO); 397} 398 399/* 400 * ========================================================================== 401 * I/O parent/child relationships and pipeline interlocks 402 * ========================================================================== 403 */ 404/* 405 * NOTE - Callers to zio_walk_parents() and zio_walk_children must 406 * continue calling these functions until they return NULL. 407 * Otherwise, the next caller will pick up the list walk in 408 * some indeterminate state. (Otherwise every caller would 409 * have to pass in a cookie to keep the state represented by 410 * io_walk_link, which gets annoying.) 411 */ 412zio_t * 413zio_walk_parents(zio_t *cio) 414{ 415 zio_link_t *zl = cio->io_walk_link; 416 list_t *pl = &cio->io_parent_list; 417 418 zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); 419 cio->io_walk_link = zl; 420 421 if (zl == NULL) 422 return (NULL); 423 424 ASSERT(zl->zl_child == cio); 425 return (zl->zl_parent); 426} 427 428zio_t * 429zio_walk_children(zio_t *pio) 430{ 431 zio_link_t *zl = pio->io_walk_link; 432 list_t *cl = &pio->io_child_list; 433 434 zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); 435 pio->io_walk_link = zl; 436 437 if (zl == NULL) 438 return (NULL); 439 440 ASSERT(zl->zl_parent == pio); 441 return (zl->zl_child); 442} 443 444zio_t * 445zio_unique_parent(zio_t *cio) 446{ 447 zio_t *pio = zio_walk_parents(cio); 448 449 VERIFY(zio_walk_parents(cio) == NULL); 450 return (pio); 451} 452 453void 454zio_add_child(zio_t *pio, zio_t *cio) 455{ 456 zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); 457 458 /* 459 * Logical I/Os can have logical, gang, or vdev children. 460 * Gang I/Os can have gang or vdev children. 461 * Vdev I/Os can only have vdev children. 462 * The following ASSERT captures all of these constraints. 463 */ 464 ASSERT(cio->io_child_type <= pio->io_child_type); 465 466 zl->zl_parent = pio; 467 zl->zl_child = cio; 468 469 mutex_enter(&cio->io_lock); 470 mutex_enter(&pio->io_lock); 471 472 ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); 473 474 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 475 pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; 476 477 list_insert_head(&pio->io_child_list, zl); 478 list_insert_head(&cio->io_parent_list, zl); 479 480 pio->io_child_count++; 481 cio->io_parent_count++; 482 483 mutex_exit(&pio->io_lock); 484 mutex_exit(&cio->io_lock); 485} 486 487static void 488zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) 489{ 490 ASSERT(zl->zl_parent == pio); 491 ASSERT(zl->zl_child == cio); 492 493 mutex_enter(&cio->io_lock); 494 mutex_enter(&pio->io_lock); 495 496 list_remove(&pio->io_child_list, zl); 497 list_remove(&cio->io_parent_list, zl); 498 499 pio->io_child_count--; 500 cio->io_parent_count--; 501 502 mutex_exit(&pio->io_lock); 503 mutex_exit(&cio->io_lock); 504 505 kmem_cache_free(zio_link_cache, zl); 506} 507 508static boolean_t 509zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) 510{ 511 uint64_t *countp = &zio->io_children[child][wait]; 512 boolean_t waiting = B_FALSE; 513 514 mutex_enter(&zio->io_lock); 515 ASSERT(zio->io_stall == NULL); 516 if (*countp != 0) { 517 zio->io_stage >>= 1; 518 zio->io_stall = countp; 519 waiting = B_TRUE; 520 } 521 mutex_exit(&zio->io_lock); 522 523 return (waiting); 524} 525 526static void 527zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) 528{ 529 uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; 530 int *errorp = &pio->io_child_error[zio->io_child_type]; 531 532 mutex_enter(&pio->io_lock); 533 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 534 *errorp = zio_worst_error(*errorp, zio->io_error); 535 pio->io_reexecute |= zio->io_reexecute; 536 ASSERT3U(*countp, >, 0); 537 538 (*countp)--; 539 540 if (*countp == 0 && pio->io_stall == countp) { 541 pio->io_stall = NULL; 542 mutex_exit(&pio->io_lock); 543 zio_execute(pio); 544 } else { 545 mutex_exit(&pio->io_lock); 546 } 547} 548 549static void 550zio_inherit_child_errors(zio_t *zio, enum zio_child c) 551{ 552 if (zio->io_child_error[c] != 0 && zio->io_error == 0) 553 zio->io_error = zio->io_child_error[c]; 554} 555 556/* 557 * ========================================================================== 558 * Create the various types of I/O (read, write, free, etc) 559 * ========================================================================== 560 */ 561static zio_t * 562zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 563 void *data, uint64_t size, zio_done_func_t *done, void *private, 564 zio_type_t type, zio_priority_t priority, enum zio_flag flags, 565 vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb, 566 enum zio_stage stage, enum zio_stage pipeline) 567{ 568 zio_t *zio; 569 570 ASSERT3U(type == ZIO_TYPE_FREE || size, <=, SPA_MAXBLOCKSIZE); 571 ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 572 ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 573 574 ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); 575 ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); 576 ASSERT(vd || stage == ZIO_STAGE_OPEN); 577 578 zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 579 bzero(zio, sizeof (zio_t)); 580 581 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 582 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 583 584 list_create(&zio->io_parent_list, sizeof (zio_link_t), 585 offsetof(zio_link_t, zl_parent_node)); 586 list_create(&zio->io_child_list, sizeof (zio_link_t), 587 offsetof(zio_link_t, zl_child_node)); 588 589 if (vd != NULL) 590 zio->io_child_type = ZIO_CHILD_VDEV; 591 else if (flags & ZIO_FLAG_GANG_CHILD) 592 zio->io_child_type = ZIO_CHILD_GANG; 593 else if (flags & ZIO_FLAG_DDT_CHILD) 594 zio->io_child_type = ZIO_CHILD_DDT; 595 else 596 zio->io_child_type = ZIO_CHILD_LOGICAL; 597 598 if (bp != NULL) { 599 zio->io_bp = (blkptr_t *)bp; 600 zio->io_bp_copy = *bp; 601 zio->io_bp_orig = *bp; 602 if (type != ZIO_TYPE_WRITE || 603 zio->io_child_type == ZIO_CHILD_DDT) 604 zio->io_bp = &zio->io_bp_copy; /* so caller can free */ 605 if (zio->io_child_type == ZIO_CHILD_LOGICAL) 606 zio->io_logical = zio; 607 if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) 608 pipeline |= ZIO_GANG_STAGES; 609 } 610 611 zio->io_spa = spa; 612 zio->io_txg = txg; 613 zio->io_done = done; 614 zio->io_private = private; 615 zio->io_type = type; 616 zio->io_priority = priority; 617 zio->io_vd = vd; 618 zio->io_offset = offset; 619 zio->io_orig_data = zio->io_data = data; 620 zio->io_orig_size = zio->io_size = size; 621 zio->io_orig_flags = zio->io_flags = flags; 622 zio->io_orig_stage = zio->io_stage = stage; 623 zio->io_orig_pipeline = zio->io_pipeline = pipeline; 624 625 zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); 626 zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); 627 628 if (zb != NULL) 629 zio->io_bookmark = *zb; 630 631 if (pio != NULL) { 632 if (zio->io_logical == NULL) 633 zio->io_logical = pio->io_logical; 634 if (zio->io_child_type == ZIO_CHILD_GANG) 635 zio->io_gang_leader = pio->io_gang_leader; 636 zio_add_child(pio, zio); 637 } 638 639 return (zio); 640} 641 642static void 643zio_destroy(zio_t *zio) 644{ 645 list_destroy(&zio->io_parent_list); 646 list_destroy(&zio->io_child_list); 647 mutex_destroy(&zio->io_lock); 648 cv_destroy(&zio->io_cv); 649 kmem_cache_free(zio_cache, zio); 650} 651 652zio_t * 653zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, 654 void *private, enum zio_flag flags) 655{ 656 zio_t *zio; 657 658 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 659 ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 660 ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); 661 662 return (zio); 663} 664 665zio_t * 666zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) 667{ 668 return (zio_null(NULL, spa, NULL, done, private, flags)); 669} 670 671void 672zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp) 673{ 674 if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) { 675 zfs_panic_recover("blkptr at %p has invalid TYPE %llu", 676 bp, (longlong_t)BP_GET_TYPE(bp)); 677 } 678 if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS || 679 BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) { 680 zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu", 681 bp, (longlong_t)BP_GET_CHECKSUM(bp)); 682 } 683 if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS || 684 BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) { 685 zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu", 686 bp, (longlong_t)BP_GET_COMPRESS(bp)); 687 } 688 if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) { 689 zfs_panic_recover("blkptr at %p has invalid LSIZE %llu", 690 bp, (longlong_t)BP_GET_LSIZE(bp)); 691 } 692 if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) { 693 zfs_panic_recover("blkptr at %p has invalid PSIZE %llu", 694 bp, (longlong_t)BP_GET_PSIZE(bp)); 695 } 696 697 if (BP_IS_EMBEDDED(bp)) { 698 if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) { 699 zfs_panic_recover("blkptr at %p has invalid ETYPE %llu", 700 bp, (longlong_t)BPE_GET_ETYPE(bp)); 701 } 702 } 703 704 /* 705 * Pool-specific checks. 706 * 707 * Note: it would be nice to verify that the blk_birth and 708 * BP_PHYSICAL_BIRTH() are not too large. However, spa_freeze() 709 * allows the birth time of log blocks (and dmu_sync()-ed blocks 710 * that are in the log) to be arbitrarily large. 711 */ 712 for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 713 uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]); 714 if (vdevid >= spa->spa_root_vdev->vdev_children) { 715 zfs_panic_recover("blkptr at %p DVA %u has invalid " 716 "VDEV %llu", 717 bp, i, (longlong_t)vdevid); 718 continue; 719 } 720 vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; 721 if (vd == NULL) { 722 zfs_panic_recover("blkptr at %p DVA %u has invalid " 723 "VDEV %llu", 724 bp, i, (longlong_t)vdevid); 725 continue; 726 } 727 if (vd->vdev_ops == &vdev_hole_ops) { 728 zfs_panic_recover("blkptr at %p DVA %u has hole " 729 "VDEV %llu", 730 bp, i, (longlong_t)vdevid); 731 continue; 732 } 733 if (vd->vdev_ops == &vdev_missing_ops) { 734 /* 735 * "missing" vdevs are valid during import, but we 736 * don't have their detailed info (e.g. asize), so 737 * we can't perform any more checks on them. 738 */ 739 continue; 740 } 741 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); 742 uint64_t asize = DVA_GET_ASIZE(&bp->blk_dva[i]); 743 if (BP_IS_GANG(bp)) 744 asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 745 if (offset + asize > vd->vdev_asize) { 746 zfs_panic_recover("blkptr at %p DVA %u has invalid " 747 "OFFSET %llu", 748 bp, i, (longlong_t)offset); 749 } 750 } 751} 752 753zio_t * 754zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, 755 void *data, uint64_t size, zio_done_func_t *done, void *private, 756 zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) 757{ 758 zio_t *zio; 759 760 zfs_blkptr_verify(spa, bp); 761 762 zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, 763 data, size, done, private, 764 ZIO_TYPE_READ, priority, flags, NULL, 0, zb, 765 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 766 ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); 767 768 return (zio); 769} 770 771zio_t * 772zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 773 void *data, uint64_t size, const zio_prop_t *zp, 774 zio_done_func_t *ready, zio_done_func_t *children_ready, 775 zio_done_func_t *physdone, zio_done_func_t *done, 776 void *private, zio_priority_t priority, enum zio_flag flags, 777 const zbookmark_phys_t *zb) 778{ 779 zio_t *zio; 780 781 ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && 782 zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && 783 zp->zp_compress >= ZIO_COMPRESS_OFF && 784 zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && 785 DMU_OT_IS_VALID(zp->zp_type) && 786 zp->zp_level < 32 && 787 zp->zp_copies > 0 && 788 zp->zp_copies <= spa_max_replication(spa)); 789 790 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 791 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 792 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 793 ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); 794 795 zio->io_ready = ready; 796 zio->io_children_ready = children_ready; 797 zio->io_physdone = physdone; 798 zio->io_prop = *zp; 799 800 /* 801 * Data can be NULL if we are going to call zio_write_override() to 802 * provide the already-allocated BP. But we may need the data to 803 * verify a dedup hit (if requested). In this case, don't try to 804 * dedup (just take the already-allocated BP verbatim). 805 */ 806 if (data == NULL && zio->io_prop.zp_dedup_verify) { 807 zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE; 808 } 809 810 return (zio); 811} 812 813zio_t * 814zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, 815 uint64_t size, zio_done_func_t *done, void *private, 816 zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb) 817{ 818 zio_t *zio; 819 820 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 821 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 822 ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 823 824 return (zio); 825} 826 827void 828zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) 829{ 830 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 831 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 832 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 833 ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); 834 835 /* 836 * We must reset the io_prop to match the values that existed 837 * when the bp was first written by dmu_sync() keeping in mind 838 * that nopwrite and dedup are mutually exclusive. 839 */ 840 zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; 841 zio->io_prop.zp_nopwrite = nopwrite; 842 zio->io_prop.zp_copies = copies; 843 zio->io_bp_override = bp; 844} 845 846void 847zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) 848{ 849 850 /* 851 * The check for EMBEDDED is a performance optimization. We 852 * process the free here (by ignoring it) rather than 853 * putting it on the list and then processing it in zio_free_sync(). 854 */ 855 if (BP_IS_EMBEDDED(bp)) 856 return; 857 metaslab_check_free(spa, bp); 858 859 /* 860 * Frees that are for the currently-syncing txg, are not going to be 861 * deferred, and which will not need to do a read (i.e. not GANG or 862 * DEDUP), can be processed immediately. Otherwise, put them on the 863 * in-memory list for later processing. 864 */ 865 if (zfs_trim_enabled || BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || 866 txg != spa->spa_syncing_txg || 867 spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) { 868 bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); 869 } else { 870 VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 871 BP_GET_PSIZE(bp), 0))); 872 } 873} 874 875zio_t * 876zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 877 uint64_t size, enum zio_flag flags) 878{ 879 zio_t *zio; 880 enum zio_stage stage = ZIO_FREE_PIPELINE; 881 882 ASSERT(!BP_IS_HOLE(bp)); 883 ASSERT(spa_syncing_txg(spa) == txg); 884 ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); 885 886 if (BP_IS_EMBEDDED(bp)) 887 return (zio_null(pio, spa, NULL, NULL, NULL, 0)); 888 889 metaslab_check_free(spa, bp); 890 arc_freed(spa, bp); 891 892 if (zfs_trim_enabled) 893 stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START | 894 ZIO_STAGE_VDEV_IO_ASSESS; 895 /* 896 * GANG and DEDUP blocks can induce a read (for the gang block header, 897 * or the DDT), so issue them asynchronously so that this thread is 898 * not tied up. 899 */ 900 else if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) 901 stage |= ZIO_STAGE_ISSUE_ASYNC; 902 903 flags |= ZIO_FLAG_DONT_QUEUE; 904 905 zio = zio_create(pio, spa, txg, bp, NULL, size, 906 NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags, 907 NULL, 0, NULL, ZIO_STAGE_OPEN, stage); 908 909 return (zio); 910} 911 912zio_t * 913zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 914 zio_done_func_t *done, void *private, enum zio_flag flags) 915{ 916 zio_t *zio; 917 918 dprintf_bp(bp, "claiming in txg %llu", txg); 919 920 if (BP_IS_EMBEDDED(bp)) 921 return (zio_null(pio, spa, NULL, NULL, NULL, 0)); 922 923 /* 924 * A claim is an allocation of a specific block. Claims are needed 925 * to support immediate writes in the intent log. The issue is that 926 * immediate writes contain committed data, but in a txg that was 927 * *not* committed. Upon opening the pool after an unclean shutdown, 928 * the intent log claims all blocks that contain immediate write data 929 * so that the SPA knows they're in use. 930 * 931 * All claims *must* be resolved in the first txg -- before the SPA 932 * starts allocating blocks -- so that nothing is allocated twice. 933 * If txg == 0 we just verify that the block is claimable. 934 */ 935 ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 936 ASSERT(txg == spa_first_txg(spa) || txg == 0); 937 ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ 938 939 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 940 done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, 941 NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 942 943 return (zio); 944} 945 946zio_t * 947zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset, 948 uint64_t size, zio_done_func_t *done, void *private, 949 zio_priority_t priority, enum zio_flag flags) 950{ 951 zio_t *zio; 952 int c; 953 954 if (vd->vdev_children == 0) { 955 zio = zio_create(pio, spa, 0, NULL, NULL, size, done, private, 956 ZIO_TYPE_IOCTL, priority, flags, vd, offset, NULL, 957 ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 958 959 zio->io_cmd = cmd; 960 } else { 961 zio = zio_null(pio, spa, NULL, NULL, NULL, flags); 962 963 for (c = 0; c < vd->vdev_children; c++) 964 zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 965 offset, size, done, private, priority, flags)); 966 } 967 968 return (zio); 969} 970 971zio_t * 972zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 973 void *data, int checksum, zio_done_func_t *done, void *private, 974 zio_priority_t priority, enum zio_flag flags, boolean_t labels) 975{ 976 zio_t *zio; 977 978 ASSERT(vd->vdev_children == 0); 979 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 980 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 981 ASSERT3U(offset + size, <=, vd->vdev_psize); 982 983 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 984 ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, 985 NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 986 987 zio->io_prop.zp_checksum = checksum; 988 989 return (zio); 990} 991 992zio_t * 993zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 994 void *data, int checksum, zio_done_func_t *done, void *private, 995 zio_priority_t priority, enum zio_flag flags, boolean_t labels) 996{ 997 zio_t *zio; 998 999 ASSERT(vd->vdev_children == 0); 1000 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 1001 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 1002 ASSERT3U(offset + size, <=, vd->vdev_psize); 1003 1004 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 1005 ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, 1006 NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 1007 1008 zio->io_prop.zp_checksum = checksum; 1009 1010 if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { 1011 /* 1012 * zec checksums are necessarily destructive -- they modify 1013 * the end of the write buffer to hold the verifier/checksum. 1014 * Therefore, we must make a local copy in case the data is 1015 * being written to multiple places in parallel. 1016 */ 1017 void *wbuf = zio_buf_alloc(size); 1018 bcopy(data, wbuf, size); 1019 zio_push_transform(zio, wbuf, size, size, NULL); 1020 } 1021 1022 return (zio); 1023} 1024 1025/* 1026 * Create a child I/O to do some work for us. 1027 */ 1028zio_t * 1029zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 1030 void *data, uint64_t size, int type, zio_priority_t priority, 1031 enum zio_flag flags, zio_done_func_t *done, void *private) 1032{ 1033 enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; 1034 zio_t *zio; 1035 1036 ASSERT(vd->vdev_parent == 1037 (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); 1038 1039 if (type == ZIO_TYPE_READ && bp != NULL) { 1040 /* 1041 * If we have the bp, then the child should perform the 1042 * checksum and the parent need not. This pushes error 1043 * detection as close to the leaves as possible and 1044 * eliminates redundant checksums in the interior nodes. 1045 */ 1046 pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; 1047 pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 1048 } 1049 1050 /* Not all IO types require vdev io done stage e.g. free */ 1051 if (!(pio->io_pipeline & ZIO_STAGE_VDEV_IO_DONE)) 1052 pipeline &= ~ZIO_STAGE_VDEV_IO_DONE; 1053 1054 if (vd->vdev_children == 0) 1055 offset += VDEV_LABEL_START_SIZE; 1056 1057 flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE; 1058 1059 /* 1060 * If we've decided to do a repair, the write is not speculative -- 1061 * even if the original read was. 1062 */ 1063 if (flags & ZIO_FLAG_IO_REPAIR) 1064 flags &= ~ZIO_FLAG_SPECULATIVE; 1065 1066 zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, 1067 done, private, type, priority, flags, vd, offset, &pio->io_bookmark, 1068 ZIO_STAGE_VDEV_IO_START >> 1, pipeline); 1069 1070 zio->io_physdone = pio->io_physdone; 1071 if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) 1072 zio->io_logical->io_phys_children++; 1073 1074 return (zio); 1075} 1076 1077zio_t * 1078zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, 1079 int type, zio_priority_t priority, enum zio_flag flags, 1080 zio_done_func_t *done, void *private) 1081{ 1082 zio_t *zio; 1083 1084 ASSERT(vd->vdev_ops->vdev_op_leaf); 1085 1086 zio = zio_create(NULL, vd->vdev_spa, 0, NULL, 1087 data, size, done, private, type, priority, 1088 flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED, 1089 vd, offset, NULL, 1090 ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); 1091 1092 return (zio); 1093} 1094 1095void 1096zio_flush(zio_t *zio, vdev_t *vd) 1097{ 1098 zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0, 1099 NULL, NULL, ZIO_PRIORITY_NOW, 1100 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); 1101} 1102 1103zio_t * 1104zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size) 1105{ 1106 1107 ASSERT(vd->vdev_ops->vdev_op_leaf); 1108 1109 return (zio_create(zio, spa, 0, NULL, NULL, size, NULL, NULL, 1110 ZIO_TYPE_FREE, ZIO_PRIORITY_TRIM, ZIO_FLAG_DONT_AGGREGATE | 1111 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, 1112 vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PHYS_PIPELINE)); 1113} 1114 1115void 1116zio_shrink(zio_t *zio, uint64_t size) 1117{ 1118 ASSERT(zio->io_executor == NULL); 1119 ASSERT(zio->io_orig_size == zio->io_size); 1120 ASSERT(size <= zio->io_size); 1121 1122 /* 1123 * We don't shrink for raidz because of problems with the 1124 * reconstruction when reading back less than the block size. 1125 * Note, BP_IS_RAIDZ() assumes no compression. 1126 */ 1127 ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); 1128 if (!BP_IS_RAIDZ(zio->io_bp)) 1129 zio->io_orig_size = zio->io_size = size; 1130} 1131 1132/* 1133 * ========================================================================== 1134 * Prepare to read and write logical blocks 1135 * ========================================================================== 1136 */ 1137 1138static int 1139zio_read_bp_init(zio_t *zio) 1140{ 1141 blkptr_t *bp = zio->io_bp; 1142 1143 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && 1144 zio->io_child_type == ZIO_CHILD_LOGICAL && 1145 !(zio->io_flags & ZIO_FLAG_RAW)) { 1146 uint64_t psize = 1147 BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); 1148 void *cbuf = zio_buf_alloc(psize); 1149 1150 zio_push_transform(zio, cbuf, psize, psize, zio_decompress); 1151 } 1152 1153 if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { 1154 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1155 decode_embedded_bp_compressed(bp, zio->io_data); 1156 } else { 1157 ASSERT(!BP_IS_EMBEDDED(bp)); 1158 } 1159 1160 if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) 1161 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1162 1163 if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) 1164 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1165 1166 if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) 1167 zio->io_pipeline = ZIO_DDT_READ_PIPELINE; 1168 1169 return (ZIO_PIPELINE_CONTINUE); 1170} 1171 1172static int 1173zio_write_bp_init(zio_t *zio) 1174{ 1175 spa_t *spa = zio->io_spa; 1176 zio_prop_t *zp = &zio->io_prop; 1177 enum zio_compress compress = zp->zp_compress; 1178 blkptr_t *bp = zio->io_bp; 1179 uint64_t lsize = zio->io_size; 1180 uint64_t psize = lsize; 1181 int pass = 1; 1182 1183 /* 1184 * If our children haven't all reached the ready stage, 1185 * wait for them and then repeat this pipeline stage. 1186 */ 1187 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 1188 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) 1189 return (ZIO_PIPELINE_STOP); 1190 1191 if (!IO_IS_ALLOCATING(zio)) 1192 return (ZIO_PIPELINE_CONTINUE); 1193 1194 if (zio->io_children_ready != NULL) { 1195 /* 1196 * Now that all our children are ready, run the callback 1197 * associated with this zio in case it wants to modify the 1198 * data to be written. 1199 */ 1200 ASSERT3U(zp->zp_level, >, 0); 1201 zio->io_children_ready(zio); 1202 } 1203 1204 ASSERT(zio->io_child_type != ZIO_CHILD_DDT); 1205 1206 if (zio->io_bp_override) { 1207 ASSERT(bp->blk_birth != zio->io_txg); 1208 ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); 1209 1210 *bp = *zio->io_bp_override; 1211 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1212 1213 if (BP_IS_EMBEDDED(bp)) 1214 return (ZIO_PIPELINE_CONTINUE); 1215 1216 /* 1217 * If we've been overridden and nopwrite is set then 1218 * set the flag accordingly to indicate that a nopwrite 1219 * has already occurred. 1220 */ 1221 if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { 1222 ASSERT(!zp->zp_dedup); 1223 zio->io_flags |= ZIO_FLAG_NOPWRITE; 1224 return (ZIO_PIPELINE_CONTINUE); 1225 } 1226 1227 ASSERT(!zp->zp_nopwrite); 1228 1229 if (BP_IS_HOLE(bp) || !zp->zp_dedup) 1230 return (ZIO_PIPELINE_CONTINUE); 1231 1232 ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags & 1233 ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify); 1234 1235 if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { 1236 BP_SET_DEDUP(bp, 1); 1237 zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; 1238 return (ZIO_PIPELINE_CONTINUE); 1239 } 1240 zio->io_bp_override = NULL; 1241 BP_ZERO(bp); 1242 } 1243 1244 if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) { 1245 /* 1246 * We're rewriting an existing block, which means we're 1247 * working on behalf of spa_sync(). For spa_sync() to 1248 * converge, it must eventually be the case that we don't 1249 * have to allocate new blocks. But compression changes 1250 * the blocksize, which forces a reallocate, and makes 1251 * convergence take longer. Therefore, after the first 1252 * few passes, stop compressing to ensure convergence. 1253 */ 1254 pass = spa_sync_pass(spa); 1255 1256 ASSERT(zio->io_txg == spa_syncing_txg(spa)); 1257 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1258 ASSERT(!BP_GET_DEDUP(bp)); 1259 1260 if (pass >= zfs_sync_pass_dont_compress) 1261 compress = ZIO_COMPRESS_OFF; 1262 1263 /* Make sure someone doesn't change their mind on overwrites */ 1264 ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp), 1265 spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 1266 } 1267 1268 if (compress != ZIO_COMPRESS_OFF) { 1269 void *cbuf = zio_buf_alloc(lsize); 1270 psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); 1271 if (psize == 0 || psize == lsize) { 1272 compress = ZIO_COMPRESS_OFF; 1273 zio_buf_free(cbuf, lsize); 1274 } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE && 1275 zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && 1276 spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { 1277 encode_embedded_bp_compressed(bp, 1278 cbuf, compress, lsize, psize); 1279 BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA); 1280 BP_SET_TYPE(bp, zio->io_prop.zp_type); 1281 BP_SET_LEVEL(bp, zio->io_prop.zp_level); 1282 zio_buf_free(cbuf, lsize); 1283 bp->blk_birth = zio->io_txg; 1284 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1285 ASSERT(spa_feature_is_active(spa, 1286 SPA_FEATURE_EMBEDDED_DATA)); 1287 return (ZIO_PIPELINE_CONTINUE); 1288 } else { 1289 /* 1290 * Round up compressed size up to the ashift 1291 * of the smallest-ashift device, and zero the tail. 1292 * This ensures that the compressed size of the BP 1293 * (and thus compressratio property) are correct, 1294 * in that we charge for the padding used to fill out 1295 * the last sector. 1296 */ 1297 ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); 1298 size_t rounded = (size_t)P2ROUNDUP(psize, 1299 1ULL << spa->spa_min_ashift); 1300 if (rounded >= lsize) { 1301 compress = ZIO_COMPRESS_OFF; 1302 zio_buf_free(cbuf, lsize); 1303 psize = lsize; 1304 } else { 1305 bzero((char *)cbuf + psize, rounded - psize); 1306 psize = rounded; 1307 zio_push_transform(zio, cbuf, 1308 psize, lsize, NULL); 1309 } 1310 } 1311 } 1312 1313 /* 1314 * The final pass of spa_sync() must be all rewrites, but the first 1315 * few passes offer a trade-off: allocating blocks defers convergence, 1316 * but newly allocated blocks are sequential, so they can be written 1317 * to disk faster. Therefore, we allow the first few passes of 1318 * spa_sync() to allocate new blocks, but force rewrites after that. 1319 * There should only be a handful of blocks after pass 1 in any case. 1320 */ 1321 if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg && 1322 BP_GET_PSIZE(bp) == psize && 1323 pass >= zfs_sync_pass_rewrite) { 1324 ASSERT(psize != 0); 1325 enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; 1326 zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; 1327 zio->io_flags |= ZIO_FLAG_IO_REWRITE; 1328 } else { 1329 BP_ZERO(bp); 1330 zio->io_pipeline = ZIO_WRITE_PIPELINE; 1331 } 1332 1333 if (psize == 0) { 1334 if (zio->io_bp_orig.blk_birth != 0 && 1335 spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { 1336 BP_SET_LSIZE(bp, lsize); 1337 BP_SET_TYPE(bp, zp->zp_type); 1338 BP_SET_LEVEL(bp, zp->zp_level); 1339 BP_SET_BIRTH(bp, zio->io_txg, 0); 1340 } 1341 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1342 } else { 1343 ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); 1344 BP_SET_LSIZE(bp, lsize); 1345 BP_SET_TYPE(bp, zp->zp_type); 1346 BP_SET_LEVEL(bp, zp->zp_level); 1347 BP_SET_PSIZE(bp, psize); 1348 BP_SET_COMPRESS(bp, compress); 1349 BP_SET_CHECKSUM(bp, zp->zp_checksum); 1350 BP_SET_DEDUP(bp, zp->zp_dedup); 1351 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1352 if (zp->zp_dedup) { 1353 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1354 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1355 zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; 1356 } 1357 if (zp->zp_nopwrite) { 1358 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1359 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1360 zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; 1361 } 1362 } 1363 1364 return (ZIO_PIPELINE_CONTINUE); 1365} 1366 1367static int 1368zio_free_bp_init(zio_t *zio) 1369{ 1370 blkptr_t *bp = zio->io_bp; 1371 1372 if (zio->io_child_type == ZIO_CHILD_LOGICAL) { 1373 if (BP_GET_DEDUP(bp)) 1374 zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; 1375 } 1376 1377 return (ZIO_PIPELINE_CONTINUE); 1378} 1379 1380/* 1381 * ========================================================================== 1382 * Execute the I/O pipeline 1383 * ========================================================================== 1384 */ 1385 1386static void 1387zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) 1388{ 1389 spa_t *spa = zio->io_spa; 1390 zio_type_t t = zio->io_type; 1391 int flags = (cutinline ? TQ_FRONT : 0); 1392 1393 ASSERT(q == ZIO_TASKQ_ISSUE || q == ZIO_TASKQ_INTERRUPT); 1394 1395 /* 1396 * If we're a config writer or a probe, the normal issue and 1397 * interrupt threads may all be blocked waiting for the config lock. 1398 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. 1399 */ 1400 if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) 1401 t = ZIO_TYPE_NULL; 1402 1403 /* 1404 * A similar issue exists for the L2ARC write thread until L2ARC 2.0. 1405 */ 1406 if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) 1407 t = ZIO_TYPE_NULL; 1408 1409 /* 1410 * If this is a high priority I/O, then use the high priority taskq if 1411 * available. 1412 */ 1413 if (zio->io_priority == ZIO_PRIORITY_NOW && 1414 spa->spa_zio_taskq[t][q + 1].stqs_count != 0) 1415 q++; 1416 1417 ASSERT3U(q, <, ZIO_TASKQ_TYPES); 1418 1419 /* 1420 * NB: We are assuming that the zio can only be dispatched 1421 * to a single taskq at a time. It would be a grievous error 1422 * to dispatch the zio to another taskq at the same time. 1423 */ 1424#if defined(illumos) || !defined(_KERNEL) 1425 ASSERT(zio->io_tqent.tqent_next == NULL); 1426#else 1427 ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); 1428#endif 1429 spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio, 1430 flags, &zio->io_tqent); 1431} 1432 1433static boolean_t 1434zio_taskq_member(zio_t *zio, zio_taskq_type_t q) 1435{ 1436 kthread_t *executor = zio->io_executor; 1437 spa_t *spa = zio->io_spa; 1438 1439 for (zio_type_t t = 0; t < ZIO_TYPES; t++) { 1440 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1441 uint_t i; 1442 for (i = 0; i < tqs->stqs_count; i++) { 1443 if (taskq_member(tqs->stqs_taskq[i], executor)) 1444 return (B_TRUE); 1445 } 1446 } 1447 1448 return (B_FALSE); 1449} 1450 1451static int 1452zio_issue_async(zio_t *zio) 1453{ 1454 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 1455 1456 return (ZIO_PIPELINE_STOP); 1457} 1458 1459void 1460zio_interrupt(zio_t *zio) 1461{ 1462 zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); 1463} 1464 1465void 1466zio_delay_interrupt(zio_t *zio) 1467{ 1468 /* 1469 * The timeout_generic() function isn't defined in userspace, so 1470 * rather than trying to implement the function, the zio delay 1471 * functionality has been disabled for userspace builds. 1472 */ 1473 1474#ifdef _KERNEL 1475 /* 1476 * If io_target_timestamp is zero, then no delay has been registered 1477 * for this IO, thus jump to the end of this function and "skip" the 1478 * delay; issuing it directly to the zio layer. 1479 */ 1480 if (zio->io_target_timestamp != 0) { 1481 hrtime_t now = gethrtime(); 1482 1483 if (now >= zio->io_target_timestamp) { 1484 /* 1485 * This IO has already taken longer than the target 1486 * delay to complete, so we don't want to delay it 1487 * any longer; we "miss" the delay and issue it 1488 * directly to the zio layer. This is likely due to 1489 * the target latency being set to a value less than 1490 * the underlying hardware can satisfy (e.g. delay 1491 * set to 1ms, but the disks take 10ms to complete an 1492 * IO request). 1493 */ 1494 1495 DTRACE_PROBE2(zio__delay__miss, zio_t *, zio, 1496 hrtime_t, now); 1497 1498 zio_interrupt(zio); 1499 } else { 1500 hrtime_t diff = zio->io_target_timestamp - now; 1501 1502 DTRACE_PROBE3(zio__delay__hit, zio_t *, zio, 1503 hrtime_t, now, hrtime_t, diff); 1504 1505 (void) timeout_generic(CALLOUT_NORMAL, 1506 (void (*)(void *))zio_interrupt, zio, diff, 1, 0); 1507 } 1508 1509 return; 1510 } 1511#endif 1512 1513 DTRACE_PROBE1(zio__delay__skip, zio_t *, zio); 1514 zio_interrupt(zio); 1515} 1516 1517/* 1518 * Execute the I/O pipeline until one of the following occurs: 1519 * 1520 * (1) the I/O completes 1521 * (2) the pipeline stalls waiting for dependent child I/Os 1522 * (3) the I/O issues, so we're waiting for an I/O completion interrupt 1523 * (4) the I/O is delegated by vdev-level caching or aggregation 1524 * (5) the I/O is deferred due to vdev-level queueing 1525 * (6) the I/O is handed off to another thread. 1526 * 1527 * In all cases, the pipeline stops whenever there's no CPU work; it never 1528 * burns a thread in cv_wait(). 1529 * 1530 * There's no locking on io_stage because there's no legitimate way 1531 * for multiple threads to be attempting to process the same I/O. 1532 */ 1533static zio_pipe_stage_t *zio_pipeline[]; 1534 1535void 1536zio_execute(zio_t *zio) 1537{ 1538 zio->io_executor = curthread; 1539 1540 while (zio->io_stage < ZIO_STAGE_DONE) { 1541 enum zio_stage pipeline = zio->io_pipeline; 1542 enum zio_stage stage = zio->io_stage; 1543 int rv; 1544 1545 ASSERT(!MUTEX_HELD(&zio->io_lock)); 1546 ASSERT(ISP2(stage)); 1547 ASSERT(zio->io_stall == NULL); 1548 1549 do { 1550 stage <<= 1; 1551 } while ((stage & pipeline) == 0); 1552 1553 ASSERT(stage <= ZIO_STAGE_DONE); 1554 1555 /* 1556 * If we are in interrupt context and this pipeline stage 1557 * will grab a config lock that is held across I/O, 1558 * or may wait for an I/O that needs an interrupt thread 1559 * to complete, issue async to avoid deadlock. 1560 * 1561 * For VDEV_IO_START, we cut in line so that the io will 1562 * be sent to disk promptly. 1563 */ 1564 if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && 1565 zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { 1566 boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? 1567 zio_requeue_io_start_cut_in_line : B_FALSE; 1568 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); 1569 return; 1570 } 1571 1572 zio->io_stage = stage; 1573 rv = zio_pipeline[highbit64(stage) - 1](zio); 1574 1575 if (rv == ZIO_PIPELINE_STOP) 1576 return; 1577 1578 ASSERT(rv == ZIO_PIPELINE_CONTINUE); 1579 } 1580} 1581 1582/* 1583 * ========================================================================== 1584 * Initiate I/O, either sync or async 1585 * ========================================================================== 1586 */ 1587int 1588zio_wait(zio_t *zio) 1589{ 1590 int error; 1591 1592 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 1593 ASSERT(zio->io_executor == NULL); 1594 1595 zio->io_waiter = curthread; 1596 1597 zio_execute(zio); 1598 1599 mutex_enter(&zio->io_lock); 1600 while (zio->io_executor != NULL) 1601 cv_wait(&zio->io_cv, &zio->io_lock); 1602 mutex_exit(&zio->io_lock); 1603 1604 error = zio->io_error; 1605 zio_destroy(zio); 1606 1607 return (error); 1608} 1609 1610void 1611zio_nowait(zio_t *zio) 1612{ 1613 ASSERT(zio->io_executor == NULL); 1614 1615 if (zio->io_child_type == ZIO_CHILD_LOGICAL && 1616 zio_unique_parent(zio) == NULL) { 1617 /* 1618 * This is a logical async I/O with no parent to wait for it. 1619 * We add it to the spa_async_root_zio "Godfather" I/O which 1620 * will ensure they complete prior to unloading the pool. 1621 */ 1622 spa_t *spa = zio->io_spa; 1623 1624 zio_add_child(spa->spa_async_zio_root[CPU_SEQID], zio); 1625 } 1626 1627 zio_execute(zio); 1628} 1629 1630/* 1631 * ========================================================================== 1632 * Reexecute or suspend/resume failed I/O 1633 * ========================================================================== 1634 */ 1635 1636static void 1637zio_reexecute(zio_t *pio) 1638{ 1639 zio_t *cio, *cio_next; 1640 1641 ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); 1642 ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); 1643 ASSERT(pio->io_gang_leader == NULL); 1644 ASSERT(pio->io_gang_tree == NULL); 1645 1646 pio->io_flags = pio->io_orig_flags; 1647 pio->io_stage = pio->io_orig_stage; 1648 pio->io_pipeline = pio->io_orig_pipeline; 1649 pio->io_reexecute = 0; 1650 pio->io_flags |= ZIO_FLAG_REEXECUTED; 1651 pio->io_error = 0; 1652 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1653 pio->io_state[w] = 0; 1654 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 1655 pio->io_child_error[c] = 0; 1656 1657 if (IO_IS_ALLOCATING(pio)) 1658 BP_ZERO(pio->io_bp); 1659 1660 /* 1661 * As we reexecute pio's children, new children could be created. 1662 * New children go to the head of pio's io_child_list, however, 1663 * so we will (correctly) not reexecute them. The key is that 1664 * the remainder of pio's io_child_list, from 'cio_next' onward, 1665 * cannot be affected by any side effects of reexecuting 'cio'. 1666 */ 1667 for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { 1668 cio_next = zio_walk_children(pio); 1669 mutex_enter(&pio->io_lock); 1670 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1671 pio->io_children[cio->io_child_type][w]++; 1672 mutex_exit(&pio->io_lock); 1673 zio_reexecute(cio); 1674 } 1675 1676 /* 1677 * Now that all children have been reexecuted, execute the parent. 1678 * We don't reexecute "The Godfather" I/O here as it's the 1679 * responsibility of the caller to wait on him. 1680 */ 1681 if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) 1682 zio_execute(pio); 1683} 1684 1685void 1686zio_suspend(spa_t *spa, zio_t *zio) 1687{ 1688 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) 1689 fm_panic("Pool '%s' has encountered an uncorrectable I/O " 1690 "failure and the failure mode property for this pool " 1691 "is set to panic.", spa_name(spa)); 1692 1693 zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); 1694 1695 mutex_enter(&spa->spa_suspend_lock); 1696 1697 if (spa->spa_suspend_zio_root == NULL) 1698 spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 1699 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 1700 ZIO_FLAG_GODFATHER); 1701 1702 spa->spa_suspended = B_TRUE; 1703 1704 if (zio != NULL) { 1705 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 1706 ASSERT(zio != spa->spa_suspend_zio_root); 1707 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1708 ASSERT(zio_unique_parent(zio) == NULL); 1709 ASSERT(zio->io_stage == ZIO_STAGE_DONE); 1710 zio_add_child(spa->spa_suspend_zio_root, zio); 1711 } 1712 1713 mutex_exit(&spa->spa_suspend_lock); 1714} 1715 1716int 1717zio_resume(spa_t *spa) 1718{ 1719 zio_t *pio; 1720 1721 /* 1722 * Reexecute all previously suspended i/o. 1723 */ 1724 mutex_enter(&spa->spa_suspend_lock); 1725 spa->spa_suspended = B_FALSE; 1726 cv_broadcast(&spa->spa_suspend_cv); 1727 pio = spa->spa_suspend_zio_root; 1728 spa->spa_suspend_zio_root = NULL; 1729 mutex_exit(&spa->spa_suspend_lock); 1730 1731 if (pio == NULL) 1732 return (0); 1733 1734 zio_reexecute(pio); 1735 return (zio_wait(pio)); 1736} 1737 1738void 1739zio_resume_wait(spa_t *spa) 1740{ 1741 mutex_enter(&spa->spa_suspend_lock); 1742 while (spa_suspended(spa)) 1743 cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); 1744 mutex_exit(&spa->spa_suspend_lock); 1745} 1746 1747/* 1748 * ========================================================================== 1749 * Gang blocks. 1750 * 1751 * A gang block is a collection of small blocks that looks to the DMU 1752 * like one large block. When zio_dva_allocate() cannot find a block 1753 * of the requested size, due to either severe fragmentation or the pool 1754 * being nearly full, it calls zio_write_gang_block() to construct the 1755 * block from smaller fragments. 1756 * 1757 * A gang block consists of a gang header (zio_gbh_phys_t) and up to 1758 * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like 1759 * an indirect block: it's an array of block pointers. It consumes 1760 * only one sector and hence is allocatable regardless of fragmentation. 1761 * The gang header's bps point to its gang members, which hold the data. 1762 * 1763 * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> 1764 * as the verifier to ensure uniqueness of the SHA256 checksum. 1765 * Critically, the gang block bp's blk_cksum is the checksum of the data, 1766 * not the gang header. This ensures that data block signatures (needed for 1767 * deduplication) are independent of how the block is physically stored. 1768 * 1769 * Gang blocks can be nested: a gang member may itself be a gang block. 1770 * Thus every gang block is a tree in which root and all interior nodes are 1771 * gang headers, and the leaves are normal blocks that contain user data. 1772 * The root of the gang tree is called the gang leader. 1773 * 1774 * To perform any operation (read, rewrite, free, claim) on a gang block, 1775 * zio_gang_assemble() first assembles the gang tree (minus data leaves) 1776 * in the io_gang_tree field of the original logical i/o by recursively 1777 * reading the gang leader and all gang headers below it. This yields 1778 * an in-core tree containing the contents of every gang header and the 1779 * bps for every constituent of the gang block. 1780 * 1781 * With the gang tree now assembled, zio_gang_issue() just walks the gang tree 1782 * and invokes a callback on each bp. To free a gang block, zio_gang_issue() 1783 * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. 1784 * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). 1785 * zio_read_gang() is a wrapper around zio_read() that omits reading gang 1786 * headers, since we already have those in io_gang_tree. zio_rewrite_gang() 1787 * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() 1788 * of the gang header plus zio_checksum_compute() of the data to update the 1789 * gang header's blk_cksum as described above. 1790 * 1791 * The two-phase assemble/issue model solves the problem of partial failure -- 1792 * what if you'd freed part of a gang block but then couldn't read the 1793 * gang header for another part? Assembling the entire gang tree first 1794 * ensures that all the necessary gang header I/O has succeeded before 1795 * starting the actual work of free, claim, or write. Once the gang tree 1796 * is assembled, free and claim are in-memory operations that cannot fail. 1797 * 1798 * In the event that a gang write fails, zio_dva_unallocate() walks the 1799 * gang tree to immediately free (i.e. insert back into the space map) 1800 * everything we've allocated. This ensures that we don't get ENOSPC 1801 * errors during repeated suspend/resume cycles due to a flaky device. 1802 * 1803 * Gang rewrites only happen during sync-to-convergence. If we can't assemble 1804 * the gang tree, we won't modify the block, so we can safely defer the free 1805 * (knowing that the block is still intact). If we *can* assemble the gang 1806 * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free 1807 * each constituent bp and we can allocate a new block on the next sync pass. 1808 * 1809 * In all cases, the gang tree allows complete recovery from partial failure. 1810 * ========================================================================== 1811 */ 1812 1813static zio_t * 1814zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1815{ 1816 if (gn != NULL) 1817 return (pio); 1818 1819 return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), 1820 NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1821 &pio->io_bookmark)); 1822} 1823 1824zio_t * 1825zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1826{ 1827 zio_t *zio; 1828 1829 if (gn != NULL) { 1830 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1831 gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, 1832 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1833 /* 1834 * As we rewrite each gang header, the pipeline will compute 1835 * a new gang block header checksum for it; but no one will 1836 * compute a new data checksum, so we do that here. The one 1837 * exception is the gang leader: the pipeline already computed 1838 * its data checksum because that stage precedes gang assembly. 1839 * (Presently, nothing actually uses interior data checksums; 1840 * this is just good hygiene.) 1841 */ 1842 if (gn != pio->io_gang_leader->io_gang_tree) { 1843 zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), 1844 data, BP_GET_PSIZE(bp)); 1845 } 1846 /* 1847 * If we are here to damage data for testing purposes, 1848 * leave the GBH alone so that we can detect the damage. 1849 */ 1850 if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) 1851 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 1852 } else { 1853 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1854 data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, 1855 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1856 } 1857 1858 return (zio); 1859} 1860 1861/* ARGSUSED */ 1862zio_t * 1863zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1864{ 1865 return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, 1866 BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp), 1867 ZIO_GANG_CHILD_FLAGS(pio))); 1868} 1869 1870/* ARGSUSED */ 1871zio_t * 1872zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1873{ 1874 return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, 1875 NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); 1876} 1877 1878static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { 1879 NULL, 1880 zio_read_gang, 1881 zio_rewrite_gang, 1882 zio_free_gang, 1883 zio_claim_gang, 1884 NULL 1885}; 1886 1887static void zio_gang_tree_assemble_done(zio_t *zio); 1888 1889static zio_gang_node_t * 1890zio_gang_node_alloc(zio_gang_node_t **gnpp) 1891{ 1892 zio_gang_node_t *gn; 1893 1894 ASSERT(*gnpp == NULL); 1895 1896 gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); 1897 gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); 1898 *gnpp = gn; 1899 1900 return (gn); 1901} 1902 1903static void 1904zio_gang_node_free(zio_gang_node_t **gnpp) 1905{ 1906 zio_gang_node_t *gn = *gnpp; 1907 1908 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1909 ASSERT(gn->gn_child[g] == NULL); 1910 1911 zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); 1912 kmem_free(gn, sizeof (*gn)); 1913 *gnpp = NULL; 1914} 1915 1916static void 1917zio_gang_tree_free(zio_gang_node_t **gnpp) 1918{ 1919 zio_gang_node_t *gn = *gnpp; 1920 1921 if (gn == NULL) 1922 return; 1923 1924 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1925 zio_gang_tree_free(&gn->gn_child[g]); 1926 1927 zio_gang_node_free(gnpp); 1928} 1929 1930static void 1931zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) 1932{ 1933 zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); 1934 1935 ASSERT(gio->io_gang_leader == gio); 1936 ASSERT(BP_IS_GANG(bp)); 1937 1938 zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, 1939 SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, 1940 gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); 1941} 1942 1943static void 1944zio_gang_tree_assemble_done(zio_t *zio) 1945{ 1946 zio_t *gio = zio->io_gang_leader; 1947 zio_gang_node_t *gn = zio->io_private; 1948 blkptr_t *bp = zio->io_bp; 1949 1950 ASSERT(gio == zio_unique_parent(zio)); 1951 ASSERT(zio->io_child_count == 0); 1952 1953 if (zio->io_error) 1954 return; 1955 1956 if (BP_SHOULD_BYTESWAP(bp)) 1957 byteswap_uint64_array(zio->io_data, zio->io_size); 1958 1959 ASSERT(zio->io_data == gn->gn_gbh); 1960 ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 1961 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1962 1963 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1964 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1965 if (!BP_IS_GANG(gbp)) 1966 continue; 1967 zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); 1968 } 1969} 1970 1971static void 1972zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) 1973{ 1974 zio_t *gio = pio->io_gang_leader; 1975 zio_t *zio; 1976 1977 ASSERT(BP_IS_GANG(bp) == !!gn); 1978 ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); 1979 ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); 1980 1981 /* 1982 * If you're a gang header, your data is in gn->gn_gbh. 1983 * If you're a gang member, your data is in 'data' and gn == NULL. 1984 */ 1985 zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); 1986 1987 if (gn != NULL) { 1988 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1989 1990 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1991 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1992 if (BP_IS_HOLE(gbp)) 1993 continue; 1994 zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); 1995 data = (char *)data + BP_GET_PSIZE(gbp); 1996 } 1997 } 1998 1999 if (gn == gio->io_gang_tree && gio->io_data != NULL) 2000 ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); 2001 2002 if (zio != pio) 2003 zio_nowait(zio); 2004} 2005 2006static int 2007zio_gang_assemble(zio_t *zio) 2008{ 2009 blkptr_t *bp = zio->io_bp; 2010 2011 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); 2012 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2013 2014 zio->io_gang_leader = zio; 2015 2016 zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); 2017 2018 return (ZIO_PIPELINE_CONTINUE); 2019} 2020 2021static int 2022zio_gang_issue(zio_t *zio) 2023{ 2024 blkptr_t *bp = zio->io_bp; 2025 2026 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) 2027 return (ZIO_PIPELINE_STOP); 2028 2029 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); 2030 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2031 2032 if (zio->io_child_error[ZIO_CHILD_GANG] == 0) 2033 zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); 2034 else 2035 zio_gang_tree_free(&zio->io_gang_tree); 2036 2037 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2038 2039 return (ZIO_PIPELINE_CONTINUE); 2040} 2041 2042static void 2043zio_write_gang_member_ready(zio_t *zio) 2044{ 2045 zio_t *pio = zio_unique_parent(zio); 2046 zio_t *gio = zio->io_gang_leader; 2047 dva_t *cdva = zio->io_bp->blk_dva; 2048 dva_t *pdva = pio->io_bp->blk_dva; 2049 uint64_t asize; 2050 2051 if (BP_IS_HOLE(zio->io_bp)) 2052 return; 2053 2054 ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); 2055 2056 ASSERT(zio->io_child_type == ZIO_CHILD_GANG); 2057 ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); 2058 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); 2059 ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); 2060 ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 2061 2062 mutex_enter(&pio->io_lock); 2063 for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { 2064 ASSERT(DVA_GET_GANG(&pdva[d])); 2065 asize = DVA_GET_ASIZE(&pdva[d]); 2066 asize += DVA_GET_ASIZE(&cdva[d]); 2067 DVA_SET_ASIZE(&pdva[d], asize); 2068 } 2069 mutex_exit(&pio->io_lock); 2070} 2071 2072static int 2073zio_write_gang_block(zio_t *pio) 2074{ 2075 spa_t *spa = pio->io_spa; 2076 blkptr_t *bp = pio->io_bp; 2077 zio_t *gio = pio->io_gang_leader; 2078 zio_t *zio; 2079 zio_gang_node_t *gn, **gnpp; 2080 zio_gbh_phys_t *gbh; 2081 uint64_t txg = pio->io_txg; 2082 uint64_t resid = pio->io_size; 2083 uint64_t lsize; 2084 int copies = gio->io_prop.zp_copies; 2085 int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); 2086 zio_prop_t zp; 2087 int error; 2088 2089 error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE, 2090 bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, 2091 METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); 2092 if (error) { 2093 pio->io_error = error; 2094 return (ZIO_PIPELINE_CONTINUE); 2095 } 2096 2097 if (pio == gio) { 2098 gnpp = &gio->io_gang_tree; 2099 } else { 2100 gnpp = pio->io_private; 2101 ASSERT(pio->io_ready == zio_write_gang_member_ready); 2102 } 2103 2104 gn = zio_gang_node_alloc(gnpp); 2105 gbh = gn->gn_gbh; 2106 bzero(gbh, SPA_GANGBLOCKSIZE); 2107 2108 /* 2109 * Create the gang header. 2110 */ 2111 zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, 2112 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 2113 2114 /* 2115 * Create and nowait the gang children. 2116 */ 2117 for (int g = 0; resid != 0; resid -= lsize, g++) { 2118 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), 2119 SPA_MINBLOCKSIZE); 2120 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); 2121 2122 zp.zp_checksum = gio->io_prop.zp_checksum; 2123 zp.zp_compress = ZIO_COMPRESS_OFF; 2124 zp.zp_type = DMU_OT_NONE; 2125 zp.zp_level = 0; 2126 zp.zp_copies = gio->io_prop.zp_copies; 2127 zp.zp_dedup = B_FALSE; 2128 zp.zp_dedup_verify = B_FALSE; 2129 zp.zp_nopwrite = B_FALSE; 2130 2131 zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], 2132 (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, 2133 zio_write_gang_member_ready, NULL, NULL, NULL, 2134 &gn->gn_child[g], pio->io_priority, 2135 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark)); 2136 } 2137 2138 /* 2139 * Set pio's pipeline to just wait for zio to finish. 2140 */ 2141 pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2142 2143 zio_nowait(zio); 2144 2145 return (ZIO_PIPELINE_CONTINUE); 2146} 2147 2148/* 2149 * The zio_nop_write stage in the pipeline determines if allocating a 2150 * new bp is necessary. The nopwrite feature can handle writes in 2151 * either syncing or open context (i.e. zil writes) and as a result is 2152 * mutually exclusive with dedup. 2153 * 2154 * By leveraging a cryptographically secure checksum, such as SHA256, we 2155 * can compare the checksums of the new data and the old to determine if 2156 * allocating a new block is required. Note that our requirements for 2157 * cryptographic strength are fairly weak: there can't be any accidental 2158 * hash collisions, but we don't need to be secure against intentional 2159 * (malicious) collisions. To trigger a nopwrite, you have to be able 2160 * to write the file to begin with, and triggering an incorrect (hash 2161 * collision) nopwrite is no worse than simply writing to the file. 2162 * That said, there are no known attacks against the checksum algorithms 2163 * used for nopwrite, assuming that the salt and the checksums 2164 * themselves remain secret. 2165 */ 2166static int 2167zio_nop_write(zio_t *zio) 2168{ 2169 blkptr_t *bp = zio->io_bp; 2170 blkptr_t *bp_orig = &zio->io_bp_orig; 2171 zio_prop_t *zp = &zio->io_prop; 2172 2173 ASSERT(BP_GET_LEVEL(bp) == 0); 2174 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 2175 ASSERT(zp->zp_nopwrite); 2176 ASSERT(!zp->zp_dedup); 2177 ASSERT(zio->io_bp_override == NULL); 2178 ASSERT(IO_IS_ALLOCATING(zio)); 2179 2180 /* 2181 * Check to see if the original bp and the new bp have matching 2182 * characteristics (i.e. same checksum, compression algorithms, etc). 2183 * If they don't then just continue with the pipeline which will 2184 * allocate a new bp. 2185 */ 2186 if (BP_IS_HOLE(bp_orig) || 2187 !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags & 2188 ZCHECKSUM_FLAG_NOPWRITE) || 2189 BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || 2190 BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || 2191 BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || 2192 zp->zp_copies != BP_GET_NDVAS(bp_orig)) 2193 return (ZIO_PIPELINE_CONTINUE); 2194 2195 /* 2196 * If the checksums match then reset the pipeline so that we 2197 * avoid allocating a new bp and issuing any I/O. 2198 */ 2199 if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { 2200 ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags & 2201 ZCHECKSUM_FLAG_NOPWRITE); 2202 ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); 2203 ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); 2204 ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); 2205 ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, 2206 sizeof (uint64_t)) == 0); 2207 2208 *bp = *bp_orig; 2209 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2210 zio->io_flags |= ZIO_FLAG_NOPWRITE; 2211 } 2212 2213 return (ZIO_PIPELINE_CONTINUE); 2214} 2215 2216/* 2217 * ========================================================================== 2218 * Dedup 2219 * ========================================================================== 2220 */ 2221static void 2222zio_ddt_child_read_done(zio_t *zio) 2223{ 2224 blkptr_t *bp = zio->io_bp; 2225 ddt_entry_t *dde = zio->io_private; 2226 ddt_phys_t *ddp; 2227 zio_t *pio = zio_unique_parent(zio); 2228 2229 mutex_enter(&pio->io_lock); 2230 ddp = ddt_phys_select(dde, bp); 2231 if (zio->io_error == 0) 2232 ddt_phys_clear(ddp); /* this ddp doesn't need repair */ 2233 if (zio->io_error == 0 && dde->dde_repair_data == NULL) 2234 dde->dde_repair_data = zio->io_data; 2235 else 2236 zio_buf_free(zio->io_data, zio->io_size); 2237 mutex_exit(&pio->io_lock); 2238} 2239 2240static int 2241zio_ddt_read_start(zio_t *zio) 2242{ 2243 blkptr_t *bp = zio->io_bp; 2244 2245 ASSERT(BP_GET_DEDUP(bp)); 2246 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2247 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2248 2249 if (zio->io_child_error[ZIO_CHILD_DDT]) { 2250 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2251 ddt_entry_t *dde = ddt_repair_start(ddt, bp); 2252 ddt_phys_t *ddp = dde->dde_phys; 2253 ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); 2254 blkptr_t blk; 2255 2256 ASSERT(zio->io_vsd == NULL); 2257 zio->io_vsd = dde; 2258 2259 if (ddp_self == NULL) 2260 return (ZIO_PIPELINE_CONTINUE); 2261 2262 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 2263 if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) 2264 continue; 2265 ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, 2266 &blk); 2267 zio_nowait(zio_read(zio, zio->io_spa, &blk, 2268 zio_buf_alloc(zio->io_size), zio->io_size, 2269 zio_ddt_child_read_done, dde, zio->io_priority, 2270 ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, 2271 &zio->io_bookmark)); 2272 } 2273 return (ZIO_PIPELINE_CONTINUE); 2274 } 2275 2276 zio_nowait(zio_read(zio, zio->io_spa, bp, 2277 zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, 2278 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); 2279 2280 return (ZIO_PIPELINE_CONTINUE); 2281} 2282 2283static int 2284zio_ddt_read_done(zio_t *zio) 2285{ 2286 blkptr_t *bp = zio->io_bp; 2287 2288 if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)) 2289 return (ZIO_PIPELINE_STOP); 2290 2291 ASSERT(BP_GET_DEDUP(bp)); 2292 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2293 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2294 2295 if (zio->io_child_error[ZIO_CHILD_DDT]) { 2296 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2297 ddt_entry_t *dde = zio->io_vsd; 2298 if (ddt == NULL) { 2299 ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); 2300 return (ZIO_PIPELINE_CONTINUE); 2301 } 2302 if (dde == NULL) { 2303 zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; 2304 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 2305 return (ZIO_PIPELINE_STOP); 2306 } 2307 if (dde->dde_repair_data != NULL) { 2308 bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); 2309 zio->io_child_error[ZIO_CHILD_DDT] = 0; 2310 } 2311 ddt_repair_done(ddt, dde); 2312 zio->io_vsd = NULL; 2313 } 2314 2315 ASSERT(zio->io_vsd == NULL); 2316 2317 return (ZIO_PIPELINE_CONTINUE); 2318} 2319 2320static boolean_t 2321zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) 2322{ 2323 spa_t *spa = zio->io_spa; 2324 2325 /* 2326 * Note: we compare the original data, not the transformed data, 2327 * because when zio->io_bp is an override bp, we will not have 2328 * pushed the I/O transforms. That's an important optimization 2329 * because otherwise we'd compress/encrypt all dmu_sync() data twice. 2330 */ 2331 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2332 zio_t *lio = dde->dde_lead_zio[p]; 2333 2334 if (lio != NULL) { 2335 return (lio->io_orig_size != zio->io_orig_size || 2336 bcmp(zio->io_orig_data, lio->io_orig_data, 2337 zio->io_orig_size) != 0); 2338 } 2339 } 2340 2341 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2342 ddt_phys_t *ddp = &dde->dde_phys[p]; 2343 2344 if (ddp->ddp_phys_birth != 0) { 2345 arc_buf_t *abuf = NULL; 2346 arc_flags_t aflags = ARC_FLAG_WAIT; 2347 blkptr_t blk = *zio->io_bp; 2348 int error; 2349 2350 ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); 2351 2352 ddt_exit(ddt); 2353 2354 error = arc_read(NULL, spa, &blk, 2355 arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, 2356 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2357 &aflags, &zio->io_bookmark); 2358 2359 if (error == 0) { 2360 if (arc_buf_size(abuf) != zio->io_orig_size || 2361 bcmp(abuf->b_data, zio->io_orig_data, 2362 zio->io_orig_size) != 0) 2363 error = SET_ERROR(EEXIST); 2364 arc_buf_destroy(abuf, &abuf); 2365 } 2366 2367 ddt_enter(ddt); 2368 return (error != 0); 2369 } 2370 } 2371 2372 return (B_FALSE); 2373} 2374 2375static void 2376zio_ddt_child_write_ready(zio_t *zio) 2377{ 2378 int p = zio->io_prop.zp_copies; 2379 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2380 ddt_entry_t *dde = zio->io_private; 2381 ddt_phys_t *ddp = &dde->dde_phys[p]; 2382 zio_t *pio; 2383 2384 if (zio->io_error) 2385 return; 2386 2387 ddt_enter(ddt); 2388 2389 ASSERT(dde->dde_lead_zio[p] == zio); 2390 2391 ddt_phys_fill(ddp, zio->io_bp); 2392 2393 while ((pio = zio_walk_parents(zio)) != NULL) 2394 ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); 2395 2396 ddt_exit(ddt); 2397} 2398 2399static void 2400zio_ddt_child_write_done(zio_t *zio) 2401{ 2402 int p = zio->io_prop.zp_copies; 2403 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2404 ddt_entry_t *dde = zio->io_private; 2405 ddt_phys_t *ddp = &dde->dde_phys[p]; 2406 2407 ddt_enter(ddt); 2408 2409 ASSERT(ddp->ddp_refcnt == 0); 2410 ASSERT(dde->dde_lead_zio[p] == zio); 2411 dde->dde_lead_zio[p] = NULL; 2412 2413 if (zio->io_error == 0) { 2414 while (zio_walk_parents(zio) != NULL) 2415 ddt_phys_addref(ddp); 2416 } else { 2417 ddt_phys_clear(ddp); 2418 } 2419 2420 ddt_exit(ddt); 2421} 2422 2423static void 2424zio_ddt_ditto_write_done(zio_t *zio) 2425{ 2426 int p = DDT_PHYS_DITTO; 2427 zio_prop_t *zp = &zio->io_prop; 2428 blkptr_t *bp = zio->io_bp; 2429 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2430 ddt_entry_t *dde = zio->io_private; 2431 ddt_phys_t *ddp = &dde->dde_phys[p]; 2432 ddt_key_t *ddk = &dde->dde_key; 2433 2434 ddt_enter(ddt); 2435 2436 ASSERT(ddp->ddp_refcnt == 0); 2437 ASSERT(dde->dde_lead_zio[p] == zio); 2438 dde->dde_lead_zio[p] = NULL; 2439 2440 if (zio->io_error == 0) { 2441 ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); 2442 ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); 2443 ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); 2444 if (ddp->ddp_phys_birth != 0) 2445 ddt_phys_free(ddt, ddk, ddp, zio->io_txg); 2446 ddt_phys_fill(ddp, bp); 2447 } 2448 2449 ddt_exit(ddt); 2450} 2451 2452static int 2453zio_ddt_write(zio_t *zio) 2454{ 2455 spa_t *spa = zio->io_spa; 2456 blkptr_t *bp = zio->io_bp; 2457 uint64_t txg = zio->io_txg; 2458 zio_prop_t *zp = &zio->io_prop; 2459 int p = zp->zp_copies; 2460 int ditto_copies; 2461 zio_t *cio = NULL; 2462 zio_t *dio = NULL; 2463 ddt_t *ddt = ddt_select(spa, bp); 2464 ddt_entry_t *dde; 2465 ddt_phys_t *ddp; 2466 2467 ASSERT(BP_GET_DEDUP(bp)); 2468 ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); 2469 ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); 2470 2471 ddt_enter(ddt); 2472 dde = ddt_lookup(ddt, bp, B_TRUE); 2473 ddp = &dde->dde_phys[p]; 2474 2475 if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { 2476 /* 2477 * If we're using a weak checksum, upgrade to a strong checksum 2478 * and try again. If we're already using a strong checksum, 2479 * we can't resolve it, so just convert to an ordinary write. 2480 * (And automatically e-mail a paper to Nature?) 2481 */ 2482 if (!(zio_checksum_table[zp->zp_checksum].ci_flags & 2483 ZCHECKSUM_FLAG_DEDUP)) { 2484 zp->zp_checksum = spa_dedup_checksum(spa); 2485 zio_pop_transforms(zio); 2486 zio->io_stage = ZIO_STAGE_OPEN; 2487 BP_ZERO(bp); 2488 } else { 2489 zp->zp_dedup = B_FALSE; 2490 } 2491 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2492 ddt_exit(ddt); 2493 return (ZIO_PIPELINE_CONTINUE); 2494 } 2495 2496 ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); 2497 ASSERT(ditto_copies < SPA_DVAS_PER_BP); 2498 2499 if (ditto_copies > ddt_ditto_copies_present(dde) && 2500 dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { 2501 zio_prop_t czp = *zp; 2502 2503 czp.zp_copies = ditto_copies; 2504 2505 /* 2506 * If we arrived here with an override bp, we won't have run 2507 * the transform stack, so we won't have the data we need to 2508 * generate a child i/o. So, toss the override bp and restart. 2509 * This is safe, because using the override bp is just an 2510 * optimization; and it's rare, so the cost doesn't matter. 2511 */ 2512 if (zio->io_bp_override) { 2513 zio_pop_transforms(zio); 2514 zio->io_stage = ZIO_STAGE_OPEN; 2515 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2516 zio->io_bp_override = NULL; 2517 BP_ZERO(bp); 2518 ddt_exit(ddt); 2519 return (ZIO_PIPELINE_CONTINUE); 2520 } 2521 2522 dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2523 zio->io_orig_size, &czp, NULL, NULL, 2524 NULL, zio_ddt_ditto_write_done, dde, zio->io_priority, 2525 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2526 2527 zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); 2528 dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; 2529 } 2530 2531 if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { 2532 if (ddp->ddp_phys_birth != 0) 2533 ddt_bp_fill(ddp, bp, txg); 2534 if (dde->dde_lead_zio[p] != NULL) 2535 zio_add_child(zio, dde->dde_lead_zio[p]); 2536 else 2537 ddt_phys_addref(ddp); 2538 } else if (zio->io_bp_override) { 2539 ASSERT(bp->blk_birth == txg); 2540 ASSERT(BP_EQUAL(bp, zio->io_bp_override)); 2541 ddt_phys_fill(ddp, bp); 2542 ddt_phys_addref(ddp); 2543 } else { 2544 cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2545 zio->io_orig_size, zp, 2546 zio_ddt_child_write_ready, NULL, NULL, 2547 zio_ddt_child_write_done, dde, zio->io_priority, 2548 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2549 2550 zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); 2551 dde->dde_lead_zio[p] = cio; 2552 } 2553 2554 ddt_exit(ddt); 2555 2556 if (cio) 2557 zio_nowait(cio); 2558 if (dio) 2559 zio_nowait(dio); 2560 2561 return (ZIO_PIPELINE_CONTINUE); 2562} 2563 2564ddt_entry_t *freedde; /* for debugging */ 2565 2566static int 2567zio_ddt_free(zio_t *zio) 2568{ 2569 spa_t *spa = zio->io_spa; 2570 blkptr_t *bp = zio->io_bp; 2571 ddt_t *ddt = ddt_select(spa, bp); 2572 ddt_entry_t *dde; 2573 ddt_phys_t *ddp; 2574 2575 ASSERT(BP_GET_DEDUP(bp)); 2576 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2577 2578 ddt_enter(ddt); 2579 freedde = dde = ddt_lookup(ddt, bp, B_TRUE); 2580 ddp = ddt_phys_select(dde, bp); 2581 ddt_phys_decref(ddp); 2582 ddt_exit(ddt); 2583 2584 return (ZIO_PIPELINE_CONTINUE); 2585} 2586 2587/* 2588 * ========================================================================== 2589 * Allocate and free blocks 2590 * ========================================================================== 2591 */ 2592static int 2593zio_dva_allocate(zio_t *zio) 2594{ 2595 spa_t *spa = zio->io_spa; 2596 metaslab_class_t *mc = spa_normal_class(spa); 2597 blkptr_t *bp = zio->io_bp; 2598 int error; 2599 int flags = 0; 2600 2601 if (zio->io_gang_leader == NULL) { 2602 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2603 zio->io_gang_leader = zio; 2604 } 2605 2606 ASSERT(BP_IS_HOLE(bp)); 2607 ASSERT0(BP_GET_NDVAS(bp)); 2608 ASSERT3U(zio->io_prop.zp_copies, >, 0); 2609 ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); 2610 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 2611 2612 /* 2613 * The dump device does not support gang blocks so allocation on 2614 * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid 2615 * the "fast" gang feature. 2616 */ 2617 flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0; 2618 flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ? 2619 METASLAB_GANG_CHILD : 0; 2620 error = metaslab_alloc(spa, mc, zio->io_size, bp, 2621 zio->io_prop.zp_copies, zio->io_txg, NULL, flags); 2622 2623 if (error) { 2624 spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " 2625 "size %llu, error %d", spa_name(spa), zio, zio->io_size, 2626 error); 2627 if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) 2628 return (zio_write_gang_block(zio)); 2629 zio->io_error = error; 2630 } 2631 2632 return (ZIO_PIPELINE_CONTINUE); 2633} 2634 2635static int 2636zio_dva_free(zio_t *zio) 2637{ 2638 metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); 2639 2640 return (ZIO_PIPELINE_CONTINUE); 2641} 2642 2643static int 2644zio_dva_claim(zio_t *zio) 2645{ 2646 int error; 2647 2648 error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 2649 if (error) 2650 zio->io_error = error; 2651 2652 return (ZIO_PIPELINE_CONTINUE); 2653} 2654 2655/* 2656 * Undo an allocation. This is used by zio_done() when an I/O fails 2657 * and we want to give back the block we just allocated. 2658 * This handles both normal blocks and gang blocks. 2659 */ 2660static void 2661zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) 2662{ 2663 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 2664 ASSERT(zio->io_bp_override == NULL); 2665 2666 if (!BP_IS_HOLE(bp)) 2667 metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); 2668 2669 if (gn != NULL) { 2670 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 2671 zio_dva_unallocate(zio, gn->gn_child[g], 2672 &gn->gn_gbh->zg_blkptr[g]); 2673 } 2674 } 2675} 2676 2677/* 2678 * Try to allocate an intent log block. Return 0 on success, errno on failure. 2679 */ 2680int 2681zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, 2682 uint64_t size, boolean_t use_slog) 2683{ 2684 int error = 1; 2685 2686 ASSERT(txg > spa_syncing_txg(spa)); 2687 2688 /* 2689 * ZIL blocks are always contiguous (i.e. not gang blocks) so we 2690 * set the METASLAB_GANG_AVOID flag so that they don't "fast gang" 2691 * when allocating them. 2692 */ 2693 if (use_slog) { 2694 error = metaslab_alloc(spa, spa_log_class(spa), size, 2695 new_bp, 1, txg, old_bp, 2696 METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID); 2697 } 2698 2699 if (error) { 2700 error = metaslab_alloc(spa, spa_normal_class(spa), size, 2701 new_bp, 1, txg, old_bp, 2702 METASLAB_HINTBP_AVOID); 2703 } 2704 2705 if (error == 0) { 2706 BP_SET_LSIZE(new_bp, size); 2707 BP_SET_PSIZE(new_bp, size); 2708 BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 2709 BP_SET_CHECKSUM(new_bp, 2710 spa_version(spa) >= SPA_VERSION_SLIM_ZIL 2711 ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); 2712 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 2713 BP_SET_LEVEL(new_bp, 0); 2714 BP_SET_DEDUP(new_bp, 0); 2715 BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 2716 } 2717 2718 return (error); 2719} 2720 2721/* 2722 * Free an intent log block. 2723 */ 2724void 2725zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp) 2726{ 2727 ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG); 2728 ASSERT(!BP_IS_GANG(bp)); 2729 2730 zio_free(spa, txg, bp); 2731} 2732 2733/* 2734 * ========================================================================== 2735 * Read, write and delete to physical devices 2736 * ========================================================================== 2737 */ 2738 2739 2740/* 2741 * Issue an I/O to the underlying vdev. Typically the issue pipeline 2742 * stops after this stage and will resume upon I/O completion. 2743 * However, there are instances where the vdev layer may need to 2744 * continue the pipeline when an I/O was not issued. Since the I/O 2745 * that was sent to the vdev layer might be different than the one 2746 * currently active in the pipeline (see vdev_queue_io()), we explicitly 2747 * force the underlying vdev layers to call either zio_execute() or 2748 * zio_interrupt() to ensure that the pipeline continues with the correct I/O. 2749 */ 2750static int 2751zio_vdev_io_start(zio_t *zio) 2752{ 2753 vdev_t *vd = zio->io_vd; 2754 uint64_t align; 2755 spa_t *spa = zio->io_spa; 2756 int ret; 2757 2758 ASSERT(zio->io_error == 0); 2759 ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); 2760 2761 if (vd == NULL) { 2762 if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2763 spa_config_enter(spa, SCL_ZIO, zio, RW_READER); 2764 2765 /* 2766 * The mirror_ops handle multiple DVAs in a single BP. 2767 */ 2768 vdev_mirror_ops.vdev_op_io_start(zio); 2769 return (ZIO_PIPELINE_STOP); 2770 } 2771 2772 if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE && 2773 zio->io_priority == ZIO_PRIORITY_NOW) { 2774 trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg); 2775 return (ZIO_PIPELINE_CONTINUE); 2776 } 2777 2778 /* 2779 * We keep track of time-sensitive I/Os so that the scan thread 2780 * can quickly react to certain workloads. In particular, we care 2781 * about non-scrubbing, top-level reads and writes with the following 2782 * characteristics: 2783 * - synchronous writes of user data to non-slog devices 2784 * - any reads of user data 2785 * When these conditions are met, adjust the timestamp of spa_last_io 2786 * which allows the scan thread to adjust its workload accordingly. 2787 */ 2788 if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && 2789 vd == vd->vdev_top && !vd->vdev_islog && 2790 zio->io_bookmark.zb_objset != DMU_META_OBJSET && 2791 zio->io_txg != spa_syncing_txg(spa)) { 2792 uint64_t old = spa->spa_last_io; 2793 uint64_t new = ddi_get_lbolt64(); 2794 if (old != new) 2795 (void) atomic_cas_64(&spa->spa_last_io, old, new); 2796 } 2797 2798 align = 1ULL << vd->vdev_top->vdev_ashift; 2799 2800 if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && 2801 P2PHASE(zio->io_size, align) != 0) { 2802 /* Transform logical writes to be a full physical block size. */ 2803 uint64_t asize = P2ROUNDUP(zio->io_size, align); 2804 char *abuf = NULL; 2805 if (zio->io_type == ZIO_TYPE_READ || 2806 zio->io_type == ZIO_TYPE_WRITE) 2807 abuf = zio_buf_alloc(asize); 2808 ASSERT(vd == vd->vdev_top); 2809 if (zio->io_type == ZIO_TYPE_WRITE) { 2810 bcopy(zio->io_data, abuf, zio->io_size); 2811 bzero(abuf + zio->io_size, asize - zio->io_size); 2812 } 2813 zio_push_transform(zio, abuf, asize, abuf ? asize : 0, 2814 zio_subblock); 2815 } 2816 2817 /* 2818 * If this is not a physical io, make sure that it is properly aligned 2819 * before proceeding. 2820 */ 2821 if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) { 2822 ASSERT0(P2PHASE(zio->io_offset, align)); 2823 ASSERT0(P2PHASE(zio->io_size, align)); 2824 } else { 2825 /* 2826 * For the physical io we allow alignment 2827 * to a logical block size. 2828 */ 2829 uint64_t log_align = 2830 1ULL << vd->vdev_top->vdev_logical_ashift; 2831 ASSERT0(P2PHASE(zio->io_offset, log_align)); 2832 ASSERT0(P2PHASE(zio->io_size, log_align)); 2833 } 2834 2835 VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa)); 2836 2837 /* 2838 * If this is a repair I/O, and there's no self-healing involved -- 2839 * that is, we're just resilvering what we expect to resilver -- 2840 * then don't do the I/O unless zio's txg is actually in vd's DTL. 2841 * This prevents spurious resilvering with nested replication. 2842 * For example, given a mirror of mirrors, (A+B)+(C+D), if only 2843 * A is out of date, we'll read from C+D, then use the data to 2844 * resilver A+B -- but we don't actually want to resilver B, just A. 2845 * The top-level mirror has no way to know this, so instead we just 2846 * discard unnecessary repairs as we work our way down the vdev tree. 2847 * The same logic applies to any form of nested replication: 2848 * ditto + mirror, RAID-Z + replacing, etc. This covers them all. 2849 */ 2850 if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && 2851 !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && 2852 zio->io_txg != 0 && /* not a delegated i/o */ 2853 !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { 2854 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 2855 zio_vdev_io_bypass(zio); 2856 return (ZIO_PIPELINE_CONTINUE); 2857 } 2858 2859 if (vd->vdev_ops->vdev_op_leaf) { 2860 switch (zio->io_type) { 2861 case ZIO_TYPE_READ: 2862 if (vdev_cache_read(zio)) 2863 return (ZIO_PIPELINE_CONTINUE); 2864 /* FALLTHROUGH */ 2865 case ZIO_TYPE_WRITE: 2866 case ZIO_TYPE_FREE: 2867 if ((zio = vdev_queue_io(zio)) == NULL) 2868 return (ZIO_PIPELINE_STOP); 2869 2870 if (!vdev_accessible(vd, zio)) { 2871 zio->io_error = SET_ERROR(ENXIO); 2872 zio_interrupt(zio); 2873 return (ZIO_PIPELINE_STOP); 2874 } 2875 break; 2876 } 2877 /* 2878 * Note that we ignore repair writes for TRIM because they can 2879 * conflict with normal writes. This isn't an issue because, by 2880 * definition, we only repair blocks that aren't freed. 2881 */ 2882 if (zio->io_type == ZIO_TYPE_WRITE && 2883 !(zio->io_flags & ZIO_FLAG_IO_REPAIR) && 2884 !trim_map_write_start(zio)) 2885 return (ZIO_PIPELINE_STOP); 2886 } 2887 2888 vd->vdev_ops->vdev_op_io_start(zio); 2889 return (ZIO_PIPELINE_STOP); 2890} 2891 2892static int 2893zio_vdev_io_done(zio_t *zio) 2894{ 2895 vdev_t *vd = zio->io_vd; 2896 vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; 2897 boolean_t unexpected_error = B_FALSE; 2898 2899 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2900 return (ZIO_PIPELINE_STOP); 2901 2902 ASSERT(zio->io_type == ZIO_TYPE_READ || 2903 zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE); 2904 2905 if (vd != NULL && vd->vdev_ops->vdev_op_leaf && 2906 (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || 2907 zio->io_type == ZIO_TYPE_FREE)) { 2908 2909 if (zio->io_type == ZIO_TYPE_WRITE && 2910 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) 2911 trim_map_write_done(zio); 2912 2913 vdev_queue_io_done(zio); 2914 2915 if (zio->io_type == ZIO_TYPE_WRITE) 2916 vdev_cache_write(zio); 2917 2918 if (zio_injection_enabled && zio->io_error == 0) 2919 zio->io_error = zio_handle_device_injection(vd, 2920 zio, EIO); 2921 2922 if (zio_injection_enabled && zio->io_error == 0) 2923 zio->io_error = zio_handle_label_injection(zio, EIO); 2924 2925 if (zio->io_error) { 2926 if (zio->io_error == ENOTSUP && 2927 zio->io_type == ZIO_TYPE_FREE) { 2928 /* Not all devices support TRIM. */ 2929 } else if (!vdev_accessible(vd, zio)) { 2930 zio->io_error = SET_ERROR(ENXIO); 2931 } else { 2932 unexpected_error = B_TRUE; 2933 } 2934 } 2935 } 2936 2937 ops->vdev_op_io_done(zio); 2938 2939 if (unexpected_error) 2940 VERIFY(vdev_probe(vd, zio) == NULL); 2941 2942 return (ZIO_PIPELINE_CONTINUE); 2943} 2944 2945/* 2946 * For non-raidz ZIOs, we can just copy aside the bad data read from the 2947 * disk, and use that to finish the checksum ereport later. 2948 */ 2949static void 2950zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, 2951 const void *good_buf) 2952{ 2953 /* no processing needed */ 2954 zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); 2955} 2956 2957/*ARGSUSED*/ 2958void 2959zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) 2960{ 2961 void *buf = zio_buf_alloc(zio->io_size); 2962 2963 bcopy(zio->io_data, buf, zio->io_size); 2964 2965 zcr->zcr_cbinfo = zio->io_size; 2966 zcr->zcr_cbdata = buf; 2967 zcr->zcr_finish = zio_vsd_default_cksum_finish; 2968 zcr->zcr_free = zio_buf_free; 2969} 2970 2971static int 2972zio_vdev_io_assess(zio_t *zio) 2973{ 2974 vdev_t *vd = zio->io_vd; 2975 2976 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2977 return (ZIO_PIPELINE_STOP); 2978 2979 if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2980 spa_config_exit(zio->io_spa, SCL_ZIO, zio); 2981 2982 if (zio->io_vsd != NULL) { 2983 zio->io_vsd_ops->vsd_free(zio); 2984 zio->io_vsd = NULL; 2985 } 2986 2987 if (zio_injection_enabled && zio->io_error == 0) 2988 zio->io_error = zio_handle_fault_injection(zio, EIO); 2989 2990 if (zio->io_type == ZIO_TYPE_FREE && 2991 zio->io_priority != ZIO_PRIORITY_NOW) { 2992 switch (zio->io_error) { 2993 case 0: 2994 ZIO_TRIM_STAT_INCR(bytes, zio->io_size); 2995 ZIO_TRIM_STAT_BUMP(success); 2996 break; 2997 case EOPNOTSUPP: 2998 ZIO_TRIM_STAT_BUMP(unsupported); 2999 break; 3000 default: 3001 ZIO_TRIM_STAT_BUMP(failed); 3002 break; 3003 } 3004 } 3005 3006 /* 3007 * If the I/O failed, determine whether we should attempt to retry it. 3008 * 3009 * On retry, we cut in line in the issue queue, since we don't want 3010 * compression/checksumming/etc. work to prevent our (cheap) IO reissue. 3011 */ 3012 if (zio->io_error && vd == NULL && 3013 !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { 3014 ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ 3015 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ 3016 zio->io_error = 0; 3017 zio->io_flags |= ZIO_FLAG_IO_RETRY | 3018 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; 3019 zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; 3020 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, 3021 zio_requeue_io_start_cut_in_line); 3022 return (ZIO_PIPELINE_STOP); 3023 } 3024 3025 /* 3026 * If we got an error on a leaf device, convert it to ENXIO 3027 * if the device is not accessible at all. 3028 */ 3029 if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && 3030 !vdev_accessible(vd, zio)) 3031 zio->io_error = SET_ERROR(ENXIO); 3032 3033 /* 3034 * If we can't write to an interior vdev (mirror or RAID-Z), 3035 * set vdev_cant_write so that we stop trying to allocate from it. 3036 */ 3037 if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && 3038 vd != NULL && !vd->vdev_ops->vdev_op_leaf) { 3039 vd->vdev_cant_write = B_TRUE; 3040 } 3041 3042 if (zio->io_error) 3043 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 3044 3045 if (vd != NULL && vd->vdev_ops->vdev_op_leaf && 3046 zio->io_physdone != NULL) { 3047 ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED)); 3048 ASSERT(zio->io_child_type == ZIO_CHILD_VDEV); 3049 zio->io_physdone(zio->io_logical); 3050 } 3051 3052 return (ZIO_PIPELINE_CONTINUE); 3053} 3054 3055void 3056zio_vdev_io_reissue(zio_t *zio) 3057{ 3058 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 3059 ASSERT(zio->io_error == 0); 3060 3061 zio->io_stage >>= 1; 3062} 3063 3064void 3065zio_vdev_io_redone(zio_t *zio) 3066{ 3067 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 3068 3069 zio->io_stage >>= 1; 3070} 3071 3072void 3073zio_vdev_io_bypass(zio_t *zio) 3074{ 3075 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 3076 ASSERT(zio->io_error == 0); 3077 3078 zio->io_flags |= ZIO_FLAG_IO_BYPASS; 3079 zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; 3080} 3081 3082/* 3083 * ========================================================================== 3084 * Generate and verify checksums 3085 * ========================================================================== 3086 */ 3087static int 3088zio_checksum_generate(zio_t *zio) 3089{ 3090 blkptr_t *bp = zio->io_bp; 3091 enum zio_checksum checksum; 3092 3093 if (bp == NULL) { 3094 /* 3095 * This is zio_write_phys(). 3096 * We're either generating a label checksum, or none at all. 3097 */ 3098 checksum = zio->io_prop.zp_checksum; 3099 3100 if (checksum == ZIO_CHECKSUM_OFF) 3101 return (ZIO_PIPELINE_CONTINUE); 3102 3103 ASSERT(checksum == ZIO_CHECKSUM_LABEL); 3104 } else { 3105 if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { 3106 ASSERT(!IO_IS_ALLOCATING(zio)); 3107 checksum = ZIO_CHECKSUM_GANG_HEADER; 3108 } else { 3109 checksum = BP_GET_CHECKSUM(bp); 3110 } 3111 } 3112 3113 zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); 3114 3115 return (ZIO_PIPELINE_CONTINUE); 3116} 3117 3118static int 3119zio_checksum_verify(zio_t *zio) 3120{ 3121 zio_bad_cksum_t info; 3122 blkptr_t *bp = zio->io_bp; 3123 int error; 3124 3125 ASSERT(zio->io_vd != NULL); 3126 3127 if (bp == NULL) { 3128 /* 3129 * This is zio_read_phys(). 3130 * We're either verifying a label checksum, or nothing at all. 3131 */ 3132 if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) 3133 return (ZIO_PIPELINE_CONTINUE); 3134 3135 ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); 3136 } 3137 3138 if ((error = zio_checksum_error(zio, &info)) != 0) { 3139 zio->io_error = error; 3140 if (error == ECKSUM && 3141 !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 3142 zfs_ereport_start_checksum(zio->io_spa, 3143 zio->io_vd, zio, zio->io_offset, 3144 zio->io_size, NULL, &info); 3145 } 3146 } 3147 3148 return (ZIO_PIPELINE_CONTINUE); 3149} 3150 3151/* 3152 * Called by RAID-Z to ensure we don't compute the checksum twice. 3153 */ 3154void 3155zio_checksum_verified(zio_t *zio) 3156{ 3157 zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 3158} 3159 3160/* 3161 * ========================================================================== 3162 * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. 3163 * An error of 0 indicates success. ENXIO indicates whole-device failure, 3164 * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO 3165 * indicate errors that are specific to one I/O, and most likely permanent. 3166 * Any other error is presumed to be worse because we weren't expecting it. 3167 * ========================================================================== 3168 */ 3169int 3170zio_worst_error(int e1, int e2) 3171{ 3172 static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; 3173 int r1, r2; 3174 3175 for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) 3176 if (e1 == zio_error_rank[r1]) 3177 break; 3178 3179 for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) 3180 if (e2 == zio_error_rank[r2]) 3181 break; 3182 3183 return (r1 > r2 ? e1 : e2); 3184} 3185 3186/* 3187 * ========================================================================== 3188 * I/O completion 3189 * ========================================================================== 3190 */ 3191static int 3192zio_ready(zio_t *zio) 3193{ 3194 blkptr_t *bp = zio->io_bp; 3195 zio_t *pio, *pio_next; 3196 3197 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 3198 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY)) 3199 return (ZIO_PIPELINE_STOP); 3200 3201 if (zio->io_ready) { 3202 ASSERT(IO_IS_ALLOCATING(zio)); 3203 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || 3204 (zio->io_flags & ZIO_FLAG_NOPWRITE)); 3205 ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); 3206 3207 zio->io_ready(zio); 3208 } 3209 3210 if (bp != NULL && bp != &zio->io_bp_copy) 3211 zio->io_bp_copy = *bp; 3212 3213 if (zio->io_error) 3214 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 3215 3216 mutex_enter(&zio->io_lock); 3217 zio->io_state[ZIO_WAIT_READY] = 1; 3218 pio = zio_walk_parents(zio); 3219 mutex_exit(&zio->io_lock); 3220 3221 /* 3222 * As we notify zio's parents, new parents could be added. 3223 * New parents go to the head of zio's io_parent_list, however, 3224 * so we will (correctly) not notify them. The remainder of zio's 3225 * io_parent_list, from 'pio_next' onward, cannot change because 3226 * all parents must wait for us to be done before they can be done. 3227 */ 3228 for (; pio != NULL; pio = pio_next) { 3229 pio_next = zio_walk_parents(zio); 3230 zio_notify_parent(pio, zio, ZIO_WAIT_READY); 3231 } 3232 3233 if (zio->io_flags & ZIO_FLAG_NODATA) { 3234 if (BP_IS_GANG(bp)) { 3235 zio->io_flags &= ~ZIO_FLAG_NODATA; 3236 } else { 3237 ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE); 3238 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 3239 } 3240 } 3241 3242 if (zio_injection_enabled && 3243 zio->io_spa->spa_syncing_txg == zio->io_txg) 3244 zio_handle_ignored_writes(zio); 3245 3246 return (ZIO_PIPELINE_CONTINUE); 3247} 3248 3249static int 3250zio_done(zio_t *zio) 3251{ 3252 spa_t *spa = zio->io_spa; 3253 zio_t *lio = zio->io_logical; 3254 blkptr_t *bp = zio->io_bp; 3255 vdev_t *vd = zio->io_vd; 3256 uint64_t psize = zio->io_size; 3257 zio_t *pio, *pio_next; 3258 3259 /* 3260 * If our children haven't all completed, 3261 * wait for them and then repeat this pipeline stage. 3262 */ 3263 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || 3264 zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || 3265 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) || 3266 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) 3267 return (ZIO_PIPELINE_STOP); 3268 3269 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 3270 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 3271 ASSERT(zio->io_children[c][w] == 0); 3272 3273 if (bp != NULL && !BP_IS_EMBEDDED(bp)) { 3274 ASSERT(bp->blk_pad[0] == 0); 3275 ASSERT(bp->blk_pad[1] == 0); 3276 ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || 3277 (bp == zio_unique_parent(zio)->io_bp)); 3278 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 3279 zio->io_bp_override == NULL && 3280 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 3281 ASSERT(!BP_SHOULD_BYTESWAP(bp)); 3282 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); 3283 ASSERT(BP_COUNT_GANG(bp) == 0 || 3284 (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 3285 } 3286 if (zio->io_flags & ZIO_FLAG_NOPWRITE) 3287 VERIFY(BP_EQUAL(bp, &zio->io_bp_orig)); 3288 } 3289 3290 /* 3291 * If there were child vdev/gang/ddt errors, they apply to us now. 3292 */ 3293 zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); 3294 zio_inherit_child_errors(zio, ZIO_CHILD_GANG); 3295 zio_inherit_child_errors(zio, ZIO_CHILD_DDT); 3296 3297 /* 3298 * If the I/O on the transformed data was successful, generate any 3299 * checksum reports now while we still have the transformed data. 3300 */ 3301 if (zio->io_error == 0) { 3302 while (zio->io_cksum_report != NULL) { 3303 zio_cksum_report_t *zcr = zio->io_cksum_report; 3304 uint64_t align = zcr->zcr_align; 3305 uint64_t asize = P2ROUNDUP(psize, align); 3306 char *abuf = zio->io_data; 3307 3308 if (asize != psize) { 3309 abuf = zio_buf_alloc(asize); 3310 bcopy(zio->io_data, abuf, psize); 3311 bzero(abuf + psize, asize - psize); 3312 } 3313 3314 zio->io_cksum_report = zcr->zcr_next; 3315 zcr->zcr_next = NULL; 3316 zcr->zcr_finish(zcr, abuf); 3317 zfs_ereport_free_checksum(zcr); 3318 3319 if (asize != psize) 3320 zio_buf_free(abuf, asize); 3321 } 3322 } 3323 3324 zio_pop_transforms(zio); /* note: may set zio->io_error */ 3325 3326 vdev_stat_update(zio, psize); 3327 3328 if (zio->io_error) { 3329 /* 3330 * If this I/O is attached to a particular vdev, 3331 * generate an error message describing the I/O failure 3332 * at the block level. We ignore these errors if the 3333 * device is currently unavailable. 3334 */ 3335 if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 3336 zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 3337 3338 if ((zio->io_error == EIO || !(zio->io_flags & 3339 (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && 3340 zio == lio) { 3341 /* 3342 * For logical I/O requests, tell the SPA to log the 3343 * error and generate a logical data ereport. 3344 */ 3345 spa_log_error(spa, zio); 3346 zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 3347 0, 0); 3348 } 3349 } 3350 3351 if (zio->io_error && zio == lio) { 3352 /* 3353 * Determine whether zio should be reexecuted. This will 3354 * propagate all the way to the root via zio_notify_parent(). 3355 */ 3356 ASSERT(vd == NULL && bp != NULL); 3357 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3358 3359 if (IO_IS_ALLOCATING(zio) && 3360 !(zio->io_flags & ZIO_FLAG_CANFAIL)) { 3361 if (zio->io_error != ENOSPC) 3362 zio->io_reexecute |= ZIO_REEXECUTE_NOW; 3363 else 3364 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3365 } 3366 3367 if ((zio->io_type == ZIO_TYPE_READ || 3368 zio->io_type == ZIO_TYPE_FREE) && 3369 !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && 3370 zio->io_error == ENXIO && 3371 spa_load_state(spa) == SPA_LOAD_NONE && 3372 spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 3373 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3374 3375 if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) 3376 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3377 3378 /* 3379 * Here is a possibly good place to attempt to do 3380 * either combinatorial reconstruction or error correction 3381 * based on checksums. It also might be a good place 3382 * to send out preliminary ereports before we suspend 3383 * processing. 3384 */ 3385 } 3386 3387 /* 3388 * If there were logical child errors, they apply to us now. 3389 * We defer this until now to avoid conflating logical child 3390 * errors with errors that happened to the zio itself when 3391 * updating vdev stats and reporting FMA events above. 3392 */ 3393 zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); 3394 3395 if ((zio->io_error || zio->io_reexecute) && 3396 IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && 3397 !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) 3398 zio_dva_unallocate(zio, zio->io_gang_tree, bp); 3399 3400 zio_gang_tree_free(&zio->io_gang_tree); 3401 3402 /* 3403 * Godfather I/Os should never suspend. 3404 */ 3405 if ((zio->io_flags & ZIO_FLAG_GODFATHER) && 3406 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) 3407 zio->io_reexecute = 0; 3408 3409 if (zio->io_reexecute) { 3410 /* 3411 * This is a logical I/O that wants to reexecute. 3412 * 3413 * Reexecute is top-down. When an i/o fails, if it's not 3414 * the root, it simply notifies its parent and sticks around. 3415 * The parent, seeing that it still has children in zio_done(), 3416 * does the same. This percolates all the way up to the root. 3417 * The root i/o will reexecute or suspend the entire tree. 3418 * 3419 * This approach ensures that zio_reexecute() honors 3420 * all the original i/o dependency relationships, e.g. 3421 * parents not executing until children are ready. 3422 */ 3423 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3424 3425 zio->io_gang_leader = NULL; 3426 3427 mutex_enter(&zio->io_lock); 3428 zio->io_state[ZIO_WAIT_DONE] = 1; 3429 mutex_exit(&zio->io_lock); 3430 3431 /* 3432 * "The Godfather" I/O monitors its children but is 3433 * not a true parent to them. It will track them through 3434 * the pipeline but severs its ties whenever they get into 3435 * trouble (e.g. suspended). This allows "The Godfather" 3436 * I/O to return status without blocking. 3437 */ 3438 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 3439 zio_link_t *zl = zio->io_walk_link; 3440 pio_next = zio_walk_parents(zio); 3441 3442 if ((pio->io_flags & ZIO_FLAG_GODFATHER) && 3443 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { 3444 zio_remove_child(pio, zio, zl); 3445 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3446 } 3447 } 3448 3449 if ((pio = zio_unique_parent(zio)) != NULL) { 3450 /* 3451 * We're not a root i/o, so there's nothing to do 3452 * but notify our parent. Don't propagate errors 3453 * upward since we haven't permanently failed yet. 3454 */ 3455 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 3456 zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; 3457 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3458 } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { 3459 /* 3460 * We'd fail again if we reexecuted now, so suspend 3461 * until conditions improve (e.g. device comes online). 3462 */ 3463 zio_suspend(spa, zio); 3464 } else { 3465 /* 3466 * Reexecution is potentially a huge amount of work. 3467 * Hand it off to the otherwise-unused claim taskq. 3468 */ 3469#if defined(illumos) || !defined(_KERNEL) 3470 ASSERT(zio->io_tqent.tqent_next == NULL); 3471#else 3472 ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); 3473#endif 3474 spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM, 3475 ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio, 3476 0, &zio->io_tqent); 3477 } 3478 return (ZIO_PIPELINE_STOP); 3479 } 3480 3481 ASSERT(zio->io_child_count == 0); 3482 ASSERT(zio->io_reexecute == 0); 3483 ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); 3484 3485 /* 3486 * Report any checksum errors, since the I/O is complete. 3487 */ 3488 while (zio->io_cksum_report != NULL) { 3489 zio_cksum_report_t *zcr = zio->io_cksum_report; 3490 zio->io_cksum_report = zcr->zcr_next; 3491 zcr->zcr_next = NULL; 3492 zcr->zcr_finish(zcr, NULL); 3493 zfs_ereport_free_checksum(zcr); 3494 } 3495 3496 /* 3497 * It is the responsibility of the done callback to ensure that this 3498 * particular zio is no longer discoverable for adoption, and as 3499 * such, cannot acquire any new parents. 3500 */ 3501 if (zio->io_done) 3502 zio->io_done(zio); 3503 3504 mutex_enter(&zio->io_lock); 3505 zio->io_state[ZIO_WAIT_DONE] = 1; 3506 mutex_exit(&zio->io_lock); 3507 3508 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 3509 zio_link_t *zl = zio->io_walk_link; 3510 pio_next = zio_walk_parents(zio); 3511 zio_remove_child(pio, zio, zl); 3512 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3513 } 3514 3515 if (zio->io_waiter != NULL) { 3516 mutex_enter(&zio->io_lock); 3517 zio->io_executor = NULL; 3518 cv_broadcast(&zio->io_cv); 3519 mutex_exit(&zio->io_lock); 3520 } else { 3521 zio_destroy(zio); 3522 } 3523 3524 return (ZIO_PIPELINE_STOP); 3525} 3526 3527/* 3528 * ========================================================================== 3529 * I/O pipeline definition 3530 * ========================================================================== 3531 */ 3532static zio_pipe_stage_t *zio_pipeline[] = { 3533 NULL, 3534 zio_read_bp_init, 3535 zio_free_bp_init, 3536 zio_issue_async, 3537 zio_write_bp_init, 3538 zio_checksum_generate, 3539 zio_nop_write, 3540 zio_ddt_read_start, 3541 zio_ddt_read_done, 3542 zio_ddt_write, 3543 zio_ddt_free, 3544 zio_gang_assemble, 3545 zio_gang_issue, 3546 zio_dva_allocate, 3547 zio_dva_free, 3548 zio_dva_claim, 3549 zio_ready, 3550 zio_vdev_io_start, 3551 zio_vdev_io_done, 3552 zio_vdev_io_assess, 3553 zio_checksum_verify, 3554 zio_done 3555}; 3556 3557 3558 3559 3560/* 3561 * Compare two zbookmark_phys_t's to see which we would reach first in a 3562 * pre-order traversal of the object tree. 3563 * 3564 * This is simple in every case aside from the meta-dnode object. For all other 3565 * objects, we traverse them in order (object 1 before object 2, and so on). 3566 * However, all of these objects are traversed while traversing object 0, since 3567 * the data it points to is the list of objects. Thus, we need to convert to a 3568 * canonical representation so we can compare meta-dnode bookmarks to 3569 * non-meta-dnode bookmarks. 3570 * 3571 * We do this by calculating "equivalents" for each field of the zbookmark. 3572 * zbookmarks outside of the meta-dnode use their own object and level, and 3573 * calculate the level 0 equivalent (the first L0 blkid that is contained in the 3574 * blocks this bookmark refers to) by multiplying their blkid by their span 3575 * (the number of L0 blocks contained within one block at their level). 3576 * zbookmarks inside the meta-dnode calculate their object equivalent 3577 * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use 3578 * level + 1<<31 (any value larger than a level could ever be) for their level. 3579 * This causes them to always compare before a bookmark in their object 3580 * equivalent, compare appropriately to bookmarks in other objects, and to 3581 * compare appropriately to other bookmarks in the meta-dnode. 3582 */ 3583int 3584zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2, 3585 const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2) 3586{ 3587 /* 3588 * These variables represent the "equivalent" values for the zbookmark, 3589 * after converting zbookmarks inside the meta dnode to their 3590 * normal-object equivalents. 3591 */ 3592 uint64_t zb1obj, zb2obj; 3593 uint64_t zb1L0, zb2L0; 3594 uint64_t zb1level, zb2level; 3595 3596 if (zb1->zb_object == zb2->zb_object && 3597 zb1->zb_level == zb2->zb_level && 3598 zb1->zb_blkid == zb2->zb_blkid) 3599 return (0); 3600 3601 /* 3602 * BP_SPANB calculates the span in blocks. 3603 */ 3604 zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level); 3605 zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level); 3606 3607 if (zb1->zb_object == DMU_META_DNODE_OBJECT) { 3608 zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); 3609 zb1L0 = 0; 3610 zb1level = zb1->zb_level + COMPARE_META_LEVEL; 3611 } else { 3612 zb1obj = zb1->zb_object; 3613 zb1level = zb1->zb_level; 3614 } 3615 3616 if (zb2->zb_object == DMU_META_DNODE_OBJECT) { 3617 zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); 3618 zb2L0 = 0; 3619 zb2level = zb2->zb_level + COMPARE_META_LEVEL; 3620 } else { 3621 zb2obj = zb2->zb_object; 3622 zb2level = zb2->zb_level; 3623 } 3624 3625 /* Now that we have a canonical representation, do the comparison. */ 3626 if (zb1obj != zb2obj) 3627 return (zb1obj < zb2obj ? -1 : 1); 3628 else if (zb1L0 != zb2L0) 3629 return (zb1L0 < zb2L0 ? -1 : 1); 3630 else if (zb1level != zb2level) 3631 return (zb1level > zb2level ? -1 : 1); 3632 /* 3633 * This can (theoretically) happen if the bookmarks have the same object 3634 * and level, but different blkids, if the block sizes are not the same. 3635 * There is presently no way to change the indirect block sizes 3636 */ 3637 return (0); 3638} 3639 3640/* 3641 * This function checks the following: given that last_block is the place that 3642 * our traversal stopped last time, does that guarantee that we've visited 3643 * every node under subtree_root? Therefore, we can't just use the raw output 3644 * of zbookmark_compare. We have to pass in a modified version of 3645 * subtree_root; by incrementing the block id, and then checking whether 3646 * last_block is before or equal to that, we can tell whether or not having 3647 * visited last_block implies that all of subtree_root's children have been 3648 * visited. 3649 */ 3650boolean_t 3651zbookmark_subtree_completed(const dnode_phys_t *dnp, 3652 const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block) 3653{ 3654 zbookmark_phys_t mod_zb = *subtree_root; 3655 mod_zb.zb_blkid++; 3656 ASSERT(last_block->zb_level == 0); 3657 3658 /* The objset_phys_t isn't before anything. */ 3659 if (dnp == NULL) 3660 return (B_FALSE); 3661 3662 /* 3663 * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the 3664 * data block size in sectors, because that variable is only used if 3665 * the bookmark refers to a block in the meta-dnode. Since we don't 3666 * know without examining it what object it refers to, and there's no 3667 * harm in passing in this value in other cases, we always pass it in. 3668 * 3669 * We pass in 0 for the indirect block size shift because zb2 must be 3670 * level 0. The indirect block size is only used to calculate the span 3671 * of the bookmark, but since the bookmark must be level 0, the span is 3672 * always 1, so the math works out. 3673 * 3674 * If you make changes to how the zbookmark_compare code works, be sure 3675 * to make sure that this code still works afterwards. 3676 */ 3677 return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift, 3678 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb, 3679 last_block) <= 0); 3680} 3681