zio.c revision 263397
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2013 by Delphix. All rights reserved. 24 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. 25 */ 26 27#include <sys/zfs_context.h> 28#include <sys/fm/fs/zfs.h> 29#include <sys/spa.h> 30#include <sys/txg.h> 31#include <sys/spa_impl.h> 32#include <sys/vdev_impl.h> 33#include <sys/zio_impl.h> 34#include <sys/zio_compress.h> 35#include <sys/zio_checksum.h> 36#include <sys/dmu_objset.h> 37#include <sys/arc.h> 38#include <sys/ddt.h> 39#include <sys/trim_map.h> 40#include <sys/zfeature.h> 41 42SYSCTL_DECL(_vfs_zfs); 43SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); 44#if defined(__amd64__) 45static int zio_use_uma = 1; 46#else 47static int zio_use_uma = 0; 48#endif 49TUNABLE_INT("vfs.zfs.zio.use_uma", &zio_use_uma); 50SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0, 51 "Use uma(9) for ZIO allocations"); 52static int zio_exclude_metadata = 0; 53TUNABLE_INT("vfs.zfs.zio.exclude_metadata", &zio_exclude_metadata); 54SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0, 55 "Exclude metadata buffers from dumps as well"); 56 57zio_trim_stats_t zio_trim_stats = { 58 { "bytes", KSTAT_DATA_UINT64, 59 "Number of bytes successfully TRIMmed" }, 60 { "success", KSTAT_DATA_UINT64, 61 "Number of successful TRIM requests" }, 62 { "unsupported", KSTAT_DATA_UINT64, 63 "Number of TRIM requests that failed because TRIM is not supported" }, 64 { "failed", KSTAT_DATA_UINT64, 65 "Number of TRIM requests that failed for reasons other than not supported" }, 66}; 67 68static kstat_t *zio_trim_ksp; 69 70/* 71 * ========================================================================== 72 * I/O type descriptions 73 * ========================================================================== 74 */ 75const char *zio_type_name[ZIO_TYPES] = { 76 "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", 77 "zio_ioctl" 78}; 79 80/* 81 * ========================================================================== 82 * I/O kmem caches 83 * ========================================================================== 84 */ 85kmem_cache_t *zio_cache; 86kmem_cache_t *zio_link_cache; 87kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 88kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 89 90#ifdef _KERNEL 91extern vmem_t *zio_alloc_arena; 92#endif 93extern int zfs_mg_alloc_failures; 94 95/* 96 * The following actions directly effect the spa's sync-to-convergence logic. 97 * The values below define the sync pass when we start performing the action. 98 * Care should be taken when changing these values as they directly impact 99 * spa_sync() performance. Tuning these values may introduce subtle performance 100 * pathologies and should only be done in the context of performance analysis. 101 * These tunables will eventually be removed and replaced with #defines once 102 * enough analysis has been done to determine optimal values. 103 * 104 * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that 105 * regular blocks are not deferred. 106 */ 107int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */ 108TUNABLE_INT("vfs.zfs.sync_pass_deferred_free", &zfs_sync_pass_deferred_free); 109SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_deferred_free, CTLFLAG_RDTUN, 110 &zfs_sync_pass_deferred_free, 0, "defer frees starting in this pass"); 111int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */ 112TUNABLE_INT("vfs.zfs.sync_pass_dont_compress", &zfs_sync_pass_dont_compress); 113SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_dont_compress, CTLFLAG_RDTUN, 114 &zfs_sync_pass_dont_compress, 0, "don't compress starting in this pass"); 115int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ 116TUNABLE_INT("vfs.zfs.sync_pass_rewrite", &zfs_sync_pass_rewrite); 117SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_rewrite, CTLFLAG_RDTUN, 118 &zfs_sync_pass_rewrite, 0, "rewrite new bps starting in this pass"); 119 120/* 121 * An allocating zio is one that either currently has the DVA allocate 122 * stage set or will have it later in its lifetime. 123 */ 124#define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) 125 126boolean_t zio_requeue_io_start_cut_in_line = B_TRUE; 127 128#ifdef ZFS_DEBUG 129int zio_buf_debug_limit = 16384; 130#else 131int zio_buf_debug_limit = 0; 132#endif 133 134void 135zio_init(void) 136{ 137 size_t c; 138 zio_cache = kmem_cache_create("zio_cache", 139 sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 140 zio_link_cache = kmem_cache_create("zio_link_cache", 141 sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 142 if (!zio_use_uma) 143 goto out; 144 145 /* 146 * For small buffers, we want a cache for each multiple of 147 * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 148 * for each quarter-power of 2. For large buffers, we want 149 * a cache for each multiple of PAGESIZE. 150 */ 151 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 152 size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 153 size_t p2 = size; 154 size_t align = 0; 155 size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0; 156 157 while (p2 & (p2 - 1)) 158 p2 &= p2 - 1; 159 160#ifdef illumos 161#ifndef _KERNEL 162 /* 163 * If we are using watchpoints, put each buffer on its own page, 164 * to eliminate the performance overhead of trapping to the 165 * kernel when modifying a non-watched buffer that shares the 166 * page with a watched buffer. 167 */ 168 if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) 169 continue; 170#endif 171#endif /* illumos */ 172 if (size <= 4 * SPA_MINBLOCKSIZE) { 173 align = SPA_MINBLOCKSIZE; 174 } else if (IS_P2ALIGNED(size, PAGESIZE)) { 175 align = PAGESIZE; 176 } else if (IS_P2ALIGNED(size, p2 >> 2)) { 177 align = p2 >> 2; 178 } 179 180 if (align != 0) { 181 char name[36]; 182 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 183 zio_buf_cache[c] = kmem_cache_create(name, size, 184 align, NULL, NULL, NULL, NULL, NULL, cflags); 185 186 /* 187 * Since zio_data bufs do not appear in crash dumps, we 188 * pass KMC_NOTOUCH so that no allocator metadata is 189 * stored with the buffers. 190 */ 191 (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 192 zio_data_buf_cache[c] = kmem_cache_create(name, size, 193 align, NULL, NULL, NULL, NULL, NULL, 194 cflags | KMC_NOTOUCH | KMC_NODEBUG); 195 } 196 } 197 198 while (--c != 0) { 199 ASSERT(zio_buf_cache[c] != NULL); 200 if (zio_buf_cache[c - 1] == NULL) 201 zio_buf_cache[c - 1] = zio_buf_cache[c]; 202 203 ASSERT(zio_data_buf_cache[c] != NULL); 204 if (zio_data_buf_cache[c - 1] == NULL) 205 zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 206 } 207out: 208 209 /* 210 * The zio write taskqs have 1 thread per cpu, allow 1/2 of the taskqs 211 * to fail 3 times per txg or 8 failures, whichever is greater. 212 */ 213 if (zfs_mg_alloc_failures == 0) 214 zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8); 215 else if (zfs_mg_alloc_failures < 8) 216 zfs_mg_alloc_failures = 8; 217 218 zio_inject_init(); 219 220 zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc", 221 KSTAT_TYPE_NAMED, 222 sizeof(zio_trim_stats) / sizeof(kstat_named_t), 223 KSTAT_FLAG_VIRTUAL); 224 225 if (zio_trim_ksp != NULL) { 226 zio_trim_ksp->ks_data = &zio_trim_stats; 227 kstat_install(zio_trim_ksp); 228 } 229} 230 231void 232zio_fini(void) 233{ 234 size_t c; 235 kmem_cache_t *last_cache = NULL; 236 kmem_cache_t *last_data_cache = NULL; 237 238 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 239 if (zio_buf_cache[c] != last_cache) { 240 last_cache = zio_buf_cache[c]; 241 kmem_cache_destroy(zio_buf_cache[c]); 242 } 243 zio_buf_cache[c] = NULL; 244 245 if (zio_data_buf_cache[c] != last_data_cache) { 246 last_data_cache = zio_data_buf_cache[c]; 247 kmem_cache_destroy(zio_data_buf_cache[c]); 248 } 249 zio_data_buf_cache[c] = NULL; 250 } 251 252 kmem_cache_destroy(zio_link_cache); 253 kmem_cache_destroy(zio_cache); 254 255 zio_inject_fini(); 256 257 if (zio_trim_ksp != NULL) { 258 kstat_delete(zio_trim_ksp); 259 zio_trim_ksp = NULL; 260 } 261} 262 263/* 264 * ========================================================================== 265 * Allocate and free I/O buffers 266 * ========================================================================== 267 */ 268 269/* 270 * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 271 * crashdump if the kernel panics, so use it judiciously. Obviously, it's 272 * useful to inspect ZFS metadata, but if possible, we should avoid keeping 273 * excess / transient data in-core during a crashdump. 274 */ 275void * 276zio_buf_alloc(size_t size) 277{ 278 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 279 int flags = zio_exclude_metadata ? KM_NODEBUG : 0; 280 281 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 282 283 if (zio_use_uma) 284 return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 285 else 286 return (kmem_alloc(size, KM_SLEEP|flags)); 287} 288 289/* 290 * Use zio_data_buf_alloc to allocate data. The data will not appear in a 291 * crashdump if the kernel panics. This exists so that we will limit the amount 292 * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 293 * of kernel heap dumped to disk when the kernel panics) 294 */ 295void * 296zio_data_buf_alloc(size_t size) 297{ 298 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 299 300 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 301 302 if (zio_use_uma) 303 return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 304 else 305 return (kmem_alloc(size, KM_SLEEP | KM_NODEBUG)); 306} 307 308void 309zio_buf_free(void *buf, size_t size) 310{ 311 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 312 313 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 314 315 if (zio_use_uma) 316 kmem_cache_free(zio_buf_cache[c], buf); 317 else 318 kmem_free(buf, size); 319} 320 321void 322zio_data_buf_free(void *buf, size_t size) 323{ 324 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 325 326 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 327 328 if (zio_use_uma) 329 kmem_cache_free(zio_data_buf_cache[c], buf); 330 else 331 kmem_free(buf, size); 332} 333 334/* 335 * ========================================================================== 336 * Push and pop I/O transform buffers 337 * ========================================================================== 338 */ 339static void 340zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, 341 zio_transform_func_t *transform) 342{ 343 zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 344 345 zt->zt_orig_data = zio->io_data; 346 zt->zt_orig_size = zio->io_size; 347 zt->zt_bufsize = bufsize; 348 zt->zt_transform = transform; 349 350 zt->zt_next = zio->io_transform_stack; 351 zio->io_transform_stack = zt; 352 353 zio->io_data = data; 354 zio->io_size = size; 355} 356 357static void 358zio_pop_transforms(zio_t *zio) 359{ 360 zio_transform_t *zt; 361 362 while ((zt = zio->io_transform_stack) != NULL) { 363 if (zt->zt_transform != NULL) 364 zt->zt_transform(zio, 365 zt->zt_orig_data, zt->zt_orig_size); 366 367 if (zt->zt_bufsize != 0) 368 zio_buf_free(zio->io_data, zt->zt_bufsize); 369 370 zio->io_data = zt->zt_orig_data; 371 zio->io_size = zt->zt_orig_size; 372 zio->io_transform_stack = zt->zt_next; 373 374 kmem_free(zt, sizeof (zio_transform_t)); 375 } 376} 377 378/* 379 * ========================================================================== 380 * I/O transform callbacks for subblocks and decompression 381 * ========================================================================== 382 */ 383static void 384zio_subblock(zio_t *zio, void *data, uint64_t size) 385{ 386 ASSERT(zio->io_size > size); 387 388 if (zio->io_type == ZIO_TYPE_READ) 389 bcopy(zio->io_data, data, size); 390} 391 392static void 393zio_decompress(zio_t *zio, void *data, uint64_t size) 394{ 395 if (zio->io_error == 0 && 396 zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), 397 zio->io_data, data, zio->io_size, size) != 0) 398 zio->io_error = SET_ERROR(EIO); 399} 400 401/* 402 * ========================================================================== 403 * I/O parent/child relationships and pipeline interlocks 404 * ========================================================================== 405 */ 406/* 407 * NOTE - Callers to zio_walk_parents() and zio_walk_children must 408 * continue calling these functions until they return NULL. 409 * Otherwise, the next caller will pick up the list walk in 410 * some indeterminate state. (Otherwise every caller would 411 * have to pass in a cookie to keep the state represented by 412 * io_walk_link, which gets annoying.) 413 */ 414zio_t * 415zio_walk_parents(zio_t *cio) 416{ 417 zio_link_t *zl = cio->io_walk_link; 418 list_t *pl = &cio->io_parent_list; 419 420 zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); 421 cio->io_walk_link = zl; 422 423 if (zl == NULL) 424 return (NULL); 425 426 ASSERT(zl->zl_child == cio); 427 return (zl->zl_parent); 428} 429 430zio_t * 431zio_walk_children(zio_t *pio) 432{ 433 zio_link_t *zl = pio->io_walk_link; 434 list_t *cl = &pio->io_child_list; 435 436 zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); 437 pio->io_walk_link = zl; 438 439 if (zl == NULL) 440 return (NULL); 441 442 ASSERT(zl->zl_parent == pio); 443 return (zl->zl_child); 444} 445 446zio_t * 447zio_unique_parent(zio_t *cio) 448{ 449 zio_t *pio = zio_walk_parents(cio); 450 451 VERIFY(zio_walk_parents(cio) == NULL); 452 return (pio); 453} 454 455void 456zio_add_child(zio_t *pio, zio_t *cio) 457{ 458 zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); 459 460 /* 461 * Logical I/Os can have logical, gang, or vdev children. 462 * Gang I/Os can have gang or vdev children. 463 * Vdev I/Os can only have vdev children. 464 * The following ASSERT captures all of these constraints. 465 */ 466 ASSERT(cio->io_child_type <= pio->io_child_type); 467 468 zl->zl_parent = pio; 469 zl->zl_child = cio; 470 471 mutex_enter(&cio->io_lock); 472 mutex_enter(&pio->io_lock); 473 474 ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); 475 476 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 477 pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; 478 479 list_insert_head(&pio->io_child_list, zl); 480 list_insert_head(&cio->io_parent_list, zl); 481 482 pio->io_child_count++; 483 cio->io_parent_count++; 484 485 mutex_exit(&pio->io_lock); 486 mutex_exit(&cio->io_lock); 487} 488 489static void 490zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) 491{ 492 ASSERT(zl->zl_parent == pio); 493 ASSERT(zl->zl_child == cio); 494 495 mutex_enter(&cio->io_lock); 496 mutex_enter(&pio->io_lock); 497 498 list_remove(&pio->io_child_list, zl); 499 list_remove(&cio->io_parent_list, zl); 500 501 pio->io_child_count--; 502 cio->io_parent_count--; 503 504 mutex_exit(&pio->io_lock); 505 mutex_exit(&cio->io_lock); 506 507 kmem_cache_free(zio_link_cache, zl); 508} 509 510static boolean_t 511zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) 512{ 513 uint64_t *countp = &zio->io_children[child][wait]; 514 boolean_t waiting = B_FALSE; 515 516 mutex_enter(&zio->io_lock); 517 ASSERT(zio->io_stall == NULL); 518 if (*countp != 0) { 519 zio->io_stage >>= 1; 520 zio->io_stall = countp; 521 waiting = B_TRUE; 522 } 523 mutex_exit(&zio->io_lock); 524 525 return (waiting); 526} 527 528static void 529zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) 530{ 531 uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; 532 int *errorp = &pio->io_child_error[zio->io_child_type]; 533 534 mutex_enter(&pio->io_lock); 535 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 536 *errorp = zio_worst_error(*errorp, zio->io_error); 537 pio->io_reexecute |= zio->io_reexecute; 538 ASSERT3U(*countp, >, 0); 539 540 (*countp)--; 541 542 if (*countp == 0 && pio->io_stall == countp) { 543 pio->io_stall = NULL; 544 mutex_exit(&pio->io_lock); 545 zio_execute(pio); 546 } else { 547 mutex_exit(&pio->io_lock); 548 } 549} 550 551static void 552zio_inherit_child_errors(zio_t *zio, enum zio_child c) 553{ 554 if (zio->io_child_error[c] != 0 && zio->io_error == 0) 555 zio->io_error = zio->io_child_error[c]; 556} 557 558/* 559 * ========================================================================== 560 * Create the various types of I/O (read, write, free, etc) 561 * ========================================================================== 562 */ 563static zio_t * 564zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 565 void *data, uint64_t size, zio_done_func_t *done, void *private, 566 zio_type_t type, zio_priority_t priority, enum zio_flag flags, 567 vdev_t *vd, uint64_t offset, const zbookmark_t *zb, 568 enum zio_stage stage, enum zio_stage pipeline) 569{ 570 zio_t *zio; 571 572 ASSERT3U(type == ZIO_TYPE_FREE || size, <=, SPA_MAXBLOCKSIZE); 573 ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 574 ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 575 576 ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); 577 ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); 578 ASSERT(vd || stage == ZIO_STAGE_OPEN); 579 580 zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 581 bzero(zio, sizeof (zio_t)); 582 583 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 584 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 585 586 list_create(&zio->io_parent_list, sizeof (zio_link_t), 587 offsetof(zio_link_t, zl_parent_node)); 588 list_create(&zio->io_child_list, sizeof (zio_link_t), 589 offsetof(zio_link_t, zl_child_node)); 590 591 if (vd != NULL) 592 zio->io_child_type = ZIO_CHILD_VDEV; 593 else if (flags & ZIO_FLAG_GANG_CHILD) 594 zio->io_child_type = ZIO_CHILD_GANG; 595 else if (flags & ZIO_FLAG_DDT_CHILD) 596 zio->io_child_type = ZIO_CHILD_DDT; 597 else 598 zio->io_child_type = ZIO_CHILD_LOGICAL; 599 600 if (bp != NULL) { 601 zio->io_bp = (blkptr_t *)bp; 602 zio->io_bp_copy = *bp; 603 zio->io_bp_orig = *bp; 604 if (type != ZIO_TYPE_WRITE || 605 zio->io_child_type == ZIO_CHILD_DDT) 606 zio->io_bp = &zio->io_bp_copy; /* so caller can free */ 607 if (zio->io_child_type == ZIO_CHILD_LOGICAL) 608 zio->io_logical = zio; 609 if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) 610 pipeline |= ZIO_GANG_STAGES; 611 } 612 613 zio->io_spa = spa; 614 zio->io_txg = txg; 615 zio->io_done = done; 616 zio->io_private = private; 617 zio->io_type = type; 618 zio->io_priority = priority; 619 zio->io_vd = vd; 620 zio->io_offset = offset; 621 zio->io_orig_data = zio->io_data = data; 622 zio->io_orig_size = zio->io_size = size; 623 zio->io_orig_flags = zio->io_flags = flags; 624 zio->io_orig_stage = zio->io_stage = stage; 625 zio->io_orig_pipeline = zio->io_pipeline = pipeline; 626 627 zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); 628 zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); 629 630 if (zb != NULL) 631 zio->io_bookmark = *zb; 632 633 if (pio != NULL) { 634 if (zio->io_logical == NULL) 635 zio->io_logical = pio->io_logical; 636 if (zio->io_child_type == ZIO_CHILD_GANG) 637 zio->io_gang_leader = pio->io_gang_leader; 638 zio_add_child(pio, zio); 639 } 640 641 return (zio); 642} 643 644static void 645zio_destroy(zio_t *zio) 646{ 647 list_destroy(&zio->io_parent_list); 648 list_destroy(&zio->io_child_list); 649 mutex_destroy(&zio->io_lock); 650 cv_destroy(&zio->io_cv); 651 kmem_cache_free(zio_cache, zio); 652} 653 654zio_t * 655zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, 656 void *private, enum zio_flag flags) 657{ 658 zio_t *zio; 659 660 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 661 ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 662 ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); 663 664 return (zio); 665} 666 667zio_t * 668zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) 669{ 670 return (zio_null(NULL, spa, NULL, done, private, flags)); 671} 672 673zio_t * 674zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, 675 void *data, uint64_t size, zio_done_func_t *done, void *private, 676 zio_priority_t priority, enum zio_flag flags, const zbookmark_t *zb) 677{ 678 zio_t *zio; 679 680 zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, 681 data, size, done, private, 682 ZIO_TYPE_READ, priority, flags, NULL, 0, zb, 683 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 684 ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); 685 686 return (zio); 687} 688 689zio_t * 690zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 691 void *data, uint64_t size, const zio_prop_t *zp, 692 zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done, 693 void *private, 694 zio_priority_t priority, enum zio_flag flags, const zbookmark_t *zb) 695{ 696 zio_t *zio; 697 698 ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && 699 zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && 700 zp->zp_compress >= ZIO_COMPRESS_OFF && 701 zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && 702 DMU_OT_IS_VALID(zp->zp_type) && 703 zp->zp_level < 32 && 704 zp->zp_copies > 0 && 705 zp->zp_copies <= spa_max_replication(spa)); 706 707 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 708 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 709 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 710 ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); 711 712 zio->io_ready = ready; 713 zio->io_physdone = physdone; 714 zio->io_prop = *zp; 715 716 return (zio); 717} 718 719zio_t * 720zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, 721 uint64_t size, zio_done_func_t *done, void *private, 722 zio_priority_t priority, enum zio_flag flags, zbookmark_t *zb) 723{ 724 zio_t *zio; 725 726 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 727 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 728 ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 729 730 return (zio); 731} 732 733void 734zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) 735{ 736 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 737 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 738 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 739 ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); 740 741 /* 742 * We must reset the io_prop to match the values that existed 743 * when the bp was first written by dmu_sync() keeping in mind 744 * that nopwrite and dedup are mutually exclusive. 745 */ 746 zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; 747 zio->io_prop.zp_nopwrite = nopwrite; 748 zio->io_prop.zp_copies = copies; 749 zio->io_bp_override = bp; 750} 751 752void 753zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) 754{ 755 metaslab_check_free(spa, bp); 756 757 /* 758 * Frees that are for the currently-syncing txg, are not going to be 759 * deferred, and which will not need to do a read (i.e. not GANG or 760 * DEDUP), can be processed immediately. Otherwise, put them on the 761 * in-memory list for later processing. 762 */ 763 if (zfs_trim_enabled || BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || 764 txg != spa->spa_syncing_txg || 765 spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) { 766 bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); 767 } else { 768 VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 769 BP_GET_PSIZE(bp), 0))); 770 } 771} 772 773zio_t * 774zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 775 uint64_t size, enum zio_flag flags) 776{ 777 zio_t *zio; 778 enum zio_stage stage = ZIO_FREE_PIPELINE; 779 780 dprintf_bp(bp, "freeing in txg %llu, pass %u", 781 (longlong_t)txg, spa->spa_sync_pass); 782 783 ASSERT(!BP_IS_HOLE(bp)); 784 ASSERT(spa_syncing_txg(spa) == txg); 785 ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); 786 787 metaslab_check_free(spa, bp); 788 arc_freed(spa, bp); 789 790 if (zfs_trim_enabled) 791 stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START | 792 ZIO_STAGE_VDEV_IO_ASSESS; 793 /* 794 * GANG and DEDUP blocks can induce a read (for the gang block header, 795 * or the DDT), so issue them asynchronously so that this thread is 796 * not tied up. 797 */ 798 else if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) 799 stage |= ZIO_STAGE_ISSUE_ASYNC; 800 801 zio = zio_create(pio, spa, txg, bp, NULL, size, 802 NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags, 803 NULL, 0, NULL, ZIO_STAGE_OPEN, stage); 804 805 return (zio); 806} 807 808zio_t * 809zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 810 zio_done_func_t *done, void *private, enum zio_flag flags) 811{ 812 zio_t *zio; 813 814 /* 815 * A claim is an allocation of a specific block. Claims are needed 816 * to support immediate writes in the intent log. The issue is that 817 * immediate writes contain committed data, but in a txg that was 818 * *not* committed. Upon opening the pool after an unclean shutdown, 819 * the intent log claims all blocks that contain immediate write data 820 * so that the SPA knows they're in use. 821 * 822 * All claims *must* be resolved in the first txg -- before the SPA 823 * starts allocating blocks -- so that nothing is allocated twice. 824 * If txg == 0 we just verify that the block is claimable. 825 */ 826 ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 827 ASSERT(txg == spa_first_txg(spa) || txg == 0); 828 ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ 829 830 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 831 done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, 832 NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 833 834 return (zio); 835} 836 837zio_t * 838zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset, 839 uint64_t size, zio_done_func_t *done, void *private, 840 enum zio_flag flags) 841{ 842 zio_t *zio; 843 int c; 844 845 if (vd->vdev_children == 0) { 846 zio = zio_create(pio, spa, 0, NULL, NULL, size, done, private, 847 ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, offset, NULL, 848 ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 849 850 zio->io_cmd = cmd; 851 } else { 852 zio = zio_null(pio, spa, NULL, NULL, NULL, flags); 853 854 for (c = 0; c < vd->vdev_children; c++) 855 zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 856 offset, size, done, private, flags)); 857 } 858 859 return (zio); 860} 861 862zio_t * 863zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 864 void *data, int checksum, zio_done_func_t *done, void *private, 865 zio_priority_t priority, enum zio_flag flags, boolean_t labels) 866{ 867 zio_t *zio; 868 869 ASSERT(vd->vdev_children == 0); 870 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 871 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 872 ASSERT3U(offset + size, <=, vd->vdev_psize); 873 874 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 875 ZIO_TYPE_READ, priority, flags, vd, offset, NULL, 876 ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 877 878 zio->io_prop.zp_checksum = checksum; 879 880 return (zio); 881} 882 883zio_t * 884zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 885 void *data, int checksum, zio_done_func_t *done, void *private, 886 zio_priority_t priority, enum zio_flag flags, boolean_t labels) 887{ 888 zio_t *zio; 889 890 ASSERT(vd->vdev_children == 0); 891 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 892 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 893 ASSERT3U(offset + size, <=, vd->vdev_psize); 894 895 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 896 ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL, 897 ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 898 899 zio->io_prop.zp_checksum = checksum; 900 901 if (zio_checksum_table[checksum].ci_eck) { 902 /* 903 * zec checksums are necessarily destructive -- they modify 904 * the end of the write buffer to hold the verifier/checksum. 905 * Therefore, we must make a local copy in case the data is 906 * being written to multiple places in parallel. 907 */ 908 void *wbuf = zio_buf_alloc(size); 909 bcopy(data, wbuf, size); 910 zio_push_transform(zio, wbuf, size, size, NULL); 911 } 912 913 return (zio); 914} 915 916/* 917 * Create a child I/O to do some work for us. 918 */ 919zio_t * 920zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 921 void *data, uint64_t size, int type, zio_priority_t priority, 922 enum zio_flag flags, zio_done_func_t *done, void *private) 923{ 924 enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; 925 zio_t *zio; 926 927 ASSERT(vd->vdev_parent == 928 (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); 929 930 if (type == ZIO_TYPE_READ && bp != NULL) { 931 /* 932 * If we have the bp, then the child should perform the 933 * checksum and the parent need not. This pushes error 934 * detection as close to the leaves as possible and 935 * eliminates redundant checksums in the interior nodes. 936 */ 937 pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; 938 pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 939 } 940 941 if (vd->vdev_children == 0) 942 offset += VDEV_LABEL_START_SIZE; 943 944 flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE; 945 946 /* 947 * If we've decided to do a repair, the write is not speculative -- 948 * even if the original read was. 949 */ 950 if (flags & ZIO_FLAG_IO_REPAIR) 951 flags &= ~ZIO_FLAG_SPECULATIVE; 952 953 zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, 954 done, private, type, priority, flags, vd, offset, &pio->io_bookmark, 955 ZIO_STAGE_VDEV_IO_START >> 1, pipeline); 956 957 zio->io_physdone = pio->io_physdone; 958 if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) 959 zio->io_logical->io_phys_children++; 960 961 return (zio); 962} 963 964zio_t * 965zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, 966 int type, zio_priority_t priority, enum zio_flag flags, 967 zio_done_func_t *done, void *private) 968{ 969 zio_t *zio; 970 971 ASSERT(vd->vdev_ops->vdev_op_leaf); 972 973 zio = zio_create(NULL, vd->vdev_spa, 0, NULL, 974 data, size, done, private, type, priority, 975 flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED, 976 vd, offset, NULL, 977 ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); 978 979 return (zio); 980} 981 982void 983zio_flush(zio_t *zio, vdev_t *vd) 984{ 985 zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0, 986 NULL, NULL, 987 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); 988} 989 990zio_t * 991zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size) 992{ 993 994 ASSERT(vd->vdev_ops->vdev_op_leaf); 995 996 return zio_ioctl(zio, spa, vd, DKIOCTRIM, offset, size, 997 NULL, NULL, 998 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY); 999} 1000 1001void 1002zio_shrink(zio_t *zio, uint64_t size) 1003{ 1004 ASSERT(zio->io_executor == NULL); 1005 ASSERT(zio->io_orig_size == zio->io_size); 1006 ASSERT(size <= zio->io_size); 1007 1008 /* 1009 * We don't shrink for raidz because of problems with the 1010 * reconstruction when reading back less than the block size. 1011 * Note, BP_IS_RAIDZ() assumes no compression. 1012 */ 1013 ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); 1014 if (!BP_IS_RAIDZ(zio->io_bp)) 1015 zio->io_orig_size = zio->io_size = size; 1016} 1017 1018/* 1019 * ========================================================================== 1020 * Prepare to read and write logical blocks 1021 * ========================================================================== 1022 */ 1023 1024static int 1025zio_read_bp_init(zio_t *zio) 1026{ 1027 blkptr_t *bp = zio->io_bp; 1028 1029 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && 1030 zio->io_child_type == ZIO_CHILD_LOGICAL && 1031 !(zio->io_flags & ZIO_FLAG_RAW)) { 1032 uint64_t psize = BP_GET_PSIZE(bp); 1033 void *cbuf = zio_buf_alloc(psize); 1034 1035 zio_push_transform(zio, cbuf, psize, psize, zio_decompress); 1036 } 1037 1038 if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) 1039 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1040 1041 if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) 1042 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1043 1044 if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) 1045 zio->io_pipeline = ZIO_DDT_READ_PIPELINE; 1046 1047 return (ZIO_PIPELINE_CONTINUE); 1048} 1049 1050static int 1051zio_write_bp_init(zio_t *zio) 1052{ 1053 spa_t *spa = zio->io_spa; 1054 zio_prop_t *zp = &zio->io_prop; 1055 enum zio_compress compress = zp->zp_compress; 1056 blkptr_t *bp = zio->io_bp; 1057 uint64_t lsize = zio->io_size; 1058 uint64_t psize = lsize; 1059 int pass = 1; 1060 1061 /* 1062 * If our children haven't all reached the ready stage, 1063 * wait for them and then repeat this pipeline stage. 1064 */ 1065 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 1066 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) 1067 return (ZIO_PIPELINE_STOP); 1068 1069 if (!IO_IS_ALLOCATING(zio)) 1070 return (ZIO_PIPELINE_CONTINUE); 1071 1072 ASSERT(zio->io_child_type != ZIO_CHILD_DDT); 1073 1074 if (zio->io_bp_override) { 1075 ASSERT(bp->blk_birth != zio->io_txg); 1076 ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); 1077 1078 *bp = *zio->io_bp_override; 1079 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1080 1081 /* 1082 * If we've been overridden and nopwrite is set then 1083 * set the flag accordingly to indicate that a nopwrite 1084 * has already occurred. 1085 */ 1086 if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { 1087 ASSERT(!zp->zp_dedup); 1088 zio->io_flags |= ZIO_FLAG_NOPWRITE; 1089 return (ZIO_PIPELINE_CONTINUE); 1090 } 1091 1092 ASSERT(!zp->zp_nopwrite); 1093 1094 if (BP_IS_HOLE(bp) || !zp->zp_dedup) 1095 return (ZIO_PIPELINE_CONTINUE); 1096 1097 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup || 1098 zp->zp_dedup_verify); 1099 1100 if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { 1101 BP_SET_DEDUP(bp, 1); 1102 zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; 1103 return (ZIO_PIPELINE_CONTINUE); 1104 } 1105 zio->io_bp_override = NULL; 1106 BP_ZERO(bp); 1107 } 1108 1109 if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) { 1110 /* 1111 * We're rewriting an existing block, which means we're 1112 * working on behalf of spa_sync(). For spa_sync() to 1113 * converge, it must eventually be the case that we don't 1114 * have to allocate new blocks. But compression changes 1115 * the blocksize, which forces a reallocate, and makes 1116 * convergence take longer. Therefore, after the first 1117 * few passes, stop compressing to ensure convergence. 1118 */ 1119 pass = spa_sync_pass(spa); 1120 1121 ASSERT(zio->io_txg == spa_syncing_txg(spa)); 1122 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1123 ASSERT(!BP_GET_DEDUP(bp)); 1124 1125 if (pass >= zfs_sync_pass_dont_compress) 1126 compress = ZIO_COMPRESS_OFF; 1127 1128 /* Make sure someone doesn't change their mind on overwrites */ 1129 ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp), 1130 spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 1131 } 1132 1133 if (compress != ZIO_COMPRESS_OFF) { 1134 metaslab_class_t *mc = spa_normal_class(spa); 1135 void *cbuf = zio_buf_alloc(lsize); 1136 psize = zio_compress_data(compress, zio->io_data, cbuf, lsize, 1137 (size_t)metaslab_class_get_minblocksize(mc)); 1138 if (psize == 0 || psize == lsize) { 1139 compress = ZIO_COMPRESS_OFF; 1140 zio_buf_free(cbuf, lsize); 1141 } else { 1142 ASSERT(psize < lsize); 1143 zio_push_transform(zio, cbuf, psize, lsize, NULL); 1144 } 1145 } 1146 1147 /* 1148 * The final pass of spa_sync() must be all rewrites, but the first 1149 * few passes offer a trade-off: allocating blocks defers convergence, 1150 * but newly allocated blocks are sequential, so they can be written 1151 * to disk faster. Therefore, we allow the first few passes of 1152 * spa_sync() to allocate new blocks, but force rewrites after that. 1153 * There should only be a handful of blocks after pass 1 in any case. 1154 */ 1155 if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg && 1156 BP_GET_PSIZE(bp) == psize && 1157 pass >= zfs_sync_pass_rewrite) { 1158 ASSERT(psize != 0); 1159 enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; 1160 zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; 1161 zio->io_flags |= ZIO_FLAG_IO_REWRITE; 1162 } else { 1163 BP_ZERO(bp); 1164 zio->io_pipeline = ZIO_WRITE_PIPELINE; 1165 } 1166 1167 if (psize == 0) { 1168 if (zio->io_bp_orig.blk_birth != 0 && 1169 spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { 1170 BP_SET_LSIZE(bp, lsize); 1171 BP_SET_TYPE(bp, zp->zp_type); 1172 BP_SET_LEVEL(bp, zp->zp_level); 1173 BP_SET_BIRTH(bp, zio->io_txg, 0); 1174 } 1175 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1176 } else { 1177 ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); 1178 BP_SET_LSIZE(bp, lsize); 1179 BP_SET_TYPE(bp, zp->zp_type); 1180 BP_SET_LEVEL(bp, zp->zp_level); 1181 BP_SET_PSIZE(bp, psize); 1182 BP_SET_COMPRESS(bp, compress); 1183 BP_SET_CHECKSUM(bp, zp->zp_checksum); 1184 BP_SET_DEDUP(bp, zp->zp_dedup); 1185 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1186 if (zp->zp_dedup) { 1187 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1188 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1189 zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; 1190 } 1191 if (zp->zp_nopwrite) { 1192 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1193 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1194 zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; 1195 } 1196 } 1197 1198 return (ZIO_PIPELINE_CONTINUE); 1199} 1200 1201static int 1202zio_free_bp_init(zio_t *zio) 1203{ 1204 blkptr_t *bp = zio->io_bp; 1205 1206 if (zio->io_child_type == ZIO_CHILD_LOGICAL) { 1207 if (BP_GET_DEDUP(bp)) 1208 zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; 1209 } 1210 1211 return (ZIO_PIPELINE_CONTINUE); 1212} 1213 1214/* 1215 * ========================================================================== 1216 * Execute the I/O pipeline 1217 * ========================================================================== 1218 */ 1219 1220static void 1221zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) 1222{ 1223 spa_t *spa = zio->io_spa; 1224 zio_type_t t = zio->io_type; 1225 int flags = (cutinline ? TQ_FRONT : 0); 1226 1227 ASSERT(q == ZIO_TASKQ_ISSUE || q == ZIO_TASKQ_INTERRUPT); 1228 1229 /* 1230 * If we're a config writer or a probe, the normal issue and 1231 * interrupt threads may all be blocked waiting for the config lock. 1232 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. 1233 */ 1234 if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) 1235 t = ZIO_TYPE_NULL; 1236 1237 /* 1238 * A similar issue exists for the L2ARC write thread until L2ARC 2.0. 1239 */ 1240 if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) 1241 t = ZIO_TYPE_NULL; 1242 1243 /* 1244 * If this is a high priority I/O, then use the high priority taskq if 1245 * available. 1246 */ 1247 if (zio->io_priority == ZIO_PRIORITY_NOW && 1248 spa->spa_zio_taskq[t][q + 1].stqs_count != 0) 1249 q++; 1250 1251 ASSERT3U(q, <, ZIO_TASKQ_TYPES); 1252 1253 /* 1254 * NB: We are assuming that the zio can only be dispatched 1255 * to a single taskq at a time. It would be a grievous error 1256 * to dispatch the zio to another taskq at the same time. 1257 */ 1258#if defined(illumos) || !defined(_KERNEL) 1259 ASSERT(zio->io_tqent.tqent_next == NULL); 1260#else 1261 ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); 1262#endif 1263 spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio, 1264 flags, &zio->io_tqent); 1265} 1266 1267static boolean_t 1268zio_taskq_member(zio_t *zio, zio_taskq_type_t q) 1269{ 1270 kthread_t *executor = zio->io_executor; 1271 spa_t *spa = zio->io_spa; 1272 1273 for (zio_type_t t = 0; t < ZIO_TYPES; t++) { 1274 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1275 uint_t i; 1276 for (i = 0; i < tqs->stqs_count; i++) { 1277 if (taskq_member(tqs->stqs_taskq[i], executor)) 1278 return (B_TRUE); 1279 } 1280 } 1281 1282 return (B_FALSE); 1283} 1284 1285static int 1286zio_issue_async(zio_t *zio) 1287{ 1288 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 1289 1290 return (ZIO_PIPELINE_STOP); 1291} 1292 1293void 1294zio_interrupt(zio_t *zio) 1295{ 1296 zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); 1297} 1298 1299/* 1300 * Execute the I/O pipeline until one of the following occurs: 1301 * 1302 * (1) the I/O completes 1303 * (2) the pipeline stalls waiting for dependent child I/Os 1304 * (3) the I/O issues, so we're waiting for an I/O completion interrupt 1305 * (4) the I/O is delegated by vdev-level caching or aggregation 1306 * (5) the I/O is deferred due to vdev-level queueing 1307 * (6) the I/O is handed off to another thread. 1308 * 1309 * In all cases, the pipeline stops whenever there's no CPU work; it never 1310 * burns a thread in cv_wait(). 1311 * 1312 * There's no locking on io_stage because there's no legitimate way 1313 * for multiple threads to be attempting to process the same I/O. 1314 */ 1315static zio_pipe_stage_t *zio_pipeline[]; 1316 1317void 1318zio_execute(zio_t *zio) 1319{ 1320 zio->io_executor = curthread; 1321 1322 while (zio->io_stage < ZIO_STAGE_DONE) { 1323 enum zio_stage pipeline = zio->io_pipeline; 1324 enum zio_stage stage = zio->io_stage; 1325 int rv; 1326 1327 ASSERT(!MUTEX_HELD(&zio->io_lock)); 1328 ASSERT(ISP2(stage)); 1329 ASSERT(zio->io_stall == NULL); 1330 1331 do { 1332 stage <<= 1; 1333 } while ((stage & pipeline) == 0); 1334 1335 ASSERT(stage <= ZIO_STAGE_DONE); 1336 1337 /* 1338 * If we are in interrupt context and this pipeline stage 1339 * will grab a config lock that is held across I/O, 1340 * or may wait for an I/O that needs an interrupt thread 1341 * to complete, issue async to avoid deadlock. 1342 * 1343 * For VDEV_IO_START, we cut in line so that the io will 1344 * be sent to disk promptly. 1345 */ 1346 if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && 1347 zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { 1348 boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? 1349 zio_requeue_io_start_cut_in_line : B_FALSE; 1350 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); 1351 return; 1352 } 1353 1354 zio->io_stage = stage; 1355 rv = zio_pipeline[highbit(stage) - 1](zio); 1356 1357 if (rv == ZIO_PIPELINE_STOP) 1358 return; 1359 1360 ASSERT(rv == ZIO_PIPELINE_CONTINUE); 1361 } 1362} 1363 1364/* 1365 * ========================================================================== 1366 * Initiate I/O, either sync or async 1367 * ========================================================================== 1368 */ 1369int 1370zio_wait(zio_t *zio) 1371{ 1372 int error; 1373 1374 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 1375 ASSERT(zio->io_executor == NULL); 1376 1377 zio->io_waiter = curthread; 1378 1379 zio_execute(zio); 1380 1381 mutex_enter(&zio->io_lock); 1382 while (zio->io_executor != NULL) 1383 cv_wait(&zio->io_cv, &zio->io_lock); 1384 mutex_exit(&zio->io_lock); 1385 1386 error = zio->io_error; 1387 zio_destroy(zio); 1388 1389 return (error); 1390} 1391 1392void 1393zio_nowait(zio_t *zio) 1394{ 1395 ASSERT(zio->io_executor == NULL); 1396 1397 if (zio->io_child_type == ZIO_CHILD_LOGICAL && 1398 zio_unique_parent(zio) == NULL) { 1399 /* 1400 * This is a logical async I/O with no parent to wait for it. 1401 * We add it to the spa_async_root_zio "Godfather" I/O which 1402 * will ensure they complete prior to unloading the pool. 1403 */ 1404 spa_t *spa = zio->io_spa; 1405 1406 zio_add_child(spa->spa_async_zio_root, zio); 1407 } 1408 1409 zio_execute(zio); 1410} 1411 1412/* 1413 * ========================================================================== 1414 * Reexecute or suspend/resume failed I/O 1415 * ========================================================================== 1416 */ 1417 1418static void 1419zio_reexecute(zio_t *pio) 1420{ 1421 zio_t *cio, *cio_next; 1422 1423 ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); 1424 ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); 1425 ASSERT(pio->io_gang_leader == NULL); 1426 ASSERT(pio->io_gang_tree == NULL); 1427 1428 pio->io_flags = pio->io_orig_flags; 1429 pio->io_stage = pio->io_orig_stage; 1430 pio->io_pipeline = pio->io_orig_pipeline; 1431 pio->io_reexecute = 0; 1432 pio->io_flags |= ZIO_FLAG_REEXECUTED; 1433 pio->io_error = 0; 1434 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1435 pio->io_state[w] = 0; 1436 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 1437 pio->io_child_error[c] = 0; 1438 1439 if (IO_IS_ALLOCATING(pio)) 1440 BP_ZERO(pio->io_bp); 1441 1442 /* 1443 * As we reexecute pio's children, new children could be created. 1444 * New children go to the head of pio's io_child_list, however, 1445 * so we will (correctly) not reexecute them. The key is that 1446 * the remainder of pio's io_child_list, from 'cio_next' onward, 1447 * cannot be affected by any side effects of reexecuting 'cio'. 1448 */ 1449 for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { 1450 cio_next = zio_walk_children(pio); 1451 mutex_enter(&pio->io_lock); 1452 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1453 pio->io_children[cio->io_child_type][w]++; 1454 mutex_exit(&pio->io_lock); 1455 zio_reexecute(cio); 1456 } 1457 1458 /* 1459 * Now that all children have been reexecuted, execute the parent. 1460 * We don't reexecute "The Godfather" I/O here as it's the 1461 * responsibility of the caller to wait on him. 1462 */ 1463 if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) 1464 zio_execute(pio); 1465} 1466 1467void 1468zio_suspend(spa_t *spa, zio_t *zio) 1469{ 1470 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) 1471 fm_panic("Pool '%s' has encountered an uncorrectable I/O " 1472 "failure and the failure mode property for this pool " 1473 "is set to panic.", spa_name(spa)); 1474 1475 zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); 1476 1477 mutex_enter(&spa->spa_suspend_lock); 1478 1479 if (spa->spa_suspend_zio_root == NULL) 1480 spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 1481 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 1482 ZIO_FLAG_GODFATHER); 1483 1484 spa->spa_suspended = B_TRUE; 1485 1486 if (zio != NULL) { 1487 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 1488 ASSERT(zio != spa->spa_suspend_zio_root); 1489 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1490 ASSERT(zio_unique_parent(zio) == NULL); 1491 ASSERT(zio->io_stage == ZIO_STAGE_DONE); 1492 zio_add_child(spa->spa_suspend_zio_root, zio); 1493 } 1494 1495 mutex_exit(&spa->spa_suspend_lock); 1496} 1497 1498int 1499zio_resume(spa_t *spa) 1500{ 1501 zio_t *pio; 1502 1503 /* 1504 * Reexecute all previously suspended i/o. 1505 */ 1506 mutex_enter(&spa->spa_suspend_lock); 1507 spa->spa_suspended = B_FALSE; 1508 cv_broadcast(&spa->spa_suspend_cv); 1509 pio = spa->spa_suspend_zio_root; 1510 spa->spa_suspend_zio_root = NULL; 1511 mutex_exit(&spa->spa_suspend_lock); 1512 1513 if (pio == NULL) 1514 return (0); 1515 1516 zio_reexecute(pio); 1517 return (zio_wait(pio)); 1518} 1519 1520void 1521zio_resume_wait(spa_t *spa) 1522{ 1523 mutex_enter(&spa->spa_suspend_lock); 1524 while (spa_suspended(spa)) 1525 cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); 1526 mutex_exit(&spa->spa_suspend_lock); 1527} 1528 1529/* 1530 * ========================================================================== 1531 * Gang blocks. 1532 * 1533 * A gang block is a collection of small blocks that looks to the DMU 1534 * like one large block. When zio_dva_allocate() cannot find a block 1535 * of the requested size, due to either severe fragmentation or the pool 1536 * being nearly full, it calls zio_write_gang_block() to construct the 1537 * block from smaller fragments. 1538 * 1539 * A gang block consists of a gang header (zio_gbh_phys_t) and up to 1540 * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like 1541 * an indirect block: it's an array of block pointers. It consumes 1542 * only one sector and hence is allocatable regardless of fragmentation. 1543 * The gang header's bps point to its gang members, which hold the data. 1544 * 1545 * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> 1546 * as the verifier to ensure uniqueness of the SHA256 checksum. 1547 * Critically, the gang block bp's blk_cksum is the checksum of the data, 1548 * not the gang header. This ensures that data block signatures (needed for 1549 * deduplication) are independent of how the block is physically stored. 1550 * 1551 * Gang blocks can be nested: a gang member may itself be a gang block. 1552 * Thus every gang block is a tree in which root and all interior nodes are 1553 * gang headers, and the leaves are normal blocks that contain user data. 1554 * The root of the gang tree is called the gang leader. 1555 * 1556 * To perform any operation (read, rewrite, free, claim) on a gang block, 1557 * zio_gang_assemble() first assembles the gang tree (minus data leaves) 1558 * in the io_gang_tree field of the original logical i/o by recursively 1559 * reading the gang leader and all gang headers below it. This yields 1560 * an in-core tree containing the contents of every gang header and the 1561 * bps for every constituent of the gang block. 1562 * 1563 * With the gang tree now assembled, zio_gang_issue() just walks the gang tree 1564 * and invokes a callback on each bp. To free a gang block, zio_gang_issue() 1565 * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. 1566 * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). 1567 * zio_read_gang() is a wrapper around zio_read() that omits reading gang 1568 * headers, since we already have those in io_gang_tree. zio_rewrite_gang() 1569 * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() 1570 * of the gang header plus zio_checksum_compute() of the data to update the 1571 * gang header's blk_cksum as described above. 1572 * 1573 * The two-phase assemble/issue model solves the problem of partial failure -- 1574 * what if you'd freed part of a gang block but then couldn't read the 1575 * gang header for another part? Assembling the entire gang tree first 1576 * ensures that all the necessary gang header I/O has succeeded before 1577 * starting the actual work of free, claim, or write. Once the gang tree 1578 * is assembled, free and claim are in-memory operations that cannot fail. 1579 * 1580 * In the event that a gang write fails, zio_dva_unallocate() walks the 1581 * gang tree to immediately free (i.e. insert back into the space map) 1582 * everything we've allocated. This ensures that we don't get ENOSPC 1583 * errors during repeated suspend/resume cycles due to a flaky device. 1584 * 1585 * Gang rewrites only happen during sync-to-convergence. If we can't assemble 1586 * the gang tree, we won't modify the block, so we can safely defer the free 1587 * (knowing that the block is still intact). If we *can* assemble the gang 1588 * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free 1589 * each constituent bp and we can allocate a new block on the next sync pass. 1590 * 1591 * In all cases, the gang tree allows complete recovery from partial failure. 1592 * ========================================================================== 1593 */ 1594 1595static zio_t * 1596zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1597{ 1598 if (gn != NULL) 1599 return (pio); 1600 1601 return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), 1602 NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1603 &pio->io_bookmark)); 1604} 1605 1606zio_t * 1607zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1608{ 1609 zio_t *zio; 1610 1611 if (gn != NULL) { 1612 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1613 gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, 1614 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1615 /* 1616 * As we rewrite each gang header, the pipeline will compute 1617 * a new gang block header checksum for it; but no one will 1618 * compute a new data checksum, so we do that here. The one 1619 * exception is the gang leader: the pipeline already computed 1620 * its data checksum because that stage precedes gang assembly. 1621 * (Presently, nothing actually uses interior data checksums; 1622 * this is just good hygiene.) 1623 */ 1624 if (gn != pio->io_gang_leader->io_gang_tree) { 1625 zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), 1626 data, BP_GET_PSIZE(bp)); 1627 } 1628 /* 1629 * If we are here to damage data for testing purposes, 1630 * leave the GBH alone so that we can detect the damage. 1631 */ 1632 if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) 1633 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 1634 } else { 1635 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1636 data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, 1637 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1638 } 1639 1640 return (zio); 1641} 1642 1643/* ARGSUSED */ 1644zio_t * 1645zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1646{ 1647 return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, 1648 BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp), 1649 ZIO_GANG_CHILD_FLAGS(pio))); 1650} 1651 1652/* ARGSUSED */ 1653zio_t * 1654zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1655{ 1656 return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, 1657 NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); 1658} 1659 1660static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { 1661 NULL, 1662 zio_read_gang, 1663 zio_rewrite_gang, 1664 zio_free_gang, 1665 zio_claim_gang, 1666 NULL 1667}; 1668 1669static void zio_gang_tree_assemble_done(zio_t *zio); 1670 1671static zio_gang_node_t * 1672zio_gang_node_alloc(zio_gang_node_t **gnpp) 1673{ 1674 zio_gang_node_t *gn; 1675 1676 ASSERT(*gnpp == NULL); 1677 1678 gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); 1679 gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); 1680 *gnpp = gn; 1681 1682 return (gn); 1683} 1684 1685static void 1686zio_gang_node_free(zio_gang_node_t **gnpp) 1687{ 1688 zio_gang_node_t *gn = *gnpp; 1689 1690 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1691 ASSERT(gn->gn_child[g] == NULL); 1692 1693 zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); 1694 kmem_free(gn, sizeof (*gn)); 1695 *gnpp = NULL; 1696} 1697 1698static void 1699zio_gang_tree_free(zio_gang_node_t **gnpp) 1700{ 1701 zio_gang_node_t *gn = *gnpp; 1702 1703 if (gn == NULL) 1704 return; 1705 1706 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1707 zio_gang_tree_free(&gn->gn_child[g]); 1708 1709 zio_gang_node_free(gnpp); 1710} 1711 1712static void 1713zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) 1714{ 1715 zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); 1716 1717 ASSERT(gio->io_gang_leader == gio); 1718 ASSERT(BP_IS_GANG(bp)); 1719 1720 zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, 1721 SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, 1722 gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); 1723} 1724 1725static void 1726zio_gang_tree_assemble_done(zio_t *zio) 1727{ 1728 zio_t *gio = zio->io_gang_leader; 1729 zio_gang_node_t *gn = zio->io_private; 1730 blkptr_t *bp = zio->io_bp; 1731 1732 ASSERT(gio == zio_unique_parent(zio)); 1733 ASSERT(zio->io_child_count == 0); 1734 1735 if (zio->io_error) 1736 return; 1737 1738 if (BP_SHOULD_BYTESWAP(bp)) 1739 byteswap_uint64_array(zio->io_data, zio->io_size); 1740 1741 ASSERT(zio->io_data == gn->gn_gbh); 1742 ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 1743 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1744 1745 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1746 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1747 if (!BP_IS_GANG(gbp)) 1748 continue; 1749 zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); 1750 } 1751} 1752 1753static void 1754zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) 1755{ 1756 zio_t *gio = pio->io_gang_leader; 1757 zio_t *zio; 1758 1759 ASSERT(BP_IS_GANG(bp) == !!gn); 1760 ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); 1761 ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); 1762 1763 /* 1764 * If you're a gang header, your data is in gn->gn_gbh. 1765 * If you're a gang member, your data is in 'data' and gn == NULL. 1766 */ 1767 zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); 1768 1769 if (gn != NULL) { 1770 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1771 1772 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1773 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1774 if (BP_IS_HOLE(gbp)) 1775 continue; 1776 zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); 1777 data = (char *)data + BP_GET_PSIZE(gbp); 1778 } 1779 } 1780 1781 if (gn == gio->io_gang_tree && gio->io_data != NULL) 1782 ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); 1783 1784 if (zio != pio) 1785 zio_nowait(zio); 1786} 1787 1788static int 1789zio_gang_assemble(zio_t *zio) 1790{ 1791 blkptr_t *bp = zio->io_bp; 1792 1793 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); 1794 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1795 1796 zio->io_gang_leader = zio; 1797 1798 zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); 1799 1800 return (ZIO_PIPELINE_CONTINUE); 1801} 1802 1803static int 1804zio_gang_issue(zio_t *zio) 1805{ 1806 blkptr_t *bp = zio->io_bp; 1807 1808 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) 1809 return (ZIO_PIPELINE_STOP); 1810 1811 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); 1812 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1813 1814 if (zio->io_child_error[ZIO_CHILD_GANG] == 0) 1815 zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); 1816 else 1817 zio_gang_tree_free(&zio->io_gang_tree); 1818 1819 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1820 1821 return (ZIO_PIPELINE_CONTINUE); 1822} 1823 1824static void 1825zio_write_gang_member_ready(zio_t *zio) 1826{ 1827 zio_t *pio = zio_unique_parent(zio); 1828 zio_t *gio = zio->io_gang_leader; 1829 dva_t *cdva = zio->io_bp->blk_dva; 1830 dva_t *pdva = pio->io_bp->blk_dva; 1831 uint64_t asize; 1832 1833 if (BP_IS_HOLE(zio->io_bp)) 1834 return; 1835 1836 ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); 1837 1838 ASSERT(zio->io_child_type == ZIO_CHILD_GANG); 1839 ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); 1840 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); 1841 ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); 1842 ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 1843 1844 mutex_enter(&pio->io_lock); 1845 for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { 1846 ASSERT(DVA_GET_GANG(&pdva[d])); 1847 asize = DVA_GET_ASIZE(&pdva[d]); 1848 asize += DVA_GET_ASIZE(&cdva[d]); 1849 DVA_SET_ASIZE(&pdva[d], asize); 1850 } 1851 mutex_exit(&pio->io_lock); 1852} 1853 1854static int 1855zio_write_gang_block(zio_t *pio) 1856{ 1857 spa_t *spa = pio->io_spa; 1858 blkptr_t *bp = pio->io_bp; 1859 zio_t *gio = pio->io_gang_leader; 1860 zio_t *zio; 1861 zio_gang_node_t *gn, **gnpp; 1862 zio_gbh_phys_t *gbh; 1863 uint64_t txg = pio->io_txg; 1864 uint64_t resid = pio->io_size; 1865 uint64_t lsize; 1866 int copies = gio->io_prop.zp_copies; 1867 int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); 1868 zio_prop_t zp; 1869 int error; 1870 1871 error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE, 1872 bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, 1873 METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); 1874 if (error) { 1875 pio->io_error = error; 1876 return (ZIO_PIPELINE_CONTINUE); 1877 } 1878 1879 if (pio == gio) { 1880 gnpp = &gio->io_gang_tree; 1881 } else { 1882 gnpp = pio->io_private; 1883 ASSERT(pio->io_ready == zio_write_gang_member_ready); 1884 } 1885 1886 gn = zio_gang_node_alloc(gnpp); 1887 gbh = gn->gn_gbh; 1888 bzero(gbh, SPA_GANGBLOCKSIZE); 1889 1890 /* 1891 * Create the gang header. 1892 */ 1893 zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, 1894 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1895 1896 /* 1897 * Create and nowait the gang children. 1898 */ 1899 for (int g = 0; resid != 0; resid -= lsize, g++) { 1900 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), 1901 SPA_MINBLOCKSIZE); 1902 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); 1903 1904 zp.zp_checksum = gio->io_prop.zp_checksum; 1905 zp.zp_compress = ZIO_COMPRESS_OFF; 1906 zp.zp_type = DMU_OT_NONE; 1907 zp.zp_level = 0; 1908 zp.zp_copies = gio->io_prop.zp_copies; 1909 zp.zp_dedup = B_FALSE; 1910 zp.zp_dedup_verify = B_FALSE; 1911 zp.zp_nopwrite = B_FALSE; 1912 1913 zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], 1914 (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, 1915 zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g], 1916 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1917 &pio->io_bookmark)); 1918 } 1919 1920 /* 1921 * Set pio's pipeline to just wait for zio to finish. 1922 */ 1923 pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1924 1925 zio_nowait(zio); 1926 1927 return (ZIO_PIPELINE_CONTINUE); 1928} 1929 1930/* 1931 * The zio_nop_write stage in the pipeline determines if allocating 1932 * a new bp is necessary. By leveraging a cryptographically secure checksum, 1933 * such as SHA256, we can compare the checksums of the new data and the old 1934 * to determine if allocating a new block is required. The nopwrite 1935 * feature can handle writes in either syncing or open context (i.e. zil 1936 * writes) and as a result is mutually exclusive with dedup. 1937 */ 1938static int 1939zio_nop_write(zio_t *zio) 1940{ 1941 blkptr_t *bp = zio->io_bp; 1942 blkptr_t *bp_orig = &zio->io_bp_orig; 1943 zio_prop_t *zp = &zio->io_prop; 1944 1945 ASSERT(BP_GET_LEVEL(bp) == 0); 1946 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1947 ASSERT(zp->zp_nopwrite); 1948 ASSERT(!zp->zp_dedup); 1949 ASSERT(zio->io_bp_override == NULL); 1950 ASSERT(IO_IS_ALLOCATING(zio)); 1951 1952 /* 1953 * Check to see if the original bp and the new bp have matching 1954 * characteristics (i.e. same checksum, compression algorithms, etc). 1955 * If they don't then just continue with the pipeline which will 1956 * allocate a new bp. 1957 */ 1958 if (BP_IS_HOLE(bp_orig) || 1959 !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup || 1960 BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || 1961 BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || 1962 BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || 1963 zp->zp_copies != BP_GET_NDVAS(bp_orig)) 1964 return (ZIO_PIPELINE_CONTINUE); 1965 1966 /* 1967 * If the checksums match then reset the pipeline so that we 1968 * avoid allocating a new bp and issuing any I/O. 1969 */ 1970 if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { 1971 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup); 1972 ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); 1973 ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); 1974 ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); 1975 ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, 1976 sizeof (uint64_t)) == 0); 1977 1978 *bp = *bp_orig; 1979 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1980 zio->io_flags |= ZIO_FLAG_NOPWRITE; 1981 } 1982 1983 return (ZIO_PIPELINE_CONTINUE); 1984} 1985 1986/* 1987 * ========================================================================== 1988 * Dedup 1989 * ========================================================================== 1990 */ 1991static void 1992zio_ddt_child_read_done(zio_t *zio) 1993{ 1994 blkptr_t *bp = zio->io_bp; 1995 ddt_entry_t *dde = zio->io_private; 1996 ddt_phys_t *ddp; 1997 zio_t *pio = zio_unique_parent(zio); 1998 1999 mutex_enter(&pio->io_lock); 2000 ddp = ddt_phys_select(dde, bp); 2001 if (zio->io_error == 0) 2002 ddt_phys_clear(ddp); /* this ddp doesn't need repair */ 2003 if (zio->io_error == 0 && dde->dde_repair_data == NULL) 2004 dde->dde_repair_data = zio->io_data; 2005 else 2006 zio_buf_free(zio->io_data, zio->io_size); 2007 mutex_exit(&pio->io_lock); 2008} 2009 2010static int 2011zio_ddt_read_start(zio_t *zio) 2012{ 2013 blkptr_t *bp = zio->io_bp; 2014 2015 ASSERT(BP_GET_DEDUP(bp)); 2016 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2017 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2018 2019 if (zio->io_child_error[ZIO_CHILD_DDT]) { 2020 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2021 ddt_entry_t *dde = ddt_repair_start(ddt, bp); 2022 ddt_phys_t *ddp = dde->dde_phys; 2023 ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); 2024 blkptr_t blk; 2025 2026 ASSERT(zio->io_vsd == NULL); 2027 zio->io_vsd = dde; 2028 2029 if (ddp_self == NULL) 2030 return (ZIO_PIPELINE_CONTINUE); 2031 2032 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 2033 if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) 2034 continue; 2035 ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, 2036 &blk); 2037 zio_nowait(zio_read(zio, zio->io_spa, &blk, 2038 zio_buf_alloc(zio->io_size), zio->io_size, 2039 zio_ddt_child_read_done, dde, zio->io_priority, 2040 ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, 2041 &zio->io_bookmark)); 2042 } 2043 return (ZIO_PIPELINE_CONTINUE); 2044 } 2045 2046 zio_nowait(zio_read(zio, zio->io_spa, bp, 2047 zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, 2048 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); 2049 2050 return (ZIO_PIPELINE_CONTINUE); 2051} 2052 2053static int 2054zio_ddt_read_done(zio_t *zio) 2055{ 2056 blkptr_t *bp = zio->io_bp; 2057 2058 if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)) 2059 return (ZIO_PIPELINE_STOP); 2060 2061 ASSERT(BP_GET_DEDUP(bp)); 2062 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2063 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2064 2065 if (zio->io_child_error[ZIO_CHILD_DDT]) { 2066 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2067 ddt_entry_t *dde = zio->io_vsd; 2068 if (ddt == NULL) { 2069 ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); 2070 return (ZIO_PIPELINE_CONTINUE); 2071 } 2072 if (dde == NULL) { 2073 zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; 2074 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 2075 return (ZIO_PIPELINE_STOP); 2076 } 2077 if (dde->dde_repair_data != NULL) { 2078 bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); 2079 zio->io_child_error[ZIO_CHILD_DDT] = 0; 2080 } 2081 ddt_repair_done(ddt, dde); 2082 zio->io_vsd = NULL; 2083 } 2084 2085 ASSERT(zio->io_vsd == NULL); 2086 2087 return (ZIO_PIPELINE_CONTINUE); 2088} 2089 2090static boolean_t 2091zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) 2092{ 2093 spa_t *spa = zio->io_spa; 2094 2095 /* 2096 * Note: we compare the original data, not the transformed data, 2097 * because when zio->io_bp is an override bp, we will not have 2098 * pushed the I/O transforms. That's an important optimization 2099 * because otherwise we'd compress/encrypt all dmu_sync() data twice. 2100 */ 2101 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2102 zio_t *lio = dde->dde_lead_zio[p]; 2103 2104 if (lio != NULL) { 2105 return (lio->io_orig_size != zio->io_orig_size || 2106 bcmp(zio->io_orig_data, lio->io_orig_data, 2107 zio->io_orig_size) != 0); 2108 } 2109 } 2110 2111 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2112 ddt_phys_t *ddp = &dde->dde_phys[p]; 2113 2114 if (ddp->ddp_phys_birth != 0) { 2115 arc_buf_t *abuf = NULL; 2116 uint32_t aflags = ARC_WAIT; 2117 blkptr_t blk = *zio->io_bp; 2118 int error; 2119 2120 ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); 2121 2122 ddt_exit(ddt); 2123 2124 error = arc_read(NULL, spa, &blk, 2125 arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, 2126 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2127 &aflags, &zio->io_bookmark); 2128 2129 if (error == 0) { 2130 if (arc_buf_size(abuf) != zio->io_orig_size || 2131 bcmp(abuf->b_data, zio->io_orig_data, 2132 zio->io_orig_size) != 0) 2133 error = SET_ERROR(EEXIST); 2134 VERIFY(arc_buf_remove_ref(abuf, &abuf)); 2135 } 2136 2137 ddt_enter(ddt); 2138 return (error != 0); 2139 } 2140 } 2141 2142 return (B_FALSE); 2143} 2144 2145static void 2146zio_ddt_child_write_ready(zio_t *zio) 2147{ 2148 int p = zio->io_prop.zp_copies; 2149 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2150 ddt_entry_t *dde = zio->io_private; 2151 ddt_phys_t *ddp = &dde->dde_phys[p]; 2152 zio_t *pio; 2153 2154 if (zio->io_error) 2155 return; 2156 2157 ddt_enter(ddt); 2158 2159 ASSERT(dde->dde_lead_zio[p] == zio); 2160 2161 ddt_phys_fill(ddp, zio->io_bp); 2162 2163 while ((pio = zio_walk_parents(zio)) != NULL) 2164 ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); 2165 2166 ddt_exit(ddt); 2167} 2168 2169static void 2170zio_ddt_child_write_done(zio_t *zio) 2171{ 2172 int p = zio->io_prop.zp_copies; 2173 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2174 ddt_entry_t *dde = zio->io_private; 2175 ddt_phys_t *ddp = &dde->dde_phys[p]; 2176 2177 ddt_enter(ddt); 2178 2179 ASSERT(ddp->ddp_refcnt == 0); 2180 ASSERT(dde->dde_lead_zio[p] == zio); 2181 dde->dde_lead_zio[p] = NULL; 2182 2183 if (zio->io_error == 0) { 2184 while (zio_walk_parents(zio) != NULL) 2185 ddt_phys_addref(ddp); 2186 } else { 2187 ddt_phys_clear(ddp); 2188 } 2189 2190 ddt_exit(ddt); 2191} 2192 2193static void 2194zio_ddt_ditto_write_done(zio_t *zio) 2195{ 2196 int p = DDT_PHYS_DITTO; 2197 zio_prop_t *zp = &zio->io_prop; 2198 blkptr_t *bp = zio->io_bp; 2199 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2200 ddt_entry_t *dde = zio->io_private; 2201 ddt_phys_t *ddp = &dde->dde_phys[p]; 2202 ddt_key_t *ddk = &dde->dde_key; 2203 2204 ddt_enter(ddt); 2205 2206 ASSERT(ddp->ddp_refcnt == 0); 2207 ASSERT(dde->dde_lead_zio[p] == zio); 2208 dde->dde_lead_zio[p] = NULL; 2209 2210 if (zio->io_error == 0) { 2211 ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); 2212 ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); 2213 ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); 2214 if (ddp->ddp_phys_birth != 0) 2215 ddt_phys_free(ddt, ddk, ddp, zio->io_txg); 2216 ddt_phys_fill(ddp, bp); 2217 } 2218 2219 ddt_exit(ddt); 2220} 2221 2222static int 2223zio_ddt_write(zio_t *zio) 2224{ 2225 spa_t *spa = zio->io_spa; 2226 blkptr_t *bp = zio->io_bp; 2227 uint64_t txg = zio->io_txg; 2228 zio_prop_t *zp = &zio->io_prop; 2229 int p = zp->zp_copies; 2230 int ditto_copies; 2231 zio_t *cio = NULL; 2232 zio_t *dio = NULL; 2233 ddt_t *ddt = ddt_select(spa, bp); 2234 ddt_entry_t *dde; 2235 ddt_phys_t *ddp; 2236 2237 ASSERT(BP_GET_DEDUP(bp)); 2238 ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); 2239 ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); 2240 2241 ddt_enter(ddt); 2242 dde = ddt_lookup(ddt, bp, B_TRUE); 2243 ddp = &dde->dde_phys[p]; 2244 2245 if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { 2246 /* 2247 * If we're using a weak checksum, upgrade to a strong checksum 2248 * and try again. If we're already using a strong checksum, 2249 * we can't resolve it, so just convert to an ordinary write. 2250 * (And automatically e-mail a paper to Nature?) 2251 */ 2252 if (!zio_checksum_table[zp->zp_checksum].ci_dedup) { 2253 zp->zp_checksum = spa_dedup_checksum(spa); 2254 zio_pop_transforms(zio); 2255 zio->io_stage = ZIO_STAGE_OPEN; 2256 BP_ZERO(bp); 2257 } else { 2258 zp->zp_dedup = B_FALSE; 2259 } 2260 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2261 ddt_exit(ddt); 2262 return (ZIO_PIPELINE_CONTINUE); 2263 } 2264 2265 ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); 2266 ASSERT(ditto_copies < SPA_DVAS_PER_BP); 2267 2268 if (ditto_copies > ddt_ditto_copies_present(dde) && 2269 dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { 2270 zio_prop_t czp = *zp; 2271 2272 czp.zp_copies = ditto_copies; 2273 2274 /* 2275 * If we arrived here with an override bp, we won't have run 2276 * the transform stack, so we won't have the data we need to 2277 * generate a child i/o. So, toss the override bp and restart. 2278 * This is safe, because using the override bp is just an 2279 * optimization; and it's rare, so the cost doesn't matter. 2280 */ 2281 if (zio->io_bp_override) { 2282 zio_pop_transforms(zio); 2283 zio->io_stage = ZIO_STAGE_OPEN; 2284 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2285 zio->io_bp_override = NULL; 2286 BP_ZERO(bp); 2287 ddt_exit(ddt); 2288 return (ZIO_PIPELINE_CONTINUE); 2289 } 2290 2291 dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2292 zio->io_orig_size, &czp, NULL, NULL, 2293 zio_ddt_ditto_write_done, dde, zio->io_priority, 2294 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2295 2296 zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); 2297 dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; 2298 } 2299 2300 if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { 2301 if (ddp->ddp_phys_birth != 0) 2302 ddt_bp_fill(ddp, bp, txg); 2303 if (dde->dde_lead_zio[p] != NULL) 2304 zio_add_child(zio, dde->dde_lead_zio[p]); 2305 else 2306 ddt_phys_addref(ddp); 2307 } else if (zio->io_bp_override) { 2308 ASSERT(bp->blk_birth == txg); 2309 ASSERT(BP_EQUAL(bp, zio->io_bp_override)); 2310 ddt_phys_fill(ddp, bp); 2311 ddt_phys_addref(ddp); 2312 } else { 2313 cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2314 zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL, 2315 zio_ddt_child_write_done, dde, zio->io_priority, 2316 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2317 2318 zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); 2319 dde->dde_lead_zio[p] = cio; 2320 } 2321 2322 ddt_exit(ddt); 2323 2324 if (cio) 2325 zio_nowait(cio); 2326 if (dio) 2327 zio_nowait(dio); 2328 2329 return (ZIO_PIPELINE_CONTINUE); 2330} 2331 2332ddt_entry_t *freedde; /* for debugging */ 2333 2334static int 2335zio_ddt_free(zio_t *zio) 2336{ 2337 spa_t *spa = zio->io_spa; 2338 blkptr_t *bp = zio->io_bp; 2339 ddt_t *ddt = ddt_select(spa, bp); 2340 ddt_entry_t *dde; 2341 ddt_phys_t *ddp; 2342 2343 ASSERT(BP_GET_DEDUP(bp)); 2344 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2345 2346 ddt_enter(ddt); 2347 freedde = dde = ddt_lookup(ddt, bp, B_TRUE); 2348 ddp = ddt_phys_select(dde, bp); 2349 ddt_phys_decref(ddp); 2350 ddt_exit(ddt); 2351 2352 return (ZIO_PIPELINE_CONTINUE); 2353} 2354 2355/* 2356 * ========================================================================== 2357 * Allocate and free blocks 2358 * ========================================================================== 2359 */ 2360static int 2361zio_dva_allocate(zio_t *zio) 2362{ 2363 spa_t *spa = zio->io_spa; 2364 metaslab_class_t *mc = spa_normal_class(spa); 2365 blkptr_t *bp = zio->io_bp; 2366 int error; 2367 int flags = 0; 2368 2369 if (zio->io_gang_leader == NULL) { 2370 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2371 zio->io_gang_leader = zio; 2372 } 2373 2374 ASSERT(BP_IS_HOLE(bp)); 2375 ASSERT0(BP_GET_NDVAS(bp)); 2376 ASSERT3U(zio->io_prop.zp_copies, >, 0); 2377 ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); 2378 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 2379 2380 /* 2381 * The dump device does not support gang blocks so allocation on 2382 * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid 2383 * the "fast" gang feature. 2384 */ 2385 flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0; 2386 flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ? 2387 METASLAB_GANG_CHILD : 0; 2388 error = metaslab_alloc(spa, mc, zio->io_size, bp, 2389 zio->io_prop.zp_copies, zio->io_txg, NULL, flags); 2390 2391 if (error) { 2392 spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " 2393 "size %llu, error %d", spa_name(spa), zio, zio->io_size, 2394 error); 2395 if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) 2396 return (zio_write_gang_block(zio)); 2397 zio->io_error = error; 2398 } 2399 2400 return (ZIO_PIPELINE_CONTINUE); 2401} 2402 2403static int 2404zio_dva_free(zio_t *zio) 2405{ 2406 metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); 2407 2408 return (ZIO_PIPELINE_CONTINUE); 2409} 2410 2411static int 2412zio_dva_claim(zio_t *zio) 2413{ 2414 int error; 2415 2416 error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 2417 if (error) 2418 zio->io_error = error; 2419 2420 return (ZIO_PIPELINE_CONTINUE); 2421} 2422 2423/* 2424 * Undo an allocation. This is used by zio_done() when an I/O fails 2425 * and we want to give back the block we just allocated. 2426 * This handles both normal blocks and gang blocks. 2427 */ 2428static void 2429zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) 2430{ 2431 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 2432 ASSERT(zio->io_bp_override == NULL); 2433 2434 if (!BP_IS_HOLE(bp)) 2435 metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); 2436 2437 if (gn != NULL) { 2438 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 2439 zio_dva_unallocate(zio, gn->gn_child[g], 2440 &gn->gn_gbh->zg_blkptr[g]); 2441 } 2442 } 2443} 2444 2445/* 2446 * Try to allocate an intent log block. Return 0 on success, errno on failure. 2447 */ 2448int 2449zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, 2450 uint64_t size, boolean_t use_slog) 2451{ 2452 int error = 1; 2453 2454 ASSERT(txg > spa_syncing_txg(spa)); 2455 2456 /* 2457 * ZIL blocks are always contiguous (i.e. not gang blocks) so we 2458 * set the METASLAB_GANG_AVOID flag so that they don't "fast gang" 2459 * when allocating them. 2460 */ 2461 if (use_slog) { 2462 error = metaslab_alloc(spa, spa_log_class(spa), size, 2463 new_bp, 1, txg, old_bp, 2464 METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID); 2465 } 2466 2467 if (error) { 2468 error = metaslab_alloc(spa, spa_normal_class(spa), size, 2469 new_bp, 1, txg, old_bp, 2470 METASLAB_HINTBP_AVOID); 2471 } 2472 2473 if (error == 0) { 2474 BP_SET_LSIZE(new_bp, size); 2475 BP_SET_PSIZE(new_bp, size); 2476 BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 2477 BP_SET_CHECKSUM(new_bp, 2478 spa_version(spa) >= SPA_VERSION_SLIM_ZIL 2479 ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); 2480 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 2481 BP_SET_LEVEL(new_bp, 0); 2482 BP_SET_DEDUP(new_bp, 0); 2483 BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 2484 } 2485 2486 return (error); 2487} 2488 2489/* 2490 * Free an intent log block. 2491 */ 2492void 2493zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp) 2494{ 2495 ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG); 2496 ASSERT(!BP_IS_GANG(bp)); 2497 2498 zio_free(spa, txg, bp); 2499} 2500 2501/* 2502 * ========================================================================== 2503 * Read, write and delete to physical devices 2504 * ========================================================================== 2505 */ 2506static int 2507zio_vdev_io_start(zio_t *zio) 2508{ 2509 vdev_t *vd = zio->io_vd; 2510 uint64_t align; 2511 spa_t *spa = zio->io_spa; 2512 2513 ASSERT(zio->io_error == 0); 2514 ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); 2515 2516 if (vd == NULL) { 2517 if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2518 spa_config_enter(spa, SCL_ZIO, zio, RW_READER); 2519 2520 /* 2521 * The mirror_ops handle multiple DVAs in a single BP. 2522 */ 2523 return (vdev_mirror_ops.vdev_op_io_start(zio)); 2524 } 2525 2526 if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE) { 2527 trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg); 2528 return (ZIO_PIPELINE_CONTINUE); 2529 } 2530 2531 /* 2532 * We keep track of time-sensitive I/Os so that the scan thread 2533 * can quickly react to certain workloads. In particular, we care 2534 * about non-scrubbing, top-level reads and writes with the following 2535 * characteristics: 2536 * - synchronous writes of user data to non-slog devices 2537 * - any reads of user data 2538 * When these conditions are met, adjust the timestamp of spa_last_io 2539 * which allows the scan thread to adjust its workload accordingly. 2540 */ 2541 if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && 2542 vd == vd->vdev_top && !vd->vdev_islog && 2543 zio->io_bookmark.zb_objset != DMU_META_OBJSET && 2544 zio->io_txg != spa_syncing_txg(spa)) { 2545 uint64_t old = spa->spa_last_io; 2546 uint64_t new = ddi_get_lbolt64(); 2547 if (old != new) 2548 (void) atomic_cas_64(&spa->spa_last_io, old, new); 2549 } 2550 2551 align = 1ULL << vd->vdev_top->vdev_ashift; 2552 2553 if (P2PHASE(zio->io_size, align) != 0) { 2554 uint64_t asize = P2ROUNDUP(zio->io_size, align); 2555 char *abuf = NULL; 2556 if (zio->io_type == ZIO_TYPE_READ || 2557 zio->io_type == ZIO_TYPE_WRITE) 2558 abuf = zio_buf_alloc(asize); 2559 ASSERT(vd == vd->vdev_top); 2560 if (zio->io_type == ZIO_TYPE_WRITE) { 2561 bcopy(zio->io_data, abuf, zio->io_size); 2562 bzero(abuf + zio->io_size, asize - zio->io_size); 2563 } 2564 zio_push_transform(zio, abuf, asize, abuf ? asize : 0, 2565 zio_subblock); 2566 } 2567 2568 ASSERT(P2PHASE(zio->io_offset, align) == 0); 2569 ASSERT(P2PHASE(zio->io_size, align) == 0); 2570 VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa)); 2571 2572 /* 2573 * If this is a repair I/O, and there's no self-healing involved -- 2574 * that is, we're just resilvering what we expect to resilver -- 2575 * then don't do the I/O unless zio's txg is actually in vd's DTL. 2576 * This prevents spurious resilvering with nested replication. 2577 * For example, given a mirror of mirrors, (A+B)+(C+D), if only 2578 * A is out of date, we'll read from C+D, then use the data to 2579 * resilver A+B -- but we don't actually want to resilver B, just A. 2580 * The top-level mirror has no way to know this, so instead we just 2581 * discard unnecessary repairs as we work our way down the vdev tree. 2582 * The same logic applies to any form of nested replication: 2583 * ditto + mirror, RAID-Z + replacing, etc. This covers them all. 2584 */ 2585 if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && 2586 !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && 2587 zio->io_txg != 0 && /* not a delegated i/o */ 2588 !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { 2589 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 2590 zio_vdev_io_bypass(zio); 2591 return (ZIO_PIPELINE_CONTINUE); 2592 } 2593 2594 if (vd->vdev_ops->vdev_op_leaf && 2595 (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { 2596 2597 if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio)) 2598 return (ZIO_PIPELINE_CONTINUE); 2599 2600 if ((zio = vdev_queue_io(zio)) == NULL) 2601 return (ZIO_PIPELINE_STOP); 2602 2603 if (!vdev_accessible(vd, zio)) { 2604 zio->io_error = SET_ERROR(ENXIO); 2605 zio_interrupt(zio); 2606 return (ZIO_PIPELINE_STOP); 2607 } 2608 } 2609 2610 /* 2611 * Note that we ignore repair writes for TRIM because they can conflict 2612 * with normal writes. This isn't an issue because, by definition, we 2613 * only repair blocks that aren't freed. 2614 */ 2615 if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_WRITE && 2616 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 2617 if (!trim_map_write_start(zio)) 2618 return (ZIO_PIPELINE_STOP); 2619 } 2620 2621 return (vd->vdev_ops->vdev_op_io_start(zio)); 2622} 2623 2624static int 2625zio_vdev_io_done(zio_t *zio) 2626{ 2627 vdev_t *vd = zio->io_vd; 2628 vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; 2629 boolean_t unexpected_error = B_FALSE; 2630 2631 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2632 return (ZIO_PIPELINE_STOP); 2633 2634 ASSERT(zio->io_type == ZIO_TYPE_READ || 2635 zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE); 2636 2637 if (vd != NULL && vd->vdev_ops->vdev_op_leaf && 2638 (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { 2639 2640 if (zio->io_type == ZIO_TYPE_WRITE && 2641 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) 2642 trim_map_write_done(zio); 2643 2644 vdev_queue_io_done(zio); 2645 2646 if (zio->io_type == ZIO_TYPE_WRITE) 2647 vdev_cache_write(zio); 2648 2649 if (zio_injection_enabled && zio->io_error == 0) 2650 zio->io_error = zio_handle_device_injection(vd, 2651 zio, EIO); 2652 2653 if (zio_injection_enabled && zio->io_error == 0) 2654 zio->io_error = zio_handle_label_injection(zio, EIO); 2655 2656 if (zio->io_error) { 2657 if (!vdev_accessible(vd, zio)) { 2658 zio->io_error = SET_ERROR(ENXIO); 2659 } else { 2660 unexpected_error = B_TRUE; 2661 } 2662 } 2663 } 2664 2665 ops->vdev_op_io_done(zio); 2666 2667 if (unexpected_error) 2668 VERIFY(vdev_probe(vd, zio) == NULL); 2669 2670 return (ZIO_PIPELINE_CONTINUE); 2671} 2672 2673/* 2674 * For non-raidz ZIOs, we can just copy aside the bad data read from the 2675 * disk, and use that to finish the checksum ereport later. 2676 */ 2677static void 2678zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, 2679 const void *good_buf) 2680{ 2681 /* no processing needed */ 2682 zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); 2683} 2684 2685/*ARGSUSED*/ 2686void 2687zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) 2688{ 2689 void *buf = zio_buf_alloc(zio->io_size); 2690 2691 bcopy(zio->io_data, buf, zio->io_size); 2692 2693 zcr->zcr_cbinfo = zio->io_size; 2694 zcr->zcr_cbdata = buf; 2695 zcr->zcr_finish = zio_vsd_default_cksum_finish; 2696 zcr->zcr_free = zio_buf_free; 2697} 2698 2699static int 2700zio_vdev_io_assess(zio_t *zio) 2701{ 2702 vdev_t *vd = zio->io_vd; 2703 2704 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2705 return (ZIO_PIPELINE_STOP); 2706 2707 if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2708 spa_config_exit(zio->io_spa, SCL_ZIO, zio); 2709 2710 if (zio->io_vsd != NULL) { 2711 zio->io_vsd_ops->vsd_free(zio); 2712 zio->io_vsd = NULL; 2713 } 2714 2715 if (zio_injection_enabled && zio->io_error == 0) 2716 zio->io_error = zio_handle_fault_injection(zio, EIO); 2717 2718 if (zio->io_type == ZIO_TYPE_IOCTL && zio->io_cmd == DKIOCTRIM) 2719 switch (zio->io_error) { 2720 case 0: 2721 ZIO_TRIM_STAT_INCR(bytes, zio->io_size); 2722 ZIO_TRIM_STAT_BUMP(success); 2723 break; 2724 case EOPNOTSUPP: 2725 ZIO_TRIM_STAT_BUMP(unsupported); 2726 break; 2727 default: 2728 ZIO_TRIM_STAT_BUMP(failed); 2729 break; 2730 } 2731 2732 /* 2733 * If the I/O failed, determine whether we should attempt to retry it. 2734 * 2735 * On retry, we cut in line in the issue queue, since we don't want 2736 * compression/checksumming/etc. work to prevent our (cheap) IO reissue. 2737 */ 2738 if (zio->io_error && vd == NULL && 2739 !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { 2740 ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ 2741 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ 2742 zio->io_error = 0; 2743 zio->io_flags |= ZIO_FLAG_IO_RETRY | 2744 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; 2745 zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; 2746 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, 2747 zio_requeue_io_start_cut_in_line); 2748 return (ZIO_PIPELINE_STOP); 2749 } 2750 2751 /* 2752 * If we got an error on a leaf device, convert it to ENXIO 2753 * if the device is not accessible at all. 2754 */ 2755 if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && 2756 !vdev_accessible(vd, zio)) 2757 zio->io_error = SET_ERROR(ENXIO); 2758 2759 /* 2760 * If we can't write to an interior vdev (mirror or RAID-Z), 2761 * set vdev_cant_write so that we stop trying to allocate from it. 2762 */ 2763 if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && 2764 vd != NULL && !vd->vdev_ops->vdev_op_leaf) { 2765 vd->vdev_cant_write = B_TRUE; 2766 } 2767 2768 if (zio->io_error) 2769 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2770 2771 if (vd != NULL && vd->vdev_ops->vdev_op_leaf && 2772 zio->io_physdone != NULL) { 2773 ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED)); 2774 ASSERT(zio->io_child_type == ZIO_CHILD_VDEV); 2775 zio->io_physdone(zio->io_logical); 2776 } 2777 2778 return (ZIO_PIPELINE_CONTINUE); 2779} 2780 2781void 2782zio_vdev_io_reissue(zio_t *zio) 2783{ 2784 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2785 ASSERT(zio->io_error == 0); 2786 2787 zio->io_stage >>= 1; 2788} 2789 2790void 2791zio_vdev_io_redone(zio_t *zio) 2792{ 2793 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 2794 2795 zio->io_stage >>= 1; 2796} 2797 2798void 2799zio_vdev_io_bypass(zio_t *zio) 2800{ 2801 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2802 ASSERT(zio->io_error == 0); 2803 2804 zio->io_flags |= ZIO_FLAG_IO_BYPASS; 2805 zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; 2806} 2807 2808/* 2809 * ========================================================================== 2810 * Generate and verify checksums 2811 * ========================================================================== 2812 */ 2813static int 2814zio_checksum_generate(zio_t *zio) 2815{ 2816 blkptr_t *bp = zio->io_bp; 2817 enum zio_checksum checksum; 2818 2819 if (bp == NULL) { 2820 /* 2821 * This is zio_write_phys(). 2822 * We're either generating a label checksum, or none at all. 2823 */ 2824 checksum = zio->io_prop.zp_checksum; 2825 2826 if (checksum == ZIO_CHECKSUM_OFF) 2827 return (ZIO_PIPELINE_CONTINUE); 2828 2829 ASSERT(checksum == ZIO_CHECKSUM_LABEL); 2830 } else { 2831 if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { 2832 ASSERT(!IO_IS_ALLOCATING(zio)); 2833 checksum = ZIO_CHECKSUM_GANG_HEADER; 2834 } else { 2835 checksum = BP_GET_CHECKSUM(bp); 2836 } 2837 } 2838 2839 zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); 2840 2841 return (ZIO_PIPELINE_CONTINUE); 2842} 2843 2844static int 2845zio_checksum_verify(zio_t *zio) 2846{ 2847 zio_bad_cksum_t info; 2848 blkptr_t *bp = zio->io_bp; 2849 int error; 2850 2851 ASSERT(zio->io_vd != NULL); 2852 2853 if (bp == NULL) { 2854 /* 2855 * This is zio_read_phys(). 2856 * We're either verifying a label checksum, or nothing at all. 2857 */ 2858 if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) 2859 return (ZIO_PIPELINE_CONTINUE); 2860 2861 ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); 2862 } 2863 2864 if ((error = zio_checksum_error(zio, &info)) != 0) { 2865 zio->io_error = error; 2866 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2867 zfs_ereport_start_checksum(zio->io_spa, 2868 zio->io_vd, zio, zio->io_offset, 2869 zio->io_size, NULL, &info); 2870 } 2871 } 2872 2873 return (ZIO_PIPELINE_CONTINUE); 2874} 2875 2876/* 2877 * Called by RAID-Z to ensure we don't compute the checksum twice. 2878 */ 2879void 2880zio_checksum_verified(zio_t *zio) 2881{ 2882 zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 2883} 2884 2885/* 2886 * ========================================================================== 2887 * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. 2888 * An error of 0 indictes success. ENXIO indicates whole-device failure, 2889 * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO 2890 * indicate errors that are specific to one I/O, and most likely permanent. 2891 * Any other error is presumed to be worse because we weren't expecting it. 2892 * ========================================================================== 2893 */ 2894int 2895zio_worst_error(int e1, int e2) 2896{ 2897 static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; 2898 int r1, r2; 2899 2900 for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) 2901 if (e1 == zio_error_rank[r1]) 2902 break; 2903 2904 for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) 2905 if (e2 == zio_error_rank[r2]) 2906 break; 2907 2908 return (r1 > r2 ? e1 : e2); 2909} 2910 2911/* 2912 * ========================================================================== 2913 * I/O completion 2914 * ========================================================================== 2915 */ 2916static int 2917zio_ready(zio_t *zio) 2918{ 2919 blkptr_t *bp = zio->io_bp; 2920 zio_t *pio, *pio_next; 2921 2922 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 2923 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY)) 2924 return (ZIO_PIPELINE_STOP); 2925 2926 if (zio->io_ready) { 2927 ASSERT(IO_IS_ALLOCATING(zio)); 2928 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || 2929 (zio->io_flags & ZIO_FLAG_NOPWRITE)); 2930 ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); 2931 2932 zio->io_ready(zio); 2933 } 2934 2935 if (bp != NULL && bp != &zio->io_bp_copy) 2936 zio->io_bp_copy = *bp; 2937 2938 if (zio->io_error) 2939 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2940 2941 mutex_enter(&zio->io_lock); 2942 zio->io_state[ZIO_WAIT_READY] = 1; 2943 pio = zio_walk_parents(zio); 2944 mutex_exit(&zio->io_lock); 2945 2946 /* 2947 * As we notify zio's parents, new parents could be added. 2948 * New parents go to the head of zio's io_parent_list, however, 2949 * so we will (correctly) not notify them. The remainder of zio's 2950 * io_parent_list, from 'pio_next' onward, cannot change because 2951 * all parents must wait for us to be done before they can be done. 2952 */ 2953 for (; pio != NULL; pio = pio_next) { 2954 pio_next = zio_walk_parents(zio); 2955 zio_notify_parent(pio, zio, ZIO_WAIT_READY); 2956 } 2957 2958 if (zio->io_flags & ZIO_FLAG_NODATA) { 2959 if (BP_IS_GANG(bp)) { 2960 zio->io_flags &= ~ZIO_FLAG_NODATA; 2961 } else { 2962 ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE); 2963 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 2964 } 2965 } 2966 2967 if (zio_injection_enabled && 2968 zio->io_spa->spa_syncing_txg == zio->io_txg) 2969 zio_handle_ignored_writes(zio); 2970 2971 return (ZIO_PIPELINE_CONTINUE); 2972} 2973 2974static int 2975zio_done(zio_t *zio) 2976{ 2977 spa_t *spa = zio->io_spa; 2978 zio_t *lio = zio->io_logical; 2979 blkptr_t *bp = zio->io_bp; 2980 vdev_t *vd = zio->io_vd; 2981 uint64_t psize = zio->io_size; 2982 zio_t *pio, *pio_next; 2983 2984 /* 2985 * If our children haven't all completed, 2986 * wait for them and then repeat this pipeline stage. 2987 */ 2988 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || 2989 zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || 2990 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) || 2991 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) 2992 return (ZIO_PIPELINE_STOP); 2993 2994 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 2995 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 2996 ASSERT(zio->io_children[c][w] == 0); 2997 2998 if (bp != NULL) { 2999 ASSERT(bp->blk_pad[0] == 0); 3000 ASSERT(bp->blk_pad[1] == 0); 3001 ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || 3002 (bp == zio_unique_parent(zio)->io_bp)); 3003 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 3004 zio->io_bp_override == NULL && 3005 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 3006 ASSERT(!BP_SHOULD_BYTESWAP(bp)); 3007 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); 3008 ASSERT(BP_COUNT_GANG(bp) == 0 || 3009 (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 3010 } 3011 if (zio->io_flags & ZIO_FLAG_NOPWRITE) 3012 VERIFY(BP_EQUAL(bp, &zio->io_bp_orig)); 3013 } 3014 3015 /* 3016 * If there were child vdev/gang/ddt errors, they apply to us now. 3017 */ 3018 zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); 3019 zio_inherit_child_errors(zio, ZIO_CHILD_GANG); 3020 zio_inherit_child_errors(zio, ZIO_CHILD_DDT); 3021 3022 /* 3023 * If the I/O on the transformed data was successful, generate any 3024 * checksum reports now while we still have the transformed data. 3025 */ 3026 if (zio->io_error == 0) { 3027 while (zio->io_cksum_report != NULL) { 3028 zio_cksum_report_t *zcr = zio->io_cksum_report; 3029 uint64_t align = zcr->zcr_align; 3030 uint64_t asize = P2ROUNDUP(psize, align); 3031 char *abuf = zio->io_data; 3032 3033 if (asize != psize) { 3034 abuf = zio_buf_alloc(asize); 3035 bcopy(zio->io_data, abuf, psize); 3036 bzero(abuf + psize, asize - psize); 3037 } 3038 3039 zio->io_cksum_report = zcr->zcr_next; 3040 zcr->zcr_next = NULL; 3041 zcr->zcr_finish(zcr, abuf); 3042 zfs_ereport_free_checksum(zcr); 3043 3044 if (asize != psize) 3045 zio_buf_free(abuf, asize); 3046 } 3047 } 3048 3049 zio_pop_transforms(zio); /* note: may set zio->io_error */ 3050 3051 vdev_stat_update(zio, psize); 3052 3053 if (zio->io_error) { 3054 /* 3055 * If this I/O is attached to a particular vdev, 3056 * generate an error message describing the I/O failure 3057 * at the block level. We ignore these errors if the 3058 * device is currently unavailable. 3059 */ 3060 if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 3061 zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 3062 3063 if ((zio->io_error == EIO || !(zio->io_flags & 3064 (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && 3065 zio == lio) { 3066 /* 3067 * For logical I/O requests, tell the SPA to log the 3068 * error and generate a logical data ereport. 3069 */ 3070 spa_log_error(spa, zio); 3071 zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 3072 0, 0); 3073 } 3074 } 3075 3076 if (zio->io_error && zio == lio) { 3077 /* 3078 * Determine whether zio should be reexecuted. This will 3079 * propagate all the way to the root via zio_notify_parent(). 3080 */ 3081 ASSERT(vd == NULL && bp != NULL); 3082 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3083 3084 if (IO_IS_ALLOCATING(zio) && 3085 !(zio->io_flags & ZIO_FLAG_CANFAIL)) { 3086 if (zio->io_error != ENOSPC) 3087 zio->io_reexecute |= ZIO_REEXECUTE_NOW; 3088 else 3089 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3090 } 3091 3092 if ((zio->io_type == ZIO_TYPE_READ || 3093 zio->io_type == ZIO_TYPE_FREE) && 3094 !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && 3095 zio->io_error == ENXIO && 3096 spa_load_state(spa) == SPA_LOAD_NONE && 3097 spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 3098 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3099 3100 if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) 3101 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3102 3103 /* 3104 * Here is a possibly good place to attempt to do 3105 * either combinatorial reconstruction or error correction 3106 * based on checksums. It also might be a good place 3107 * to send out preliminary ereports before we suspend 3108 * processing. 3109 */ 3110 } 3111 3112 /* 3113 * If there were logical child errors, they apply to us now. 3114 * We defer this until now to avoid conflating logical child 3115 * errors with errors that happened to the zio itself when 3116 * updating vdev stats and reporting FMA events above. 3117 */ 3118 zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); 3119 3120 if ((zio->io_error || zio->io_reexecute) && 3121 IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && 3122 !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) 3123 zio_dva_unallocate(zio, zio->io_gang_tree, bp); 3124 3125 zio_gang_tree_free(&zio->io_gang_tree); 3126 3127 /* 3128 * Godfather I/Os should never suspend. 3129 */ 3130 if ((zio->io_flags & ZIO_FLAG_GODFATHER) && 3131 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) 3132 zio->io_reexecute = 0; 3133 3134 if (zio->io_reexecute) { 3135 /* 3136 * This is a logical I/O that wants to reexecute. 3137 * 3138 * Reexecute is top-down. When an i/o fails, if it's not 3139 * the root, it simply notifies its parent and sticks around. 3140 * The parent, seeing that it still has children in zio_done(), 3141 * does the same. This percolates all the way up to the root. 3142 * The root i/o will reexecute or suspend the entire tree. 3143 * 3144 * This approach ensures that zio_reexecute() honors 3145 * all the original i/o dependency relationships, e.g. 3146 * parents not executing until children are ready. 3147 */ 3148 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3149 3150 zio->io_gang_leader = NULL; 3151 3152 mutex_enter(&zio->io_lock); 3153 zio->io_state[ZIO_WAIT_DONE] = 1; 3154 mutex_exit(&zio->io_lock); 3155 3156 /* 3157 * "The Godfather" I/O monitors its children but is 3158 * not a true parent to them. It will track them through 3159 * the pipeline but severs its ties whenever they get into 3160 * trouble (e.g. suspended). This allows "The Godfather" 3161 * I/O to return status without blocking. 3162 */ 3163 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 3164 zio_link_t *zl = zio->io_walk_link; 3165 pio_next = zio_walk_parents(zio); 3166 3167 if ((pio->io_flags & ZIO_FLAG_GODFATHER) && 3168 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { 3169 zio_remove_child(pio, zio, zl); 3170 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3171 } 3172 } 3173 3174 if ((pio = zio_unique_parent(zio)) != NULL) { 3175 /* 3176 * We're not a root i/o, so there's nothing to do 3177 * but notify our parent. Don't propagate errors 3178 * upward since we haven't permanently failed yet. 3179 */ 3180 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 3181 zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; 3182 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3183 } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { 3184 /* 3185 * We'd fail again if we reexecuted now, so suspend 3186 * until conditions improve (e.g. device comes online). 3187 */ 3188 zio_suspend(spa, zio); 3189 } else { 3190 /* 3191 * Reexecution is potentially a huge amount of work. 3192 * Hand it off to the otherwise-unused claim taskq. 3193 */ 3194#if defined(illumos) || !defined(_KERNEL) 3195 ASSERT(zio->io_tqent.tqent_next == NULL); 3196#else 3197 ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); 3198#endif 3199 spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM, 3200 ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio, 3201 0, &zio->io_tqent); 3202 } 3203 return (ZIO_PIPELINE_STOP); 3204 } 3205 3206 ASSERT(zio->io_child_count == 0); 3207 ASSERT(zio->io_reexecute == 0); 3208 ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); 3209 3210 /* 3211 * Report any checksum errors, since the I/O is complete. 3212 */ 3213 while (zio->io_cksum_report != NULL) { 3214 zio_cksum_report_t *zcr = zio->io_cksum_report; 3215 zio->io_cksum_report = zcr->zcr_next; 3216 zcr->zcr_next = NULL; 3217 zcr->zcr_finish(zcr, NULL); 3218 zfs_ereport_free_checksum(zcr); 3219 } 3220 3221 /* 3222 * It is the responsibility of the done callback to ensure that this 3223 * particular zio is no longer discoverable for adoption, and as 3224 * such, cannot acquire any new parents. 3225 */ 3226 if (zio->io_done) 3227 zio->io_done(zio); 3228 3229 mutex_enter(&zio->io_lock); 3230 zio->io_state[ZIO_WAIT_DONE] = 1; 3231 mutex_exit(&zio->io_lock); 3232 3233 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 3234 zio_link_t *zl = zio->io_walk_link; 3235 pio_next = zio_walk_parents(zio); 3236 zio_remove_child(pio, zio, zl); 3237 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3238 } 3239 3240 if (zio->io_waiter != NULL) { 3241 mutex_enter(&zio->io_lock); 3242 zio->io_executor = NULL; 3243 cv_broadcast(&zio->io_cv); 3244 mutex_exit(&zio->io_lock); 3245 } else { 3246 zio_destroy(zio); 3247 } 3248 3249 return (ZIO_PIPELINE_STOP); 3250} 3251 3252/* 3253 * ========================================================================== 3254 * I/O pipeline definition 3255 * ========================================================================== 3256 */ 3257static zio_pipe_stage_t *zio_pipeline[] = { 3258 NULL, 3259 zio_read_bp_init, 3260 zio_free_bp_init, 3261 zio_issue_async, 3262 zio_write_bp_init, 3263 zio_checksum_generate, 3264 zio_nop_write, 3265 zio_ddt_read_start, 3266 zio_ddt_read_done, 3267 zio_ddt_write, 3268 zio_ddt_free, 3269 zio_gang_assemble, 3270 zio_gang_issue, 3271 zio_dva_allocate, 3272 zio_dva_free, 3273 zio_dva_claim, 3274 zio_ready, 3275 zio_vdev_io_start, 3276 zio_vdev_io_done, 3277 zio_vdev_io_assess, 3278 zio_checksum_verify, 3279 zio_done 3280}; 3281 3282/* dnp is the dnode for zb1->zb_object */ 3283boolean_t 3284zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1, 3285 const zbookmark_t *zb2) 3286{ 3287 uint64_t zb1nextL0, zb2thisobj; 3288 3289 ASSERT(zb1->zb_objset == zb2->zb_objset); 3290 ASSERT(zb2->zb_level == 0); 3291 3292 /* 3293 * A bookmark in the deadlist is considered to be after 3294 * everything else. 3295 */ 3296 if (zb2->zb_object == DMU_DEADLIST_OBJECT) 3297 return (B_TRUE); 3298 3299 /* The objset_phys_t isn't before anything. */ 3300 if (dnp == NULL) 3301 return (B_FALSE); 3302 3303 zb1nextL0 = (zb1->zb_blkid + 1) << 3304 ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); 3305 3306 zb2thisobj = zb2->zb_object ? zb2->zb_object : 3307 zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); 3308 3309 if (zb1->zb_object == DMU_META_DNODE_OBJECT) { 3310 uint64_t nextobj = zb1nextL0 * 3311 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; 3312 return (nextobj <= zb2thisobj); 3313 } 3314 3315 if (zb1->zb_object < zb2thisobj) 3316 return (B_TRUE); 3317 if (zb1->zb_object > zb2thisobj) 3318 return (B_FALSE); 3319 if (zb2->zb_object == DMU_META_DNODE_OBJECT) 3320 return (B_FALSE); 3321 return (zb1nextL0 <= zb2->zb_blkid); 3322} 3323