zio.c revision 265740
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 24 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. 25 */ 26 27#include <sys/zfs_context.h> 28#include <sys/fm/fs/zfs.h> 29#include <sys/spa.h> 30#include <sys/txg.h> 31#include <sys/spa_impl.h> 32#include <sys/vdev_impl.h> 33#include <sys/zio_impl.h> 34#include <sys/zio_compress.h> 35#include <sys/zio_checksum.h> 36#include <sys/dmu_objset.h> 37#include <sys/arc.h> 38#include <sys/ddt.h> 39#include <sys/trim_map.h> 40#include <sys/zfeature.h> 41 42SYSCTL_DECL(_vfs_zfs); 43SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); 44#if defined(__amd64__) 45static int zio_use_uma = 1; 46#else 47static int zio_use_uma = 0; 48#endif 49TUNABLE_INT("vfs.zfs.zio.use_uma", &zio_use_uma); 50SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0, 51 "Use uma(9) for ZIO allocations"); 52static int zio_exclude_metadata = 0; 53TUNABLE_INT("vfs.zfs.zio.exclude_metadata", &zio_exclude_metadata); 54SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0, 55 "Exclude metadata buffers from dumps as well"); 56 57zio_trim_stats_t zio_trim_stats = { 58 { "bytes", KSTAT_DATA_UINT64, 59 "Number of bytes successfully TRIMmed" }, 60 { "success", KSTAT_DATA_UINT64, 61 "Number of successful TRIM requests" }, 62 { "unsupported", KSTAT_DATA_UINT64, 63 "Number of TRIM requests that failed because TRIM is not supported" }, 64 { "failed", KSTAT_DATA_UINT64, 65 "Number of TRIM requests that failed for reasons other than not supported" }, 66}; 67 68static kstat_t *zio_trim_ksp; 69 70/* 71 * ========================================================================== 72 * I/O type descriptions 73 * ========================================================================== 74 */ 75const char *zio_type_name[ZIO_TYPES] = { 76 "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", 77 "zio_ioctl" 78}; 79 80/* 81 * ========================================================================== 82 * I/O kmem caches 83 * ========================================================================== 84 */ 85kmem_cache_t *zio_cache; 86kmem_cache_t *zio_link_cache; 87kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 88kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 89 90#ifdef _KERNEL 91extern vmem_t *zio_alloc_arena; 92#endif 93extern int zfs_mg_alloc_failures; 94 95/* 96 * The following actions directly effect the spa's sync-to-convergence logic. 97 * The values below define the sync pass when we start performing the action. 98 * Care should be taken when changing these values as they directly impact 99 * spa_sync() performance. Tuning these values may introduce subtle performance 100 * pathologies and should only be done in the context of performance analysis. 101 * These tunables will eventually be removed and replaced with #defines once 102 * enough analysis has been done to determine optimal values. 103 * 104 * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that 105 * regular blocks are not deferred. 106 */ 107int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */ 108TUNABLE_INT("vfs.zfs.sync_pass_deferred_free", &zfs_sync_pass_deferred_free); 109SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_deferred_free, CTLFLAG_RDTUN, 110 &zfs_sync_pass_deferred_free, 0, "defer frees starting in this pass"); 111int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */ 112TUNABLE_INT("vfs.zfs.sync_pass_dont_compress", &zfs_sync_pass_dont_compress); 113SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_dont_compress, CTLFLAG_RDTUN, 114 &zfs_sync_pass_dont_compress, 0, "don't compress starting in this pass"); 115int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ 116TUNABLE_INT("vfs.zfs.sync_pass_rewrite", &zfs_sync_pass_rewrite); 117SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_rewrite, CTLFLAG_RDTUN, 118 &zfs_sync_pass_rewrite, 0, "rewrite new bps starting in this pass"); 119 120/* 121 * An allocating zio is one that either currently has the DVA allocate 122 * stage set or will have it later in its lifetime. 123 */ 124#define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) 125 126boolean_t zio_requeue_io_start_cut_in_line = B_TRUE; 127 128#ifdef ZFS_DEBUG 129int zio_buf_debug_limit = 16384; 130#else 131int zio_buf_debug_limit = 0; 132#endif 133 134void 135zio_init(void) 136{ 137 size_t c; 138 zio_cache = kmem_cache_create("zio_cache", 139 sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 140 zio_link_cache = kmem_cache_create("zio_link_cache", 141 sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 142 if (!zio_use_uma) 143 goto out; 144 145 /* 146 * For small buffers, we want a cache for each multiple of 147 * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 148 * for each quarter-power of 2. For large buffers, we want 149 * a cache for each multiple of PAGESIZE. 150 */ 151 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 152 size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 153 size_t p2 = size; 154 size_t align = 0; 155 size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0; 156 157 while (p2 & (p2 - 1)) 158 p2 &= p2 - 1; 159 160#ifdef illumos 161#ifndef _KERNEL 162 /* 163 * If we are using watchpoints, put each buffer on its own page, 164 * to eliminate the performance overhead of trapping to the 165 * kernel when modifying a non-watched buffer that shares the 166 * page with a watched buffer. 167 */ 168 if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) 169 continue; 170#endif 171#endif /* illumos */ 172 if (size <= 4 * SPA_MINBLOCKSIZE) { 173 align = SPA_MINBLOCKSIZE; 174 } else if (IS_P2ALIGNED(size, PAGESIZE)) { 175 align = PAGESIZE; 176 } else if (IS_P2ALIGNED(size, p2 >> 2)) { 177 align = p2 >> 2; 178 } 179 180 if (align != 0) { 181 char name[36]; 182 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 183 zio_buf_cache[c] = kmem_cache_create(name, size, 184 align, NULL, NULL, NULL, NULL, NULL, cflags); 185 186 /* 187 * Since zio_data bufs do not appear in crash dumps, we 188 * pass KMC_NOTOUCH so that no allocator metadata is 189 * stored with the buffers. 190 */ 191 (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 192 zio_data_buf_cache[c] = kmem_cache_create(name, size, 193 align, NULL, NULL, NULL, NULL, NULL, 194 cflags | KMC_NOTOUCH | KMC_NODEBUG); 195 } 196 } 197 198 while (--c != 0) { 199 ASSERT(zio_buf_cache[c] != NULL); 200 if (zio_buf_cache[c - 1] == NULL) 201 zio_buf_cache[c - 1] = zio_buf_cache[c]; 202 203 ASSERT(zio_data_buf_cache[c] != NULL); 204 if (zio_data_buf_cache[c - 1] == NULL) 205 zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 206 } 207out: 208 209 /* 210 * The zio write taskqs have 1 thread per cpu, allow 1/2 of the taskqs 211 * to fail 3 times per txg or 8 failures, whichever is greater. 212 */ 213 if (zfs_mg_alloc_failures == 0) 214 zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8); 215 else if (zfs_mg_alloc_failures < 8) 216 zfs_mg_alloc_failures = 8; 217 218 zio_inject_init(); 219 220 zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc", 221 KSTAT_TYPE_NAMED, 222 sizeof(zio_trim_stats) / sizeof(kstat_named_t), 223 KSTAT_FLAG_VIRTUAL); 224 225 if (zio_trim_ksp != NULL) { 226 zio_trim_ksp->ks_data = &zio_trim_stats; 227 kstat_install(zio_trim_ksp); 228 } 229} 230 231void 232zio_fini(void) 233{ 234 size_t c; 235 kmem_cache_t *last_cache = NULL; 236 kmem_cache_t *last_data_cache = NULL; 237 238 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 239 if (zio_buf_cache[c] != last_cache) { 240 last_cache = zio_buf_cache[c]; 241 kmem_cache_destroy(zio_buf_cache[c]); 242 } 243 zio_buf_cache[c] = NULL; 244 245 if (zio_data_buf_cache[c] != last_data_cache) { 246 last_data_cache = zio_data_buf_cache[c]; 247 kmem_cache_destroy(zio_data_buf_cache[c]); 248 } 249 zio_data_buf_cache[c] = NULL; 250 } 251 252 kmem_cache_destroy(zio_link_cache); 253 kmem_cache_destroy(zio_cache); 254 255 zio_inject_fini(); 256 257 if (zio_trim_ksp != NULL) { 258 kstat_delete(zio_trim_ksp); 259 zio_trim_ksp = NULL; 260 } 261} 262 263/* 264 * ========================================================================== 265 * Allocate and free I/O buffers 266 * ========================================================================== 267 */ 268 269/* 270 * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 271 * crashdump if the kernel panics, so use it judiciously. Obviously, it's 272 * useful to inspect ZFS metadata, but if possible, we should avoid keeping 273 * excess / transient data in-core during a crashdump. 274 */ 275void * 276zio_buf_alloc(size_t size) 277{ 278 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 279 int flags = zio_exclude_metadata ? KM_NODEBUG : 0; 280 281 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 282 283 if (zio_use_uma) 284 return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 285 else 286 return (kmem_alloc(size, KM_SLEEP|flags)); 287} 288 289/* 290 * Use zio_data_buf_alloc to allocate data. The data will not appear in a 291 * crashdump if the kernel panics. This exists so that we will limit the amount 292 * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 293 * of kernel heap dumped to disk when the kernel panics) 294 */ 295void * 296zio_data_buf_alloc(size_t size) 297{ 298 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 299 300 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 301 302 if (zio_use_uma) 303 return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 304 else 305 return (kmem_alloc(size, KM_SLEEP | KM_NODEBUG)); 306} 307 308void 309zio_buf_free(void *buf, size_t size) 310{ 311 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 312 313 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 314 315 if (zio_use_uma) 316 kmem_cache_free(zio_buf_cache[c], buf); 317 else 318 kmem_free(buf, size); 319} 320 321void 322zio_data_buf_free(void *buf, size_t size) 323{ 324 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 325 326 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 327 328 if (zio_use_uma) 329 kmem_cache_free(zio_data_buf_cache[c], buf); 330 else 331 kmem_free(buf, size); 332} 333 334/* 335 * ========================================================================== 336 * Push and pop I/O transform buffers 337 * ========================================================================== 338 */ 339static void 340zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, 341 zio_transform_func_t *transform) 342{ 343 zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 344 345 zt->zt_orig_data = zio->io_data; 346 zt->zt_orig_size = zio->io_size; 347 zt->zt_bufsize = bufsize; 348 zt->zt_transform = transform; 349 350 zt->zt_next = zio->io_transform_stack; 351 zio->io_transform_stack = zt; 352 353 zio->io_data = data; 354 zio->io_size = size; 355} 356 357static void 358zio_pop_transforms(zio_t *zio) 359{ 360 zio_transform_t *zt; 361 362 while ((zt = zio->io_transform_stack) != NULL) { 363 if (zt->zt_transform != NULL) 364 zt->zt_transform(zio, 365 zt->zt_orig_data, zt->zt_orig_size); 366 367 if (zt->zt_bufsize != 0) 368 zio_buf_free(zio->io_data, zt->zt_bufsize); 369 370 zio->io_data = zt->zt_orig_data; 371 zio->io_size = zt->zt_orig_size; 372 zio->io_transform_stack = zt->zt_next; 373 374 kmem_free(zt, sizeof (zio_transform_t)); 375 } 376} 377 378/* 379 * ========================================================================== 380 * I/O transform callbacks for subblocks and decompression 381 * ========================================================================== 382 */ 383static void 384zio_subblock(zio_t *zio, void *data, uint64_t size) 385{ 386 ASSERT(zio->io_size > size); 387 388 if (zio->io_type == ZIO_TYPE_READ) 389 bcopy(zio->io_data, data, size); 390} 391 392static void 393zio_decompress(zio_t *zio, void *data, uint64_t size) 394{ 395 if (zio->io_error == 0 && 396 zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), 397 zio->io_data, data, zio->io_size, size) != 0) 398 zio->io_error = SET_ERROR(EIO); 399} 400 401/* 402 * ========================================================================== 403 * I/O parent/child relationships and pipeline interlocks 404 * ========================================================================== 405 */ 406/* 407 * NOTE - Callers to zio_walk_parents() and zio_walk_children must 408 * continue calling these functions until they return NULL. 409 * Otherwise, the next caller will pick up the list walk in 410 * some indeterminate state. (Otherwise every caller would 411 * have to pass in a cookie to keep the state represented by 412 * io_walk_link, which gets annoying.) 413 */ 414zio_t * 415zio_walk_parents(zio_t *cio) 416{ 417 zio_link_t *zl = cio->io_walk_link; 418 list_t *pl = &cio->io_parent_list; 419 420 zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); 421 cio->io_walk_link = zl; 422 423 if (zl == NULL) 424 return (NULL); 425 426 ASSERT(zl->zl_child == cio); 427 return (zl->zl_parent); 428} 429 430zio_t * 431zio_walk_children(zio_t *pio) 432{ 433 zio_link_t *zl = pio->io_walk_link; 434 list_t *cl = &pio->io_child_list; 435 436 zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); 437 pio->io_walk_link = zl; 438 439 if (zl == NULL) 440 return (NULL); 441 442 ASSERT(zl->zl_parent == pio); 443 return (zl->zl_child); 444} 445 446zio_t * 447zio_unique_parent(zio_t *cio) 448{ 449 zio_t *pio = zio_walk_parents(cio); 450 451 VERIFY(zio_walk_parents(cio) == NULL); 452 return (pio); 453} 454 455void 456zio_add_child(zio_t *pio, zio_t *cio) 457{ 458 zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); 459 460 /* 461 * Logical I/Os can have logical, gang, or vdev children. 462 * Gang I/Os can have gang or vdev children. 463 * Vdev I/Os can only have vdev children. 464 * The following ASSERT captures all of these constraints. 465 */ 466 ASSERT(cio->io_child_type <= pio->io_child_type); 467 468 zl->zl_parent = pio; 469 zl->zl_child = cio; 470 471 mutex_enter(&cio->io_lock); 472 mutex_enter(&pio->io_lock); 473 474 ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); 475 476 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 477 pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; 478 479 list_insert_head(&pio->io_child_list, zl); 480 list_insert_head(&cio->io_parent_list, zl); 481 482 pio->io_child_count++; 483 cio->io_parent_count++; 484 485 mutex_exit(&pio->io_lock); 486 mutex_exit(&cio->io_lock); 487} 488 489static void 490zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) 491{ 492 ASSERT(zl->zl_parent == pio); 493 ASSERT(zl->zl_child == cio); 494 495 mutex_enter(&cio->io_lock); 496 mutex_enter(&pio->io_lock); 497 498 list_remove(&pio->io_child_list, zl); 499 list_remove(&cio->io_parent_list, zl); 500 501 pio->io_child_count--; 502 cio->io_parent_count--; 503 504 mutex_exit(&pio->io_lock); 505 mutex_exit(&cio->io_lock); 506 507 kmem_cache_free(zio_link_cache, zl); 508} 509 510static boolean_t 511zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) 512{ 513 uint64_t *countp = &zio->io_children[child][wait]; 514 boolean_t waiting = B_FALSE; 515 516 mutex_enter(&zio->io_lock); 517 ASSERT(zio->io_stall == NULL); 518 if (*countp != 0) { 519 zio->io_stage >>= 1; 520 zio->io_stall = countp; 521 waiting = B_TRUE; 522 } 523 mutex_exit(&zio->io_lock); 524 525 return (waiting); 526} 527 528static void 529zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) 530{ 531 uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; 532 int *errorp = &pio->io_child_error[zio->io_child_type]; 533 534 mutex_enter(&pio->io_lock); 535 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 536 *errorp = zio_worst_error(*errorp, zio->io_error); 537 pio->io_reexecute |= zio->io_reexecute; 538 ASSERT3U(*countp, >, 0); 539 540 (*countp)--; 541 542 if (*countp == 0 && pio->io_stall == countp) { 543 pio->io_stall = NULL; 544 mutex_exit(&pio->io_lock); 545 zio_execute(pio); 546 } else { 547 mutex_exit(&pio->io_lock); 548 } 549} 550 551static void 552zio_inherit_child_errors(zio_t *zio, enum zio_child c) 553{ 554 if (zio->io_child_error[c] != 0 && zio->io_error == 0) 555 zio->io_error = zio->io_child_error[c]; 556} 557 558/* 559 * ========================================================================== 560 * Create the various types of I/O (read, write, free, etc) 561 * ========================================================================== 562 */ 563static zio_t * 564zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 565 void *data, uint64_t size, zio_done_func_t *done, void *private, 566 zio_type_t type, zio_priority_t priority, enum zio_flag flags, 567 vdev_t *vd, uint64_t offset, const zbookmark_t *zb, 568 enum zio_stage stage, enum zio_stage pipeline) 569{ 570 zio_t *zio; 571 572 ASSERT3U(type == ZIO_TYPE_FREE || size, <=, SPA_MAXBLOCKSIZE); 573 ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 574 ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 575 576 ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); 577 ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); 578 ASSERT(vd || stage == ZIO_STAGE_OPEN); 579 580 zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 581 bzero(zio, sizeof (zio_t)); 582 583 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 584 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 585 586 list_create(&zio->io_parent_list, sizeof (zio_link_t), 587 offsetof(zio_link_t, zl_parent_node)); 588 list_create(&zio->io_child_list, sizeof (zio_link_t), 589 offsetof(zio_link_t, zl_child_node)); 590 591 if (vd != NULL) 592 zio->io_child_type = ZIO_CHILD_VDEV; 593 else if (flags & ZIO_FLAG_GANG_CHILD) 594 zio->io_child_type = ZIO_CHILD_GANG; 595 else if (flags & ZIO_FLAG_DDT_CHILD) 596 zio->io_child_type = ZIO_CHILD_DDT; 597 else 598 zio->io_child_type = ZIO_CHILD_LOGICAL; 599 600 if (bp != NULL) { 601 zio->io_bp = (blkptr_t *)bp; 602 zio->io_bp_copy = *bp; 603 zio->io_bp_orig = *bp; 604 if (type != ZIO_TYPE_WRITE || 605 zio->io_child_type == ZIO_CHILD_DDT) 606 zio->io_bp = &zio->io_bp_copy; /* so caller can free */ 607 if (zio->io_child_type == ZIO_CHILD_LOGICAL) 608 zio->io_logical = zio; 609 if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) 610 pipeline |= ZIO_GANG_STAGES; 611 } 612 613 zio->io_spa = spa; 614 zio->io_txg = txg; 615 zio->io_done = done; 616 zio->io_private = private; 617 zio->io_type = type; 618 zio->io_priority = priority; 619 zio->io_vd = vd; 620 zio->io_offset = offset; 621 zio->io_orig_data = zio->io_data = data; 622 zio->io_orig_size = zio->io_size = size; 623 zio->io_orig_flags = zio->io_flags = flags; 624 zio->io_orig_stage = zio->io_stage = stage; 625 zio->io_orig_pipeline = zio->io_pipeline = pipeline; 626 627 zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); 628 zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); 629 630 if (zb != NULL) 631 zio->io_bookmark = *zb; 632 633 if (pio != NULL) { 634 if (zio->io_logical == NULL) 635 zio->io_logical = pio->io_logical; 636 if (zio->io_child_type == ZIO_CHILD_GANG) 637 zio->io_gang_leader = pio->io_gang_leader; 638 zio_add_child(pio, zio); 639 } 640 641 return (zio); 642} 643 644static void 645zio_destroy(zio_t *zio) 646{ 647 list_destroy(&zio->io_parent_list); 648 list_destroy(&zio->io_child_list); 649 mutex_destroy(&zio->io_lock); 650 cv_destroy(&zio->io_cv); 651 kmem_cache_free(zio_cache, zio); 652} 653 654zio_t * 655zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, 656 void *private, enum zio_flag flags) 657{ 658 zio_t *zio; 659 660 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 661 ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 662 ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); 663 664 return (zio); 665} 666 667zio_t * 668zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) 669{ 670 return (zio_null(NULL, spa, NULL, done, private, flags)); 671} 672 673zio_t * 674zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, 675 void *data, uint64_t size, zio_done_func_t *done, void *private, 676 zio_priority_t priority, enum zio_flag flags, const zbookmark_t *zb) 677{ 678 zio_t *zio; 679 680 zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, 681 data, size, done, private, 682 ZIO_TYPE_READ, priority, flags, NULL, 0, zb, 683 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 684 ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); 685 686 return (zio); 687} 688 689zio_t * 690zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 691 void *data, uint64_t size, const zio_prop_t *zp, 692 zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done, 693 void *private, 694 zio_priority_t priority, enum zio_flag flags, const zbookmark_t *zb) 695{ 696 zio_t *zio; 697 698 ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && 699 zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && 700 zp->zp_compress >= ZIO_COMPRESS_OFF && 701 zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && 702 DMU_OT_IS_VALID(zp->zp_type) && 703 zp->zp_level < 32 && 704 zp->zp_copies > 0 && 705 zp->zp_copies <= spa_max_replication(spa)); 706 707 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 708 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 709 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 710 ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); 711 712 zio->io_ready = ready; 713 zio->io_physdone = physdone; 714 zio->io_prop = *zp; 715 716 return (zio); 717} 718 719zio_t * 720zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, 721 uint64_t size, zio_done_func_t *done, void *private, 722 zio_priority_t priority, enum zio_flag flags, zbookmark_t *zb) 723{ 724 zio_t *zio; 725 726 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 727 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 728 ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 729 730 return (zio); 731} 732 733void 734zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) 735{ 736 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 737 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 738 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 739 ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); 740 741 /* 742 * We must reset the io_prop to match the values that existed 743 * when the bp was first written by dmu_sync() keeping in mind 744 * that nopwrite and dedup are mutually exclusive. 745 */ 746 zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; 747 zio->io_prop.zp_nopwrite = nopwrite; 748 zio->io_prop.zp_copies = copies; 749 zio->io_bp_override = bp; 750} 751 752void 753zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) 754{ 755 metaslab_check_free(spa, bp); 756 757 /* 758 * Frees that are for the currently-syncing txg, are not going to be 759 * deferred, and which will not need to do a read (i.e. not GANG or 760 * DEDUP), can be processed immediately. Otherwise, put them on the 761 * in-memory list for later processing. 762 */ 763 if (zfs_trim_enabled || BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || 764 txg != spa->spa_syncing_txg || 765 spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) { 766 bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); 767 } else { 768 VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 769 BP_GET_PSIZE(bp), 0))); 770 } 771} 772 773zio_t * 774zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 775 uint64_t size, enum zio_flag flags) 776{ 777 zio_t *zio; 778 enum zio_stage stage = ZIO_FREE_PIPELINE; 779 780 dprintf_bp(bp, "freeing in txg %llu, pass %u", 781 (longlong_t)txg, spa->spa_sync_pass); 782 783 ASSERT(!BP_IS_HOLE(bp)); 784 ASSERT(spa_syncing_txg(spa) == txg); 785 ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); 786 787 metaslab_check_free(spa, bp); 788 arc_freed(spa, bp); 789 790 if (zfs_trim_enabled) 791 stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START | 792 ZIO_STAGE_VDEV_IO_ASSESS; 793 /* 794 * GANG and DEDUP blocks can induce a read (for the gang block header, 795 * or the DDT), so issue them asynchronously so that this thread is 796 * not tied up. 797 */ 798 else if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) 799 stage |= ZIO_STAGE_ISSUE_ASYNC; 800 801 zio = zio_create(pio, spa, txg, bp, NULL, size, 802 NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags, 803 NULL, 0, NULL, ZIO_STAGE_OPEN, stage); 804 805 return (zio); 806} 807 808zio_t * 809zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 810 zio_done_func_t *done, void *private, enum zio_flag flags) 811{ 812 zio_t *zio; 813 814 /* 815 * A claim is an allocation of a specific block. Claims are needed 816 * to support immediate writes in the intent log. The issue is that 817 * immediate writes contain committed data, but in a txg that was 818 * *not* committed. Upon opening the pool after an unclean shutdown, 819 * the intent log claims all blocks that contain immediate write data 820 * so that the SPA knows they're in use. 821 * 822 * All claims *must* be resolved in the first txg -- before the SPA 823 * starts allocating blocks -- so that nothing is allocated twice. 824 * If txg == 0 we just verify that the block is claimable. 825 */ 826 ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 827 ASSERT(txg == spa_first_txg(spa) || txg == 0); 828 ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ 829 830 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 831 done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, 832 NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 833 834 return (zio); 835} 836 837zio_t * 838zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset, 839 uint64_t size, zio_done_func_t *done, void *private, 840 enum zio_flag flags) 841{ 842 zio_t *zio; 843 int c; 844 845 if (vd->vdev_children == 0) { 846 zio = zio_create(pio, spa, 0, NULL, NULL, size, done, private, 847 ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, offset, NULL, 848 ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 849 850 zio->io_cmd = cmd; 851 } else { 852 zio = zio_null(pio, spa, NULL, NULL, NULL, flags); 853 854 for (c = 0; c < vd->vdev_children; c++) 855 zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 856 offset, size, done, private, flags)); 857 } 858 859 return (zio); 860} 861 862zio_t * 863zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 864 void *data, int checksum, zio_done_func_t *done, void *private, 865 zio_priority_t priority, enum zio_flag flags, boolean_t labels) 866{ 867 zio_t *zio; 868 869 ASSERT(vd->vdev_children == 0); 870 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 871 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 872 ASSERT3U(offset + size, <=, vd->vdev_psize); 873 874 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 875 ZIO_TYPE_READ, priority, flags, vd, offset, NULL, 876 ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 877 878 zio->io_prop.zp_checksum = checksum; 879 880 return (zio); 881} 882 883zio_t * 884zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 885 void *data, int checksum, zio_done_func_t *done, void *private, 886 zio_priority_t priority, enum zio_flag flags, boolean_t labels) 887{ 888 zio_t *zio; 889 890 ASSERT(vd->vdev_children == 0); 891 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 892 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 893 ASSERT3U(offset + size, <=, vd->vdev_psize); 894 895 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 896 ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL, 897 ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 898 899 zio->io_prop.zp_checksum = checksum; 900 901 if (zio_checksum_table[checksum].ci_eck) { 902 /* 903 * zec checksums are necessarily destructive -- they modify 904 * the end of the write buffer to hold the verifier/checksum. 905 * Therefore, we must make a local copy in case the data is 906 * being written to multiple places in parallel. 907 */ 908 void *wbuf = zio_buf_alloc(size); 909 bcopy(data, wbuf, size); 910 zio_push_transform(zio, wbuf, size, size, NULL); 911 } 912 913 return (zio); 914} 915 916/* 917 * Create a child I/O to do some work for us. 918 */ 919zio_t * 920zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 921 void *data, uint64_t size, int type, zio_priority_t priority, 922 enum zio_flag flags, zio_done_func_t *done, void *private) 923{ 924 enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; 925 zio_t *zio; 926 927 ASSERT(vd->vdev_parent == 928 (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); 929 930 if (type == ZIO_TYPE_READ && bp != NULL) { 931 /* 932 * If we have the bp, then the child should perform the 933 * checksum and the parent need not. This pushes error 934 * detection as close to the leaves as possible and 935 * eliminates redundant checksums in the interior nodes. 936 */ 937 pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; 938 pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 939 } 940 941 if (vd->vdev_children == 0) 942 offset += VDEV_LABEL_START_SIZE; 943 944 flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE; 945 946 /* 947 * If we've decided to do a repair, the write is not speculative -- 948 * even if the original read was. 949 */ 950 if (flags & ZIO_FLAG_IO_REPAIR) 951 flags &= ~ZIO_FLAG_SPECULATIVE; 952 953 zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, 954 done, private, type, priority, flags, vd, offset, &pio->io_bookmark, 955 ZIO_STAGE_VDEV_IO_START >> 1, pipeline); 956 957 zio->io_physdone = pio->io_physdone; 958 if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) 959 zio->io_logical->io_phys_children++; 960 961 return (zio); 962} 963 964zio_t * 965zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, 966 int type, zio_priority_t priority, enum zio_flag flags, 967 zio_done_func_t *done, void *private) 968{ 969 zio_t *zio; 970 971 ASSERT(vd->vdev_ops->vdev_op_leaf); 972 973 zio = zio_create(NULL, vd->vdev_spa, 0, NULL, 974 data, size, done, private, type, priority, 975 flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED, 976 vd, offset, NULL, 977 ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); 978 979 return (zio); 980} 981 982void 983zio_flush(zio_t *zio, vdev_t *vd) 984{ 985 zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0, 986 NULL, NULL, 987 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); 988} 989 990zio_t * 991zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size) 992{ 993 994 ASSERT(vd->vdev_ops->vdev_op_leaf); 995 996 return zio_ioctl(zio, spa, vd, DKIOCTRIM, offset, size, 997 NULL, NULL, 998 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY); 999} 1000 1001void 1002zio_shrink(zio_t *zio, uint64_t size) 1003{ 1004 ASSERT(zio->io_executor == NULL); 1005 ASSERT(zio->io_orig_size == zio->io_size); 1006 ASSERT(size <= zio->io_size); 1007 1008 /* 1009 * We don't shrink for raidz because of problems with the 1010 * reconstruction when reading back less than the block size. 1011 * Note, BP_IS_RAIDZ() assumes no compression. 1012 */ 1013 ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); 1014 if (!BP_IS_RAIDZ(zio->io_bp)) 1015 zio->io_orig_size = zio->io_size = size; 1016} 1017 1018/* 1019 * ========================================================================== 1020 * Prepare to read and write logical blocks 1021 * ========================================================================== 1022 */ 1023 1024static int 1025zio_read_bp_init(zio_t **ziop) 1026{ 1027 zio_t *zio = *ziop; 1028 blkptr_t *bp = zio->io_bp; 1029 1030 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && 1031 zio->io_child_type == ZIO_CHILD_LOGICAL && 1032 !(zio->io_flags & ZIO_FLAG_RAW)) { 1033 uint64_t psize = BP_GET_PSIZE(bp); 1034 void *cbuf = zio_buf_alloc(psize); 1035 1036 zio_push_transform(zio, cbuf, psize, psize, zio_decompress); 1037 } 1038 1039 if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) 1040 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1041 1042 if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) 1043 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1044 1045 if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) 1046 zio->io_pipeline = ZIO_DDT_READ_PIPELINE; 1047 1048 return (ZIO_PIPELINE_CONTINUE); 1049} 1050 1051static int 1052zio_write_bp_init(zio_t **ziop) 1053{ 1054 zio_t *zio = *ziop; 1055 spa_t *spa = zio->io_spa; 1056 zio_prop_t *zp = &zio->io_prop; 1057 enum zio_compress compress = zp->zp_compress; 1058 blkptr_t *bp = zio->io_bp; 1059 uint64_t lsize = zio->io_size; 1060 uint64_t psize = lsize; 1061 int pass = 1; 1062 1063 /* 1064 * If our children haven't all reached the ready stage, 1065 * wait for them and then repeat this pipeline stage. 1066 */ 1067 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 1068 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) 1069 return (ZIO_PIPELINE_STOP); 1070 1071 if (!IO_IS_ALLOCATING(zio)) 1072 return (ZIO_PIPELINE_CONTINUE); 1073 1074 ASSERT(zio->io_child_type != ZIO_CHILD_DDT); 1075 1076 if (zio->io_bp_override) { 1077 ASSERT(bp->blk_birth != zio->io_txg); 1078 ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); 1079 1080 *bp = *zio->io_bp_override; 1081 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1082 1083 /* 1084 * If we've been overridden and nopwrite is set then 1085 * set the flag accordingly to indicate that a nopwrite 1086 * has already occurred. 1087 */ 1088 if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { 1089 ASSERT(!zp->zp_dedup); 1090 zio->io_flags |= ZIO_FLAG_NOPWRITE; 1091 return (ZIO_PIPELINE_CONTINUE); 1092 } 1093 1094 ASSERT(!zp->zp_nopwrite); 1095 1096 if (BP_IS_HOLE(bp) || !zp->zp_dedup) 1097 return (ZIO_PIPELINE_CONTINUE); 1098 1099 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup || 1100 zp->zp_dedup_verify); 1101 1102 if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { 1103 BP_SET_DEDUP(bp, 1); 1104 zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; 1105 return (ZIO_PIPELINE_CONTINUE); 1106 } 1107 zio->io_bp_override = NULL; 1108 BP_ZERO(bp); 1109 } 1110 1111 if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) { 1112 /* 1113 * We're rewriting an existing block, which means we're 1114 * working on behalf of spa_sync(). For spa_sync() to 1115 * converge, it must eventually be the case that we don't 1116 * have to allocate new blocks. But compression changes 1117 * the blocksize, which forces a reallocate, and makes 1118 * convergence take longer. Therefore, after the first 1119 * few passes, stop compressing to ensure convergence. 1120 */ 1121 pass = spa_sync_pass(spa); 1122 1123 ASSERT(zio->io_txg == spa_syncing_txg(spa)); 1124 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1125 ASSERT(!BP_GET_DEDUP(bp)); 1126 1127 if (pass >= zfs_sync_pass_dont_compress) 1128 compress = ZIO_COMPRESS_OFF; 1129 1130 /* Make sure someone doesn't change their mind on overwrites */ 1131 ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp), 1132 spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 1133 } 1134 1135 if (compress != ZIO_COMPRESS_OFF) { 1136 metaslab_class_t *mc = spa_normal_class(spa); 1137 void *cbuf = zio_buf_alloc(lsize); 1138 psize = zio_compress_data(compress, zio->io_data, cbuf, lsize, 1139 (size_t)metaslab_class_get_minblocksize(mc)); 1140 if (psize == 0 || psize == lsize) { 1141 compress = ZIO_COMPRESS_OFF; 1142 zio_buf_free(cbuf, lsize); 1143 } else { 1144 ASSERT(psize < lsize); 1145 zio_push_transform(zio, cbuf, psize, lsize, NULL); 1146 } 1147 } 1148 1149 /* 1150 * The final pass of spa_sync() must be all rewrites, but the first 1151 * few passes offer a trade-off: allocating blocks defers convergence, 1152 * but newly allocated blocks are sequential, so they can be written 1153 * to disk faster. Therefore, we allow the first few passes of 1154 * spa_sync() to allocate new blocks, but force rewrites after that. 1155 * There should only be a handful of blocks after pass 1 in any case. 1156 */ 1157 if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg && 1158 BP_GET_PSIZE(bp) == psize && 1159 pass >= zfs_sync_pass_rewrite) { 1160 ASSERT(psize != 0); 1161 enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; 1162 zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; 1163 zio->io_flags |= ZIO_FLAG_IO_REWRITE; 1164 } else { 1165 BP_ZERO(bp); 1166 zio->io_pipeline = ZIO_WRITE_PIPELINE; 1167 } 1168 1169 if (psize == 0) { 1170 if (zio->io_bp_orig.blk_birth != 0 && 1171 spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { 1172 BP_SET_LSIZE(bp, lsize); 1173 BP_SET_TYPE(bp, zp->zp_type); 1174 BP_SET_LEVEL(bp, zp->zp_level); 1175 BP_SET_BIRTH(bp, zio->io_txg, 0); 1176 } 1177 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1178 } else { 1179 ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); 1180 BP_SET_LSIZE(bp, lsize); 1181 BP_SET_TYPE(bp, zp->zp_type); 1182 BP_SET_LEVEL(bp, zp->zp_level); 1183 BP_SET_PSIZE(bp, psize); 1184 BP_SET_COMPRESS(bp, compress); 1185 BP_SET_CHECKSUM(bp, zp->zp_checksum); 1186 BP_SET_DEDUP(bp, zp->zp_dedup); 1187 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1188 if (zp->zp_dedup) { 1189 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1190 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1191 zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; 1192 } 1193 if (zp->zp_nopwrite) { 1194 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1195 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1196 zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; 1197 } 1198 } 1199 1200 return (ZIO_PIPELINE_CONTINUE); 1201} 1202 1203static int 1204zio_free_bp_init(zio_t **ziop) 1205{ 1206 zio_t *zio = *ziop; 1207 blkptr_t *bp = zio->io_bp; 1208 1209 if (zio->io_child_type == ZIO_CHILD_LOGICAL) { 1210 if (BP_GET_DEDUP(bp)) 1211 zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; 1212 } 1213 1214 return (ZIO_PIPELINE_CONTINUE); 1215} 1216 1217/* 1218 * ========================================================================== 1219 * Execute the I/O pipeline 1220 * ========================================================================== 1221 */ 1222 1223static void 1224zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) 1225{ 1226 spa_t *spa = zio->io_spa; 1227 zio_type_t t = zio->io_type; 1228 int flags = (cutinline ? TQ_FRONT : 0); 1229 1230 ASSERT(q == ZIO_TASKQ_ISSUE || q == ZIO_TASKQ_INTERRUPT); 1231 1232 /* 1233 * If we're a config writer or a probe, the normal issue and 1234 * interrupt threads may all be blocked waiting for the config lock. 1235 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. 1236 */ 1237 if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) 1238 t = ZIO_TYPE_NULL; 1239 1240 /* 1241 * A similar issue exists for the L2ARC write thread until L2ARC 2.0. 1242 */ 1243 if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) 1244 t = ZIO_TYPE_NULL; 1245 1246 /* 1247 * If this is a high priority I/O, then use the high priority taskq if 1248 * available. 1249 */ 1250 if (zio->io_priority == ZIO_PRIORITY_NOW && 1251 spa->spa_zio_taskq[t][q + 1].stqs_count != 0) 1252 q++; 1253 1254 ASSERT3U(q, <, ZIO_TASKQ_TYPES); 1255 1256 /* 1257 * NB: We are assuming that the zio can only be dispatched 1258 * to a single taskq at a time. It would be a grievous error 1259 * to dispatch the zio to another taskq at the same time. 1260 */ 1261#if defined(illumos) || !defined(_KERNEL) 1262 ASSERT(zio->io_tqent.tqent_next == NULL); 1263#else 1264 ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); 1265#endif 1266 spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio, 1267 flags, &zio->io_tqent); 1268} 1269 1270static boolean_t 1271zio_taskq_member(zio_t *zio, zio_taskq_type_t q) 1272{ 1273 kthread_t *executor = zio->io_executor; 1274 spa_t *spa = zio->io_spa; 1275 1276 for (zio_type_t t = 0; t < ZIO_TYPES; t++) { 1277 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1278 uint_t i; 1279 for (i = 0; i < tqs->stqs_count; i++) { 1280 if (taskq_member(tqs->stqs_taskq[i], executor)) 1281 return (B_TRUE); 1282 } 1283 } 1284 1285 return (B_FALSE); 1286} 1287 1288static int 1289zio_issue_async(zio_t **ziop) 1290{ 1291 zio_t *zio = *ziop; 1292 1293 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 1294 1295 return (ZIO_PIPELINE_STOP); 1296} 1297 1298void 1299zio_interrupt(zio_t *zio) 1300{ 1301 zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); 1302} 1303 1304/* 1305 * Execute the I/O pipeline until one of the following occurs: 1306 * 1307 * (1) the I/O completes 1308 * (2) the pipeline stalls waiting for dependent child I/Os 1309 * (3) the I/O issues, so we're waiting for an I/O completion interrupt 1310 * (4) the I/O is delegated by vdev-level caching or aggregation 1311 * (5) the I/O is deferred due to vdev-level queueing 1312 * (6) the I/O is handed off to another thread. 1313 * 1314 * In all cases, the pipeline stops whenever there's no CPU work; it never 1315 * burns a thread in cv_wait(). 1316 * 1317 * There's no locking on io_stage because there's no legitimate way 1318 * for multiple threads to be attempting to process the same I/O. 1319 */ 1320static zio_pipe_stage_t *zio_pipeline[]; 1321 1322void 1323zio_execute(zio_t *zio) 1324{ 1325 zio->io_executor = curthread; 1326 1327 while (zio->io_stage < ZIO_STAGE_DONE) { 1328 enum zio_stage pipeline = zio->io_pipeline; 1329 enum zio_stage stage = zio->io_stage; 1330 int rv; 1331 1332 ASSERT(!MUTEX_HELD(&zio->io_lock)); 1333 ASSERT(ISP2(stage)); 1334 ASSERT(zio->io_stall == NULL); 1335 1336 do { 1337 stage <<= 1; 1338 } while ((stage & pipeline) == 0); 1339 1340 ASSERT(stage <= ZIO_STAGE_DONE); 1341 1342 /* 1343 * If we are in interrupt context and this pipeline stage 1344 * will grab a config lock that is held across I/O, 1345 * or may wait for an I/O that needs an interrupt thread 1346 * to complete, issue async to avoid deadlock. 1347 * 1348 * For VDEV_IO_START, we cut in line so that the io will 1349 * be sent to disk promptly. 1350 */ 1351 if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && 1352 zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { 1353 boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? 1354 zio_requeue_io_start_cut_in_line : B_FALSE; 1355 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); 1356 return; 1357 } 1358 1359 zio->io_stage = stage; 1360 rv = zio_pipeline[highbit64(stage) - 1](&zio); 1361 1362 if (rv == ZIO_PIPELINE_STOP) 1363 return; 1364 1365 ASSERT(rv == ZIO_PIPELINE_CONTINUE); 1366 } 1367} 1368 1369/* 1370 * ========================================================================== 1371 * Initiate I/O, either sync or async 1372 * ========================================================================== 1373 */ 1374int 1375zio_wait(zio_t *zio) 1376{ 1377 int error; 1378 1379 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 1380 ASSERT(zio->io_executor == NULL); 1381 1382 zio->io_waiter = curthread; 1383 1384 zio_execute(zio); 1385 1386 mutex_enter(&zio->io_lock); 1387 while (zio->io_executor != NULL) 1388 cv_wait(&zio->io_cv, &zio->io_lock); 1389 mutex_exit(&zio->io_lock); 1390 1391 error = zio->io_error; 1392 zio_destroy(zio); 1393 1394 return (error); 1395} 1396 1397void 1398zio_nowait(zio_t *zio) 1399{ 1400 ASSERT(zio->io_executor == NULL); 1401 1402 if (zio->io_child_type == ZIO_CHILD_LOGICAL && 1403 zio_unique_parent(zio) == NULL) { 1404 /* 1405 * This is a logical async I/O with no parent to wait for it. 1406 * We add it to the spa_async_root_zio "Godfather" I/O which 1407 * will ensure they complete prior to unloading the pool. 1408 */ 1409 spa_t *spa = zio->io_spa; 1410 1411 zio_add_child(spa->spa_async_zio_root, zio); 1412 } 1413 1414 zio_execute(zio); 1415} 1416 1417/* 1418 * ========================================================================== 1419 * Reexecute or suspend/resume failed I/O 1420 * ========================================================================== 1421 */ 1422 1423static void 1424zio_reexecute(zio_t *pio) 1425{ 1426 zio_t *cio, *cio_next; 1427 1428 ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); 1429 ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); 1430 ASSERT(pio->io_gang_leader == NULL); 1431 ASSERT(pio->io_gang_tree == NULL); 1432 1433 pio->io_flags = pio->io_orig_flags; 1434 pio->io_stage = pio->io_orig_stage; 1435 pio->io_pipeline = pio->io_orig_pipeline; 1436 pio->io_reexecute = 0; 1437 pio->io_flags |= ZIO_FLAG_REEXECUTED; 1438 pio->io_error = 0; 1439 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1440 pio->io_state[w] = 0; 1441 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 1442 pio->io_child_error[c] = 0; 1443 1444 if (IO_IS_ALLOCATING(pio)) 1445 BP_ZERO(pio->io_bp); 1446 1447 /* 1448 * As we reexecute pio's children, new children could be created. 1449 * New children go to the head of pio's io_child_list, however, 1450 * so we will (correctly) not reexecute them. The key is that 1451 * the remainder of pio's io_child_list, from 'cio_next' onward, 1452 * cannot be affected by any side effects of reexecuting 'cio'. 1453 */ 1454 for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { 1455 cio_next = zio_walk_children(pio); 1456 mutex_enter(&pio->io_lock); 1457 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1458 pio->io_children[cio->io_child_type][w]++; 1459 mutex_exit(&pio->io_lock); 1460 zio_reexecute(cio); 1461 } 1462 1463 /* 1464 * Now that all children have been reexecuted, execute the parent. 1465 * We don't reexecute "The Godfather" I/O here as it's the 1466 * responsibility of the caller to wait on him. 1467 */ 1468 if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) 1469 zio_execute(pio); 1470} 1471 1472void 1473zio_suspend(spa_t *spa, zio_t *zio) 1474{ 1475 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) 1476 fm_panic("Pool '%s' has encountered an uncorrectable I/O " 1477 "failure and the failure mode property for this pool " 1478 "is set to panic.", spa_name(spa)); 1479 1480 zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); 1481 1482 mutex_enter(&spa->spa_suspend_lock); 1483 1484 if (spa->spa_suspend_zio_root == NULL) 1485 spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 1486 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 1487 ZIO_FLAG_GODFATHER); 1488 1489 spa->spa_suspended = B_TRUE; 1490 1491 if (zio != NULL) { 1492 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 1493 ASSERT(zio != spa->spa_suspend_zio_root); 1494 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1495 ASSERT(zio_unique_parent(zio) == NULL); 1496 ASSERT(zio->io_stage == ZIO_STAGE_DONE); 1497 zio_add_child(spa->spa_suspend_zio_root, zio); 1498 } 1499 1500 mutex_exit(&spa->spa_suspend_lock); 1501} 1502 1503int 1504zio_resume(spa_t *spa) 1505{ 1506 zio_t *pio; 1507 1508 /* 1509 * Reexecute all previously suspended i/o. 1510 */ 1511 mutex_enter(&spa->spa_suspend_lock); 1512 spa->spa_suspended = B_FALSE; 1513 cv_broadcast(&spa->spa_suspend_cv); 1514 pio = spa->spa_suspend_zio_root; 1515 spa->spa_suspend_zio_root = NULL; 1516 mutex_exit(&spa->spa_suspend_lock); 1517 1518 if (pio == NULL) 1519 return (0); 1520 1521 zio_reexecute(pio); 1522 return (zio_wait(pio)); 1523} 1524 1525void 1526zio_resume_wait(spa_t *spa) 1527{ 1528 mutex_enter(&spa->spa_suspend_lock); 1529 while (spa_suspended(spa)) 1530 cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); 1531 mutex_exit(&spa->spa_suspend_lock); 1532} 1533 1534/* 1535 * ========================================================================== 1536 * Gang blocks. 1537 * 1538 * A gang block is a collection of small blocks that looks to the DMU 1539 * like one large block. When zio_dva_allocate() cannot find a block 1540 * of the requested size, due to either severe fragmentation or the pool 1541 * being nearly full, it calls zio_write_gang_block() to construct the 1542 * block from smaller fragments. 1543 * 1544 * A gang block consists of a gang header (zio_gbh_phys_t) and up to 1545 * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like 1546 * an indirect block: it's an array of block pointers. It consumes 1547 * only one sector and hence is allocatable regardless of fragmentation. 1548 * The gang header's bps point to its gang members, which hold the data. 1549 * 1550 * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> 1551 * as the verifier to ensure uniqueness of the SHA256 checksum. 1552 * Critically, the gang block bp's blk_cksum is the checksum of the data, 1553 * not the gang header. This ensures that data block signatures (needed for 1554 * deduplication) are independent of how the block is physically stored. 1555 * 1556 * Gang blocks can be nested: a gang member may itself be a gang block. 1557 * Thus every gang block is a tree in which root and all interior nodes are 1558 * gang headers, and the leaves are normal blocks that contain user data. 1559 * The root of the gang tree is called the gang leader. 1560 * 1561 * To perform any operation (read, rewrite, free, claim) on a gang block, 1562 * zio_gang_assemble() first assembles the gang tree (minus data leaves) 1563 * in the io_gang_tree field of the original logical i/o by recursively 1564 * reading the gang leader and all gang headers below it. This yields 1565 * an in-core tree containing the contents of every gang header and the 1566 * bps for every constituent of the gang block. 1567 * 1568 * With the gang tree now assembled, zio_gang_issue() just walks the gang tree 1569 * and invokes a callback on each bp. To free a gang block, zio_gang_issue() 1570 * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. 1571 * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). 1572 * zio_read_gang() is a wrapper around zio_read() that omits reading gang 1573 * headers, since we already have those in io_gang_tree. zio_rewrite_gang() 1574 * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() 1575 * of the gang header plus zio_checksum_compute() of the data to update the 1576 * gang header's blk_cksum as described above. 1577 * 1578 * The two-phase assemble/issue model solves the problem of partial failure -- 1579 * what if you'd freed part of a gang block but then couldn't read the 1580 * gang header for another part? Assembling the entire gang tree first 1581 * ensures that all the necessary gang header I/O has succeeded before 1582 * starting the actual work of free, claim, or write. Once the gang tree 1583 * is assembled, free and claim are in-memory operations that cannot fail. 1584 * 1585 * In the event that a gang write fails, zio_dva_unallocate() walks the 1586 * gang tree to immediately free (i.e. insert back into the space map) 1587 * everything we've allocated. This ensures that we don't get ENOSPC 1588 * errors during repeated suspend/resume cycles due to a flaky device. 1589 * 1590 * Gang rewrites only happen during sync-to-convergence. If we can't assemble 1591 * the gang tree, we won't modify the block, so we can safely defer the free 1592 * (knowing that the block is still intact). If we *can* assemble the gang 1593 * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free 1594 * each constituent bp and we can allocate a new block on the next sync pass. 1595 * 1596 * In all cases, the gang tree allows complete recovery from partial failure. 1597 * ========================================================================== 1598 */ 1599 1600static zio_t * 1601zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1602{ 1603 if (gn != NULL) 1604 return (pio); 1605 1606 return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), 1607 NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1608 &pio->io_bookmark)); 1609} 1610 1611zio_t * 1612zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1613{ 1614 zio_t *zio; 1615 1616 if (gn != NULL) { 1617 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1618 gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, 1619 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1620 /* 1621 * As we rewrite each gang header, the pipeline will compute 1622 * a new gang block header checksum for it; but no one will 1623 * compute a new data checksum, so we do that here. The one 1624 * exception is the gang leader: the pipeline already computed 1625 * its data checksum because that stage precedes gang assembly. 1626 * (Presently, nothing actually uses interior data checksums; 1627 * this is just good hygiene.) 1628 */ 1629 if (gn != pio->io_gang_leader->io_gang_tree) { 1630 zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), 1631 data, BP_GET_PSIZE(bp)); 1632 } 1633 /* 1634 * If we are here to damage data for testing purposes, 1635 * leave the GBH alone so that we can detect the damage. 1636 */ 1637 if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) 1638 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 1639 } else { 1640 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1641 data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, 1642 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1643 } 1644 1645 return (zio); 1646} 1647 1648/* ARGSUSED */ 1649zio_t * 1650zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1651{ 1652 return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, 1653 BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp), 1654 ZIO_GANG_CHILD_FLAGS(pio))); 1655} 1656 1657/* ARGSUSED */ 1658zio_t * 1659zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1660{ 1661 return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, 1662 NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); 1663} 1664 1665static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { 1666 NULL, 1667 zio_read_gang, 1668 zio_rewrite_gang, 1669 zio_free_gang, 1670 zio_claim_gang, 1671 NULL 1672}; 1673 1674static void zio_gang_tree_assemble_done(zio_t *zio); 1675 1676static zio_gang_node_t * 1677zio_gang_node_alloc(zio_gang_node_t **gnpp) 1678{ 1679 zio_gang_node_t *gn; 1680 1681 ASSERT(*gnpp == NULL); 1682 1683 gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); 1684 gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); 1685 *gnpp = gn; 1686 1687 return (gn); 1688} 1689 1690static void 1691zio_gang_node_free(zio_gang_node_t **gnpp) 1692{ 1693 zio_gang_node_t *gn = *gnpp; 1694 1695 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1696 ASSERT(gn->gn_child[g] == NULL); 1697 1698 zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); 1699 kmem_free(gn, sizeof (*gn)); 1700 *gnpp = NULL; 1701} 1702 1703static void 1704zio_gang_tree_free(zio_gang_node_t **gnpp) 1705{ 1706 zio_gang_node_t *gn = *gnpp; 1707 1708 if (gn == NULL) 1709 return; 1710 1711 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1712 zio_gang_tree_free(&gn->gn_child[g]); 1713 1714 zio_gang_node_free(gnpp); 1715} 1716 1717static void 1718zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) 1719{ 1720 zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); 1721 1722 ASSERT(gio->io_gang_leader == gio); 1723 ASSERT(BP_IS_GANG(bp)); 1724 1725 zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, 1726 SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, 1727 gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); 1728} 1729 1730static void 1731zio_gang_tree_assemble_done(zio_t *zio) 1732{ 1733 zio_t *gio = zio->io_gang_leader; 1734 zio_gang_node_t *gn = zio->io_private; 1735 blkptr_t *bp = zio->io_bp; 1736 1737 ASSERT(gio == zio_unique_parent(zio)); 1738 ASSERT(zio->io_child_count == 0); 1739 1740 if (zio->io_error) 1741 return; 1742 1743 if (BP_SHOULD_BYTESWAP(bp)) 1744 byteswap_uint64_array(zio->io_data, zio->io_size); 1745 1746 ASSERT(zio->io_data == gn->gn_gbh); 1747 ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 1748 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1749 1750 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1751 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1752 if (!BP_IS_GANG(gbp)) 1753 continue; 1754 zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); 1755 } 1756} 1757 1758static void 1759zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) 1760{ 1761 zio_t *gio = pio->io_gang_leader; 1762 zio_t *zio; 1763 1764 ASSERT(BP_IS_GANG(bp) == !!gn); 1765 ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); 1766 ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); 1767 1768 /* 1769 * If you're a gang header, your data is in gn->gn_gbh. 1770 * If you're a gang member, your data is in 'data' and gn == NULL. 1771 */ 1772 zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); 1773 1774 if (gn != NULL) { 1775 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1776 1777 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1778 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1779 if (BP_IS_HOLE(gbp)) 1780 continue; 1781 zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); 1782 data = (char *)data + BP_GET_PSIZE(gbp); 1783 } 1784 } 1785 1786 if (gn == gio->io_gang_tree && gio->io_data != NULL) 1787 ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); 1788 1789 if (zio != pio) 1790 zio_nowait(zio); 1791} 1792 1793static int 1794zio_gang_assemble(zio_t **ziop) 1795{ 1796 zio_t *zio = *ziop; 1797 blkptr_t *bp = zio->io_bp; 1798 1799 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); 1800 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1801 1802 zio->io_gang_leader = zio; 1803 1804 zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); 1805 1806 return (ZIO_PIPELINE_CONTINUE); 1807} 1808 1809static int 1810zio_gang_issue(zio_t **ziop) 1811{ 1812 zio_t *zio = *ziop; 1813 blkptr_t *bp = zio->io_bp; 1814 1815 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) 1816 return (ZIO_PIPELINE_STOP); 1817 1818 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); 1819 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1820 1821 if (zio->io_child_error[ZIO_CHILD_GANG] == 0) 1822 zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); 1823 else 1824 zio_gang_tree_free(&zio->io_gang_tree); 1825 1826 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1827 1828 return (ZIO_PIPELINE_CONTINUE); 1829} 1830 1831static void 1832zio_write_gang_member_ready(zio_t *zio) 1833{ 1834 zio_t *pio = zio_unique_parent(zio); 1835 zio_t *gio = zio->io_gang_leader; 1836 dva_t *cdva = zio->io_bp->blk_dva; 1837 dva_t *pdva = pio->io_bp->blk_dva; 1838 uint64_t asize; 1839 1840 if (BP_IS_HOLE(zio->io_bp)) 1841 return; 1842 1843 ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); 1844 1845 ASSERT(zio->io_child_type == ZIO_CHILD_GANG); 1846 ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); 1847 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); 1848 ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); 1849 ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 1850 1851 mutex_enter(&pio->io_lock); 1852 for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { 1853 ASSERT(DVA_GET_GANG(&pdva[d])); 1854 asize = DVA_GET_ASIZE(&pdva[d]); 1855 asize += DVA_GET_ASIZE(&cdva[d]); 1856 DVA_SET_ASIZE(&pdva[d], asize); 1857 } 1858 mutex_exit(&pio->io_lock); 1859} 1860 1861static int 1862zio_write_gang_block(zio_t *pio) 1863{ 1864 spa_t *spa = pio->io_spa; 1865 blkptr_t *bp = pio->io_bp; 1866 zio_t *gio = pio->io_gang_leader; 1867 zio_t *zio; 1868 zio_gang_node_t *gn, **gnpp; 1869 zio_gbh_phys_t *gbh; 1870 uint64_t txg = pio->io_txg; 1871 uint64_t resid = pio->io_size; 1872 uint64_t lsize; 1873 int copies = gio->io_prop.zp_copies; 1874 int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); 1875 zio_prop_t zp; 1876 int error; 1877 1878 error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE, 1879 bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, 1880 METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); 1881 if (error) { 1882 pio->io_error = error; 1883 return (ZIO_PIPELINE_CONTINUE); 1884 } 1885 1886 if (pio == gio) { 1887 gnpp = &gio->io_gang_tree; 1888 } else { 1889 gnpp = pio->io_private; 1890 ASSERT(pio->io_ready == zio_write_gang_member_ready); 1891 } 1892 1893 gn = zio_gang_node_alloc(gnpp); 1894 gbh = gn->gn_gbh; 1895 bzero(gbh, SPA_GANGBLOCKSIZE); 1896 1897 /* 1898 * Create the gang header. 1899 */ 1900 zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, 1901 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1902 1903 /* 1904 * Create and nowait the gang children. 1905 */ 1906 for (int g = 0; resid != 0; resid -= lsize, g++) { 1907 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), 1908 SPA_MINBLOCKSIZE); 1909 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); 1910 1911 zp.zp_checksum = gio->io_prop.zp_checksum; 1912 zp.zp_compress = ZIO_COMPRESS_OFF; 1913 zp.zp_type = DMU_OT_NONE; 1914 zp.zp_level = 0; 1915 zp.zp_copies = gio->io_prop.zp_copies; 1916 zp.zp_dedup = B_FALSE; 1917 zp.zp_dedup_verify = B_FALSE; 1918 zp.zp_nopwrite = B_FALSE; 1919 1920 zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], 1921 (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, 1922 zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g], 1923 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1924 &pio->io_bookmark)); 1925 } 1926 1927 /* 1928 * Set pio's pipeline to just wait for zio to finish. 1929 */ 1930 pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1931 1932 zio_nowait(zio); 1933 1934 return (ZIO_PIPELINE_CONTINUE); 1935} 1936 1937/* 1938 * The zio_nop_write stage in the pipeline determines if allocating 1939 * a new bp is necessary. By leveraging a cryptographically secure checksum, 1940 * such as SHA256, we can compare the checksums of the new data and the old 1941 * to determine if allocating a new block is required. The nopwrite 1942 * feature can handle writes in either syncing or open context (i.e. zil 1943 * writes) and as a result is mutually exclusive with dedup. 1944 */ 1945static int 1946zio_nop_write(zio_t **ziop) 1947{ 1948 zio_t *zio = *ziop; 1949 blkptr_t *bp = zio->io_bp; 1950 blkptr_t *bp_orig = &zio->io_bp_orig; 1951 zio_prop_t *zp = &zio->io_prop; 1952 1953 ASSERT(BP_GET_LEVEL(bp) == 0); 1954 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1955 ASSERT(zp->zp_nopwrite); 1956 ASSERT(!zp->zp_dedup); 1957 ASSERT(zio->io_bp_override == NULL); 1958 ASSERT(IO_IS_ALLOCATING(zio)); 1959 1960 /* 1961 * Check to see if the original bp and the new bp have matching 1962 * characteristics (i.e. same checksum, compression algorithms, etc). 1963 * If they don't then just continue with the pipeline which will 1964 * allocate a new bp. 1965 */ 1966 if (BP_IS_HOLE(bp_orig) || 1967 !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup || 1968 BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || 1969 BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || 1970 BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || 1971 zp->zp_copies != BP_GET_NDVAS(bp_orig)) 1972 return (ZIO_PIPELINE_CONTINUE); 1973 1974 /* 1975 * If the checksums match then reset the pipeline so that we 1976 * avoid allocating a new bp and issuing any I/O. 1977 */ 1978 if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { 1979 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup); 1980 ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); 1981 ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); 1982 ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); 1983 ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, 1984 sizeof (uint64_t)) == 0); 1985 1986 *bp = *bp_orig; 1987 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1988 zio->io_flags |= ZIO_FLAG_NOPWRITE; 1989 } 1990 1991 return (ZIO_PIPELINE_CONTINUE); 1992} 1993 1994/* 1995 * ========================================================================== 1996 * Dedup 1997 * ========================================================================== 1998 */ 1999static void 2000zio_ddt_child_read_done(zio_t *zio) 2001{ 2002 blkptr_t *bp = zio->io_bp; 2003 ddt_entry_t *dde = zio->io_private; 2004 ddt_phys_t *ddp; 2005 zio_t *pio = zio_unique_parent(zio); 2006 2007 mutex_enter(&pio->io_lock); 2008 ddp = ddt_phys_select(dde, bp); 2009 if (zio->io_error == 0) 2010 ddt_phys_clear(ddp); /* this ddp doesn't need repair */ 2011 if (zio->io_error == 0 && dde->dde_repair_data == NULL) 2012 dde->dde_repair_data = zio->io_data; 2013 else 2014 zio_buf_free(zio->io_data, zio->io_size); 2015 mutex_exit(&pio->io_lock); 2016} 2017 2018static int 2019zio_ddt_read_start(zio_t **ziop) 2020{ 2021 zio_t *zio = *ziop; 2022 blkptr_t *bp = zio->io_bp; 2023 2024 ASSERT(BP_GET_DEDUP(bp)); 2025 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2026 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2027 2028 if (zio->io_child_error[ZIO_CHILD_DDT]) { 2029 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2030 ddt_entry_t *dde = ddt_repair_start(ddt, bp); 2031 ddt_phys_t *ddp = dde->dde_phys; 2032 ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); 2033 blkptr_t blk; 2034 2035 ASSERT(zio->io_vsd == NULL); 2036 zio->io_vsd = dde; 2037 2038 if (ddp_self == NULL) 2039 return (ZIO_PIPELINE_CONTINUE); 2040 2041 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 2042 if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) 2043 continue; 2044 ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, 2045 &blk); 2046 zio_nowait(zio_read(zio, zio->io_spa, &blk, 2047 zio_buf_alloc(zio->io_size), zio->io_size, 2048 zio_ddt_child_read_done, dde, zio->io_priority, 2049 ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, 2050 &zio->io_bookmark)); 2051 } 2052 return (ZIO_PIPELINE_CONTINUE); 2053 } 2054 2055 zio_nowait(zio_read(zio, zio->io_spa, bp, 2056 zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, 2057 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); 2058 2059 return (ZIO_PIPELINE_CONTINUE); 2060} 2061 2062static int 2063zio_ddt_read_done(zio_t **ziop) 2064{ 2065 zio_t *zio = *ziop; 2066 blkptr_t *bp = zio->io_bp; 2067 2068 if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)) 2069 return (ZIO_PIPELINE_STOP); 2070 2071 ASSERT(BP_GET_DEDUP(bp)); 2072 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2073 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2074 2075 if (zio->io_child_error[ZIO_CHILD_DDT]) { 2076 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2077 ddt_entry_t *dde = zio->io_vsd; 2078 if (ddt == NULL) { 2079 ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); 2080 return (ZIO_PIPELINE_CONTINUE); 2081 } 2082 if (dde == NULL) { 2083 zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; 2084 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 2085 return (ZIO_PIPELINE_STOP); 2086 } 2087 if (dde->dde_repair_data != NULL) { 2088 bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); 2089 zio->io_child_error[ZIO_CHILD_DDT] = 0; 2090 } 2091 ddt_repair_done(ddt, dde); 2092 zio->io_vsd = NULL; 2093 } 2094 2095 ASSERT(zio->io_vsd == NULL); 2096 2097 return (ZIO_PIPELINE_CONTINUE); 2098} 2099 2100static boolean_t 2101zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) 2102{ 2103 spa_t *spa = zio->io_spa; 2104 2105 /* 2106 * Note: we compare the original data, not the transformed data, 2107 * because when zio->io_bp is an override bp, we will not have 2108 * pushed the I/O transforms. That's an important optimization 2109 * because otherwise we'd compress/encrypt all dmu_sync() data twice. 2110 */ 2111 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2112 zio_t *lio = dde->dde_lead_zio[p]; 2113 2114 if (lio != NULL) { 2115 return (lio->io_orig_size != zio->io_orig_size || 2116 bcmp(zio->io_orig_data, lio->io_orig_data, 2117 zio->io_orig_size) != 0); 2118 } 2119 } 2120 2121 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2122 ddt_phys_t *ddp = &dde->dde_phys[p]; 2123 2124 if (ddp->ddp_phys_birth != 0) { 2125 arc_buf_t *abuf = NULL; 2126 uint32_t aflags = ARC_WAIT; 2127 blkptr_t blk = *zio->io_bp; 2128 int error; 2129 2130 ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); 2131 2132 ddt_exit(ddt); 2133 2134 error = arc_read(NULL, spa, &blk, 2135 arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, 2136 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2137 &aflags, &zio->io_bookmark); 2138 2139 if (error == 0) { 2140 if (arc_buf_size(abuf) != zio->io_orig_size || 2141 bcmp(abuf->b_data, zio->io_orig_data, 2142 zio->io_orig_size) != 0) 2143 error = SET_ERROR(EEXIST); 2144 VERIFY(arc_buf_remove_ref(abuf, &abuf)); 2145 } 2146 2147 ddt_enter(ddt); 2148 return (error != 0); 2149 } 2150 } 2151 2152 return (B_FALSE); 2153} 2154 2155static void 2156zio_ddt_child_write_ready(zio_t *zio) 2157{ 2158 int p = zio->io_prop.zp_copies; 2159 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2160 ddt_entry_t *dde = zio->io_private; 2161 ddt_phys_t *ddp = &dde->dde_phys[p]; 2162 zio_t *pio; 2163 2164 if (zio->io_error) 2165 return; 2166 2167 ddt_enter(ddt); 2168 2169 ASSERT(dde->dde_lead_zio[p] == zio); 2170 2171 ddt_phys_fill(ddp, zio->io_bp); 2172 2173 while ((pio = zio_walk_parents(zio)) != NULL) 2174 ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); 2175 2176 ddt_exit(ddt); 2177} 2178 2179static void 2180zio_ddt_child_write_done(zio_t *zio) 2181{ 2182 int p = zio->io_prop.zp_copies; 2183 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2184 ddt_entry_t *dde = zio->io_private; 2185 ddt_phys_t *ddp = &dde->dde_phys[p]; 2186 2187 ddt_enter(ddt); 2188 2189 ASSERT(ddp->ddp_refcnt == 0); 2190 ASSERT(dde->dde_lead_zio[p] == zio); 2191 dde->dde_lead_zio[p] = NULL; 2192 2193 if (zio->io_error == 0) { 2194 while (zio_walk_parents(zio) != NULL) 2195 ddt_phys_addref(ddp); 2196 } else { 2197 ddt_phys_clear(ddp); 2198 } 2199 2200 ddt_exit(ddt); 2201} 2202 2203static void 2204zio_ddt_ditto_write_done(zio_t *zio) 2205{ 2206 int p = DDT_PHYS_DITTO; 2207 zio_prop_t *zp = &zio->io_prop; 2208 blkptr_t *bp = zio->io_bp; 2209 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2210 ddt_entry_t *dde = zio->io_private; 2211 ddt_phys_t *ddp = &dde->dde_phys[p]; 2212 ddt_key_t *ddk = &dde->dde_key; 2213 2214 ddt_enter(ddt); 2215 2216 ASSERT(ddp->ddp_refcnt == 0); 2217 ASSERT(dde->dde_lead_zio[p] == zio); 2218 dde->dde_lead_zio[p] = NULL; 2219 2220 if (zio->io_error == 0) { 2221 ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); 2222 ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); 2223 ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); 2224 if (ddp->ddp_phys_birth != 0) 2225 ddt_phys_free(ddt, ddk, ddp, zio->io_txg); 2226 ddt_phys_fill(ddp, bp); 2227 } 2228 2229 ddt_exit(ddt); 2230} 2231 2232static int 2233zio_ddt_write(zio_t **ziop) 2234{ 2235 zio_t *zio = *ziop; 2236 spa_t *spa = zio->io_spa; 2237 blkptr_t *bp = zio->io_bp; 2238 uint64_t txg = zio->io_txg; 2239 zio_prop_t *zp = &zio->io_prop; 2240 int p = zp->zp_copies; 2241 int ditto_copies; 2242 zio_t *cio = NULL; 2243 zio_t *dio = NULL; 2244 ddt_t *ddt = ddt_select(spa, bp); 2245 ddt_entry_t *dde; 2246 ddt_phys_t *ddp; 2247 2248 ASSERT(BP_GET_DEDUP(bp)); 2249 ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); 2250 ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); 2251 2252 ddt_enter(ddt); 2253 dde = ddt_lookup(ddt, bp, B_TRUE); 2254 ddp = &dde->dde_phys[p]; 2255 2256 if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { 2257 /* 2258 * If we're using a weak checksum, upgrade to a strong checksum 2259 * and try again. If we're already using a strong checksum, 2260 * we can't resolve it, so just convert to an ordinary write. 2261 * (And automatically e-mail a paper to Nature?) 2262 */ 2263 if (!zio_checksum_table[zp->zp_checksum].ci_dedup) { 2264 zp->zp_checksum = spa_dedup_checksum(spa); 2265 zio_pop_transforms(zio); 2266 zio->io_stage = ZIO_STAGE_OPEN; 2267 BP_ZERO(bp); 2268 } else { 2269 zp->zp_dedup = B_FALSE; 2270 } 2271 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2272 ddt_exit(ddt); 2273 return (ZIO_PIPELINE_CONTINUE); 2274 } 2275 2276 ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); 2277 ASSERT(ditto_copies < SPA_DVAS_PER_BP); 2278 2279 if (ditto_copies > ddt_ditto_copies_present(dde) && 2280 dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { 2281 zio_prop_t czp = *zp; 2282 2283 czp.zp_copies = ditto_copies; 2284 2285 /* 2286 * If we arrived here with an override bp, we won't have run 2287 * the transform stack, so we won't have the data we need to 2288 * generate a child i/o. So, toss the override bp and restart. 2289 * This is safe, because using the override bp is just an 2290 * optimization; and it's rare, so the cost doesn't matter. 2291 */ 2292 if (zio->io_bp_override) { 2293 zio_pop_transforms(zio); 2294 zio->io_stage = ZIO_STAGE_OPEN; 2295 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2296 zio->io_bp_override = NULL; 2297 BP_ZERO(bp); 2298 ddt_exit(ddt); 2299 return (ZIO_PIPELINE_CONTINUE); 2300 } 2301 2302 dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2303 zio->io_orig_size, &czp, NULL, NULL, 2304 zio_ddt_ditto_write_done, dde, zio->io_priority, 2305 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2306 2307 zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); 2308 dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; 2309 } 2310 2311 if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { 2312 if (ddp->ddp_phys_birth != 0) 2313 ddt_bp_fill(ddp, bp, txg); 2314 if (dde->dde_lead_zio[p] != NULL) 2315 zio_add_child(zio, dde->dde_lead_zio[p]); 2316 else 2317 ddt_phys_addref(ddp); 2318 } else if (zio->io_bp_override) { 2319 ASSERT(bp->blk_birth == txg); 2320 ASSERT(BP_EQUAL(bp, zio->io_bp_override)); 2321 ddt_phys_fill(ddp, bp); 2322 ddt_phys_addref(ddp); 2323 } else { 2324 cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2325 zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL, 2326 zio_ddt_child_write_done, dde, zio->io_priority, 2327 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2328 2329 zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); 2330 dde->dde_lead_zio[p] = cio; 2331 } 2332 2333 ddt_exit(ddt); 2334 2335 if (cio) 2336 zio_nowait(cio); 2337 if (dio) 2338 zio_nowait(dio); 2339 2340 return (ZIO_PIPELINE_CONTINUE); 2341} 2342 2343ddt_entry_t *freedde; /* for debugging */ 2344 2345static int 2346zio_ddt_free(zio_t **ziop) 2347{ 2348 zio_t *zio = *ziop; 2349 spa_t *spa = zio->io_spa; 2350 blkptr_t *bp = zio->io_bp; 2351 ddt_t *ddt = ddt_select(spa, bp); 2352 ddt_entry_t *dde; 2353 ddt_phys_t *ddp; 2354 2355 ASSERT(BP_GET_DEDUP(bp)); 2356 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2357 2358 ddt_enter(ddt); 2359 freedde = dde = ddt_lookup(ddt, bp, B_TRUE); 2360 ddp = ddt_phys_select(dde, bp); 2361 ddt_phys_decref(ddp); 2362 ddt_exit(ddt); 2363 2364 return (ZIO_PIPELINE_CONTINUE); 2365} 2366 2367/* 2368 * ========================================================================== 2369 * Allocate and free blocks 2370 * ========================================================================== 2371 */ 2372static int 2373zio_dva_allocate(zio_t **ziop) 2374{ 2375 zio_t *zio = *ziop; 2376 spa_t *spa = zio->io_spa; 2377 metaslab_class_t *mc = spa_normal_class(spa); 2378 blkptr_t *bp = zio->io_bp; 2379 int error; 2380 int flags = 0; 2381 2382 if (zio->io_gang_leader == NULL) { 2383 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2384 zio->io_gang_leader = zio; 2385 } 2386 2387 ASSERT(BP_IS_HOLE(bp)); 2388 ASSERT0(BP_GET_NDVAS(bp)); 2389 ASSERT3U(zio->io_prop.zp_copies, >, 0); 2390 ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); 2391 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 2392 2393 /* 2394 * The dump device does not support gang blocks so allocation on 2395 * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid 2396 * the "fast" gang feature. 2397 */ 2398 flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0; 2399 flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ? 2400 METASLAB_GANG_CHILD : 0; 2401 error = metaslab_alloc(spa, mc, zio->io_size, bp, 2402 zio->io_prop.zp_copies, zio->io_txg, NULL, flags); 2403 2404 if (error) { 2405 spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " 2406 "size %llu, error %d", spa_name(spa), zio, zio->io_size, 2407 error); 2408 if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) 2409 return (zio_write_gang_block(zio)); 2410 zio->io_error = error; 2411 } 2412 2413 return (ZIO_PIPELINE_CONTINUE); 2414} 2415 2416static int 2417zio_dva_free(zio_t **ziop) 2418{ 2419 zio_t *zio = *ziop; 2420 2421 metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); 2422 2423 return (ZIO_PIPELINE_CONTINUE); 2424} 2425 2426static int 2427zio_dva_claim(zio_t **ziop) 2428{ 2429 zio_t *zio = *ziop; 2430 int error; 2431 2432 error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 2433 if (error) 2434 zio->io_error = error; 2435 2436 return (ZIO_PIPELINE_CONTINUE); 2437} 2438 2439/* 2440 * Undo an allocation. This is used by zio_done() when an I/O fails 2441 * and we want to give back the block we just allocated. 2442 * This handles both normal blocks and gang blocks. 2443 */ 2444static void 2445zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) 2446{ 2447 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 2448 ASSERT(zio->io_bp_override == NULL); 2449 2450 if (!BP_IS_HOLE(bp)) 2451 metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); 2452 2453 if (gn != NULL) { 2454 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 2455 zio_dva_unallocate(zio, gn->gn_child[g], 2456 &gn->gn_gbh->zg_blkptr[g]); 2457 } 2458 } 2459} 2460 2461/* 2462 * Try to allocate an intent log block. Return 0 on success, errno on failure. 2463 */ 2464int 2465zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, 2466 uint64_t size, boolean_t use_slog) 2467{ 2468 int error = 1; 2469 2470 ASSERT(txg > spa_syncing_txg(spa)); 2471 2472 /* 2473 * ZIL blocks are always contiguous (i.e. not gang blocks) so we 2474 * set the METASLAB_GANG_AVOID flag so that they don't "fast gang" 2475 * when allocating them. 2476 */ 2477 if (use_slog) { 2478 error = metaslab_alloc(spa, spa_log_class(spa), size, 2479 new_bp, 1, txg, old_bp, 2480 METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID); 2481 } 2482 2483 if (error) { 2484 error = metaslab_alloc(spa, spa_normal_class(spa), size, 2485 new_bp, 1, txg, old_bp, 2486 METASLAB_HINTBP_AVOID); 2487 } 2488 2489 if (error == 0) { 2490 BP_SET_LSIZE(new_bp, size); 2491 BP_SET_PSIZE(new_bp, size); 2492 BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 2493 BP_SET_CHECKSUM(new_bp, 2494 spa_version(spa) >= SPA_VERSION_SLIM_ZIL 2495 ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); 2496 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 2497 BP_SET_LEVEL(new_bp, 0); 2498 BP_SET_DEDUP(new_bp, 0); 2499 BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 2500 } 2501 2502 return (error); 2503} 2504 2505/* 2506 * Free an intent log block. 2507 */ 2508void 2509zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp) 2510{ 2511 ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG); 2512 ASSERT(!BP_IS_GANG(bp)); 2513 2514 zio_free(spa, txg, bp); 2515} 2516 2517/* 2518 * ========================================================================== 2519 * Read, write and delete to physical devices 2520 * ========================================================================== 2521 */ 2522static int 2523zio_vdev_io_start(zio_t **ziop) 2524{ 2525 zio_t *zio = *ziop; 2526 vdev_t *vd = zio->io_vd; 2527 uint64_t align; 2528 spa_t *spa = zio->io_spa; 2529 2530 ASSERT(zio->io_error == 0); 2531 ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); 2532 2533 if (vd == NULL) { 2534 if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2535 spa_config_enter(spa, SCL_ZIO, zio, RW_READER); 2536 2537 /* 2538 * The mirror_ops handle multiple DVAs in a single BP. 2539 */ 2540 return (vdev_mirror_ops.vdev_op_io_start(zio)); 2541 } 2542 2543 if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE) { 2544 trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg); 2545 return (ZIO_PIPELINE_CONTINUE); 2546 } 2547 2548 /* 2549 * We keep track of time-sensitive I/Os so that the scan thread 2550 * can quickly react to certain workloads. In particular, we care 2551 * about non-scrubbing, top-level reads and writes with the following 2552 * characteristics: 2553 * - synchronous writes of user data to non-slog devices 2554 * - any reads of user data 2555 * When these conditions are met, adjust the timestamp of spa_last_io 2556 * which allows the scan thread to adjust its workload accordingly. 2557 */ 2558 if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && 2559 vd == vd->vdev_top && !vd->vdev_islog && 2560 zio->io_bookmark.zb_objset != DMU_META_OBJSET && 2561 zio->io_txg != spa_syncing_txg(spa)) { 2562 uint64_t old = spa->spa_last_io; 2563 uint64_t new = ddi_get_lbolt64(); 2564 if (old != new) 2565 (void) atomic_cas_64(&spa->spa_last_io, old, new); 2566 } 2567 2568 align = 1ULL << vd->vdev_top->vdev_ashift; 2569 2570 if (P2PHASE(zio->io_size, align) != 0) { 2571 uint64_t asize = P2ROUNDUP(zio->io_size, align); 2572 char *abuf = NULL; 2573 if (zio->io_type == ZIO_TYPE_READ || 2574 zio->io_type == ZIO_TYPE_WRITE) 2575 abuf = zio_buf_alloc(asize); 2576 ASSERT(vd == vd->vdev_top); 2577 if (zio->io_type == ZIO_TYPE_WRITE) { 2578 bcopy(zio->io_data, abuf, zio->io_size); 2579 bzero(abuf + zio->io_size, asize - zio->io_size); 2580 } 2581 zio_push_transform(zio, abuf, asize, abuf ? asize : 0, 2582 zio_subblock); 2583 } 2584 2585 ASSERT(P2PHASE(zio->io_offset, align) == 0); 2586 ASSERT(P2PHASE(zio->io_size, align) == 0); 2587 VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa)); 2588 2589 /* 2590 * If this is a repair I/O, and there's no self-healing involved -- 2591 * that is, we're just resilvering what we expect to resilver -- 2592 * then don't do the I/O unless zio's txg is actually in vd's DTL. 2593 * This prevents spurious resilvering with nested replication. 2594 * For example, given a mirror of mirrors, (A+B)+(C+D), if only 2595 * A is out of date, we'll read from C+D, then use the data to 2596 * resilver A+B -- but we don't actually want to resilver B, just A. 2597 * The top-level mirror has no way to know this, so instead we just 2598 * discard unnecessary repairs as we work our way down the vdev tree. 2599 * The same logic applies to any form of nested replication: 2600 * ditto + mirror, RAID-Z + replacing, etc. This covers them all. 2601 */ 2602 if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && 2603 !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && 2604 zio->io_txg != 0 && /* not a delegated i/o */ 2605 !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { 2606 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 2607 zio_vdev_io_bypass(zio); 2608 return (ZIO_PIPELINE_CONTINUE); 2609 } 2610 2611 if (vd->vdev_ops->vdev_op_leaf && 2612 (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { 2613 2614 if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio)) 2615 return (ZIO_PIPELINE_CONTINUE); 2616 2617 if ((zio = vdev_queue_io(zio)) == NULL) 2618 return (ZIO_PIPELINE_STOP); 2619 *ziop = zio; 2620 2621 if (!vdev_accessible(vd, zio)) { 2622 zio->io_error = SET_ERROR(ENXIO); 2623 zio_interrupt(zio); 2624 return (ZIO_PIPELINE_STOP); 2625 } 2626 } 2627 2628 /* 2629 * Note that we ignore repair writes for TRIM because they can conflict 2630 * with normal writes. This isn't an issue because, by definition, we 2631 * only repair blocks that aren't freed. 2632 */ 2633 if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_WRITE && 2634 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 2635 if (!trim_map_write_start(zio)) 2636 return (ZIO_PIPELINE_STOP); 2637 } 2638 2639 return (vd->vdev_ops->vdev_op_io_start(zio)); 2640} 2641 2642static int 2643zio_vdev_io_done(zio_t **ziop) 2644{ 2645 zio_t *zio = *ziop; 2646 vdev_t *vd = zio->io_vd; 2647 vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; 2648 boolean_t unexpected_error = B_FALSE; 2649 2650 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2651 return (ZIO_PIPELINE_STOP); 2652 2653 ASSERT(zio->io_type == ZIO_TYPE_READ || 2654 zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE); 2655 2656 if (vd != NULL && vd->vdev_ops->vdev_op_leaf && 2657 (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { 2658 2659 if (zio->io_type == ZIO_TYPE_WRITE && 2660 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) 2661 trim_map_write_done(zio); 2662 2663 vdev_queue_io_done(zio); 2664 2665 if (zio->io_type == ZIO_TYPE_WRITE) 2666 vdev_cache_write(zio); 2667 2668 if (zio_injection_enabled && zio->io_error == 0) 2669 zio->io_error = zio_handle_device_injection(vd, 2670 zio, EIO); 2671 2672 if (zio_injection_enabled && zio->io_error == 0) 2673 zio->io_error = zio_handle_label_injection(zio, EIO); 2674 2675 if (zio->io_error) { 2676 if (!vdev_accessible(vd, zio)) { 2677 zio->io_error = SET_ERROR(ENXIO); 2678 } else { 2679 unexpected_error = B_TRUE; 2680 } 2681 } 2682 } 2683 2684 ops->vdev_op_io_done(zio); 2685 2686 if (unexpected_error) 2687 VERIFY(vdev_probe(vd, zio) == NULL); 2688 2689 return (ZIO_PIPELINE_CONTINUE); 2690} 2691 2692/* 2693 * For non-raidz ZIOs, we can just copy aside the bad data read from the 2694 * disk, and use that to finish the checksum ereport later. 2695 */ 2696static void 2697zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, 2698 const void *good_buf) 2699{ 2700 /* no processing needed */ 2701 zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); 2702} 2703 2704/*ARGSUSED*/ 2705void 2706zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) 2707{ 2708 void *buf = zio_buf_alloc(zio->io_size); 2709 2710 bcopy(zio->io_data, buf, zio->io_size); 2711 2712 zcr->zcr_cbinfo = zio->io_size; 2713 zcr->zcr_cbdata = buf; 2714 zcr->zcr_finish = zio_vsd_default_cksum_finish; 2715 zcr->zcr_free = zio_buf_free; 2716} 2717 2718static int 2719zio_vdev_io_assess(zio_t **ziop) 2720{ 2721 zio_t *zio = *ziop; 2722 vdev_t *vd = zio->io_vd; 2723 2724 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2725 return (ZIO_PIPELINE_STOP); 2726 2727 if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2728 spa_config_exit(zio->io_spa, SCL_ZIO, zio); 2729 2730 if (zio->io_vsd != NULL) { 2731 zio->io_vsd_ops->vsd_free(zio); 2732 zio->io_vsd = NULL; 2733 } 2734 2735 if (zio_injection_enabled && zio->io_error == 0) 2736 zio->io_error = zio_handle_fault_injection(zio, EIO); 2737 2738 if (zio->io_type == ZIO_TYPE_IOCTL && zio->io_cmd == DKIOCTRIM) 2739 switch (zio->io_error) { 2740 case 0: 2741 ZIO_TRIM_STAT_INCR(bytes, zio->io_size); 2742 ZIO_TRIM_STAT_BUMP(success); 2743 break; 2744 case EOPNOTSUPP: 2745 ZIO_TRIM_STAT_BUMP(unsupported); 2746 break; 2747 default: 2748 ZIO_TRIM_STAT_BUMP(failed); 2749 break; 2750 } 2751 2752 /* 2753 * If the I/O failed, determine whether we should attempt to retry it. 2754 * 2755 * On retry, we cut in line in the issue queue, since we don't want 2756 * compression/checksumming/etc. work to prevent our (cheap) IO reissue. 2757 */ 2758 if (zio->io_error && vd == NULL && 2759 !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { 2760 ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ 2761 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ 2762 zio->io_error = 0; 2763 zio->io_flags |= ZIO_FLAG_IO_RETRY | 2764 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; 2765 zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; 2766 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, 2767 zio_requeue_io_start_cut_in_line); 2768 return (ZIO_PIPELINE_STOP); 2769 } 2770 2771 /* 2772 * If we got an error on a leaf device, convert it to ENXIO 2773 * if the device is not accessible at all. 2774 */ 2775 if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && 2776 !vdev_accessible(vd, zio)) 2777 zio->io_error = SET_ERROR(ENXIO); 2778 2779 /* 2780 * If we can't write to an interior vdev (mirror or RAID-Z), 2781 * set vdev_cant_write so that we stop trying to allocate from it. 2782 */ 2783 if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && 2784 vd != NULL && !vd->vdev_ops->vdev_op_leaf) { 2785 vd->vdev_cant_write = B_TRUE; 2786 } 2787 2788 if (zio->io_error) 2789 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2790 2791 if (vd != NULL && vd->vdev_ops->vdev_op_leaf && 2792 zio->io_physdone != NULL) { 2793 ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED)); 2794 ASSERT(zio->io_child_type == ZIO_CHILD_VDEV); 2795 zio->io_physdone(zio->io_logical); 2796 } 2797 2798 return (ZIO_PIPELINE_CONTINUE); 2799} 2800 2801void 2802zio_vdev_io_reissue(zio_t *zio) 2803{ 2804 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2805 ASSERT(zio->io_error == 0); 2806 2807 zio->io_stage >>= 1; 2808} 2809 2810void 2811zio_vdev_io_redone(zio_t *zio) 2812{ 2813 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 2814 2815 zio->io_stage >>= 1; 2816} 2817 2818void 2819zio_vdev_io_bypass(zio_t *zio) 2820{ 2821 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2822 ASSERT(zio->io_error == 0); 2823 2824 zio->io_flags |= ZIO_FLAG_IO_BYPASS; 2825 zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; 2826} 2827 2828/* 2829 * ========================================================================== 2830 * Generate and verify checksums 2831 * ========================================================================== 2832 */ 2833static int 2834zio_checksum_generate(zio_t **ziop) 2835{ 2836 zio_t *zio = *ziop; 2837 blkptr_t *bp = zio->io_bp; 2838 enum zio_checksum checksum; 2839 2840 if (bp == NULL) { 2841 /* 2842 * This is zio_write_phys(). 2843 * We're either generating a label checksum, or none at all. 2844 */ 2845 checksum = zio->io_prop.zp_checksum; 2846 2847 if (checksum == ZIO_CHECKSUM_OFF) 2848 return (ZIO_PIPELINE_CONTINUE); 2849 2850 ASSERT(checksum == ZIO_CHECKSUM_LABEL); 2851 } else { 2852 if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { 2853 ASSERT(!IO_IS_ALLOCATING(zio)); 2854 checksum = ZIO_CHECKSUM_GANG_HEADER; 2855 } else { 2856 checksum = BP_GET_CHECKSUM(bp); 2857 } 2858 } 2859 2860 zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); 2861 2862 return (ZIO_PIPELINE_CONTINUE); 2863} 2864 2865static int 2866zio_checksum_verify(zio_t **ziop) 2867{ 2868 zio_t *zio = *ziop; 2869 zio_bad_cksum_t info; 2870 blkptr_t *bp = zio->io_bp; 2871 int error; 2872 2873 ASSERT(zio->io_vd != NULL); 2874 2875 if (bp == NULL) { 2876 /* 2877 * This is zio_read_phys(). 2878 * We're either verifying a label checksum, or nothing at all. 2879 */ 2880 if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) 2881 return (ZIO_PIPELINE_CONTINUE); 2882 2883 ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); 2884 } 2885 2886 if ((error = zio_checksum_error(zio, &info)) != 0) { 2887 zio->io_error = error; 2888 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2889 zfs_ereport_start_checksum(zio->io_spa, 2890 zio->io_vd, zio, zio->io_offset, 2891 zio->io_size, NULL, &info); 2892 } 2893 } 2894 2895 return (ZIO_PIPELINE_CONTINUE); 2896} 2897 2898/* 2899 * Called by RAID-Z to ensure we don't compute the checksum twice. 2900 */ 2901void 2902zio_checksum_verified(zio_t *zio) 2903{ 2904 zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 2905} 2906 2907/* 2908 * ========================================================================== 2909 * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. 2910 * An error of 0 indictes success. ENXIO indicates whole-device failure, 2911 * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO 2912 * indicate errors that are specific to one I/O, and most likely permanent. 2913 * Any other error is presumed to be worse because we weren't expecting it. 2914 * ========================================================================== 2915 */ 2916int 2917zio_worst_error(int e1, int e2) 2918{ 2919 static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; 2920 int r1, r2; 2921 2922 for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) 2923 if (e1 == zio_error_rank[r1]) 2924 break; 2925 2926 for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) 2927 if (e2 == zio_error_rank[r2]) 2928 break; 2929 2930 return (r1 > r2 ? e1 : e2); 2931} 2932 2933/* 2934 * ========================================================================== 2935 * I/O completion 2936 * ========================================================================== 2937 */ 2938static int 2939zio_ready(zio_t **ziop) 2940{ 2941 zio_t *zio = *ziop; 2942 blkptr_t *bp = zio->io_bp; 2943 zio_t *pio, *pio_next; 2944 2945 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 2946 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY)) 2947 return (ZIO_PIPELINE_STOP); 2948 2949 if (zio->io_ready) { 2950 ASSERT(IO_IS_ALLOCATING(zio)); 2951 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || 2952 (zio->io_flags & ZIO_FLAG_NOPWRITE)); 2953 ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); 2954 2955 zio->io_ready(zio); 2956 } 2957 2958 if (bp != NULL && bp != &zio->io_bp_copy) 2959 zio->io_bp_copy = *bp; 2960 2961 if (zio->io_error) 2962 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2963 2964 mutex_enter(&zio->io_lock); 2965 zio->io_state[ZIO_WAIT_READY] = 1; 2966 pio = zio_walk_parents(zio); 2967 mutex_exit(&zio->io_lock); 2968 2969 /* 2970 * As we notify zio's parents, new parents could be added. 2971 * New parents go to the head of zio's io_parent_list, however, 2972 * so we will (correctly) not notify them. The remainder of zio's 2973 * io_parent_list, from 'pio_next' onward, cannot change because 2974 * all parents must wait for us to be done before they can be done. 2975 */ 2976 for (; pio != NULL; pio = pio_next) { 2977 pio_next = zio_walk_parents(zio); 2978 zio_notify_parent(pio, zio, ZIO_WAIT_READY); 2979 } 2980 2981 if (zio->io_flags & ZIO_FLAG_NODATA) { 2982 if (BP_IS_GANG(bp)) { 2983 zio->io_flags &= ~ZIO_FLAG_NODATA; 2984 } else { 2985 ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE); 2986 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 2987 } 2988 } 2989 2990 if (zio_injection_enabled && 2991 zio->io_spa->spa_syncing_txg == zio->io_txg) 2992 zio_handle_ignored_writes(zio); 2993 2994 return (ZIO_PIPELINE_CONTINUE); 2995} 2996 2997static int 2998zio_done(zio_t **ziop) 2999{ 3000 zio_t *zio = *ziop; 3001 spa_t *spa = zio->io_spa; 3002 zio_t *lio = zio->io_logical; 3003 blkptr_t *bp = zio->io_bp; 3004 vdev_t *vd = zio->io_vd; 3005 uint64_t psize = zio->io_size; 3006 zio_t *pio, *pio_next; 3007 3008 /* 3009 * If our children haven't all completed, 3010 * wait for them and then repeat this pipeline stage. 3011 */ 3012 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || 3013 zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || 3014 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) || 3015 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) 3016 return (ZIO_PIPELINE_STOP); 3017 3018 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 3019 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 3020 ASSERT(zio->io_children[c][w] == 0); 3021 3022 if (bp != NULL) { 3023 ASSERT(bp->blk_pad[0] == 0); 3024 ASSERT(bp->blk_pad[1] == 0); 3025 ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || 3026 (bp == zio_unique_parent(zio)->io_bp)); 3027 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 3028 zio->io_bp_override == NULL && 3029 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 3030 ASSERT(!BP_SHOULD_BYTESWAP(bp)); 3031 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); 3032 ASSERT(BP_COUNT_GANG(bp) == 0 || 3033 (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 3034 } 3035 if (zio->io_flags & ZIO_FLAG_NOPWRITE) 3036 VERIFY(BP_EQUAL(bp, &zio->io_bp_orig)); 3037 } 3038 3039 /* 3040 * If there were child vdev/gang/ddt errors, they apply to us now. 3041 */ 3042 zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); 3043 zio_inherit_child_errors(zio, ZIO_CHILD_GANG); 3044 zio_inherit_child_errors(zio, ZIO_CHILD_DDT); 3045 3046 /* 3047 * If the I/O on the transformed data was successful, generate any 3048 * checksum reports now while we still have the transformed data. 3049 */ 3050 if (zio->io_error == 0) { 3051 while (zio->io_cksum_report != NULL) { 3052 zio_cksum_report_t *zcr = zio->io_cksum_report; 3053 uint64_t align = zcr->zcr_align; 3054 uint64_t asize = P2ROUNDUP(psize, align); 3055 char *abuf = zio->io_data; 3056 3057 if (asize != psize) { 3058 abuf = zio_buf_alloc(asize); 3059 bcopy(zio->io_data, abuf, psize); 3060 bzero(abuf + psize, asize - psize); 3061 } 3062 3063 zio->io_cksum_report = zcr->zcr_next; 3064 zcr->zcr_next = NULL; 3065 zcr->zcr_finish(zcr, abuf); 3066 zfs_ereport_free_checksum(zcr); 3067 3068 if (asize != psize) 3069 zio_buf_free(abuf, asize); 3070 } 3071 } 3072 3073 zio_pop_transforms(zio); /* note: may set zio->io_error */ 3074 3075 vdev_stat_update(zio, psize); 3076 3077 if (zio->io_error) { 3078 /* 3079 * If this I/O is attached to a particular vdev, 3080 * generate an error message describing the I/O failure 3081 * at the block level. We ignore these errors if the 3082 * device is currently unavailable. 3083 */ 3084 if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 3085 zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 3086 3087 if ((zio->io_error == EIO || !(zio->io_flags & 3088 (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && 3089 zio == lio) { 3090 /* 3091 * For logical I/O requests, tell the SPA to log the 3092 * error and generate a logical data ereport. 3093 */ 3094 spa_log_error(spa, zio); 3095 zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 3096 0, 0); 3097 } 3098 } 3099 3100 if (zio->io_error && zio == lio) { 3101 /* 3102 * Determine whether zio should be reexecuted. This will 3103 * propagate all the way to the root via zio_notify_parent(). 3104 */ 3105 ASSERT(vd == NULL && bp != NULL); 3106 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3107 3108 if (IO_IS_ALLOCATING(zio) && 3109 !(zio->io_flags & ZIO_FLAG_CANFAIL)) { 3110 if (zio->io_error != ENOSPC) 3111 zio->io_reexecute |= ZIO_REEXECUTE_NOW; 3112 else 3113 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3114 } 3115 3116 if ((zio->io_type == ZIO_TYPE_READ || 3117 zio->io_type == ZIO_TYPE_FREE) && 3118 !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && 3119 zio->io_error == ENXIO && 3120 spa_load_state(spa) == SPA_LOAD_NONE && 3121 spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 3122 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3123 3124 if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) 3125 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3126 3127 /* 3128 * Here is a possibly good place to attempt to do 3129 * either combinatorial reconstruction or error correction 3130 * based on checksums. It also might be a good place 3131 * to send out preliminary ereports before we suspend 3132 * processing. 3133 */ 3134 } 3135 3136 /* 3137 * If there were logical child errors, they apply to us now. 3138 * We defer this until now to avoid conflating logical child 3139 * errors with errors that happened to the zio itself when 3140 * updating vdev stats and reporting FMA events above. 3141 */ 3142 zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); 3143 3144 if ((zio->io_error || zio->io_reexecute) && 3145 IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && 3146 !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) 3147 zio_dva_unallocate(zio, zio->io_gang_tree, bp); 3148 3149 zio_gang_tree_free(&zio->io_gang_tree); 3150 3151 /* 3152 * Godfather I/Os should never suspend. 3153 */ 3154 if ((zio->io_flags & ZIO_FLAG_GODFATHER) && 3155 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) 3156 zio->io_reexecute = 0; 3157 3158 if (zio->io_reexecute) { 3159 /* 3160 * This is a logical I/O that wants to reexecute. 3161 * 3162 * Reexecute is top-down. When an i/o fails, if it's not 3163 * the root, it simply notifies its parent and sticks around. 3164 * The parent, seeing that it still has children in zio_done(), 3165 * does the same. This percolates all the way up to the root. 3166 * The root i/o will reexecute or suspend the entire tree. 3167 * 3168 * This approach ensures that zio_reexecute() honors 3169 * all the original i/o dependency relationships, e.g. 3170 * parents not executing until children are ready. 3171 */ 3172 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3173 3174 zio->io_gang_leader = NULL; 3175 3176 mutex_enter(&zio->io_lock); 3177 zio->io_state[ZIO_WAIT_DONE] = 1; 3178 mutex_exit(&zio->io_lock); 3179 3180 /* 3181 * "The Godfather" I/O monitors its children but is 3182 * not a true parent to them. It will track them through 3183 * the pipeline but severs its ties whenever they get into 3184 * trouble (e.g. suspended). This allows "The Godfather" 3185 * I/O to return status without blocking. 3186 */ 3187 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 3188 zio_link_t *zl = zio->io_walk_link; 3189 pio_next = zio_walk_parents(zio); 3190 3191 if ((pio->io_flags & ZIO_FLAG_GODFATHER) && 3192 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { 3193 zio_remove_child(pio, zio, zl); 3194 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3195 } 3196 } 3197 3198 if ((pio = zio_unique_parent(zio)) != NULL) { 3199 /* 3200 * We're not a root i/o, so there's nothing to do 3201 * but notify our parent. Don't propagate errors 3202 * upward since we haven't permanently failed yet. 3203 */ 3204 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 3205 zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; 3206 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3207 } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { 3208 /* 3209 * We'd fail again if we reexecuted now, so suspend 3210 * until conditions improve (e.g. device comes online). 3211 */ 3212 zio_suspend(spa, zio); 3213 } else { 3214 /* 3215 * Reexecution is potentially a huge amount of work. 3216 * Hand it off to the otherwise-unused claim taskq. 3217 */ 3218#if defined(illumos) || !defined(_KERNEL) 3219 ASSERT(zio->io_tqent.tqent_next == NULL); 3220#else 3221 ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); 3222#endif 3223 spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM, 3224 ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio, 3225 0, &zio->io_tqent); 3226 } 3227 return (ZIO_PIPELINE_STOP); 3228 } 3229 3230 ASSERT(zio->io_child_count == 0); 3231 ASSERT(zio->io_reexecute == 0); 3232 ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); 3233 3234 /* 3235 * Report any checksum errors, since the I/O is complete. 3236 */ 3237 while (zio->io_cksum_report != NULL) { 3238 zio_cksum_report_t *zcr = zio->io_cksum_report; 3239 zio->io_cksum_report = zcr->zcr_next; 3240 zcr->zcr_next = NULL; 3241 zcr->zcr_finish(zcr, NULL); 3242 zfs_ereport_free_checksum(zcr); 3243 } 3244 3245 /* 3246 * It is the responsibility of the done callback to ensure that this 3247 * particular zio is no longer discoverable for adoption, and as 3248 * such, cannot acquire any new parents. 3249 */ 3250 if (zio->io_done) 3251 zio->io_done(zio); 3252 3253 mutex_enter(&zio->io_lock); 3254 zio->io_state[ZIO_WAIT_DONE] = 1; 3255 mutex_exit(&zio->io_lock); 3256 3257 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 3258 zio_link_t *zl = zio->io_walk_link; 3259 pio_next = zio_walk_parents(zio); 3260 zio_remove_child(pio, zio, zl); 3261 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3262 } 3263 3264 if (zio->io_waiter != NULL) { 3265 mutex_enter(&zio->io_lock); 3266 zio->io_executor = NULL; 3267 cv_broadcast(&zio->io_cv); 3268 mutex_exit(&zio->io_lock); 3269 } else { 3270 zio_destroy(zio); 3271 } 3272 3273 return (ZIO_PIPELINE_STOP); 3274} 3275 3276/* 3277 * ========================================================================== 3278 * I/O pipeline definition 3279 * ========================================================================== 3280 */ 3281static zio_pipe_stage_t *zio_pipeline[] = { 3282 NULL, 3283 zio_read_bp_init, 3284 zio_free_bp_init, 3285 zio_issue_async, 3286 zio_write_bp_init, 3287 zio_checksum_generate, 3288 zio_nop_write, 3289 zio_ddt_read_start, 3290 zio_ddt_read_done, 3291 zio_ddt_write, 3292 zio_ddt_free, 3293 zio_gang_assemble, 3294 zio_gang_issue, 3295 zio_dva_allocate, 3296 zio_dva_free, 3297 zio_dva_claim, 3298 zio_ready, 3299 zio_vdev_io_start, 3300 zio_vdev_io_done, 3301 zio_vdev_io_assess, 3302 zio_checksum_verify, 3303 zio_done 3304}; 3305 3306/* dnp is the dnode for zb1->zb_object */ 3307boolean_t 3308zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1, 3309 const zbookmark_t *zb2) 3310{ 3311 uint64_t zb1nextL0, zb2thisobj; 3312 3313 ASSERT(zb1->zb_objset == zb2->zb_objset); 3314 ASSERT(zb2->zb_level == 0); 3315 3316 /* 3317 * A bookmark in the deadlist is considered to be after 3318 * everything else. 3319 */ 3320 if (zb2->zb_object == DMU_DEADLIST_OBJECT) 3321 return (B_TRUE); 3322 3323 /* The objset_phys_t isn't before anything. */ 3324 if (dnp == NULL) 3325 return (B_FALSE); 3326 3327 zb1nextL0 = (zb1->zb_blkid + 1) << 3328 ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); 3329 3330 zb2thisobj = zb2->zb_object ? zb2->zb_object : 3331 zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); 3332 3333 if (zb1->zb_object == DMU_META_DNODE_OBJECT) { 3334 uint64_t nextobj = zb1nextL0 * 3335 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; 3336 return (nextobj <= zb2thisobj); 3337 } 3338 3339 if (zb1->zb_object < zb2thisobj) 3340 return (B_TRUE); 3341 if (zb1->zb_object > zb2thisobj) 3342 return (B_FALSE); 3343 if (zb2->zb_object == DMU_META_DNODE_OBJECT) 3344 return (B_FALSE); 3345 return (zb1nextL0 <= zb2->zb_blkid); 3346} 3347