1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23265740Sdelphij * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 24260742Savg * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. 25168404Spjd */ 26168404Spjd 27168404Spjd#include <sys/zfs_context.h> 28168404Spjd#include <sys/fm/fs/zfs.h> 29168404Spjd#include <sys/spa.h> 30168404Spjd#include <sys/txg.h> 31168404Spjd#include <sys/spa_impl.h> 32168404Spjd#include <sys/vdev_impl.h> 33168404Spjd#include <sys/zio_impl.h> 34168404Spjd#include <sys/zio_compress.h> 35168404Spjd#include <sys/zio_checksum.h> 36219089Spjd#include <sys/dmu_objset.h> 37219089Spjd#include <sys/arc.h> 38219089Spjd#include <sys/ddt.h> 39240868Spjd#include <sys/trim_map.h> 40268649Sdelphij#include <sys/blkptr.h> 41263397Sdelphij#include <sys/zfeature.h> 42168404Spjd 43208148SpjdSYSCTL_DECL(_vfs_zfs); 44208148SpjdSYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); 45260338Smav#if defined(__amd64__) 46260338Smavstatic int zio_use_uma = 1; 47260338Smav#else 48209261Spjdstatic int zio_use_uma = 0; 49260338Smav#endif 50208148SpjdTUNABLE_INT("vfs.zfs.zio.use_uma", &zio_use_uma); 51208148SpjdSYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0, 52208148Spjd "Use uma(9) for ZIO allocations"); 53230647Skmacystatic int zio_exclude_metadata = 0; 54230647SkmacyTUNABLE_INT("vfs.zfs.zio.exclude_metadata", &zio_exclude_metadata); 55230647SkmacySYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0, 56230647Skmacy "Exclude metadata buffers from dumps as well"); 57208148Spjd 58240868Spjdzio_trim_stats_t zio_trim_stats = { 59244155Ssmh { "bytes", KSTAT_DATA_UINT64, 60244155Ssmh "Number of bytes successfully TRIMmed" }, 61244155Ssmh { "success", KSTAT_DATA_UINT64, 62244155Ssmh "Number of successful TRIM requests" }, 63244155Ssmh { "unsupported", KSTAT_DATA_UINT64, 64244155Ssmh "Number of TRIM requests that failed because TRIM is not supported" }, 65244155Ssmh { "failed", KSTAT_DATA_UINT64, 66244155Ssmh "Number of TRIM requests that failed for reasons other than not supported" }, 67240868Spjd}; 68240868Spjd 69240868Spjdstatic kstat_t *zio_trim_ksp; 70240868Spjd 71240868Spjd/* 72168404Spjd * ========================================================================== 73168404Spjd * I/O type descriptions 74168404Spjd * ========================================================================== 75168404Spjd */ 76260763Savgconst char *zio_type_name[ZIO_TYPES] = { 77211931Smm "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", 78211931Smm "zio_ioctl" 79211931Smm}; 80168404Spjd 81168404Spjd/* 82168404Spjd * ========================================================================== 83168404Spjd * I/O kmem caches 84168404Spjd * ========================================================================== 85168404Spjd */ 86168926Spjdkmem_cache_t *zio_cache; 87209962Smmkmem_cache_t *zio_link_cache; 88168404Spjdkmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 89168404Spjdkmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 90168404Spjd 91168404Spjd#ifdef _KERNEL 92168404Spjdextern vmem_t *zio_alloc_arena; 93168404Spjd#endif 94168404Spjd 95185029Spjd/* 96243503Smm * The following actions directly effect the spa's sync-to-convergence logic. 97243503Smm * The values below define the sync pass when we start performing the action. 98243503Smm * Care should be taken when changing these values as they directly impact 99243503Smm * spa_sync() performance. Tuning these values may introduce subtle performance 100243503Smm * pathologies and should only be done in the context of performance analysis. 101243503Smm * These tunables will eventually be removed and replaced with #defines once 102243503Smm * enough analysis has been done to determine optimal values. 103243503Smm * 104243503Smm * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that 105243503Smm * regular blocks are not deferred. 106243503Smm */ 107243503Smmint zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */ 108243503SmmTUNABLE_INT("vfs.zfs.sync_pass_deferred_free", &zfs_sync_pass_deferred_free); 109243503SmmSYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_deferred_free, CTLFLAG_RDTUN, 110243503Smm &zfs_sync_pass_deferred_free, 0, "defer frees starting in this pass"); 111243503Smmint zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */ 112243503SmmTUNABLE_INT("vfs.zfs.sync_pass_dont_compress", &zfs_sync_pass_dont_compress); 113243503SmmSYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_dont_compress, CTLFLAG_RDTUN, 114243503Smm &zfs_sync_pass_dont_compress, 0, "don't compress starting in this pass"); 115243503Smmint zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ 116243503SmmTUNABLE_INT("vfs.zfs.sync_pass_rewrite", &zfs_sync_pass_rewrite); 117243503SmmSYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_rewrite, CTLFLAG_RDTUN, 118243503Smm &zfs_sync_pass_rewrite, 0, "rewrite new bps starting in this pass"); 119243503Smm 120243503Smm/* 121185029Spjd * An allocating zio is one that either currently has the DVA allocate 122185029Spjd * stage set or will have it later in its lifetime. 123185029Spjd */ 124219089Spjd#define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) 125185029Spjd 126219089Spjdboolean_t zio_requeue_io_start_cut_in_line = B_TRUE; 127219089Spjd 128219089Spjd#ifdef ZFS_DEBUG 129219089Spjdint zio_buf_debug_limit = 16384; 130219089Spjd#else 131219089Spjdint zio_buf_debug_limit = 0; 132219089Spjd#endif 133219089Spjd 134168404Spjdvoid 135168404Spjdzio_init(void) 136168404Spjd{ 137168404Spjd size_t c; 138209962Smm zio_cache = kmem_cache_create("zio_cache", 139209962Smm sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 140209962Smm zio_link_cache = kmem_cache_create("zio_link_cache", 141209962Smm sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 142250149Sdavide if (!zio_use_uma) 143250149Sdavide goto out; 144168926Spjd 145168404Spjd /* 146168404Spjd * For small buffers, we want a cache for each multiple of 147168404Spjd * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 148168404Spjd * for each quarter-power of 2. For large buffers, we want 149168404Spjd * a cache for each multiple of PAGESIZE. 150168404Spjd */ 151168404Spjd for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 152168404Spjd size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 153168404Spjd size_t p2 = size; 154168404Spjd size_t align = 0; 155219089Spjd size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0; 156168404Spjd 157168404Spjd while (p2 & (p2 - 1)) 158168404Spjd p2 &= p2 - 1; 159168404Spjd 160240133Smm#ifdef illumos 161240133Smm#ifndef _KERNEL 162240133Smm /* 163240133Smm * If we are using watchpoints, put each buffer on its own page, 164240133Smm * to eliminate the performance overhead of trapping to the 165240133Smm * kernel when modifying a non-watched buffer that shares the 166240133Smm * page with a watched buffer. 167240133Smm */ 168240133Smm if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) 169240133Smm continue; 170240133Smm#endif 171240133Smm#endif /* illumos */ 172168404Spjd if (size <= 4 * SPA_MINBLOCKSIZE) { 173168404Spjd align = SPA_MINBLOCKSIZE; 174240133Smm } else if (IS_P2ALIGNED(size, PAGESIZE)) { 175168404Spjd align = PAGESIZE; 176240133Smm } else if (IS_P2ALIGNED(size, p2 >> 2)) { 177168404Spjd align = p2 >> 2; 178168404Spjd } 179168404Spjd 180168404Spjd if (align != 0) { 181168404Spjd char name[36]; 182168404Spjd (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 183168404Spjd zio_buf_cache[c] = kmem_cache_create(name, size, 184219089Spjd align, NULL, NULL, NULL, NULL, NULL, cflags); 185168404Spjd 186219089Spjd /* 187219089Spjd * Since zio_data bufs do not appear in crash dumps, we 188219089Spjd * pass KMC_NOTOUCH so that no allocator metadata is 189219089Spjd * stored with the buffers. 190219089Spjd */ 191168404Spjd (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 192168404Spjd zio_data_buf_cache[c] = kmem_cache_create(name, size, 193219089Spjd align, NULL, NULL, NULL, NULL, NULL, 194230689Skmacy cflags | KMC_NOTOUCH | KMC_NODEBUG); 195168404Spjd } 196168404Spjd } 197168404Spjd 198168404Spjd while (--c != 0) { 199168404Spjd ASSERT(zio_buf_cache[c] != NULL); 200168404Spjd if (zio_buf_cache[c - 1] == NULL) 201168404Spjd zio_buf_cache[c - 1] = zio_buf_cache[c]; 202168404Spjd 203168404Spjd ASSERT(zio_data_buf_cache[c] != NULL); 204168404Spjd if (zio_data_buf_cache[c - 1] == NULL) 205168404Spjd zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 206168404Spjd } 207250149Sdavideout: 208208458Spjd 209168404Spjd zio_inject_init(); 210240868Spjd 211240868Spjd zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc", 212240868Spjd KSTAT_TYPE_NAMED, 213240868Spjd sizeof(zio_trim_stats) / sizeof(kstat_named_t), 214240868Spjd KSTAT_FLAG_VIRTUAL); 215240868Spjd 216240868Spjd if (zio_trim_ksp != NULL) { 217240868Spjd zio_trim_ksp->ks_data = &zio_trim_stats; 218240868Spjd kstat_install(zio_trim_ksp); 219240868Spjd } 220168404Spjd} 221168404Spjd 222168404Spjdvoid 223168404Spjdzio_fini(void) 224168404Spjd{ 225168404Spjd size_t c; 226168404Spjd kmem_cache_t *last_cache = NULL; 227168404Spjd kmem_cache_t *last_data_cache = NULL; 228168404Spjd 229168404Spjd for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 230168404Spjd if (zio_buf_cache[c] != last_cache) { 231168404Spjd last_cache = zio_buf_cache[c]; 232168404Spjd kmem_cache_destroy(zio_buf_cache[c]); 233168404Spjd } 234168404Spjd zio_buf_cache[c] = NULL; 235168404Spjd 236168404Spjd if (zio_data_buf_cache[c] != last_data_cache) { 237168404Spjd last_data_cache = zio_data_buf_cache[c]; 238168404Spjd kmem_cache_destroy(zio_data_buf_cache[c]); 239168404Spjd } 240168404Spjd zio_data_buf_cache[c] = NULL; 241168404Spjd } 242168404Spjd 243209962Smm kmem_cache_destroy(zio_link_cache); 244168926Spjd kmem_cache_destroy(zio_cache); 245168926Spjd 246168404Spjd zio_inject_fini(); 247240868Spjd 248240868Spjd if (zio_trim_ksp != NULL) { 249240868Spjd kstat_delete(zio_trim_ksp); 250240868Spjd zio_trim_ksp = NULL; 251240868Spjd } 252168404Spjd} 253168404Spjd 254168404Spjd/* 255168404Spjd * ========================================================================== 256168404Spjd * Allocate and free I/O buffers 257168404Spjd * ========================================================================== 258168404Spjd */ 259168404Spjd 260168404Spjd/* 261168404Spjd * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 262168404Spjd * crashdump if the kernel panics, so use it judiciously. Obviously, it's 263168404Spjd * useful to inspect ZFS metadata, but if possible, we should avoid keeping 264168404Spjd * excess / transient data in-core during a crashdump. 265168404Spjd */ 266168404Spjdvoid * 267168404Spjdzio_buf_alloc(size_t size) 268168404Spjd{ 269168404Spjd size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 270230647Skmacy int flags = zio_exclude_metadata ? KM_NODEBUG : 0; 271168404Spjd 272268649Sdelphij ASSERT3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 273168404Spjd 274208148Spjd if (zio_use_uma) 275208148Spjd return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 276208148Spjd else 277230647Skmacy return (kmem_alloc(size, KM_SLEEP|flags)); 278168404Spjd} 279168404Spjd 280168404Spjd/* 281168404Spjd * Use zio_data_buf_alloc to allocate data. The data will not appear in a 282168404Spjd * crashdump if the kernel panics. This exists so that we will limit the amount 283168404Spjd * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 284168404Spjd * of kernel heap dumped to disk when the kernel panics) 285168404Spjd */ 286168404Spjdvoid * 287168404Spjdzio_data_buf_alloc(size_t size) 288168404Spjd{ 289168404Spjd size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 290168404Spjd 291168404Spjd ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 292168404Spjd 293208148Spjd if (zio_use_uma) 294208148Spjd return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 295208148Spjd else 296230623Skmacy return (kmem_alloc(size, KM_SLEEP | KM_NODEBUG)); 297168404Spjd} 298168404Spjd 299168404Spjdvoid 300168404Spjdzio_buf_free(void *buf, size_t size) 301168404Spjd{ 302168404Spjd size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 303168404Spjd 304168404Spjd ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 305168404Spjd 306208148Spjd if (zio_use_uma) 307208148Spjd kmem_cache_free(zio_buf_cache[c], buf); 308208148Spjd else 309208148Spjd kmem_free(buf, size); 310168404Spjd} 311168404Spjd 312168404Spjdvoid 313168404Spjdzio_data_buf_free(void *buf, size_t size) 314168404Spjd{ 315168404Spjd size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 316168404Spjd 317168404Spjd ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 318168404Spjd 319208148Spjd if (zio_use_uma) 320208148Spjd kmem_cache_free(zio_data_buf_cache[c], buf); 321208148Spjd else 322208148Spjd kmem_free(buf, size); 323168404Spjd} 324168404Spjd 325168404Spjd/* 326168404Spjd * ========================================================================== 327168404Spjd * Push and pop I/O transform buffers 328168404Spjd * ========================================================================== 329168404Spjd */ 330168404Spjdstatic void 331185029Spjdzio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, 332185029Spjd zio_transform_func_t *transform) 333168404Spjd{ 334168404Spjd zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 335168404Spjd 336185029Spjd zt->zt_orig_data = zio->io_data; 337185029Spjd zt->zt_orig_size = zio->io_size; 338168404Spjd zt->zt_bufsize = bufsize; 339185029Spjd zt->zt_transform = transform; 340168404Spjd 341168404Spjd zt->zt_next = zio->io_transform_stack; 342168404Spjd zio->io_transform_stack = zt; 343168404Spjd 344168404Spjd zio->io_data = data; 345168404Spjd zio->io_size = size; 346168404Spjd} 347168404Spjd 348168404Spjdstatic void 349185029Spjdzio_pop_transforms(zio_t *zio) 350168404Spjd{ 351185029Spjd zio_transform_t *zt; 352168404Spjd 353185029Spjd while ((zt = zio->io_transform_stack) != NULL) { 354185029Spjd if (zt->zt_transform != NULL) 355185029Spjd zt->zt_transform(zio, 356185029Spjd zt->zt_orig_data, zt->zt_orig_size); 357168404Spjd 358219089Spjd if (zt->zt_bufsize != 0) 359219089Spjd zio_buf_free(zio->io_data, zt->zt_bufsize); 360168404Spjd 361185029Spjd zio->io_data = zt->zt_orig_data; 362185029Spjd zio->io_size = zt->zt_orig_size; 363185029Spjd zio->io_transform_stack = zt->zt_next; 364185029Spjd 365185029Spjd kmem_free(zt, sizeof (zio_transform_t)); 366168404Spjd } 367168404Spjd} 368168404Spjd 369185029Spjd/* 370185029Spjd * ========================================================================== 371185029Spjd * I/O transform callbacks for subblocks and decompression 372185029Spjd * ========================================================================== 373185029Spjd */ 374168404Spjdstatic void 375185029Spjdzio_subblock(zio_t *zio, void *data, uint64_t size) 376168404Spjd{ 377185029Spjd ASSERT(zio->io_size > size); 378168404Spjd 379185029Spjd if (zio->io_type == ZIO_TYPE_READ) 380185029Spjd bcopy(zio->io_data, data, size); 381185029Spjd} 382168404Spjd 383185029Spjdstatic void 384185029Spjdzio_decompress(zio_t *zio, void *data, uint64_t size) 385185029Spjd{ 386185029Spjd if (zio->io_error == 0 && 387185029Spjd zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), 388219089Spjd zio->io_data, data, zio->io_size, size) != 0) 389249195Smm zio->io_error = SET_ERROR(EIO); 390185029Spjd} 391185029Spjd 392185029Spjd/* 393185029Spjd * ========================================================================== 394185029Spjd * I/O parent/child relationships and pipeline interlocks 395185029Spjd * ========================================================================== 396185029Spjd */ 397209962Smm/* 398209962Smm * NOTE - Callers to zio_walk_parents() and zio_walk_children must 399209962Smm * continue calling these functions until they return NULL. 400209962Smm * Otherwise, the next caller will pick up the list walk in 401209962Smm * some indeterminate state. (Otherwise every caller would 402209962Smm * have to pass in a cookie to keep the state represented by 403209962Smm * io_walk_link, which gets annoying.) 404209962Smm */ 405209962Smmzio_t * 406209962Smmzio_walk_parents(zio_t *cio) 407209962Smm{ 408209962Smm zio_link_t *zl = cio->io_walk_link; 409209962Smm list_t *pl = &cio->io_parent_list; 410185029Spjd 411209962Smm zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); 412209962Smm cio->io_walk_link = zl; 413209962Smm 414209962Smm if (zl == NULL) 415209962Smm return (NULL); 416209962Smm 417209962Smm ASSERT(zl->zl_child == cio); 418209962Smm return (zl->zl_parent); 419209962Smm} 420209962Smm 421209962Smmzio_t * 422209962Smmzio_walk_children(zio_t *pio) 423185029Spjd{ 424209962Smm zio_link_t *zl = pio->io_walk_link; 425209962Smm list_t *cl = &pio->io_child_list; 426209962Smm 427209962Smm zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); 428209962Smm pio->io_walk_link = zl; 429209962Smm 430209962Smm if (zl == NULL) 431209962Smm return (NULL); 432209962Smm 433209962Smm ASSERT(zl->zl_parent == pio); 434209962Smm return (zl->zl_child); 435209962Smm} 436209962Smm 437209962Smmzio_t * 438209962Smmzio_unique_parent(zio_t *cio) 439209962Smm{ 440209962Smm zio_t *pio = zio_walk_parents(cio); 441209962Smm 442209962Smm VERIFY(zio_walk_parents(cio) == NULL); 443209962Smm return (pio); 444209962Smm} 445209962Smm 446209962Smmvoid 447209962Smmzio_add_child(zio_t *pio, zio_t *cio) 448209962Smm{ 449209962Smm zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); 450209962Smm 451209962Smm /* 452209962Smm * Logical I/Os can have logical, gang, or vdev children. 453209962Smm * Gang I/Os can have gang or vdev children. 454209962Smm * Vdev I/Os can only have vdev children. 455209962Smm * The following ASSERT captures all of these constraints. 456209962Smm */ 457209962Smm ASSERT(cio->io_child_type <= pio->io_child_type); 458209962Smm 459209962Smm zl->zl_parent = pio; 460209962Smm zl->zl_child = cio; 461209962Smm 462209962Smm mutex_enter(&cio->io_lock); 463185029Spjd mutex_enter(&pio->io_lock); 464209962Smm 465209962Smm ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); 466209962Smm 467209962Smm for (int w = 0; w < ZIO_WAIT_TYPES; w++) 468209962Smm pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; 469209962Smm 470209962Smm list_insert_head(&pio->io_child_list, zl); 471209962Smm list_insert_head(&cio->io_parent_list, zl); 472209962Smm 473219089Spjd pio->io_child_count++; 474219089Spjd cio->io_parent_count++; 475219089Spjd 476185029Spjd mutex_exit(&pio->io_lock); 477209962Smm mutex_exit(&cio->io_lock); 478185029Spjd} 479185029Spjd 480185029Spjdstatic void 481209962Smmzio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) 482185029Spjd{ 483209962Smm ASSERT(zl->zl_parent == pio); 484209962Smm ASSERT(zl->zl_child == cio); 485185029Spjd 486209962Smm mutex_enter(&cio->io_lock); 487209962Smm mutex_enter(&pio->io_lock); 488185029Spjd 489209962Smm list_remove(&pio->io_child_list, zl); 490209962Smm list_remove(&cio->io_parent_list, zl); 491209962Smm 492219089Spjd pio->io_child_count--; 493219089Spjd cio->io_parent_count--; 494219089Spjd 495185029Spjd mutex_exit(&pio->io_lock); 496209962Smm mutex_exit(&cio->io_lock); 497209962Smm 498209962Smm kmem_cache_free(zio_link_cache, zl); 499185029Spjd} 500185029Spjd 501185029Spjdstatic boolean_t 502185029Spjdzio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) 503185029Spjd{ 504185029Spjd uint64_t *countp = &zio->io_children[child][wait]; 505185029Spjd boolean_t waiting = B_FALSE; 506185029Spjd 507185029Spjd mutex_enter(&zio->io_lock); 508185029Spjd ASSERT(zio->io_stall == NULL); 509185029Spjd if (*countp != 0) { 510219089Spjd zio->io_stage >>= 1; 511185029Spjd zio->io_stall = countp; 512185029Spjd waiting = B_TRUE; 513168404Spjd } 514185029Spjd mutex_exit(&zio->io_lock); 515185029Spjd 516185029Spjd return (waiting); 517168404Spjd} 518168404Spjd 519185029Spjdstatic void 520185029Spjdzio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) 521185029Spjd{ 522185029Spjd uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; 523185029Spjd int *errorp = &pio->io_child_error[zio->io_child_type]; 524185029Spjd 525185029Spjd mutex_enter(&pio->io_lock); 526185029Spjd if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 527185029Spjd *errorp = zio_worst_error(*errorp, zio->io_error); 528185029Spjd pio->io_reexecute |= zio->io_reexecute; 529185029Spjd ASSERT3U(*countp, >, 0); 530260763Savg 531260763Savg (*countp)--; 532260763Savg 533260763Savg if (*countp == 0 && pio->io_stall == countp) { 534185029Spjd pio->io_stall = NULL; 535185029Spjd mutex_exit(&pio->io_lock); 536185029Spjd zio_execute(pio); 537185029Spjd } else { 538185029Spjd mutex_exit(&pio->io_lock); 539185029Spjd } 540185029Spjd} 541185029Spjd 542185029Spjdstatic void 543185029Spjdzio_inherit_child_errors(zio_t *zio, enum zio_child c) 544185029Spjd{ 545185029Spjd if (zio->io_child_error[c] != 0 && zio->io_error == 0) 546185029Spjd zio->io_error = zio->io_child_error[c]; 547185029Spjd} 548185029Spjd 549168404Spjd/* 550168404Spjd * ========================================================================== 551185029Spjd * Create the various types of I/O (read, write, free, etc) 552168404Spjd * ========================================================================== 553168404Spjd */ 554168404Spjdstatic zio_t * 555219089Spjdzio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 556168404Spjd void *data, uint64_t size, zio_done_func_t *done, void *private, 557260763Savg zio_type_t type, zio_priority_t priority, enum zio_flag flags, 558268657Sdelphij vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb, 559219089Spjd enum zio_stage stage, enum zio_stage pipeline) 560168404Spjd{ 561168404Spjd zio_t *zio; 562168404Spjd 563240868Spjd ASSERT3U(type == ZIO_TYPE_FREE || size, <=, SPA_MAXBLOCKSIZE); 564168404Spjd ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 565185029Spjd ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 566168404Spjd 567185029Spjd ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); 568185029Spjd ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); 569185029Spjd ASSERT(vd || stage == ZIO_STAGE_OPEN); 570185029Spjd 571168926Spjd zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 572168926Spjd bzero(zio, sizeof (zio_t)); 573185029Spjd 574185029Spjd mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 575185029Spjd cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 576185029Spjd 577209962Smm list_create(&zio->io_parent_list, sizeof (zio_link_t), 578209962Smm offsetof(zio_link_t, zl_parent_node)); 579209962Smm list_create(&zio->io_child_list, sizeof (zio_link_t), 580209962Smm offsetof(zio_link_t, zl_child_node)); 581209962Smm 582185029Spjd if (vd != NULL) 583185029Spjd zio->io_child_type = ZIO_CHILD_VDEV; 584185029Spjd else if (flags & ZIO_FLAG_GANG_CHILD) 585185029Spjd zio->io_child_type = ZIO_CHILD_GANG; 586219089Spjd else if (flags & ZIO_FLAG_DDT_CHILD) 587219089Spjd zio->io_child_type = ZIO_CHILD_DDT; 588185029Spjd else 589185029Spjd zio->io_child_type = ZIO_CHILD_LOGICAL; 590185029Spjd 591168404Spjd if (bp != NULL) { 592219089Spjd zio->io_bp = (blkptr_t *)bp; 593168404Spjd zio->io_bp_copy = *bp; 594168404Spjd zio->io_bp_orig = *bp; 595219089Spjd if (type != ZIO_TYPE_WRITE || 596219089Spjd zio->io_child_type == ZIO_CHILD_DDT) 597185029Spjd zio->io_bp = &zio->io_bp_copy; /* so caller can free */ 598209962Smm if (zio->io_child_type == ZIO_CHILD_LOGICAL) 599185029Spjd zio->io_logical = zio; 600209962Smm if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) 601209962Smm pipeline |= ZIO_GANG_STAGES; 602168404Spjd } 603185029Spjd 604185029Spjd zio->io_spa = spa; 605185029Spjd zio->io_txg = txg; 606168404Spjd zio->io_done = done; 607168404Spjd zio->io_private = private; 608168404Spjd zio->io_type = type; 609168404Spjd zio->io_priority = priority; 610185029Spjd zio->io_vd = vd; 611185029Spjd zio->io_offset = offset; 612219089Spjd zio->io_orig_data = zio->io_data = data; 613219089Spjd zio->io_orig_size = zio->io_size = size; 614185029Spjd zio->io_orig_flags = zio->io_flags = flags; 615185029Spjd zio->io_orig_stage = zio->io_stage = stage; 616185029Spjd zio->io_orig_pipeline = zio->io_pipeline = pipeline; 617168404Spjd 618209962Smm zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); 619209962Smm zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); 620209962Smm 621185029Spjd if (zb != NULL) 622185029Spjd zio->io_bookmark = *zb; 623185029Spjd 624185029Spjd if (pio != NULL) { 625185029Spjd if (zio->io_logical == NULL) 626168404Spjd zio->io_logical = pio->io_logical; 627209962Smm if (zio->io_child_type == ZIO_CHILD_GANG) 628209962Smm zio->io_gang_leader = pio->io_gang_leader; 629185029Spjd zio_add_child(pio, zio); 630168404Spjd } 631168404Spjd 632168404Spjd return (zio); 633168404Spjd} 634168404Spjd 635185029Spjdstatic void 636185029Spjdzio_destroy(zio_t *zio) 637185029Spjd{ 638209962Smm list_destroy(&zio->io_parent_list); 639209962Smm list_destroy(&zio->io_child_list); 640185029Spjd mutex_destroy(&zio->io_lock); 641185029Spjd cv_destroy(&zio->io_cv); 642185029Spjd kmem_cache_free(zio_cache, zio); 643185029Spjd} 644185029Spjd 645168404Spjdzio_t * 646209962Smmzio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, 647219089Spjd void *private, enum zio_flag flags) 648168404Spjd{ 649168404Spjd zio_t *zio; 650168404Spjd 651168404Spjd zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 652209962Smm ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 653185029Spjd ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); 654168404Spjd 655168404Spjd return (zio); 656168404Spjd} 657168404Spjd 658168404Spjdzio_t * 659219089Spjdzio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) 660168404Spjd{ 661209962Smm return (zio_null(NULL, spa, NULL, done, private, flags)); 662168404Spjd} 663168404Spjd 664168404Spjdzio_t * 665185029Spjdzio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, 666185029Spjd void *data, uint64_t size, zio_done_func_t *done, void *private, 667268657Sdelphij zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) 668168404Spjd{ 669168404Spjd zio_t *zio; 670168404Spjd 671219089Spjd zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, 672185029Spjd data, size, done, private, 673185029Spjd ZIO_TYPE_READ, priority, flags, NULL, 0, zb, 674219089Spjd ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 675219089Spjd ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); 676168404Spjd 677168404Spjd return (zio); 678168404Spjd} 679168404Spjd 680168404Spjdzio_t * 681185029Spjdzio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 682219089Spjd void *data, uint64_t size, const zio_prop_t *zp, 683260763Savg zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done, 684260763Savg void *private, 685268657Sdelphij zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) 686168404Spjd{ 687168404Spjd zio_t *zio; 688168404Spjd 689185029Spjd ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && 690185029Spjd zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && 691185029Spjd zp->zp_compress >= ZIO_COMPRESS_OFF && 692185029Spjd zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && 693236884Smm DMU_OT_IS_VALID(zp->zp_type) && 694185029Spjd zp->zp_level < 32 && 695219089Spjd zp->zp_copies > 0 && 696243524Smm zp->zp_copies <= spa_max_replication(spa)); 697168404Spjd 698168404Spjd zio = zio_create(pio, spa, txg, bp, data, size, done, private, 699185029Spjd ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 700219089Spjd ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 701219089Spjd ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); 702168404Spjd 703168404Spjd zio->io_ready = ready; 704260763Savg zio->io_physdone = physdone; 705185029Spjd zio->io_prop = *zp; 706168404Spjd 707268649Sdelphij /* 708268649Sdelphij * Data can be NULL if we are going to call zio_write_override() to 709268649Sdelphij * provide the already-allocated BP. But we may need the data to 710268649Sdelphij * verify a dedup hit (if requested). In this case, don't try to 711268649Sdelphij * dedup (just take the already-allocated BP verbatim). 712268649Sdelphij */ 713268649Sdelphij if (data == NULL && zio->io_prop.zp_dedup_verify) { 714268649Sdelphij zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE; 715268649Sdelphij } 716268649Sdelphij 717168404Spjd return (zio); 718168404Spjd} 719168404Spjd 720168404Spjdzio_t * 721185029Spjdzio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, 722260763Savg uint64_t size, zio_done_func_t *done, void *private, 723268657Sdelphij zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb) 724168404Spjd{ 725168404Spjd zio_t *zio; 726168404Spjd 727168404Spjd zio = zio_create(pio, spa, txg, bp, data, size, done, private, 728185029Spjd ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 729168404Spjd ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 730168404Spjd 731168404Spjd return (zio); 732168404Spjd} 733168404Spjd 734219089Spjdvoid 735243524Smmzio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) 736219089Spjd{ 737219089Spjd ASSERT(zio->io_type == ZIO_TYPE_WRITE); 738219089Spjd ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 739219089Spjd ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 740219089Spjd ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); 741219089Spjd 742243524Smm /* 743243524Smm * We must reset the io_prop to match the values that existed 744243524Smm * when the bp was first written by dmu_sync() keeping in mind 745243524Smm * that nopwrite and dedup are mutually exclusive. 746243524Smm */ 747243524Smm zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; 748243524Smm zio->io_prop.zp_nopwrite = nopwrite; 749219089Spjd zio->io_prop.zp_copies = copies; 750219089Spjd zio->io_bp_override = bp; 751219089Spjd} 752219089Spjd 753219089Spjdvoid 754219089Spjdzio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) 755219089Spjd{ 756268649Sdelphij 757268649Sdelphij /* 758268649Sdelphij * The check for EMBEDDED is a performance optimization. We 759268649Sdelphij * process the free here (by ignoring it) rather than 760268649Sdelphij * putting it on the list and then processing it in zio_free_sync(). 761268649Sdelphij */ 762268649Sdelphij if (BP_IS_EMBEDDED(bp)) 763268649Sdelphij return; 764248571Smm metaslab_check_free(spa, bp); 765252840Smm 766252840Smm /* 767252840Smm * Frees that are for the currently-syncing txg, are not going to be 768252840Smm * deferred, and which will not need to do a read (i.e. not GANG or 769252840Smm * DEDUP), can be processed immediately. Otherwise, put them on the 770252840Smm * in-memory list for later processing. 771252840Smm */ 772253992Smav if (zfs_trim_enabled || BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || 773252840Smm txg != spa->spa_syncing_txg || 774252840Smm spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) { 775252840Smm bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); 776252840Smm } else { 777252840Smm VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 778252840Smm BP_GET_PSIZE(bp), 0))); 779252840Smm } 780219089Spjd} 781219089Spjd 782168404Spjdzio_t * 783219089Spjdzio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 784240868Spjd uint64_t size, enum zio_flag flags) 785168404Spjd{ 786168404Spjd zio_t *zio; 787252840Smm enum zio_stage stage = ZIO_FREE_PIPELINE; 788168404Spjd 789168404Spjd ASSERT(!BP_IS_HOLE(bp)); 790219089Spjd ASSERT(spa_syncing_txg(spa) == txg); 791243503Smm ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); 792168404Spjd 793268649Sdelphij if (BP_IS_EMBEDDED(bp)) 794268649Sdelphij return (zio_null(pio, spa, NULL, NULL, NULL, 0)); 795268649Sdelphij 796248571Smm metaslab_check_free(spa, bp); 797251520Sdelphij arc_freed(spa, bp); 798248571Smm 799253992Smav if (zfs_trim_enabled) 800253992Smav stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START | 801253992Smav ZIO_STAGE_VDEV_IO_ASSESS; 802252840Smm /* 803252840Smm * GANG and DEDUP blocks can induce a read (for the gang block header, 804252840Smm * or the DDT), so issue them asynchronously so that this thread is 805252840Smm * not tied up. 806252840Smm */ 807253992Smav else if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) 808252840Smm stage |= ZIO_STAGE_ISSUE_ASYNC; 809252840Smm 810270312Ssmh flags |= ZIO_FLAG_DONT_QUEUE; 811270312Ssmh 812240868Spjd zio = zio_create(pio, spa, txg, bp, NULL, size, 813260763Savg NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags, 814252840Smm NULL, 0, NULL, ZIO_STAGE_OPEN, stage); 815168404Spjd 816168404Spjd return (zio); 817168404Spjd} 818168404Spjd 819168404Spjdzio_t * 820219089Spjdzio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 821219089Spjd zio_done_func_t *done, void *private, enum zio_flag flags) 822168404Spjd{ 823168404Spjd zio_t *zio; 824168404Spjd 825268649Sdelphij dprintf_bp(bp, "claiming in txg %llu", txg); 826268649Sdelphij 827268649Sdelphij if (BP_IS_EMBEDDED(bp)) 828268649Sdelphij return (zio_null(pio, spa, NULL, NULL, NULL, 0)); 829268649Sdelphij 830168404Spjd /* 831168404Spjd * A claim is an allocation of a specific block. Claims are needed 832168404Spjd * to support immediate writes in the intent log. The issue is that 833168404Spjd * immediate writes contain committed data, but in a txg that was 834168404Spjd * *not* committed. Upon opening the pool after an unclean shutdown, 835168404Spjd * the intent log claims all blocks that contain immediate write data 836168404Spjd * so that the SPA knows they're in use. 837168404Spjd * 838168404Spjd * All claims *must* be resolved in the first txg -- before the SPA 839168404Spjd * starts allocating blocks -- so that nothing is allocated twice. 840219089Spjd * If txg == 0 we just verify that the block is claimable. 841168404Spjd */ 842168404Spjd ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 843219089Spjd ASSERT(txg == spa_first_txg(spa) || txg == 0); 844219089Spjd ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ 845168404Spjd 846185029Spjd zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 847185029Spjd done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, 848185029Spjd NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 849168404Spjd 850168404Spjd return (zio); 851168404Spjd} 852168404Spjd 853168404Spjdzio_t * 854240868Spjdzio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset, 855260763Savg uint64_t size, zio_done_func_t *done, void *private, 856270312Ssmh zio_priority_t priority, enum zio_flag flags) 857168404Spjd{ 858168404Spjd zio_t *zio; 859168404Spjd int c; 860168404Spjd 861168404Spjd if (vd->vdev_children == 0) { 862240868Spjd zio = zio_create(pio, spa, 0, NULL, NULL, size, done, private, 863270312Ssmh ZIO_TYPE_IOCTL, priority, flags, vd, offset, NULL, 864168404Spjd ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 865168404Spjd 866168404Spjd zio->io_cmd = cmd; 867168404Spjd } else { 868209962Smm zio = zio_null(pio, spa, NULL, NULL, NULL, flags); 869168404Spjd 870168404Spjd for (c = 0; c < vd->vdev_children; c++) 871168404Spjd zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 872270312Ssmh offset, size, done, private, priority, flags)); 873168404Spjd } 874168404Spjd 875168404Spjd return (zio); 876168404Spjd} 877168404Spjd 878168404Spjdzio_t * 879168404Spjdzio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 880168404Spjd void *data, int checksum, zio_done_func_t *done, void *private, 881260763Savg zio_priority_t priority, enum zio_flag flags, boolean_t labels) 882168404Spjd{ 883168404Spjd zio_t *zio; 884168404Spjd 885185029Spjd ASSERT(vd->vdev_children == 0); 886185029Spjd ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 887185029Spjd offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 888185029Spjd ASSERT3U(offset + size, <=, vd->vdev_psize); 889168404Spjd 890185029Spjd zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 891269416Sdelphij ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, 892269416Sdelphij NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 893168404Spjd 894185029Spjd zio->io_prop.zp_checksum = checksum; 895168404Spjd 896168404Spjd return (zio); 897168404Spjd} 898168404Spjd 899168404Spjdzio_t * 900168404Spjdzio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 901168404Spjd void *data, int checksum, zio_done_func_t *done, void *private, 902260763Savg zio_priority_t priority, enum zio_flag flags, boolean_t labels) 903168404Spjd{ 904168404Spjd zio_t *zio; 905168404Spjd 906185029Spjd ASSERT(vd->vdev_children == 0); 907185029Spjd ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 908185029Spjd offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 909185029Spjd ASSERT3U(offset + size, <=, vd->vdev_psize); 910168404Spjd 911185029Spjd zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 912269416Sdelphij ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, 913269416Sdelphij NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 914168404Spjd 915185029Spjd zio->io_prop.zp_checksum = checksum; 916168404Spjd 917219089Spjd if (zio_checksum_table[checksum].ci_eck) { 918168404Spjd /* 919219089Spjd * zec checksums are necessarily destructive -- they modify 920185029Spjd * the end of the write buffer to hold the verifier/checksum. 921168404Spjd * Therefore, we must make a local copy in case the data is 922185029Spjd * being written to multiple places in parallel. 923168404Spjd */ 924185029Spjd void *wbuf = zio_buf_alloc(size); 925168404Spjd bcopy(data, wbuf, size); 926185029Spjd zio_push_transform(zio, wbuf, size, size, NULL); 927168404Spjd } 928168404Spjd 929168404Spjd return (zio); 930168404Spjd} 931168404Spjd 932168404Spjd/* 933185029Spjd * Create a child I/O to do some work for us. 934168404Spjd */ 935168404Spjdzio_t * 936185029Spjdzio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 937260763Savg void *data, uint64_t size, int type, zio_priority_t priority, 938260763Savg enum zio_flag flags, zio_done_func_t *done, void *private) 939168404Spjd{ 940219089Spjd enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; 941185029Spjd zio_t *zio; 942168404Spjd 943185029Spjd ASSERT(vd->vdev_parent == 944185029Spjd (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); 945185029Spjd 946168404Spjd if (type == ZIO_TYPE_READ && bp != NULL) { 947168404Spjd /* 948168404Spjd * If we have the bp, then the child should perform the 949168404Spjd * checksum and the parent need not. This pushes error 950168404Spjd * detection as close to the leaves as possible and 951168404Spjd * eliminates redundant checksums in the interior nodes. 952168404Spjd */ 953219089Spjd pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; 954219089Spjd pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 955168404Spjd } 956168404Spjd 957270312Ssmh /* Not all IO types require vdev io done stage e.g. free */ 958270312Ssmh if (!(pio->io_pipeline & ZIO_STAGE_VDEV_IO_DONE)) 959270312Ssmh pipeline &= ~ZIO_STAGE_VDEV_IO_DONE; 960270312Ssmh 961185029Spjd if (vd->vdev_children == 0) 962185029Spjd offset += VDEV_LABEL_START_SIZE; 963185029Spjd 964219089Spjd flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE; 965219089Spjd 966219089Spjd /* 967219089Spjd * If we've decided to do a repair, the write is not speculative -- 968219089Spjd * even if the original read was. 969219089Spjd */ 970219089Spjd if (flags & ZIO_FLAG_IO_REPAIR) 971219089Spjd flags &= ~ZIO_FLAG_SPECULATIVE; 972219089Spjd 973185029Spjd zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, 974219089Spjd done, private, type, priority, flags, vd, offset, &pio->io_bookmark, 975219089Spjd ZIO_STAGE_VDEV_IO_START >> 1, pipeline); 976168404Spjd 977260763Savg zio->io_physdone = pio->io_physdone; 978260763Savg if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) 979260763Savg zio->io_logical->io_phys_children++; 980260763Savg 981185029Spjd return (zio); 982168404Spjd} 983168404Spjd 984185029Spjdzio_t * 985185029Spjdzio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, 986260763Savg int type, zio_priority_t priority, enum zio_flag flags, 987219089Spjd zio_done_func_t *done, void *private) 988168404Spjd{ 989185029Spjd zio_t *zio; 990168404Spjd 991185029Spjd ASSERT(vd->vdev_ops->vdev_op_leaf); 992168404Spjd 993185029Spjd zio = zio_create(NULL, vd->vdev_spa, 0, NULL, 994185029Spjd data, size, done, private, type, priority, 995260763Savg flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED, 996185029Spjd vd, offset, NULL, 997219089Spjd ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); 998168404Spjd 999185029Spjd return (zio); 1000168404Spjd} 1001168404Spjd 1002168404Spjdvoid 1003185029Spjdzio_flush(zio_t *zio, vdev_t *vd) 1004168404Spjd{ 1005240868Spjd zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0, 1006270312Ssmh NULL, NULL, ZIO_PRIORITY_NOW, 1007185029Spjd ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); 1008168404Spjd} 1009168404Spjd 1010240868Spjdzio_t * 1011240868Spjdzio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size) 1012240868Spjd{ 1013240868Spjd 1014240868Spjd ASSERT(vd->vdev_ops->vdev_op_leaf); 1015240868Spjd 1016270312Ssmh return (zio_create(zio, spa, 0, NULL, NULL, size, NULL, NULL, 1017270312Ssmh ZIO_TYPE_FREE, ZIO_PRIORITY_TRIM, ZIO_FLAG_DONT_AGGREGATE | 1018270312Ssmh ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, 1019270312Ssmh vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PHYS_PIPELINE)); 1020240868Spjd} 1021240868Spjd 1022219089Spjdvoid 1023219089Spjdzio_shrink(zio_t *zio, uint64_t size) 1024219089Spjd{ 1025219089Spjd ASSERT(zio->io_executor == NULL); 1026219089Spjd ASSERT(zio->io_orig_size == zio->io_size); 1027219089Spjd ASSERT(size <= zio->io_size); 1028219089Spjd 1029219089Spjd /* 1030219089Spjd * We don't shrink for raidz because of problems with the 1031219089Spjd * reconstruction when reading back less than the block size. 1032219089Spjd * Note, BP_IS_RAIDZ() assumes no compression. 1033219089Spjd */ 1034219089Spjd ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); 1035219089Spjd if (!BP_IS_RAIDZ(zio->io_bp)) 1036219089Spjd zio->io_orig_size = zio->io_size = size; 1037219089Spjd} 1038219089Spjd 1039168404Spjd/* 1040168404Spjd * ========================================================================== 1041185029Spjd * Prepare to read and write logical blocks 1042168404Spjd * ========================================================================== 1043168404Spjd */ 1044185029Spjd 1045185029Spjdstatic int 1046270312Ssmhzio_read_bp_init(zio_t *zio) 1047168404Spjd{ 1048185029Spjd blkptr_t *bp = zio->io_bp; 1049185029Spjd 1050209962Smm if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && 1051209962Smm zio->io_child_type == ZIO_CHILD_LOGICAL && 1052209962Smm !(zio->io_flags & ZIO_FLAG_RAW)) { 1053268649Sdelphij uint64_t psize = 1054268649Sdelphij BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); 1055219089Spjd void *cbuf = zio_buf_alloc(psize); 1056185029Spjd 1057219089Spjd zio_push_transform(zio, cbuf, psize, psize, zio_decompress); 1058168404Spjd } 1059185029Spjd 1060268649Sdelphij if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { 1061268649Sdelphij zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1062268649Sdelphij decode_embedded_bp_compressed(bp, zio->io_data); 1063268649Sdelphij } else { 1064268649Sdelphij ASSERT(!BP_IS_EMBEDDED(bp)); 1065268649Sdelphij } 1066268649Sdelphij 1067236884Smm if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) 1068185029Spjd zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1069185029Spjd 1070219089Spjd if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) 1071219089Spjd zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1072219089Spjd 1073219089Spjd if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) 1074219089Spjd zio->io_pipeline = ZIO_DDT_READ_PIPELINE; 1075219089Spjd 1076185029Spjd return (ZIO_PIPELINE_CONTINUE); 1077168404Spjd} 1078168404Spjd 1079185029Spjdstatic int 1080270312Ssmhzio_write_bp_init(zio_t *zio) 1081168404Spjd{ 1082219089Spjd spa_t *spa = zio->io_spa; 1083185029Spjd zio_prop_t *zp = &zio->io_prop; 1084219089Spjd enum zio_compress compress = zp->zp_compress; 1085185029Spjd blkptr_t *bp = zio->io_bp; 1086185029Spjd uint64_t lsize = zio->io_size; 1087219089Spjd uint64_t psize = lsize; 1088185029Spjd int pass = 1; 1089168404Spjd 1090185029Spjd /* 1091185029Spjd * If our children haven't all reached the ready stage, 1092185029Spjd * wait for them and then repeat this pipeline stage. 1093185029Spjd */ 1094185029Spjd if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 1095185029Spjd zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) 1096185029Spjd return (ZIO_PIPELINE_STOP); 1097185029Spjd 1098185029Spjd if (!IO_IS_ALLOCATING(zio)) 1099185029Spjd return (ZIO_PIPELINE_CONTINUE); 1100185029Spjd 1101219089Spjd ASSERT(zio->io_child_type != ZIO_CHILD_DDT); 1102185029Spjd 1103219089Spjd if (zio->io_bp_override) { 1104219089Spjd ASSERT(bp->blk_birth != zio->io_txg); 1105219089Spjd ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); 1106219089Spjd 1107219089Spjd *bp = *zio->io_bp_override; 1108219089Spjd zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1109219089Spjd 1110268649Sdelphij if (BP_IS_EMBEDDED(bp)) 1111268649Sdelphij return (ZIO_PIPELINE_CONTINUE); 1112268649Sdelphij 1113243524Smm /* 1114243524Smm * If we've been overridden and nopwrite is set then 1115243524Smm * set the flag accordingly to indicate that a nopwrite 1116243524Smm * has already occurred. 1117243524Smm */ 1118243524Smm if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { 1119243524Smm ASSERT(!zp->zp_dedup); 1120243524Smm zio->io_flags |= ZIO_FLAG_NOPWRITE; 1121243524Smm return (ZIO_PIPELINE_CONTINUE); 1122243524Smm } 1123243524Smm 1124243524Smm ASSERT(!zp->zp_nopwrite); 1125243524Smm 1126219089Spjd if (BP_IS_HOLE(bp) || !zp->zp_dedup) 1127219089Spjd return (ZIO_PIPELINE_CONTINUE); 1128219089Spjd 1129219089Spjd ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup || 1130219089Spjd zp->zp_dedup_verify); 1131219089Spjd 1132219089Spjd if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { 1133219089Spjd BP_SET_DEDUP(bp, 1); 1134219089Spjd zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; 1135219089Spjd return (ZIO_PIPELINE_CONTINUE); 1136219089Spjd } 1137219089Spjd zio->io_bp_override = NULL; 1138219089Spjd BP_ZERO(bp); 1139219089Spjd } 1140219089Spjd 1141263397Sdelphij if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) { 1142185029Spjd /* 1143185029Spjd * We're rewriting an existing block, which means we're 1144185029Spjd * working on behalf of spa_sync(). For spa_sync() to 1145185029Spjd * converge, it must eventually be the case that we don't 1146185029Spjd * have to allocate new blocks. But compression changes 1147185029Spjd * the blocksize, which forces a reallocate, and makes 1148185029Spjd * convergence take longer. Therefore, after the first 1149185029Spjd * few passes, stop compressing to ensure convergence. 1150185029Spjd */ 1151219089Spjd pass = spa_sync_pass(spa); 1152185029Spjd 1153219089Spjd ASSERT(zio->io_txg == spa_syncing_txg(spa)); 1154219089Spjd ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1155219089Spjd ASSERT(!BP_GET_DEDUP(bp)); 1156219089Spjd 1157243503Smm if (pass >= zfs_sync_pass_dont_compress) 1158185029Spjd compress = ZIO_COMPRESS_OFF; 1159185029Spjd 1160185029Spjd /* Make sure someone doesn't change their mind on overwrites */ 1161268649Sdelphij ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp), 1162219089Spjd spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 1163185029Spjd } 1164185029Spjd 1165185029Spjd if (compress != ZIO_COMPRESS_OFF) { 1166219089Spjd void *cbuf = zio_buf_alloc(lsize); 1167269732Sdelphij psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); 1168219089Spjd if (psize == 0 || psize == lsize) { 1169185029Spjd compress = ZIO_COMPRESS_OFF; 1170219089Spjd zio_buf_free(cbuf, lsize); 1171268649Sdelphij } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE && 1172268649Sdelphij zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && 1173268649Sdelphij spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { 1174268649Sdelphij encode_embedded_bp_compressed(bp, 1175268649Sdelphij cbuf, compress, lsize, psize); 1176268649Sdelphij BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA); 1177268649Sdelphij BP_SET_TYPE(bp, zio->io_prop.zp_type); 1178268649Sdelphij BP_SET_LEVEL(bp, zio->io_prop.zp_level); 1179268649Sdelphij zio_buf_free(cbuf, lsize); 1180268649Sdelphij bp->blk_birth = zio->io_txg; 1181268649Sdelphij zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1182268649Sdelphij ASSERT(spa_feature_is_active(spa, 1183268649Sdelphij SPA_FEATURE_EMBEDDED_DATA)); 1184268649Sdelphij return (ZIO_PIPELINE_CONTINUE); 1185219089Spjd } else { 1186268649Sdelphij /* 1187268649Sdelphij * Round up compressed size to MINBLOCKSIZE and 1188268649Sdelphij * zero the tail. 1189268649Sdelphij */ 1190268649Sdelphij size_t rounded = 1191268649Sdelphij P2ROUNDUP(psize, (size_t)SPA_MINBLOCKSIZE); 1192268649Sdelphij if (rounded > psize) { 1193268649Sdelphij bzero((char *)cbuf + psize, rounded - psize); 1194268649Sdelphij psize = rounded; 1195268649Sdelphij } 1196268649Sdelphij if (psize == lsize) { 1197268649Sdelphij compress = ZIO_COMPRESS_OFF; 1198268649Sdelphij zio_buf_free(cbuf, lsize); 1199268649Sdelphij } else { 1200268649Sdelphij zio_push_transform(zio, cbuf, 1201268649Sdelphij psize, lsize, NULL); 1202268649Sdelphij } 1203185029Spjd } 1204185029Spjd } 1205185029Spjd 1206185029Spjd /* 1207185029Spjd * The final pass of spa_sync() must be all rewrites, but the first 1208185029Spjd * few passes offer a trade-off: allocating blocks defers convergence, 1209185029Spjd * but newly allocated blocks are sequential, so they can be written 1210185029Spjd * to disk faster. Therefore, we allow the first few passes of 1211185029Spjd * spa_sync() to allocate new blocks, but force rewrites after that. 1212185029Spjd * There should only be a handful of blocks after pass 1 in any case. 1213185029Spjd */ 1214263397Sdelphij if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg && 1215263397Sdelphij BP_GET_PSIZE(bp) == psize && 1216243503Smm pass >= zfs_sync_pass_rewrite) { 1217219089Spjd ASSERT(psize != 0); 1218219089Spjd enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; 1219185029Spjd zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; 1220185029Spjd zio->io_flags |= ZIO_FLAG_IO_REWRITE; 1221168404Spjd } else { 1222185029Spjd BP_ZERO(bp); 1223185029Spjd zio->io_pipeline = ZIO_WRITE_PIPELINE; 1224168404Spjd } 1225185029Spjd 1226219089Spjd if (psize == 0) { 1227263397Sdelphij if (zio->io_bp_orig.blk_birth != 0 && 1228263397Sdelphij spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { 1229263397Sdelphij BP_SET_LSIZE(bp, lsize); 1230263397Sdelphij BP_SET_TYPE(bp, zp->zp_type); 1231263397Sdelphij BP_SET_LEVEL(bp, zp->zp_level); 1232263397Sdelphij BP_SET_BIRTH(bp, zio->io_txg, 0); 1233263397Sdelphij } 1234185029Spjd zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1235185029Spjd } else { 1236185029Spjd ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); 1237185029Spjd BP_SET_LSIZE(bp, lsize); 1238263397Sdelphij BP_SET_TYPE(bp, zp->zp_type); 1239263397Sdelphij BP_SET_LEVEL(bp, zp->zp_level); 1240219089Spjd BP_SET_PSIZE(bp, psize); 1241185029Spjd BP_SET_COMPRESS(bp, compress); 1242185029Spjd BP_SET_CHECKSUM(bp, zp->zp_checksum); 1243219089Spjd BP_SET_DEDUP(bp, zp->zp_dedup); 1244185029Spjd BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1245219089Spjd if (zp->zp_dedup) { 1246219089Spjd ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1247219089Spjd ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1248219089Spjd zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; 1249219089Spjd } 1250243524Smm if (zp->zp_nopwrite) { 1251243524Smm ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1252243524Smm ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1253243524Smm zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; 1254243524Smm } 1255185029Spjd } 1256185029Spjd 1257185029Spjd return (ZIO_PIPELINE_CONTINUE); 1258168404Spjd} 1259168404Spjd 1260219089Spjdstatic int 1261270312Ssmhzio_free_bp_init(zio_t *zio) 1262219089Spjd{ 1263219089Spjd blkptr_t *bp = zio->io_bp; 1264219089Spjd 1265219089Spjd if (zio->io_child_type == ZIO_CHILD_LOGICAL) { 1266219089Spjd if (BP_GET_DEDUP(bp)) 1267219089Spjd zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; 1268219089Spjd } 1269219089Spjd 1270219089Spjd return (ZIO_PIPELINE_CONTINUE); 1271219089Spjd} 1272219089Spjd 1273185029Spjd/* 1274185029Spjd * ========================================================================== 1275185029Spjd * Execute the I/O pipeline 1276185029Spjd * ========================================================================== 1277185029Spjd */ 1278185029Spjd 1279168404Spjdstatic void 1280260750Savgzio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) 1281168404Spjd{ 1282211931Smm spa_t *spa = zio->io_spa; 1283185029Spjd zio_type_t t = zio->io_type; 1284260742Savg int flags = (cutinline ? TQ_FRONT : 0); 1285168404Spjd 1286216919Smm ASSERT(q == ZIO_TASKQ_ISSUE || q == ZIO_TASKQ_INTERRUPT); 1287216919Smm 1288185029Spjd /* 1289209096Smm * If we're a config writer or a probe, the normal issue and 1290209096Smm * interrupt threads may all be blocked waiting for the config lock. 1291209096Smm * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. 1292185029Spjd */ 1293209096Smm if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) 1294185029Spjd t = ZIO_TYPE_NULL; 1295185029Spjd 1296185029Spjd /* 1297185029Spjd * A similar issue exists for the L2ARC write thread until L2ARC 2.0. 1298185029Spjd */ 1299185029Spjd if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) 1300185029Spjd t = ZIO_TYPE_NULL; 1301185029Spjd 1302211931Smm /* 1303260750Savg * If this is a high priority I/O, then use the high priority taskq if 1304260750Savg * available. 1305211931Smm */ 1306211931Smm if (zio->io_priority == ZIO_PRIORITY_NOW && 1307260750Savg spa->spa_zio_taskq[t][q + 1].stqs_count != 0) 1308211931Smm q++; 1309211931Smm 1310211931Smm ASSERT3U(q, <, ZIO_TASKQ_TYPES); 1311260742Savg 1312260742Savg /* 1313260742Savg * NB: We are assuming that the zio can only be dispatched 1314260742Savg * to a single taskq at a time. It would be a grievous error 1315260742Savg * to dispatch the zio to another taskq at the same time. 1316260742Savg */ 1317260742Savg#if defined(illumos) || !defined(_KERNEL) 1318260742Savg ASSERT(zio->io_tqent.tqent_next == NULL); 1319216919Smm#else 1320260742Savg ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); 1321216919Smm#endif 1322260750Savg spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio, 1323260750Savg flags, &zio->io_tqent); 1324168404Spjd} 1325168404Spjd 1326185029Spjdstatic boolean_t 1327260750Savgzio_taskq_member(zio_t *zio, zio_taskq_type_t q) 1328168404Spjd{ 1329185029Spjd kthread_t *executor = zio->io_executor; 1330185029Spjd spa_t *spa = zio->io_spa; 1331168404Spjd 1332260750Savg for (zio_type_t t = 0; t < ZIO_TYPES; t++) { 1333260750Savg spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1334260750Savg uint_t i; 1335260750Savg for (i = 0; i < tqs->stqs_count; i++) { 1336260750Savg if (taskq_member(tqs->stqs_taskq[i], executor)) 1337260750Savg return (B_TRUE); 1338260750Savg } 1339260750Savg } 1340168404Spjd 1341185029Spjd return (B_FALSE); 1342185029Spjd} 1343168404Spjd 1344185029Spjdstatic int 1345270312Ssmhzio_issue_async(zio_t *zio) 1346185029Spjd{ 1347219089Spjd zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 1348168404Spjd 1349185029Spjd return (ZIO_PIPELINE_STOP); 1350168404Spjd} 1351168404Spjd 1352185029Spjdvoid 1353185029Spjdzio_interrupt(zio_t *zio) 1354168404Spjd{ 1355219089Spjd zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); 1356185029Spjd} 1357168404Spjd 1358185029Spjd/* 1359185029Spjd * Execute the I/O pipeline until one of the following occurs: 1360185029Spjd * 1361251631Sdelphij * (1) the I/O completes 1362251631Sdelphij * (2) the pipeline stalls waiting for dependent child I/Os 1363251631Sdelphij * (3) the I/O issues, so we're waiting for an I/O completion interrupt 1364251631Sdelphij * (4) the I/O is delegated by vdev-level caching or aggregation 1365251631Sdelphij * (5) the I/O is deferred due to vdev-level queueing 1366251631Sdelphij * (6) the I/O is handed off to another thread. 1367251631Sdelphij * 1368251631Sdelphij * In all cases, the pipeline stops whenever there's no CPU work; it never 1369251631Sdelphij * burns a thread in cv_wait(). 1370251631Sdelphij * 1371185029Spjd * There's no locking on io_stage because there's no legitimate way 1372185029Spjd * for multiple threads to be attempting to process the same I/O. 1373185029Spjd */ 1374219089Spjdstatic zio_pipe_stage_t *zio_pipeline[]; 1375168404Spjd 1376185029Spjdvoid 1377185029Spjdzio_execute(zio_t *zio) 1378185029Spjd{ 1379185029Spjd zio->io_executor = curthread; 1380168404Spjd 1381185029Spjd while (zio->io_stage < ZIO_STAGE_DONE) { 1382219089Spjd enum zio_stage pipeline = zio->io_pipeline; 1383219089Spjd enum zio_stage stage = zio->io_stage; 1384185029Spjd int rv; 1385168404Spjd 1386185029Spjd ASSERT(!MUTEX_HELD(&zio->io_lock)); 1387219089Spjd ASSERT(ISP2(stage)); 1388219089Spjd ASSERT(zio->io_stall == NULL); 1389168404Spjd 1390219089Spjd do { 1391219089Spjd stage <<= 1; 1392219089Spjd } while ((stage & pipeline) == 0); 1393168404Spjd 1394185029Spjd ASSERT(stage <= ZIO_STAGE_DONE); 1395168404Spjd 1396168404Spjd /* 1397185029Spjd * If we are in interrupt context and this pipeline stage 1398185029Spjd * will grab a config lock that is held across I/O, 1399219089Spjd * or may wait for an I/O that needs an interrupt thread 1400219089Spjd * to complete, issue async to avoid deadlock. 1401219089Spjd * 1402219089Spjd * For VDEV_IO_START, we cut in line so that the io will 1403219089Spjd * be sent to disk promptly. 1404168404Spjd */ 1405219089Spjd if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && 1406185029Spjd zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { 1407219089Spjd boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? 1408219089Spjd zio_requeue_io_start_cut_in_line : B_FALSE; 1409219089Spjd zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); 1410185029Spjd return; 1411185029Spjd } 1412168404Spjd 1413185029Spjd zio->io_stage = stage; 1414270312Ssmh rv = zio_pipeline[highbit64(stage) - 1](zio); 1415185029Spjd 1416185029Spjd if (rv == ZIO_PIPELINE_STOP) 1417185029Spjd return; 1418185029Spjd 1419185029Spjd ASSERT(rv == ZIO_PIPELINE_CONTINUE); 1420168404Spjd } 1421185029Spjd} 1422168404Spjd 1423185029Spjd/* 1424185029Spjd * ========================================================================== 1425185029Spjd * Initiate I/O, either sync or async 1426185029Spjd * ========================================================================== 1427185029Spjd */ 1428185029Spjdint 1429185029Spjdzio_wait(zio_t *zio) 1430185029Spjd{ 1431185029Spjd int error; 1432168404Spjd 1433185029Spjd ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 1434185029Spjd ASSERT(zio->io_executor == NULL); 1435168404Spjd 1436185029Spjd zio->io_waiter = curthread; 1437168404Spjd 1438185029Spjd zio_execute(zio); 1439168404Spjd 1440185029Spjd mutex_enter(&zio->io_lock); 1441185029Spjd while (zio->io_executor != NULL) 1442185029Spjd cv_wait(&zio->io_cv, &zio->io_lock); 1443185029Spjd mutex_exit(&zio->io_lock); 1444168404Spjd 1445185029Spjd error = zio->io_error; 1446185029Spjd zio_destroy(zio); 1447168404Spjd 1448185029Spjd return (error); 1449185029Spjd} 1450185029Spjd 1451185029Spjdvoid 1452185029Spjdzio_nowait(zio_t *zio) 1453185029Spjd{ 1454185029Spjd ASSERT(zio->io_executor == NULL); 1455185029Spjd 1456209962Smm if (zio->io_child_type == ZIO_CHILD_LOGICAL && 1457209962Smm zio_unique_parent(zio) == NULL) { 1458185029Spjd /* 1459185029Spjd * This is a logical async I/O with no parent to wait for it. 1460209962Smm * We add it to the spa_async_root_zio "Godfather" I/O which 1461209962Smm * will ensure they complete prior to unloading the pool. 1462185029Spjd */ 1463185029Spjd spa_t *spa = zio->io_spa; 1464209962Smm 1465209962Smm zio_add_child(spa->spa_async_zio_root, zio); 1466168404Spjd } 1467185029Spjd 1468185029Spjd zio_execute(zio); 1469168404Spjd} 1470168404Spjd 1471168404Spjd/* 1472168404Spjd * ========================================================================== 1473185029Spjd * Reexecute or suspend/resume failed I/O 1474168404Spjd * ========================================================================== 1475168404Spjd */ 1476185029Spjd 1477168404Spjdstatic void 1478185029Spjdzio_reexecute(zio_t *pio) 1479168404Spjd{ 1480209962Smm zio_t *cio, *cio_next; 1481168404Spjd 1482209962Smm ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); 1483209962Smm ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); 1484209962Smm ASSERT(pio->io_gang_leader == NULL); 1485209962Smm ASSERT(pio->io_gang_tree == NULL); 1486209962Smm 1487185029Spjd pio->io_flags = pio->io_orig_flags; 1488185029Spjd pio->io_stage = pio->io_orig_stage; 1489185029Spjd pio->io_pipeline = pio->io_orig_pipeline; 1490185029Spjd pio->io_reexecute = 0; 1491243524Smm pio->io_flags |= ZIO_FLAG_REEXECUTED; 1492185029Spjd pio->io_error = 0; 1493209962Smm for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1494209962Smm pio->io_state[w] = 0; 1495185029Spjd for (int c = 0; c < ZIO_CHILD_TYPES; c++) 1496185029Spjd pio->io_child_error[c] = 0; 1497185029Spjd 1498219089Spjd if (IO_IS_ALLOCATING(pio)) 1499219089Spjd BP_ZERO(pio->io_bp); 1500168404Spjd 1501185029Spjd /* 1502185029Spjd * As we reexecute pio's children, new children could be created. 1503209962Smm * New children go to the head of pio's io_child_list, however, 1504185029Spjd * so we will (correctly) not reexecute them. The key is that 1505209962Smm * the remainder of pio's io_child_list, from 'cio_next' onward, 1506209962Smm * cannot be affected by any side effects of reexecuting 'cio'. 1507185029Spjd */ 1508209962Smm for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { 1509209962Smm cio_next = zio_walk_children(pio); 1510185029Spjd mutex_enter(&pio->io_lock); 1511209962Smm for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1512209962Smm pio->io_children[cio->io_child_type][w]++; 1513185029Spjd mutex_exit(&pio->io_lock); 1514209962Smm zio_reexecute(cio); 1515185029Spjd } 1516168404Spjd 1517168404Spjd /* 1518185029Spjd * Now that all children have been reexecuted, execute the parent. 1519209962Smm * We don't reexecute "The Godfather" I/O here as it's the 1520209962Smm * responsibility of the caller to wait on him. 1521168404Spjd */ 1522209962Smm if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) 1523209962Smm zio_execute(pio); 1524185029Spjd} 1525185029Spjd 1526185029Spjdvoid 1527185029Spjdzio_suspend(spa_t *spa, zio_t *zio) 1528185029Spjd{ 1529185029Spjd if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) 1530185029Spjd fm_panic("Pool '%s' has encountered an uncorrectable I/O " 1531185029Spjd "failure and the failure mode property for this pool " 1532185029Spjd "is set to panic.", spa_name(spa)); 1533185029Spjd 1534185029Spjd zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); 1535185029Spjd 1536185029Spjd mutex_enter(&spa->spa_suspend_lock); 1537185029Spjd 1538185029Spjd if (spa->spa_suspend_zio_root == NULL) 1539209962Smm spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 1540209962Smm ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 1541209962Smm ZIO_FLAG_GODFATHER); 1542185029Spjd 1543185029Spjd spa->spa_suspended = B_TRUE; 1544185029Spjd 1545185029Spjd if (zio != NULL) { 1546209962Smm ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 1547185029Spjd ASSERT(zio != spa->spa_suspend_zio_root); 1548185029Spjd ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1549209962Smm ASSERT(zio_unique_parent(zio) == NULL); 1550185029Spjd ASSERT(zio->io_stage == ZIO_STAGE_DONE); 1551185029Spjd zio_add_child(spa->spa_suspend_zio_root, zio); 1552168404Spjd } 1553168404Spjd 1554185029Spjd mutex_exit(&spa->spa_suspend_lock); 1555168404Spjd} 1556168404Spjd 1557209962Smmint 1558185029Spjdzio_resume(spa_t *spa) 1559168404Spjd{ 1560209962Smm zio_t *pio; 1561168404Spjd 1562185029Spjd /* 1563185029Spjd * Reexecute all previously suspended i/o. 1564185029Spjd */ 1565185029Spjd mutex_enter(&spa->spa_suspend_lock); 1566185029Spjd spa->spa_suspended = B_FALSE; 1567185029Spjd cv_broadcast(&spa->spa_suspend_cv); 1568185029Spjd pio = spa->spa_suspend_zio_root; 1569185029Spjd spa->spa_suspend_zio_root = NULL; 1570185029Spjd mutex_exit(&spa->spa_suspend_lock); 1571168404Spjd 1572185029Spjd if (pio == NULL) 1573209962Smm return (0); 1574168404Spjd 1575209962Smm zio_reexecute(pio); 1576209962Smm return (zio_wait(pio)); 1577168404Spjd} 1578168404Spjd 1579185029Spjdvoid 1580185029Spjdzio_resume_wait(spa_t *spa) 1581185029Spjd{ 1582185029Spjd mutex_enter(&spa->spa_suspend_lock); 1583185029Spjd while (spa_suspended(spa)) 1584185029Spjd cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); 1585185029Spjd mutex_exit(&spa->spa_suspend_lock); 1586185029Spjd} 1587185029Spjd 1588168404Spjd/* 1589168404Spjd * ========================================================================== 1590185029Spjd * Gang blocks. 1591185029Spjd * 1592185029Spjd * A gang block is a collection of small blocks that looks to the DMU 1593185029Spjd * like one large block. When zio_dva_allocate() cannot find a block 1594185029Spjd * of the requested size, due to either severe fragmentation or the pool 1595185029Spjd * being nearly full, it calls zio_write_gang_block() to construct the 1596185029Spjd * block from smaller fragments. 1597185029Spjd * 1598185029Spjd * A gang block consists of a gang header (zio_gbh_phys_t) and up to 1599185029Spjd * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like 1600185029Spjd * an indirect block: it's an array of block pointers. It consumes 1601185029Spjd * only one sector and hence is allocatable regardless of fragmentation. 1602185029Spjd * The gang header's bps point to its gang members, which hold the data. 1603185029Spjd * 1604185029Spjd * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> 1605185029Spjd * as the verifier to ensure uniqueness of the SHA256 checksum. 1606185029Spjd * Critically, the gang block bp's blk_cksum is the checksum of the data, 1607185029Spjd * not the gang header. This ensures that data block signatures (needed for 1608185029Spjd * deduplication) are independent of how the block is physically stored. 1609185029Spjd * 1610185029Spjd * Gang blocks can be nested: a gang member may itself be a gang block. 1611185029Spjd * Thus every gang block is a tree in which root and all interior nodes are 1612185029Spjd * gang headers, and the leaves are normal blocks that contain user data. 1613185029Spjd * The root of the gang tree is called the gang leader. 1614185029Spjd * 1615185029Spjd * To perform any operation (read, rewrite, free, claim) on a gang block, 1616185029Spjd * zio_gang_assemble() first assembles the gang tree (minus data leaves) 1617185029Spjd * in the io_gang_tree field of the original logical i/o by recursively 1618185029Spjd * reading the gang leader and all gang headers below it. This yields 1619185029Spjd * an in-core tree containing the contents of every gang header and the 1620185029Spjd * bps for every constituent of the gang block. 1621185029Spjd * 1622185029Spjd * With the gang tree now assembled, zio_gang_issue() just walks the gang tree 1623185029Spjd * and invokes a callback on each bp. To free a gang block, zio_gang_issue() 1624185029Spjd * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. 1625185029Spjd * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). 1626185029Spjd * zio_read_gang() is a wrapper around zio_read() that omits reading gang 1627185029Spjd * headers, since we already have those in io_gang_tree. zio_rewrite_gang() 1628185029Spjd * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() 1629185029Spjd * of the gang header plus zio_checksum_compute() of the data to update the 1630185029Spjd * gang header's blk_cksum as described above. 1631185029Spjd * 1632185029Spjd * The two-phase assemble/issue model solves the problem of partial failure -- 1633185029Spjd * what if you'd freed part of a gang block but then couldn't read the 1634185029Spjd * gang header for another part? Assembling the entire gang tree first 1635185029Spjd * ensures that all the necessary gang header I/O has succeeded before 1636185029Spjd * starting the actual work of free, claim, or write. Once the gang tree 1637185029Spjd * is assembled, free and claim are in-memory operations that cannot fail. 1638185029Spjd * 1639185029Spjd * In the event that a gang write fails, zio_dva_unallocate() walks the 1640185029Spjd * gang tree to immediately free (i.e. insert back into the space map) 1641185029Spjd * everything we've allocated. This ensures that we don't get ENOSPC 1642185029Spjd * errors during repeated suspend/resume cycles due to a flaky device. 1643185029Spjd * 1644185029Spjd * Gang rewrites only happen during sync-to-convergence. If we can't assemble 1645185029Spjd * the gang tree, we won't modify the block, so we can safely defer the free 1646185029Spjd * (knowing that the block is still intact). If we *can* assemble the gang 1647185029Spjd * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free 1648185029Spjd * each constituent bp and we can allocate a new block on the next sync pass. 1649185029Spjd * 1650185029Spjd * In all cases, the gang tree allows complete recovery from partial failure. 1651168404Spjd * ========================================================================== 1652168404Spjd */ 1653185029Spjd 1654185029Spjdstatic zio_t * 1655185029Spjdzio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1656168404Spjd{ 1657185029Spjd if (gn != NULL) 1658185029Spjd return (pio); 1659168404Spjd 1660185029Spjd return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), 1661185029Spjd NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1662185029Spjd &pio->io_bookmark)); 1663168404Spjd} 1664168404Spjd 1665185029Spjdzio_t * 1666185029Spjdzio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1667168404Spjd{ 1668185029Spjd zio_t *zio; 1669168404Spjd 1670185029Spjd if (gn != NULL) { 1671185029Spjd zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1672185029Spjd gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, 1673185029Spjd ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1674185029Spjd /* 1675185029Spjd * As we rewrite each gang header, the pipeline will compute 1676185029Spjd * a new gang block header checksum for it; but no one will 1677185029Spjd * compute a new data checksum, so we do that here. The one 1678185029Spjd * exception is the gang leader: the pipeline already computed 1679185029Spjd * its data checksum because that stage precedes gang assembly. 1680185029Spjd * (Presently, nothing actually uses interior data checksums; 1681185029Spjd * this is just good hygiene.) 1682185029Spjd */ 1683209962Smm if (gn != pio->io_gang_leader->io_gang_tree) { 1684185029Spjd zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), 1685185029Spjd data, BP_GET_PSIZE(bp)); 1686185029Spjd } 1687219089Spjd /* 1688219089Spjd * If we are here to damage data for testing purposes, 1689219089Spjd * leave the GBH alone so that we can detect the damage. 1690219089Spjd */ 1691219089Spjd if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) 1692219089Spjd zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 1693185029Spjd } else { 1694185029Spjd zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1695185029Spjd data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, 1696185029Spjd ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1697185029Spjd } 1698185029Spjd 1699185029Spjd return (zio); 1700168404Spjd} 1701168404Spjd 1702185029Spjd/* ARGSUSED */ 1703185029Spjdzio_t * 1704185029Spjdzio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1705168404Spjd{ 1706219089Spjd return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, 1707240868Spjd BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp), 1708219089Spjd ZIO_GANG_CHILD_FLAGS(pio))); 1709185029Spjd} 1710168404Spjd 1711185029Spjd/* ARGSUSED */ 1712185029Spjdzio_t * 1713185029Spjdzio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1714185029Spjd{ 1715185029Spjd return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, 1716185029Spjd NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); 1717185029Spjd} 1718168404Spjd 1719185029Spjdstatic zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { 1720185029Spjd NULL, 1721185029Spjd zio_read_gang, 1722185029Spjd zio_rewrite_gang, 1723185029Spjd zio_free_gang, 1724185029Spjd zio_claim_gang, 1725185029Spjd NULL 1726185029Spjd}; 1727168404Spjd 1728185029Spjdstatic void zio_gang_tree_assemble_done(zio_t *zio); 1729168404Spjd 1730185029Spjdstatic zio_gang_node_t * 1731185029Spjdzio_gang_node_alloc(zio_gang_node_t **gnpp) 1732185029Spjd{ 1733185029Spjd zio_gang_node_t *gn; 1734185029Spjd 1735185029Spjd ASSERT(*gnpp == NULL); 1736185029Spjd 1737185029Spjd gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); 1738185029Spjd gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); 1739185029Spjd *gnpp = gn; 1740185029Spjd 1741185029Spjd return (gn); 1742168404Spjd} 1743168404Spjd 1744168404Spjdstatic void 1745185029Spjdzio_gang_node_free(zio_gang_node_t **gnpp) 1746168404Spjd{ 1747185029Spjd zio_gang_node_t *gn = *gnpp; 1748168404Spjd 1749185029Spjd for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1750185029Spjd ASSERT(gn->gn_child[g] == NULL); 1751168404Spjd 1752185029Spjd zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); 1753185029Spjd kmem_free(gn, sizeof (*gn)); 1754185029Spjd *gnpp = NULL; 1755185029Spjd} 1756168404Spjd 1757185029Spjdstatic void 1758185029Spjdzio_gang_tree_free(zio_gang_node_t **gnpp) 1759185029Spjd{ 1760185029Spjd zio_gang_node_t *gn = *gnpp; 1761168404Spjd 1762185029Spjd if (gn == NULL) 1763185029Spjd return; 1764168404Spjd 1765185029Spjd for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1766185029Spjd zio_gang_tree_free(&gn->gn_child[g]); 1767168404Spjd 1768185029Spjd zio_gang_node_free(gnpp); 1769168404Spjd} 1770168404Spjd 1771168404Spjdstatic void 1772209962Smmzio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) 1773168404Spjd{ 1774185029Spjd zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); 1775168404Spjd 1776209962Smm ASSERT(gio->io_gang_leader == gio); 1777185029Spjd ASSERT(BP_IS_GANG(bp)); 1778168404Spjd 1779209962Smm zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, 1780185029Spjd SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, 1781209962Smm gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); 1782185029Spjd} 1783168404Spjd 1784185029Spjdstatic void 1785185029Spjdzio_gang_tree_assemble_done(zio_t *zio) 1786185029Spjd{ 1787209962Smm zio_t *gio = zio->io_gang_leader; 1788185029Spjd zio_gang_node_t *gn = zio->io_private; 1789185029Spjd blkptr_t *bp = zio->io_bp; 1790168404Spjd 1791209962Smm ASSERT(gio == zio_unique_parent(zio)); 1792219089Spjd ASSERT(zio->io_child_count == 0); 1793168404Spjd 1794185029Spjd if (zio->io_error) 1795185029Spjd return; 1796168404Spjd 1797185029Spjd if (BP_SHOULD_BYTESWAP(bp)) 1798185029Spjd byteswap_uint64_array(zio->io_data, zio->io_size); 1799185029Spjd 1800185029Spjd ASSERT(zio->io_data == gn->gn_gbh); 1801185029Spjd ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 1802219089Spjd ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1803185029Spjd 1804185029Spjd for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1805185029Spjd blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1806185029Spjd if (!BP_IS_GANG(gbp)) 1807185029Spjd continue; 1808209962Smm zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); 1809168404Spjd } 1810168404Spjd} 1811168404Spjd 1812168404Spjdstatic void 1813185029Spjdzio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) 1814168404Spjd{ 1815209962Smm zio_t *gio = pio->io_gang_leader; 1816185029Spjd zio_t *zio; 1817168404Spjd 1818185029Spjd ASSERT(BP_IS_GANG(bp) == !!gn); 1819209962Smm ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); 1820209962Smm ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); 1821168404Spjd 1822185029Spjd /* 1823185029Spjd * If you're a gang header, your data is in gn->gn_gbh. 1824185029Spjd * If you're a gang member, your data is in 'data' and gn == NULL. 1825185029Spjd */ 1826209962Smm zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); 1827168404Spjd 1828185029Spjd if (gn != NULL) { 1829219089Spjd ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1830168404Spjd 1831185029Spjd for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1832185029Spjd blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1833185029Spjd if (BP_IS_HOLE(gbp)) 1834185029Spjd continue; 1835185029Spjd zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); 1836185029Spjd data = (char *)data + BP_GET_PSIZE(gbp); 1837185029Spjd } 1838168404Spjd } 1839168404Spjd 1840240868Spjd if (gn == gio->io_gang_tree && gio->io_data != NULL) 1841209962Smm ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); 1842185029Spjd 1843185029Spjd if (zio != pio) 1844185029Spjd zio_nowait(zio); 1845168404Spjd} 1846168404Spjd 1847185029Spjdstatic int 1848270312Ssmhzio_gang_assemble(zio_t *zio) 1849168404Spjd{ 1850185029Spjd blkptr_t *bp = zio->io_bp; 1851168404Spjd 1852209962Smm ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); 1853209962Smm ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1854168404Spjd 1855209962Smm zio->io_gang_leader = zio; 1856209962Smm 1857185029Spjd zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); 1858168404Spjd 1859185029Spjd return (ZIO_PIPELINE_CONTINUE); 1860185029Spjd} 1861168404Spjd 1862185029Spjdstatic int 1863270312Ssmhzio_gang_issue(zio_t *zio) 1864185029Spjd{ 1865185029Spjd blkptr_t *bp = zio->io_bp; 1866185029Spjd 1867185029Spjd if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) 1868185029Spjd return (ZIO_PIPELINE_STOP); 1869185029Spjd 1870209962Smm ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); 1871209962Smm ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1872185029Spjd 1873185029Spjd if (zio->io_child_error[ZIO_CHILD_GANG] == 0) 1874209962Smm zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); 1875185029Spjd else 1876209962Smm zio_gang_tree_free(&zio->io_gang_tree); 1877185029Spjd 1878185029Spjd zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1879185029Spjd 1880185029Spjd return (ZIO_PIPELINE_CONTINUE); 1881168404Spjd} 1882168404Spjd 1883168404Spjdstatic void 1884185029Spjdzio_write_gang_member_ready(zio_t *zio) 1885168404Spjd{ 1886209962Smm zio_t *pio = zio_unique_parent(zio); 1887209962Smm zio_t *gio = zio->io_gang_leader; 1888168404Spjd dva_t *cdva = zio->io_bp->blk_dva; 1889168404Spjd dva_t *pdva = pio->io_bp->blk_dva; 1890168404Spjd uint64_t asize; 1891168404Spjd 1892185029Spjd if (BP_IS_HOLE(zio->io_bp)) 1893185029Spjd return; 1894185029Spjd 1895185029Spjd ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); 1896185029Spjd 1897185029Spjd ASSERT(zio->io_child_type == ZIO_CHILD_GANG); 1898219089Spjd ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); 1899219089Spjd ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); 1900219089Spjd ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); 1901168404Spjd ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 1902168404Spjd 1903168404Spjd mutex_enter(&pio->io_lock); 1904185029Spjd for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { 1905168404Spjd ASSERT(DVA_GET_GANG(&pdva[d])); 1906168404Spjd asize = DVA_GET_ASIZE(&pdva[d]); 1907168404Spjd asize += DVA_GET_ASIZE(&cdva[d]); 1908168404Spjd DVA_SET_ASIZE(&pdva[d], asize); 1909168404Spjd } 1910168404Spjd mutex_exit(&pio->io_lock); 1911168404Spjd} 1912168404Spjd 1913185029Spjdstatic int 1914185029Spjdzio_write_gang_block(zio_t *pio) 1915168404Spjd{ 1916185029Spjd spa_t *spa = pio->io_spa; 1917185029Spjd blkptr_t *bp = pio->io_bp; 1918209962Smm zio_t *gio = pio->io_gang_leader; 1919185029Spjd zio_t *zio; 1920185029Spjd zio_gang_node_t *gn, **gnpp; 1921168404Spjd zio_gbh_phys_t *gbh; 1922185029Spjd uint64_t txg = pio->io_txg; 1923185029Spjd uint64_t resid = pio->io_size; 1924185029Spjd uint64_t lsize; 1925219089Spjd int copies = gio->io_prop.zp_copies; 1926219089Spjd int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); 1927185029Spjd zio_prop_t zp; 1928168404Spjd int error; 1929168404Spjd 1930219089Spjd error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE, 1931219089Spjd bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, 1932185029Spjd METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); 1933185029Spjd if (error) { 1934185029Spjd pio->io_error = error; 1935185029Spjd return (ZIO_PIPELINE_CONTINUE); 1936185029Spjd } 1937168404Spjd 1938209962Smm if (pio == gio) { 1939209962Smm gnpp = &gio->io_gang_tree; 1940185029Spjd } else { 1941185029Spjd gnpp = pio->io_private; 1942185029Spjd ASSERT(pio->io_ready == zio_write_gang_member_ready); 1943185029Spjd } 1944168404Spjd 1945185029Spjd gn = zio_gang_node_alloc(gnpp); 1946185029Spjd gbh = gn->gn_gbh; 1947185029Spjd bzero(gbh, SPA_GANGBLOCKSIZE); 1948168404Spjd 1949185029Spjd /* 1950185029Spjd * Create the gang header. 1951185029Spjd */ 1952185029Spjd zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, 1953185029Spjd pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1954168404Spjd 1955185029Spjd /* 1956185029Spjd * Create and nowait the gang children. 1957185029Spjd */ 1958185029Spjd for (int g = 0; resid != 0; resid -= lsize, g++) { 1959185029Spjd lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), 1960185029Spjd SPA_MINBLOCKSIZE); 1961185029Spjd ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); 1962168404Spjd 1963209962Smm zp.zp_checksum = gio->io_prop.zp_checksum; 1964185029Spjd zp.zp_compress = ZIO_COMPRESS_OFF; 1965185029Spjd zp.zp_type = DMU_OT_NONE; 1966185029Spjd zp.zp_level = 0; 1967219089Spjd zp.zp_copies = gio->io_prop.zp_copies; 1968243524Smm zp.zp_dedup = B_FALSE; 1969243524Smm zp.zp_dedup_verify = B_FALSE; 1970243524Smm zp.zp_nopwrite = B_FALSE; 1971168404Spjd 1972185029Spjd zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], 1973185029Spjd (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, 1974260763Savg zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g], 1975185029Spjd pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1976185029Spjd &pio->io_bookmark)); 1977168404Spjd } 1978168404Spjd 1979185029Spjd /* 1980185029Spjd * Set pio's pipeline to just wait for zio to finish. 1981185029Spjd */ 1982185029Spjd pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1983168404Spjd 1984185029Spjd zio_nowait(zio); 1985168404Spjd 1986185029Spjd return (ZIO_PIPELINE_CONTINUE); 1987168404Spjd} 1988168404Spjd 1989168404Spjd/* 1990243524Smm * The zio_nop_write stage in the pipeline determines if allocating 1991243524Smm * a new bp is necessary. By leveraging a cryptographically secure checksum, 1992243524Smm * such as SHA256, we can compare the checksums of the new data and the old 1993243524Smm * to determine if allocating a new block is required. The nopwrite 1994243524Smm * feature can handle writes in either syncing or open context (i.e. zil 1995243524Smm * writes) and as a result is mutually exclusive with dedup. 1996243524Smm */ 1997243524Smmstatic int 1998270312Ssmhzio_nop_write(zio_t *zio) 1999243524Smm{ 2000243524Smm blkptr_t *bp = zio->io_bp; 2001243524Smm blkptr_t *bp_orig = &zio->io_bp_orig; 2002243524Smm zio_prop_t *zp = &zio->io_prop; 2003243524Smm 2004243524Smm ASSERT(BP_GET_LEVEL(bp) == 0); 2005243524Smm ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 2006243524Smm ASSERT(zp->zp_nopwrite); 2007243524Smm ASSERT(!zp->zp_dedup); 2008243524Smm ASSERT(zio->io_bp_override == NULL); 2009243524Smm ASSERT(IO_IS_ALLOCATING(zio)); 2010243524Smm 2011243524Smm /* 2012243524Smm * Check to see if the original bp and the new bp have matching 2013243524Smm * characteristics (i.e. same checksum, compression algorithms, etc). 2014243524Smm * If they don't then just continue with the pipeline which will 2015243524Smm * allocate a new bp. 2016243524Smm */ 2017243524Smm if (BP_IS_HOLE(bp_orig) || 2018243524Smm !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup || 2019243524Smm BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || 2020243524Smm BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || 2021243524Smm BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || 2022243524Smm zp->zp_copies != BP_GET_NDVAS(bp_orig)) 2023243524Smm return (ZIO_PIPELINE_CONTINUE); 2024243524Smm 2025243524Smm /* 2026243524Smm * If the checksums match then reset the pipeline so that we 2027243524Smm * avoid allocating a new bp and issuing any I/O. 2028243524Smm */ 2029243524Smm if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { 2030243524Smm ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup); 2031243524Smm ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); 2032243524Smm ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); 2033243524Smm ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); 2034243524Smm ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, 2035243524Smm sizeof (uint64_t)) == 0); 2036243524Smm 2037243524Smm *bp = *bp_orig; 2038243524Smm zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2039243524Smm zio->io_flags |= ZIO_FLAG_NOPWRITE; 2040243524Smm } 2041243524Smm 2042243524Smm return (ZIO_PIPELINE_CONTINUE); 2043243524Smm} 2044243524Smm 2045243524Smm/* 2046168404Spjd * ========================================================================== 2047219089Spjd * Dedup 2048168404Spjd * ========================================================================== 2049168404Spjd */ 2050219089Spjdstatic void 2051219089Spjdzio_ddt_child_read_done(zio_t *zio) 2052219089Spjd{ 2053219089Spjd blkptr_t *bp = zio->io_bp; 2054219089Spjd ddt_entry_t *dde = zio->io_private; 2055219089Spjd ddt_phys_t *ddp; 2056219089Spjd zio_t *pio = zio_unique_parent(zio); 2057185029Spjd 2058219089Spjd mutex_enter(&pio->io_lock); 2059219089Spjd ddp = ddt_phys_select(dde, bp); 2060219089Spjd if (zio->io_error == 0) 2061219089Spjd ddt_phys_clear(ddp); /* this ddp doesn't need repair */ 2062219089Spjd if (zio->io_error == 0 && dde->dde_repair_data == NULL) 2063219089Spjd dde->dde_repair_data = zio->io_data; 2064219089Spjd else 2065219089Spjd zio_buf_free(zio->io_data, zio->io_size); 2066219089Spjd mutex_exit(&pio->io_lock); 2067219089Spjd} 2068219089Spjd 2069185029Spjdstatic int 2070270312Ssmhzio_ddt_read_start(zio_t *zio) 2071219089Spjd{ 2072219089Spjd blkptr_t *bp = zio->io_bp; 2073219089Spjd 2074219089Spjd ASSERT(BP_GET_DEDUP(bp)); 2075219089Spjd ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2076219089Spjd ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2077219089Spjd 2078219089Spjd if (zio->io_child_error[ZIO_CHILD_DDT]) { 2079219089Spjd ddt_t *ddt = ddt_select(zio->io_spa, bp); 2080219089Spjd ddt_entry_t *dde = ddt_repair_start(ddt, bp); 2081219089Spjd ddt_phys_t *ddp = dde->dde_phys; 2082219089Spjd ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); 2083219089Spjd blkptr_t blk; 2084219089Spjd 2085219089Spjd ASSERT(zio->io_vsd == NULL); 2086219089Spjd zio->io_vsd = dde; 2087219089Spjd 2088219089Spjd if (ddp_self == NULL) 2089219089Spjd return (ZIO_PIPELINE_CONTINUE); 2090219089Spjd 2091219089Spjd for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 2092219089Spjd if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) 2093219089Spjd continue; 2094219089Spjd ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, 2095219089Spjd &blk); 2096219089Spjd zio_nowait(zio_read(zio, zio->io_spa, &blk, 2097219089Spjd zio_buf_alloc(zio->io_size), zio->io_size, 2098219089Spjd zio_ddt_child_read_done, dde, zio->io_priority, 2099219089Spjd ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, 2100219089Spjd &zio->io_bookmark)); 2101219089Spjd } 2102219089Spjd return (ZIO_PIPELINE_CONTINUE); 2103219089Spjd } 2104219089Spjd 2105219089Spjd zio_nowait(zio_read(zio, zio->io_spa, bp, 2106219089Spjd zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, 2107219089Spjd ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); 2108219089Spjd 2109219089Spjd return (ZIO_PIPELINE_CONTINUE); 2110219089Spjd} 2111219089Spjd 2112219089Spjdstatic int 2113270312Ssmhzio_ddt_read_done(zio_t *zio) 2114219089Spjd{ 2115219089Spjd blkptr_t *bp = zio->io_bp; 2116219089Spjd 2117219089Spjd if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)) 2118219089Spjd return (ZIO_PIPELINE_STOP); 2119219089Spjd 2120219089Spjd ASSERT(BP_GET_DEDUP(bp)); 2121219089Spjd ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2122219089Spjd ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2123219089Spjd 2124219089Spjd if (zio->io_child_error[ZIO_CHILD_DDT]) { 2125219089Spjd ddt_t *ddt = ddt_select(zio->io_spa, bp); 2126219089Spjd ddt_entry_t *dde = zio->io_vsd; 2127219089Spjd if (ddt == NULL) { 2128219089Spjd ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); 2129219089Spjd return (ZIO_PIPELINE_CONTINUE); 2130219089Spjd } 2131219089Spjd if (dde == NULL) { 2132219089Spjd zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; 2133219089Spjd zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 2134219089Spjd return (ZIO_PIPELINE_STOP); 2135219089Spjd } 2136219089Spjd if (dde->dde_repair_data != NULL) { 2137219089Spjd bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); 2138219089Spjd zio->io_child_error[ZIO_CHILD_DDT] = 0; 2139219089Spjd } 2140219089Spjd ddt_repair_done(ddt, dde); 2141219089Spjd zio->io_vsd = NULL; 2142219089Spjd } 2143219089Spjd 2144219089Spjd ASSERT(zio->io_vsd == NULL); 2145219089Spjd 2146219089Spjd return (ZIO_PIPELINE_CONTINUE); 2147219089Spjd} 2148219089Spjd 2149219089Spjdstatic boolean_t 2150219089Spjdzio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) 2151219089Spjd{ 2152219089Spjd spa_t *spa = zio->io_spa; 2153219089Spjd 2154219089Spjd /* 2155219089Spjd * Note: we compare the original data, not the transformed data, 2156219089Spjd * because when zio->io_bp is an override bp, we will not have 2157219089Spjd * pushed the I/O transforms. That's an important optimization 2158219089Spjd * because otherwise we'd compress/encrypt all dmu_sync() data twice. 2159219089Spjd */ 2160219089Spjd for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2161219089Spjd zio_t *lio = dde->dde_lead_zio[p]; 2162219089Spjd 2163219089Spjd if (lio != NULL) { 2164219089Spjd return (lio->io_orig_size != zio->io_orig_size || 2165219089Spjd bcmp(zio->io_orig_data, lio->io_orig_data, 2166219089Spjd zio->io_orig_size) != 0); 2167219089Spjd } 2168219089Spjd } 2169219089Spjd 2170219089Spjd for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2171219089Spjd ddt_phys_t *ddp = &dde->dde_phys[p]; 2172219089Spjd 2173219089Spjd if (ddp->ddp_phys_birth != 0) { 2174219089Spjd arc_buf_t *abuf = NULL; 2175219089Spjd uint32_t aflags = ARC_WAIT; 2176219089Spjd blkptr_t blk = *zio->io_bp; 2177219089Spjd int error; 2178219089Spjd 2179219089Spjd ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); 2180219089Spjd 2181219089Spjd ddt_exit(ddt); 2182219089Spjd 2183246666Smm error = arc_read(NULL, spa, &blk, 2184219089Spjd arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, 2185219089Spjd ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2186219089Spjd &aflags, &zio->io_bookmark); 2187219089Spjd 2188219089Spjd if (error == 0) { 2189219089Spjd if (arc_buf_size(abuf) != zio->io_orig_size || 2190219089Spjd bcmp(abuf->b_data, zio->io_orig_data, 2191219089Spjd zio->io_orig_size) != 0) 2192249195Smm error = SET_ERROR(EEXIST); 2193248571Smm VERIFY(arc_buf_remove_ref(abuf, &abuf)); 2194219089Spjd } 2195219089Spjd 2196219089Spjd ddt_enter(ddt); 2197219089Spjd return (error != 0); 2198219089Spjd } 2199219089Spjd } 2200219089Spjd 2201219089Spjd return (B_FALSE); 2202219089Spjd} 2203219089Spjd 2204219089Spjdstatic void 2205219089Spjdzio_ddt_child_write_ready(zio_t *zio) 2206219089Spjd{ 2207219089Spjd int p = zio->io_prop.zp_copies; 2208219089Spjd ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2209219089Spjd ddt_entry_t *dde = zio->io_private; 2210219089Spjd ddt_phys_t *ddp = &dde->dde_phys[p]; 2211219089Spjd zio_t *pio; 2212219089Spjd 2213219089Spjd if (zio->io_error) 2214219089Spjd return; 2215219089Spjd 2216219089Spjd ddt_enter(ddt); 2217219089Spjd 2218219089Spjd ASSERT(dde->dde_lead_zio[p] == zio); 2219219089Spjd 2220219089Spjd ddt_phys_fill(ddp, zio->io_bp); 2221219089Spjd 2222219089Spjd while ((pio = zio_walk_parents(zio)) != NULL) 2223219089Spjd ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); 2224219089Spjd 2225219089Spjd ddt_exit(ddt); 2226219089Spjd} 2227219089Spjd 2228219089Spjdstatic void 2229219089Spjdzio_ddt_child_write_done(zio_t *zio) 2230219089Spjd{ 2231219089Spjd int p = zio->io_prop.zp_copies; 2232219089Spjd ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2233219089Spjd ddt_entry_t *dde = zio->io_private; 2234219089Spjd ddt_phys_t *ddp = &dde->dde_phys[p]; 2235219089Spjd 2236219089Spjd ddt_enter(ddt); 2237219089Spjd 2238219089Spjd ASSERT(ddp->ddp_refcnt == 0); 2239219089Spjd ASSERT(dde->dde_lead_zio[p] == zio); 2240219089Spjd dde->dde_lead_zio[p] = NULL; 2241219089Spjd 2242219089Spjd if (zio->io_error == 0) { 2243219089Spjd while (zio_walk_parents(zio) != NULL) 2244219089Spjd ddt_phys_addref(ddp); 2245219089Spjd } else { 2246219089Spjd ddt_phys_clear(ddp); 2247219089Spjd } 2248219089Spjd 2249219089Spjd ddt_exit(ddt); 2250219089Spjd} 2251219089Spjd 2252219089Spjdstatic void 2253219089Spjdzio_ddt_ditto_write_done(zio_t *zio) 2254219089Spjd{ 2255219089Spjd int p = DDT_PHYS_DITTO; 2256219089Spjd zio_prop_t *zp = &zio->io_prop; 2257219089Spjd blkptr_t *bp = zio->io_bp; 2258219089Spjd ddt_t *ddt = ddt_select(zio->io_spa, bp); 2259219089Spjd ddt_entry_t *dde = zio->io_private; 2260219089Spjd ddt_phys_t *ddp = &dde->dde_phys[p]; 2261219089Spjd ddt_key_t *ddk = &dde->dde_key; 2262219089Spjd 2263219089Spjd ddt_enter(ddt); 2264219089Spjd 2265219089Spjd ASSERT(ddp->ddp_refcnt == 0); 2266219089Spjd ASSERT(dde->dde_lead_zio[p] == zio); 2267219089Spjd dde->dde_lead_zio[p] = NULL; 2268219089Spjd 2269219089Spjd if (zio->io_error == 0) { 2270219089Spjd ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); 2271219089Spjd ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); 2272219089Spjd ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); 2273219089Spjd if (ddp->ddp_phys_birth != 0) 2274219089Spjd ddt_phys_free(ddt, ddk, ddp, zio->io_txg); 2275219089Spjd ddt_phys_fill(ddp, bp); 2276219089Spjd } 2277219089Spjd 2278219089Spjd ddt_exit(ddt); 2279219089Spjd} 2280219089Spjd 2281219089Spjdstatic int 2282270312Ssmhzio_ddt_write(zio_t *zio) 2283219089Spjd{ 2284219089Spjd spa_t *spa = zio->io_spa; 2285219089Spjd blkptr_t *bp = zio->io_bp; 2286219089Spjd uint64_t txg = zio->io_txg; 2287219089Spjd zio_prop_t *zp = &zio->io_prop; 2288219089Spjd int p = zp->zp_copies; 2289219089Spjd int ditto_copies; 2290219089Spjd zio_t *cio = NULL; 2291219089Spjd zio_t *dio = NULL; 2292219089Spjd ddt_t *ddt = ddt_select(spa, bp); 2293219089Spjd ddt_entry_t *dde; 2294219089Spjd ddt_phys_t *ddp; 2295219089Spjd 2296219089Spjd ASSERT(BP_GET_DEDUP(bp)); 2297219089Spjd ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); 2298219089Spjd ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); 2299219089Spjd 2300219089Spjd ddt_enter(ddt); 2301219089Spjd dde = ddt_lookup(ddt, bp, B_TRUE); 2302219089Spjd ddp = &dde->dde_phys[p]; 2303219089Spjd 2304219089Spjd if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { 2305219089Spjd /* 2306219089Spjd * If we're using a weak checksum, upgrade to a strong checksum 2307219089Spjd * and try again. If we're already using a strong checksum, 2308219089Spjd * we can't resolve it, so just convert to an ordinary write. 2309219089Spjd * (And automatically e-mail a paper to Nature?) 2310219089Spjd */ 2311219089Spjd if (!zio_checksum_table[zp->zp_checksum].ci_dedup) { 2312219089Spjd zp->zp_checksum = spa_dedup_checksum(spa); 2313219089Spjd zio_pop_transforms(zio); 2314219089Spjd zio->io_stage = ZIO_STAGE_OPEN; 2315219089Spjd BP_ZERO(bp); 2316219089Spjd } else { 2317243524Smm zp->zp_dedup = B_FALSE; 2318219089Spjd } 2319219089Spjd zio->io_pipeline = ZIO_WRITE_PIPELINE; 2320219089Spjd ddt_exit(ddt); 2321219089Spjd return (ZIO_PIPELINE_CONTINUE); 2322219089Spjd } 2323219089Spjd 2324219089Spjd ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); 2325219089Spjd ASSERT(ditto_copies < SPA_DVAS_PER_BP); 2326219089Spjd 2327219089Spjd if (ditto_copies > ddt_ditto_copies_present(dde) && 2328219089Spjd dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { 2329219089Spjd zio_prop_t czp = *zp; 2330219089Spjd 2331219089Spjd czp.zp_copies = ditto_copies; 2332219089Spjd 2333219089Spjd /* 2334219089Spjd * If we arrived here with an override bp, we won't have run 2335219089Spjd * the transform stack, so we won't have the data we need to 2336219089Spjd * generate a child i/o. So, toss the override bp and restart. 2337219089Spjd * This is safe, because using the override bp is just an 2338219089Spjd * optimization; and it's rare, so the cost doesn't matter. 2339219089Spjd */ 2340219089Spjd if (zio->io_bp_override) { 2341219089Spjd zio_pop_transforms(zio); 2342219089Spjd zio->io_stage = ZIO_STAGE_OPEN; 2343219089Spjd zio->io_pipeline = ZIO_WRITE_PIPELINE; 2344219089Spjd zio->io_bp_override = NULL; 2345219089Spjd BP_ZERO(bp); 2346219089Spjd ddt_exit(ddt); 2347219089Spjd return (ZIO_PIPELINE_CONTINUE); 2348219089Spjd } 2349219089Spjd 2350219089Spjd dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2351260763Savg zio->io_orig_size, &czp, NULL, NULL, 2352219089Spjd zio_ddt_ditto_write_done, dde, zio->io_priority, 2353219089Spjd ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2354219089Spjd 2355219089Spjd zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); 2356219089Spjd dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; 2357219089Spjd } 2358219089Spjd 2359219089Spjd if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { 2360219089Spjd if (ddp->ddp_phys_birth != 0) 2361219089Spjd ddt_bp_fill(ddp, bp, txg); 2362219089Spjd if (dde->dde_lead_zio[p] != NULL) 2363219089Spjd zio_add_child(zio, dde->dde_lead_zio[p]); 2364219089Spjd else 2365219089Spjd ddt_phys_addref(ddp); 2366219089Spjd } else if (zio->io_bp_override) { 2367219089Spjd ASSERT(bp->blk_birth == txg); 2368219089Spjd ASSERT(BP_EQUAL(bp, zio->io_bp_override)); 2369219089Spjd ddt_phys_fill(ddp, bp); 2370219089Spjd ddt_phys_addref(ddp); 2371219089Spjd } else { 2372219089Spjd cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2373260763Savg zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL, 2374219089Spjd zio_ddt_child_write_done, dde, zio->io_priority, 2375219089Spjd ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2376219089Spjd 2377219089Spjd zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); 2378219089Spjd dde->dde_lead_zio[p] = cio; 2379219089Spjd } 2380219089Spjd 2381219089Spjd ddt_exit(ddt); 2382219089Spjd 2383219089Spjd if (cio) 2384219089Spjd zio_nowait(cio); 2385219089Spjd if (dio) 2386219089Spjd zio_nowait(dio); 2387219089Spjd 2388219089Spjd return (ZIO_PIPELINE_CONTINUE); 2389219089Spjd} 2390219089Spjd 2391219089Spjdddt_entry_t *freedde; /* for debugging */ 2392219089Spjd 2393219089Spjdstatic int 2394270312Ssmhzio_ddt_free(zio_t *zio) 2395219089Spjd{ 2396219089Spjd spa_t *spa = zio->io_spa; 2397219089Spjd blkptr_t *bp = zio->io_bp; 2398219089Spjd ddt_t *ddt = ddt_select(spa, bp); 2399219089Spjd ddt_entry_t *dde; 2400219089Spjd ddt_phys_t *ddp; 2401219089Spjd 2402219089Spjd ASSERT(BP_GET_DEDUP(bp)); 2403219089Spjd ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2404219089Spjd 2405219089Spjd ddt_enter(ddt); 2406219089Spjd freedde = dde = ddt_lookup(ddt, bp, B_TRUE); 2407219089Spjd ddp = ddt_phys_select(dde, bp); 2408219089Spjd ddt_phys_decref(ddp); 2409219089Spjd ddt_exit(ddt); 2410219089Spjd 2411219089Spjd return (ZIO_PIPELINE_CONTINUE); 2412219089Spjd} 2413219089Spjd 2414219089Spjd/* 2415219089Spjd * ========================================================================== 2416219089Spjd * Allocate and free blocks 2417219089Spjd * ========================================================================== 2418219089Spjd */ 2419219089Spjdstatic int 2420270312Ssmhzio_dva_allocate(zio_t *zio) 2421168404Spjd{ 2422185029Spjd spa_t *spa = zio->io_spa; 2423219089Spjd metaslab_class_t *mc = spa_normal_class(spa); 2424168404Spjd blkptr_t *bp = zio->io_bp; 2425168404Spjd int error; 2426224177Smm int flags = 0; 2427168404Spjd 2428209962Smm if (zio->io_gang_leader == NULL) { 2429209962Smm ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2430209962Smm zio->io_gang_leader = zio; 2431209962Smm } 2432209962Smm 2433168404Spjd ASSERT(BP_IS_HOLE(bp)); 2434240415Smm ASSERT0(BP_GET_NDVAS(bp)); 2435219089Spjd ASSERT3U(zio->io_prop.zp_copies, >, 0); 2436219089Spjd ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); 2437168404Spjd ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 2438168404Spjd 2439224177Smm /* 2440224177Smm * The dump device does not support gang blocks so allocation on 2441224177Smm * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid 2442224177Smm * the "fast" gang feature. 2443224177Smm */ 2444224177Smm flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0; 2445224177Smm flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ? 2446224177Smm METASLAB_GANG_CHILD : 0; 2447185029Spjd error = metaslab_alloc(spa, mc, zio->io_size, bp, 2448224177Smm zio->io_prop.zp_copies, zio->io_txg, NULL, flags); 2449168404Spjd 2450185029Spjd if (error) { 2451224177Smm spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " 2452224177Smm "size %llu, error %d", spa_name(spa), zio, zio->io_size, 2453224177Smm error); 2454185029Spjd if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) 2455185029Spjd return (zio_write_gang_block(zio)); 2456168404Spjd zio->io_error = error; 2457168404Spjd } 2458185029Spjd 2459185029Spjd return (ZIO_PIPELINE_CONTINUE); 2460168404Spjd} 2461168404Spjd 2462185029Spjdstatic int 2463270312Ssmhzio_dva_free(zio_t *zio) 2464168404Spjd{ 2465185029Spjd metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); 2466168404Spjd 2467185029Spjd return (ZIO_PIPELINE_CONTINUE); 2468185029Spjd} 2469168404Spjd 2470185029Spjdstatic int 2471270312Ssmhzio_dva_claim(zio_t *zio) 2472185029Spjd{ 2473185029Spjd int error; 2474168404Spjd 2475185029Spjd error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 2476185029Spjd if (error) 2477185029Spjd zio->io_error = error; 2478185029Spjd 2479185029Spjd return (ZIO_PIPELINE_CONTINUE); 2480168404Spjd} 2481168404Spjd 2482185029Spjd/* 2483185029Spjd * Undo an allocation. This is used by zio_done() when an I/O fails 2484185029Spjd * and we want to give back the block we just allocated. 2485185029Spjd * This handles both normal blocks and gang blocks. 2486185029Spjd */ 2487168404Spjdstatic void 2488185029Spjdzio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) 2489168404Spjd{ 2490185029Spjd ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 2491219089Spjd ASSERT(zio->io_bp_override == NULL); 2492185029Spjd 2493185029Spjd if (!BP_IS_HOLE(bp)) 2494219089Spjd metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); 2495185029Spjd 2496185029Spjd if (gn != NULL) { 2497185029Spjd for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 2498185029Spjd zio_dva_unallocate(zio, gn->gn_child[g], 2499185029Spjd &gn->gn_gbh->zg_blkptr[g]); 2500185029Spjd } 2501185029Spjd } 2502168404Spjd} 2503168404Spjd 2504168404Spjd/* 2505185029Spjd * Try to allocate an intent log block. Return 0 on success, errno on failure. 2506185029Spjd */ 2507185029Spjdint 2508219089Spjdzio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, 2509219089Spjd uint64_t size, boolean_t use_slog) 2510185029Spjd{ 2511219089Spjd int error = 1; 2512185029Spjd 2513219089Spjd ASSERT(txg > spa_syncing_txg(spa)); 2514185029Spjd 2515230514Smm /* 2516230514Smm * ZIL blocks are always contiguous (i.e. not gang blocks) so we 2517230514Smm * set the METASLAB_GANG_AVOID flag so that they don't "fast gang" 2518230514Smm * when allocating them. 2519230514Smm */ 2520230514Smm if (use_slog) { 2521219089Spjd error = metaslab_alloc(spa, spa_log_class(spa), size, 2522230514Smm new_bp, 1, txg, old_bp, 2523230514Smm METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID); 2524230514Smm } 2525219089Spjd 2526230514Smm if (error) { 2527219089Spjd error = metaslab_alloc(spa, spa_normal_class(spa), size, 2528230514Smm new_bp, 1, txg, old_bp, 2529260768Savg METASLAB_HINTBP_AVOID); 2530230514Smm } 2531185029Spjd 2532185029Spjd if (error == 0) { 2533185029Spjd BP_SET_LSIZE(new_bp, size); 2534185029Spjd BP_SET_PSIZE(new_bp, size); 2535185029Spjd BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 2536219089Spjd BP_SET_CHECKSUM(new_bp, 2537219089Spjd spa_version(spa) >= SPA_VERSION_SLIM_ZIL 2538219089Spjd ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); 2539185029Spjd BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 2540185029Spjd BP_SET_LEVEL(new_bp, 0); 2541219089Spjd BP_SET_DEDUP(new_bp, 0); 2542185029Spjd BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 2543185029Spjd } 2544185029Spjd 2545185029Spjd return (error); 2546185029Spjd} 2547185029Spjd 2548185029Spjd/* 2549219089Spjd * Free an intent log block. 2550185029Spjd */ 2551185029Spjdvoid 2552219089Spjdzio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp) 2553185029Spjd{ 2554219089Spjd ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG); 2555185029Spjd ASSERT(!BP_IS_GANG(bp)); 2556185029Spjd 2557219089Spjd zio_free(spa, txg, bp); 2558185029Spjd} 2559185029Spjd 2560185029Spjd/* 2561168404Spjd * ========================================================================== 2562244187Ssmh * Read, write and delete to physical devices 2563168404Spjd * ========================================================================== 2564168404Spjd */ 2565185029Spjdstatic int 2566270312Ssmhzio_vdev_io_start(zio_t *zio) 2567168404Spjd{ 2568168404Spjd vdev_t *vd = zio->io_vd; 2569168404Spjd uint64_t align; 2570185029Spjd spa_t *spa = zio->io_spa; 2571270312Ssmh int ret; 2572168404Spjd 2573185029Spjd ASSERT(zio->io_error == 0); 2574185029Spjd ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); 2575185029Spjd 2576168404Spjd if (vd == NULL) { 2577185029Spjd if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2578185029Spjd spa_config_enter(spa, SCL_ZIO, zio, RW_READER); 2579185029Spjd 2580185029Spjd /* 2581185029Spjd * The mirror_ops handle multiple DVAs in a single BP. 2582185029Spjd */ 2583185029Spjd return (vdev_mirror_ops.vdev_op_io_start(zio)); 2584168404Spjd } 2585168404Spjd 2586270312Ssmh if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE && 2587270312Ssmh zio->io_priority == ZIO_PRIORITY_NOW) { 2588248574Ssmh trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg); 2589240868Spjd return (ZIO_PIPELINE_CONTINUE); 2590240868Spjd } 2591240868Spjd 2592219089Spjd /* 2593219089Spjd * We keep track of time-sensitive I/Os so that the scan thread 2594219089Spjd * can quickly react to certain workloads. In particular, we care 2595219089Spjd * about non-scrubbing, top-level reads and writes with the following 2596219089Spjd * characteristics: 2597219089Spjd * - synchronous writes of user data to non-slog devices 2598219089Spjd * - any reads of user data 2599219089Spjd * When these conditions are met, adjust the timestamp of spa_last_io 2600219089Spjd * which allows the scan thread to adjust its workload accordingly. 2601219089Spjd */ 2602219089Spjd if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && 2603219089Spjd vd == vd->vdev_top && !vd->vdev_islog && 2604219089Spjd zio->io_bookmark.zb_objset != DMU_META_OBJSET && 2605219089Spjd zio->io_txg != spa_syncing_txg(spa)) { 2606219089Spjd uint64_t old = spa->spa_last_io; 2607219089Spjd uint64_t new = ddi_get_lbolt64(); 2608219089Spjd if (old != new) 2609219089Spjd (void) atomic_cas_64(&spa->spa_last_io, old, new); 2610219089Spjd } 2611219089Spjd 2612185029Spjd align = 1ULL << vd->vdev_top->vdev_ashift; 2613168404Spjd 2614269733Sdelphij if ((!(zio->io_flags & ZIO_FLAG_PHYSICAL) || 2615269733Sdelphij (vd->vdev_top->vdev_physical_ashift > SPA_MINBLOCKSHIFT)) && 2616269416Sdelphij P2PHASE(zio->io_size, align) != 0) { 2617269416Sdelphij /* Transform logical writes to be a full physical block size. */ 2618168404Spjd uint64_t asize = P2ROUNDUP(zio->io_size, align); 2619240868Spjd char *abuf = NULL; 2620240868Spjd if (zio->io_type == ZIO_TYPE_READ || 2621240868Spjd zio->io_type == ZIO_TYPE_WRITE) 2622240868Spjd abuf = zio_buf_alloc(asize); 2623185029Spjd ASSERT(vd == vd->vdev_top); 2624168404Spjd if (zio->io_type == ZIO_TYPE_WRITE) { 2625168404Spjd bcopy(zio->io_data, abuf, zio->io_size); 2626168404Spjd bzero(abuf + zio->io_size, asize - zio->io_size); 2627168404Spjd } 2628240868Spjd zio_push_transform(zio, abuf, asize, abuf ? asize : 0, 2629240868Spjd zio_subblock); 2630168404Spjd } 2631168404Spjd 2632269416Sdelphij /* 2633269416Sdelphij * If this is not a physical io, make sure that it is properly aligned 2634269416Sdelphij * before proceeding. 2635269416Sdelphij */ 2636269416Sdelphij if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) { 2637269416Sdelphij ASSERT0(P2PHASE(zio->io_offset, align)); 2638269416Sdelphij ASSERT0(P2PHASE(zio->io_size, align)); 2639269416Sdelphij } else { 2640269416Sdelphij /* 2641269416Sdelphij * For physical writes, we allow 512b aligned writes and assume 2642269416Sdelphij * the device will perform a read-modify-write as necessary. 2643269416Sdelphij */ 2644269416Sdelphij ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE)); 2645269416Sdelphij ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE)); 2646269416Sdelphij } 2647269416Sdelphij 2648240868Spjd VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa)); 2649168404Spjd 2650209962Smm /* 2651209962Smm * If this is a repair I/O, and there's no self-healing involved -- 2652209962Smm * that is, we're just resilvering what we expect to resilver -- 2653209962Smm * then don't do the I/O unless zio's txg is actually in vd's DTL. 2654209962Smm * This prevents spurious resilvering with nested replication. 2655209962Smm * For example, given a mirror of mirrors, (A+B)+(C+D), if only 2656209962Smm * A is out of date, we'll read from C+D, then use the data to 2657209962Smm * resilver A+B -- but we don't actually want to resilver B, just A. 2658209962Smm * The top-level mirror has no way to know this, so instead we just 2659209962Smm * discard unnecessary repairs as we work our way down the vdev tree. 2660209962Smm * The same logic applies to any form of nested replication: 2661209962Smm * ditto + mirror, RAID-Z + replacing, etc. This covers them all. 2662209962Smm */ 2663209962Smm if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && 2664209962Smm !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && 2665209962Smm zio->io_txg != 0 && /* not a delegated i/o */ 2666209962Smm !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { 2667209962Smm ASSERT(zio->io_type == ZIO_TYPE_WRITE); 2668209962Smm zio_vdev_io_bypass(zio); 2669209962Smm return (ZIO_PIPELINE_CONTINUE); 2670209962Smm } 2671209962Smm 2672270312Ssmh if (vd->vdev_ops->vdev_op_leaf) { 2673270312Ssmh switch (zio->io_type) { 2674270312Ssmh case ZIO_TYPE_READ: 2675270312Ssmh if (vdev_cache_read(zio)) 2676270312Ssmh return (ZIO_PIPELINE_CONTINUE); 2677270312Ssmh /* FALLTHROUGH */ 2678270312Ssmh case ZIO_TYPE_WRITE: 2679270312Ssmh case ZIO_TYPE_FREE: 2680270312Ssmh if ((zio = vdev_queue_io(zio)) == NULL) 2681270312Ssmh return (ZIO_PIPELINE_STOP); 2682168404Spjd 2683270312Ssmh if (!vdev_accessible(vd, zio)) { 2684270312Ssmh zio->io_error = SET_ERROR(ENXIO); 2685270312Ssmh zio_interrupt(zio); 2686270312Ssmh return (ZIO_PIPELINE_STOP); 2687270312Ssmh } 2688270312Ssmh break; 2689185029Spjd } 2690270312Ssmh /* 2691270312Ssmh * Note that we ignore repair writes for TRIM because they can 2692270312Ssmh * conflict with normal writes. This isn't an issue because, by 2693270312Ssmh * definition, we only repair blocks that aren't freed. 2694270312Ssmh */ 2695270312Ssmh if (zio->io_type == ZIO_TYPE_WRITE && 2696270312Ssmh !(zio->io_flags & ZIO_FLAG_IO_REPAIR) && 2697270312Ssmh !trim_map_write_start(zio)) 2698240868Spjd return (ZIO_PIPELINE_STOP); 2699240868Spjd } 2700240868Spjd 2701270312Ssmh ret = vd->vdev_ops->vdev_op_io_start(zio); 2702270312Ssmh ASSERT(ret == ZIO_PIPELINE_STOP); 2703270312Ssmh 2704270312Ssmh return (ret); 2705168404Spjd} 2706168404Spjd 2707185029Spjdstatic int 2708270312Ssmhzio_vdev_io_done(zio_t *zio) 2709168404Spjd{ 2710168404Spjd vdev_t *vd = zio->io_vd; 2711185029Spjd vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; 2712185029Spjd boolean_t unexpected_error = B_FALSE; 2713168404Spjd 2714185029Spjd if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2715185029Spjd return (ZIO_PIPELINE_STOP); 2716168404Spjd 2717240868Spjd ASSERT(zio->io_type == ZIO_TYPE_READ || 2718240868Spjd zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE); 2719185029Spjd 2720240868Spjd if (vd != NULL && vd->vdev_ops->vdev_op_leaf && 2721270312Ssmh (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || 2722270312Ssmh zio->io_type == ZIO_TYPE_FREE)) { 2723240868Spjd 2724248573Ssmh if (zio->io_type == ZIO_TYPE_WRITE && 2725248573Ssmh !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) 2726248573Ssmh trim_map_write_done(zio); 2727248573Ssmh 2728185029Spjd vdev_queue_io_done(zio); 2729185029Spjd 2730185029Spjd if (zio->io_type == ZIO_TYPE_WRITE) 2731185029Spjd vdev_cache_write(zio); 2732185029Spjd 2733185029Spjd if (zio_injection_enabled && zio->io_error == 0) 2734213198Smm zio->io_error = zio_handle_device_injection(vd, 2735213198Smm zio, EIO); 2736185029Spjd 2737185029Spjd if (zio_injection_enabled && zio->io_error == 0) 2738185029Spjd zio->io_error = zio_handle_label_injection(zio, EIO); 2739185029Spjd 2740185029Spjd if (zio->io_error) { 2741271683Ssmh if (zio->io_error == ENOTSUP && 2742271683Ssmh zio->io_type == ZIO_TYPE_FREE) { 2743271683Ssmh /* Not all devices support TRIM. */ 2744271683Ssmh } else if (!vdev_accessible(vd, zio)) { 2745249195Smm zio->io_error = SET_ERROR(ENXIO); 2746185029Spjd } else { 2747185029Spjd unexpected_error = B_TRUE; 2748185029Spjd } 2749185029Spjd } 2750185029Spjd } 2751185029Spjd 2752185029Spjd ops->vdev_op_io_done(zio); 2753185029Spjd 2754185029Spjd if (unexpected_error) 2755209962Smm VERIFY(vdev_probe(vd, zio) == NULL); 2756185029Spjd 2757185029Spjd return (ZIO_PIPELINE_CONTINUE); 2758168404Spjd} 2759168404Spjd 2760219089Spjd/* 2761219089Spjd * For non-raidz ZIOs, we can just copy aside the bad data read from the 2762219089Spjd * disk, and use that to finish the checksum ereport later. 2763219089Spjd */ 2764219089Spjdstatic void 2765219089Spjdzio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, 2766219089Spjd const void *good_buf) 2767219089Spjd{ 2768219089Spjd /* no processing needed */ 2769219089Spjd zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); 2770219089Spjd} 2771219089Spjd 2772219089Spjd/*ARGSUSED*/ 2773219089Spjdvoid 2774219089Spjdzio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) 2775219089Spjd{ 2776219089Spjd void *buf = zio_buf_alloc(zio->io_size); 2777219089Spjd 2778219089Spjd bcopy(zio->io_data, buf, zio->io_size); 2779219089Spjd 2780219089Spjd zcr->zcr_cbinfo = zio->io_size; 2781219089Spjd zcr->zcr_cbdata = buf; 2782219089Spjd zcr->zcr_finish = zio_vsd_default_cksum_finish; 2783219089Spjd zcr->zcr_free = zio_buf_free; 2784219089Spjd} 2785219089Spjd 2786185029Spjdstatic int 2787270312Ssmhzio_vdev_io_assess(zio_t *zio) 2788168404Spjd{ 2789168404Spjd vdev_t *vd = zio->io_vd; 2790168404Spjd 2791185029Spjd if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2792185029Spjd return (ZIO_PIPELINE_STOP); 2793168404Spjd 2794185029Spjd if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2795185029Spjd spa_config_exit(zio->io_spa, SCL_ZIO, zio); 2796185029Spjd 2797185029Spjd if (zio->io_vsd != NULL) { 2798219089Spjd zio->io_vsd_ops->vsd_free(zio); 2799185029Spjd zio->io_vsd = NULL; 2800168404Spjd } 2801168404Spjd 2802185029Spjd if (zio_injection_enabled && zio->io_error == 0) 2803168404Spjd zio->io_error = zio_handle_fault_injection(zio, EIO); 2804168404Spjd 2805270312Ssmh if (zio->io_type == ZIO_TYPE_FREE && 2806270312Ssmh zio->io_priority != ZIO_PRIORITY_NOW) { 2807240868Spjd switch (zio->io_error) { 2808240868Spjd case 0: 2809244155Ssmh ZIO_TRIM_STAT_INCR(bytes, zio->io_size); 2810244155Ssmh ZIO_TRIM_STAT_BUMP(success); 2811240868Spjd break; 2812240868Spjd case EOPNOTSUPP: 2813244155Ssmh ZIO_TRIM_STAT_BUMP(unsupported); 2814240868Spjd break; 2815240868Spjd default: 2816244155Ssmh ZIO_TRIM_STAT_BUMP(failed); 2817240868Spjd break; 2818240868Spjd } 2819270312Ssmh } 2820240868Spjd 2821168404Spjd /* 2822168404Spjd * If the I/O failed, determine whether we should attempt to retry it. 2823219089Spjd * 2824219089Spjd * On retry, we cut in line in the issue queue, since we don't want 2825219089Spjd * compression/checksumming/etc. work to prevent our (cheap) IO reissue. 2826168404Spjd */ 2827185029Spjd if (zio->io_error && vd == NULL && 2828185029Spjd !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { 2829185029Spjd ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ 2830185029Spjd ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ 2831168404Spjd zio->io_error = 0; 2832185029Spjd zio->io_flags |= ZIO_FLAG_IO_RETRY | 2833185029Spjd ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; 2834219089Spjd zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; 2835219089Spjd zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, 2836219089Spjd zio_requeue_io_start_cut_in_line); 2837185029Spjd return (ZIO_PIPELINE_STOP); 2838185029Spjd } 2839168404Spjd 2840185029Spjd /* 2841185029Spjd * If we got an error on a leaf device, convert it to ENXIO 2842185029Spjd * if the device is not accessible at all. 2843185029Spjd */ 2844185029Spjd if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && 2845185029Spjd !vdev_accessible(vd, zio)) 2846249195Smm zio->io_error = SET_ERROR(ENXIO); 2847168404Spjd 2848185029Spjd /* 2849185029Spjd * If we can't write to an interior vdev (mirror or RAID-Z), 2850185029Spjd * set vdev_cant_write so that we stop trying to allocate from it. 2851185029Spjd */ 2852185029Spjd if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && 2853248571Smm vd != NULL && !vd->vdev_ops->vdev_op_leaf) { 2854185029Spjd vd->vdev_cant_write = B_TRUE; 2855248571Smm } 2856168404Spjd 2857185029Spjd if (zio->io_error) 2858185029Spjd zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2859168404Spjd 2860260763Savg if (vd != NULL && vd->vdev_ops->vdev_op_leaf && 2861260763Savg zio->io_physdone != NULL) { 2862260763Savg ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED)); 2863260763Savg ASSERT(zio->io_child_type == ZIO_CHILD_VDEV); 2864260763Savg zio->io_physdone(zio->io_logical); 2865260763Savg } 2866260763Savg 2867185029Spjd return (ZIO_PIPELINE_CONTINUE); 2868168404Spjd} 2869168404Spjd 2870168404Spjdvoid 2871168404Spjdzio_vdev_io_reissue(zio_t *zio) 2872168404Spjd{ 2873168404Spjd ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2874168404Spjd ASSERT(zio->io_error == 0); 2875168404Spjd 2876219089Spjd zio->io_stage >>= 1; 2877168404Spjd} 2878168404Spjd 2879168404Spjdvoid 2880168404Spjdzio_vdev_io_redone(zio_t *zio) 2881168404Spjd{ 2882168404Spjd ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 2883168404Spjd 2884219089Spjd zio->io_stage >>= 1; 2885168404Spjd} 2886168404Spjd 2887168404Spjdvoid 2888168404Spjdzio_vdev_io_bypass(zio_t *zio) 2889168404Spjd{ 2890168404Spjd ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2891168404Spjd ASSERT(zio->io_error == 0); 2892168404Spjd 2893168404Spjd zio->io_flags |= ZIO_FLAG_IO_BYPASS; 2894219089Spjd zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; 2895168404Spjd} 2896168404Spjd 2897168404Spjd/* 2898168404Spjd * ========================================================================== 2899168404Spjd * Generate and verify checksums 2900168404Spjd * ========================================================================== 2901168404Spjd */ 2902185029Spjdstatic int 2903270312Ssmhzio_checksum_generate(zio_t *zio) 2904168404Spjd{ 2905168404Spjd blkptr_t *bp = zio->io_bp; 2906185029Spjd enum zio_checksum checksum; 2907168404Spjd 2908185029Spjd if (bp == NULL) { 2909185029Spjd /* 2910185029Spjd * This is zio_write_phys(). 2911185029Spjd * We're either generating a label checksum, or none at all. 2912185029Spjd */ 2913185029Spjd checksum = zio->io_prop.zp_checksum; 2914168404Spjd 2915185029Spjd if (checksum == ZIO_CHECKSUM_OFF) 2916185029Spjd return (ZIO_PIPELINE_CONTINUE); 2917168404Spjd 2918185029Spjd ASSERT(checksum == ZIO_CHECKSUM_LABEL); 2919185029Spjd } else { 2920185029Spjd if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { 2921185029Spjd ASSERT(!IO_IS_ALLOCATING(zio)); 2922185029Spjd checksum = ZIO_CHECKSUM_GANG_HEADER; 2923185029Spjd } else { 2924185029Spjd checksum = BP_GET_CHECKSUM(bp); 2925185029Spjd } 2926185029Spjd } 2927168404Spjd 2928185029Spjd zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); 2929185029Spjd 2930185029Spjd return (ZIO_PIPELINE_CONTINUE); 2931168404Spjd} 2932168404Spjd 2933185029Spjdstatic int 2934270312Ssmhzio_checksum_verify(zio_t *zio) 2935168404Spjd{ 2936219089Spjd zio_bad_cksum_t info; 2937185029Spjd blkptr_t *bp = zio->io_bp; 2938185029Spjd int error; 2939168404Spjd 2940219089Spjd ASSERT(zio->io_vd != NULL); 2941219089Spjd 2942185029Spjd if (bp == NULL) { 2943185029Spjd /* 2944185029Spjd * This is zio_read_phys(). 2945185029Spjd * We're either verifying a label checksum, or nothing at all. 2946185029Spjd */ 2947185029Spjd if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) 2948185029Spjd return (ZIO_PIPELINE_CONTINUE); 2949168404Spjd 2950185029Spjd ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); 2951185029Spjd } 2952168404Spjd 2953219089Spjd if ((error = zio_checksum_error(zio, &info)) != 0) { 2954185029Spjd zio->io_error = error; 2955185029Spjd if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2956219089Spjd zfs_ereport_start_checksum(zio->io_spa, 2957219089Spjd zio->io_vd, zio, zio->io_offset, 2958219089Spjd zio->io_size, NULL, &info); 2959185029Spjd } 2960168404Spjd } 2961168404Spjd 2962185029Spjd return (ZIO_PIPELINE_CONTINUE); 2963168404Spjd} 2964168404Spjd 2965168404Spjd/* 2966168404Spjd * Called by RAID-Z to ensure we don't compute the checksum twice. 2967168404Spjd */ 2968168404Spjdvoid 2969168404Spjdzio_checksum_verified(zio_t *zio) 2970168404Spjd{ 2971219089Spjd zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 2972168404Spjd} 2973168404Spjd 2974168404Spjd/* 2975185029Spjd * ========================================================================== 2976185029Spjd * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. 2977268649Sdelphij * An error of 0 indicates success. ENXIO indicates whole-device failure, 2978185029Spjd * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO 2979185029Spjd * indicate errors that are specific to one I/O, and most likely permanent. 2980185029Spjd * Any other error is presumed to be worse because we weren't expecting it. 2981185029Spjd * ========================================================================== 2982168404Spjd */ 2983185029Spjdint 2984185029Spjdzio_worst_error(int e1, int e2) 2985168404Spjd{ 2986185029Spjd static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; 2987185029Spjd int r1, r2; 2988168404Spjd 2989185029Spjd for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) 2990185029Spjd if (e1 == zio_error_rank[r1]) 2991185029Spjd break; 2992185029Spjd 2993185029Spjd for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) 2994185029Spjd if (e2 == zio_error_rank[r2]) 2995185029Spjd break; 2996185029Spjd 2997185029Spjd return (r1 > r2 ? e1 : e2); 2998168404Spjd} 2999168404Spjd 3000168404Spjd/* 3001168404Spjd * ========================================================================== 3002185029Spjd * I/O completion 3003168404Spjd * ========================================================================== 3004168404Spjd */ 3005185029Spjdstatic int 3006270312Ssmhzio_ready(zio_t *zio) 3007168404Spjd{ 3008185029Spjd blkptr_t *bp = zio->io_bp; 3009209962Smm zio_t *pio, *pio_next; 3010168404Spjd 3011219089Spjd if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 3012219089Spjd zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY)) 3013209962Smm return (ZIO_PIPELINE_STOP); 3014209962Smm 3015185029Spjd if (zio->io_ready) { 3016185029Spjd ASSERT(IO_IS_ALLOCATING(zio)); 3017243524Smm ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || 3018243524Smm (zio->io_flags & ZIO_FLAG_NOPWRITE)); 3019185029Spjd ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); 3020168404Spjd 3021185029Spjd zio->io_ready(zio); 3022168404Spjd } 3023168404Spjd 3024185029Spjd if (bp != NULL && bp != &zio->io_bp_copy) 3025185029Spjd zio->io_bp_copy = *bp; 3026168404Spjd 3027185029Spjd if (zio->io_error) 3028185029Spjd zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 3029168404Spjd 3030209962Smm mutex_enter(&zio->io_lock); 3031209962Smm zio->io_state[ZIO_WAIT_READY] = 1; 3032209962Smm pio = zio_walk_parents(zio); 3033209962Smm mutex_exit(&zio->io_lock); 3034209962Smm 3035209962Smm /* 3036209962Smm * As we notify zio's parents, new parents could be added. 3037209962Smm * New parents go to the head of zio's io_parent_list, however, 3038209962Smm * so we will (correctly) not notify them. The remainder of zio's 3039209962Smm * io_parent_list, from 'pio_next' onward, cannot change because 3040209962Smm * all parents must wait for us to be done before they can be done. 3041209962Smm */ 3042209962Smm for (; pio != NULL; pio = pio_next) { 3043209962Smm pio_next = zio_walk_parents(zio); 3044185029Spjd zio_notify_parent(pio, zio, ZIO_WAIT_READY); 3045209962Smm } 3046185029Spjd 3047219089Spjd if (zio->io_flags & ZIO_FLAG_NODATA) { 3048219089Spjd if (BP_IS_GANG(bp)) { 3049219089Spjd zio->io_flags &= ~ZIO_FLAG_NODATA; 3050219089Spjd } else { 3051219089Spjd ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE); 3052219089Spjd zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 3053219089Spjd } 3054219089Spjd } 3055219089Spjd 3056219089Spjd if (zio_injection_enabled && 3057219089Spjd zio->io_spa->spa_syncing_txg == zio->io_txg) 3058219089Spjd zio_handle_ignored_writes(zio); 3059219089Spjd 3060185029Spjd return (ZIO_PIPELINE_CONTINUE); 3061185029Spjd} 3062185029Spjd 3063185029Spjdstatic int 3064270312Ssmhzio_done(zio_t *zio) 3065185029Spjd{ 3066185029Spjd spa_t *spa = zio->io_spa; 3067185029Spjd zio_t *lio = zio->io_logical; 3068185029Spjd blkptr_t *bp = zio->io_bp; 3069185029Spjd vdev_t *vd = zio->io_vd; 3070185029Spjd uint64_t psize = zio->io_size; 3071209962Smm zio_t *pio, *pio_next; 3072185029Spjd 3073168404Spjd /* 3074209962Smm * If our children haven't all completed, 3075185029Spjd * wait for them and then repeat this pipeline stage. 3076168404Spjd */ 3077185029Spjd if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || 3078185029Spjd zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || 3079219089Spjd zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) || 3080185029Spjd zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) 3081185029Spjd return (ZIO_PIPELINE_STOP); 3082185029Spjd 3083185029Spjd for (int c = 0; c < ZIO_CHILD_TYPES; c++) 3084185029Spjd for (int w = 0; w < ZIO_WAIT_TYPES; w++) 3085185029Spjd ASSERT(zio->io_children[c][w] == 0); 3086185029Spjd 3087268649Sdelphij if (bp != NULL && !BP_IS_EMBEDDED(bp)) { 3088185029Spjd ASSERT(bp->blk_pad[0] == 0); 3089185029Spjd ASSERT(bp->blk_pad[1] == 0); 3090185029Spjd ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || 3091209962Smm (bp == zio_unique_parent(zio)->io_bp)); 3092185029Spjd if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 3093219089Spjd zio->io_bp_override == NULL && 3094185029Spjd !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 3095185029Spjd ASSERT(!BP_SHOULD_BYTESWAP(bp)); 3096219089Spjd ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); 3097185029Spjd ASSERT(BP_COUNT_GANG(bp) == 0 || 3098185029Spjd (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 3099185029Spjd } 3100243524Smm if (zio->io_flags & ZIO_FLAG_NOPWRITE) 3101243524Smm VERIFY(BP_EQUAL(bp, &zio->io_bp_orig)); 3102168404Spjd } 3103168404Spjd 3104185029Spjd /* 3105219089Spjd * If there were child vdev/gang/ddt errors, they apply to us now. 3106185029Spjd */ 3107185029Spjd zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); 3108185029Spjd zio_inherit_child_errors(zio, ZIO_CHILD_GANG); 3109219089Spjd zio_inherit_child_errors(zio, ZIO_CHILD_DDT); 3110168404Spjd 3111219089Spjd /* 3112219089Spjd * If the I/O on the transformed data was successful, generate any 3113219089Spjd * checksum reports now while we still have the transformed data. 3114219089Spjd */ 3115219089Spjd if (zio->io_error == 0) { 3116219089Spjd while (zio->io_cksum_report != NULL) { 3117219089Spjd zio_cksum_report_t *zcr = zio->io_cksum_report; 3118219089Spjd uint64_t align = zcr->zcr_align; 3119219089Spjd uint64_t asize = P2ROUNDUP(psize, align); 3120219089Spjd char *abuf = zio->io_data; 3121219089Spjd 3122219089Spjd if (asize != psize) { 3123219089Spjd abuf = zio_buf_alloc(asize); 3124219089Spjd bcopy(zio->io_data, abuf, psize); 3125219089Spjd bzero(abuf + psize, asize - psize); 3126219089Spjd } 3127219089Spjd 3128219089Spjd zio->io_cksum_report = zcr->zcr_next; 3129219089Spjd zcr->zcr_next = NULL; 3130219089Spjd zcr->zcr_finish(zcr, abuf); 3131219089Spjd zfs_ereport_free_checksum(zcr); 3132219089Spjd 3133219089Spjd if (asize != psize) 3134219089Spjd zio_buf_free(abuf, asize); 3135219089Spjd } 3136219089Spjd } 3137219089Spjd 3138185029Spjd zio_pop_transforms(zio); /* note: may set zio->io_error */ 3139168404Spjd 3140185029Spjd vdev_stat_update(zio, psize); 3141185029Spjd 3142168404Spjd if (zio->io_error) { 3143185029Spjd /* 3144185029Spjd * If this I/O is attached to a particular vdev, 3145185029Spjd * generate an error message describing the I/O failure 3146185029Spjd * at the block level. We ignore these errors if the 3147185029Spjd * device is currently unavailable. 3148185029Spjd */ 3149185029Spjd if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 3150185029Spjd zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 3151185029Spjd 3152219089Spjd if ((zio->io_error == EIO || !(zio->io_flags & 3153219089Spjd (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && 3154219089Spjd zio == lio) { 3155185029Spjd /* 3156185029Spjd * For logical I/O requests, tell the SPA to log the 3157185029Spjd * error and generate a logical data ereport. 3158185029Spjd */ 3159185029Spjd spa_log_error(spa, zio); 3160185029Spjd zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 3161185029Spjd 0, 0); 3162185029Spjd } 3163168404Spjd } 3164168404Spjd 3165185029Spjd if (zio->io_error && zio == lio) { 3166185029Spjd /* 3167185029Spjd * Determine whether zio should be reexecuted. This will 3168185029Spjd * propagate all the way to the root via zio_notify_parent(). 3169185029Spjd */ 3170185029Spjd ASSERT(vd == NULL && bp != NULL); 3171219089Spjd ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3172168404Spjd 3173219089Spjd if (IO_IS_ALLOCATING(zio) && 3174219089Spjd !(zio->io_flags & ZIO_FLAG_CANFAIL)) { 3175185029Spjd if (zio->io_error != ENOSPC) 3176185029Spjd zio->io_reexecute |= ZIO_REEXECUTE_NOW; 3177185029Spjd else 3178185029Spjd zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3179219089Spjd } 3180168404Spjd 3181185029Spjd if ((zio->io_type == ZIO_TYPE_READ || 3182185029Spjd zio->io_type == ZIO_TYPE_FREE) && 3183219089Spjd !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && 3184185029Spjd zio->io_error == ENXIO && 3185219089Spjd spa_load_state(spa) == SPA_LOAD_NONE && 3186185029Spjd spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 3187185029Spjd zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3188185029Spjd 3189185029Spjd if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) 3190185029Spjd zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3191219089Spjd 3192219089Spjd /* 3193219089Spjd * Here is a possibly good place to attempt to do 3194219089Spjd * either combinatorial reconstruction or error correction 3195219089Spjd * based on checksums. It also might be a good place 3196219089Spjd * to send out preliminary ereports before we suspend 3197219089Spjd * processing. 3198219089Spjd */ 3199185029Spjd } 3200185029Spjd 3201168404Spjd /* 3202185029Spjd * If there were logical child errors, they apply to us now. 3203185029Spjd * We defer this until now to avoid conflating logical child 3204185029Spjd * errors with errors that happened to the zio itself when 3205185029Spjd * updating vdev stats and reporting FMA events above. 3206168404Spjd */ 3207185029Spjd zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); 3208185029Spjd 3209219089Spjd if ((zio->io_error || zio->io_reexecute) && 3210219089Spjd IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && 3211243524Smm !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) 3212209962Smm zio_dva_unallocate(zio, zio->io_gang_tree, bp); 3213209962Smm 3214209962Smm zio_gang_tree_free(&zio->io_gang_tree); 3215209962Smm 3216209962Smm /* 3217209962Smm * Godfather I/Os should never suspend. 3218209962Smm */ 3219209962Smm if ((zio->io_flags & ZIO_FLAG_GODFATHER) && 3220209962Smm (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) 3221209962Smm zio->io_reexecute = 0; 3222209962Smm 3223185029Spjd if (zio->io_reexecute) { 3224185029Spjd /* 3225185029Spjd * This is a logical I/O that wants to reexecute. 3226185029Spjd * 3227185029Spjd * Reexecute is top-down. When an i/o fails, if it's not 3228185029Spjd * the root, it simply notifies its parent and sticks around. 3229185029Spjd * The parent, seeing that it still has children in zio_done(), 3230185029Spjd * does the same. This percolates all the way up to the root. 3231185029Spjd * The root i/o will reexecute or suspend the entire tree. 3232185029Spjd * 3233185029Spjd * This approach ensures that zio_reexecute() honors 3234185029Spjd * all the original i/o dependency relationships, e.g. 3235185029Spjd * parents not executing until children are ready. 3236185029Spjd */ 3237185029Spjd ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3238185029Spjd 3239209962Smm zio->io_gang_leader = NULL; 3240185029Spjd 3241209962Smm mutex_enter(&zio->io_lock); 3242209962Smm zio->io_state[ZIO_WAIT_DONE] = 1; 3243209962Smm mutex_exit(&zio->io_lock); 3244185029Spjd 3245209962Smm /* 3246209962Smm * "The Godfather" I/O monitors its children but is 3247209962Smm * not a true parent to them. It will track them through 3248209962Smm * the pipeline but severs its ties whenever they get into 3249209962Smm * trouble (e.g. suspended). This allows "The Godfather" 3250209962Smm * I/O to return status without blocking. 3251209962Smm */ 3252209962Smm for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 3253209962Smm zio_link_t *zl = zio->io_walk_link; 3254209962Smm pio_next = zio_walk_parents(zio); 3255209962Smm 3256209962Smm if ((pio->io_flags & ZIO_FLAG_GODFATHER) && 3257209962Smm (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { 3258209962Smm zio_remove_child(pio, zio, zl); 3259209962Smm zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3260209962Smm } 3261209962Smm } 3262209962Smm 3263209962Smm if ((pio = zio_unique_parent(zio)) != NULL) { 3264185029Spjd /* 3265185029Spjd * We're not a root i/o, so there's nothing to do 3266185029Spjd * but notify our parent. Don't propagate errors 3267185029Spjd * upward since we haven't permanently failed yet. 3268185029Spjd */ 3269209962Smm ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 3270185029Spjd zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; 3271185029Spjd zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3272185029Spjd } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { 3273185029Spjd /* 3274185029Spjd * We'd fail again if we reexecuted now, so suspend 3275185029Spjd * until conditions improve (e.g. device comes online). 3276185029Spjd */ 3277185029Spjd zio_suspend(spa, zio); 3278185029Spjd } else { 3279185029Spjd /* 3280185029Spjd * Reexecution is potentially a huge amount of work. 3281185029Spjd * Hand it off to the otherwise-unused claim taskq. 3282185029Spjd */ 3283260742Savg#if defined(illumos) || !defined(_KERNEL) 3284260742Savg ASSERT(zio->io_tqent.tqent_next == NULL); 3285216919Smm#else 3286260742Savg ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); 3287260742Savg#endif 3288260750Savg spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM, 3289260750Savg ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio, 3290260750Savg 0, &zio->io_tqent); 3291185029Spjd } 3292185029Spjd return (ZIO_PIPELINE_STOP); 3293168404Spjd } 3294168404Spjd 3295219089Spjd ASSERT(zio->io_child_count == 0); 3296185029Spjd ASSERT(zio->io_reexecute == 0); 3297185029Spjd ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); 3298168404Spjd 3299209962Smm /* 3300219089Spjd * Report any checksum errors, since the I/O is complete. 3301219089Spjd */ 3302219089Spjd while (zio->io_cksum_report != NULL) { 3303219089Spjd zio_cksum_report_t *zcr = zio->io_cksum_report; 3304219089Spjd zio->io_cksum_report = zcr->zcr_next; 3305219089Spjd zcr->zcr_next = NULL; 3306219089Spjd zcr->zcr_finish(zcr, NULL); 3307219089Spjd zfs_ereport_free_checksum(zcr); 3308219089Spjd } 3309219089Spjd 3310219089Spjd /* 3311209962Smm * It is the responsibility of the done callback to ensure that this 3312209962Smm * particular zio is no longer discoverable for adoption, and as 3313209962Smm * such, cannot acquire any new parents. 3314209962Smm */ 3315185029Spjd if (zio->io_done) 3316185029Spjd zio->io_done(zio); 3317168404Spjd 3318209962Smm mutex_enter(&zio->io_lock); 3319209962Smm zio->io_state[ZIO_WAIT_DONE] = 1; 3320209962Smm mutex_exit(&zio->io_lock); 3321168404Spjd 3322209962Smm for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 3323209962Smm zio_link_t *zl = zio->io_walk_link; 3324209962Smm pio_next = zio_walk_parents(zio); 3325209962Smm zio_remove_child(pio, zio, zl); 3326185029Spjd zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3327168404Spjd } 3328168404Spjd 3329185029Spjd if (zio->io_waiter != NULL) { 3330185029Spjd mutex_enter(&zio->io_lock); 3331185029Spjd zio->io_executor = NULL; 3332185029Spjd cv_broadcast(&zio->io_cv); 3333185029Spjd mutex_exit(&zio->io_lock); 3334185029Spjd } else { 3335185029Spjd zio_destroy(zio); 3336168404Spjd } 3337168404Spjd 3338185029Spjd return (ZIO_PIPELINE_STOP); 3339168404Spjd} 3340168404Spjd 3341168404Spjd/* 3342185029Spjd * ========================================================================== 3343185029Spjd * I/O pipeline definition 3344185029Spjd * ========================================================================== 3345168404Spjd */ 3346219089Spjdstatic zio_pipe_stage_t *zio_pipeline[] = { 3347185029Spjd NULL, 3348219089Spjd zio_read_bp_init, 3349219089Spjd zio_free_bp_init, 3350185029Spjd zio_issue_async, 3351185029Spjd zio_write_bp_init, 3352185029Spjd zio_checksum_generate, 3353243524Smm zio_nop_write, 3354219089Spjd zio_ddt_read_start, 3355219089Spjd zio_ddt_read_done, 3356219089Spjd zio_ddt_write, 3357219089Spjd zio_ddt_free, 3358185029Spjd zio_gang_assemble, 3359185029Spjd zio_gang_issue, 3360185029Spjd zio_dva_allocate, 3361185029Spjd zio_dva_free, 3362185029Spjd zio_dva_claim, 3363185029Spjd zio_ready, 3364185029Spjd zio_vdev_io_start, 3365185029Spjd zio_vdev_io_done, 3366185029Spjd zio_vdev_io_assess, 3367185029Spjd zio_checksum_verify, 3368185029Spjd zio_done 3369185029Spjd}; 3370236884Smm 3371236884Smm/* dnp is the dnode for zb1->zb_object */ 3372236884Smmboolean_t 3373268657Sdelphijzbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_phys_t *zb1, 3374268657Sdelphij const zbookmark_phys_t *zb2) 3375236884Smm{ 3376236884Smm uint64_t zb1nextL0, zb2thisobj; 3377236884Smm 3378236884Smm ASSERT(zb1->zb_objset == zb2->zb_objset); 3379236884Smm ASSERT(zb2->zb_level == 0); 3380236884Smm 3381236884Smm /* The objset_phys_t isn't before anything. */ 3382236884Smm if (dnp == NULL) 3383236884Smm return (B_FALSE); 3384236884Smm 3385236884Smm zb1nextL0 = (zb1->zb_blkid + 1) << 3386236884Smm ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); 3387236884Smm 3388236884Smm zb2thisobj = zb2->zb_object ? zb2->zb_object : 3389236884Smm zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); 3390236884Smm 3391236884Smm if (zb1->zb_object == DMU_META_DNODE_OBJECT) { 3392236884Smm uint64_t nextobj = zb1nextL0 * 3393236884Smm (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; 3394236884Smm return (nextobj <= zb2thisobj); 3395236884Smm } 3396236884Smm 3397236884Smm if (zb1->zb_object < zb2thisobj) 3398236884Smm return (B_TRUE); 3399236884Smm if (zb1->zb_object > zb2thisobj) 3400236884Smm return (B_FALSE); 3401236884Smm if (zb2->zb_object == DMU_META_DNODE_OBJECT) 3402236884Smm return (B_FALSE); 3403236884Smm return (zb1nextL0 <= zb2->zb_blkid); 3404236884Smm} 3405