1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23304139Savg * Copyright (c) 2011, 2016 by Delphix. All rights reserved. 24260742Savg * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. 25297112Smav * Copyright (c) 2014 Integros [integros.com] 26168404Spjd */ 27168404Spjd 28277547Sdelphij#include <sys/sysmacros.h> 29168404Spjd#include <sys/zfs_context.h> 30168404Spjd#include <sys/fm/fs/zfs.h> 31168404Spjd#include <sys/spa.h> 32168404Spjd#include <sys/txg.h> 33168404Spjd#include <sys/spa_impl.h> 34168404Spjd#include <sys/vdev_impl.h> 35168404Spjd#include <sys/zio_impl.h> 36168404Spjd#include <sys/zio_compress.h> 37168404Spjd#include <sys/zio_checksum.h> 38219089Spjd#include <sys/dmu_objset.h> 39219089Spjd#include <sys/arc.h> 40219089Spjd#include <sys/ddt.h> 41240868Spjd#include <sys/trim_map.h> 42268649Sdelphij#include <sys/blkptr.h> 43263397Sdelphij#include <sys/zfeature.h> 44307279Smav#include <sys/metaslab_impl.h> 45168404Spjd 46208148SpjdSYSCTL_DECL(_vfs_zfs); 47208148SpjdSYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); 48260338Smav#if defined(__amd64__) 49260338Smavstatic int zio_use_uma = 1; 50260338Smav#else 51209261Spjdstatic int zio_use_uma = 0; 52260338Smav#endif 53208148SpjdTUNABLE_INT("vfs.zfs.zio.use_uma", &zio_use_uma); 54208148SpjdSYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0, 55208148Spjd "Use uma(9) for ZIO allocations"); 56230647Skmacystatic int zio_exclude_metadata = 0; 57230647SkmacyTUNABLE_INT("vfs.zfs.zio.exclude_metadata", &zio_exclude_metadata); 58230647SkmacySYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0, 59230647Skmacy "Exclude metadata buffers from dumps as well"); 60208148Spjd 61240868Spjdzio_trim_stats_t zio_trim_stats = { 62244155Ssmh { "bytes", KSTAT_DATA_UINT64, 63244155Ssmh "Number of bytes successfully TRIMmed" }, 64244155Ssmh { "success", KSTAT_DATA_UINT64, 65244155Ssmh "Number of successful TRIM requests" }, 66244155Ssmh { "unsupported", KSTAT_DATA_UINT64, 67244155Ssmh "Number of TRIM requests that failed because TRIM is not supported" }, 68244155Ssmh { "failed", KSTAT_DATA_UINT64, 69244155Ssmh "Number of TRIM requests that failed for reasons other than not supported" }, 70240868Spjd}; 71240868Spjd 72240868Spjdstatic kstat_t *zio_trim_ksp; 73240868Spjd 74240868Spjd/* 75168404Spjd * ========================================================================== 76168404Spjd * I/O type descriptions 77168404Spjd * ========================================================================== 78168404Spjd */ 79260763Savgconst char *zio_type_name[ZIO_TYPES] = { 80211931Smm "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", 81211931Smm "zio_ioctl" 82211931Smm}; 83168404Spjd 84307279Smavboolean_t zio_dva_throttle_enabled = B_TRUE; 85307279SmavSYSCTL_INT(_vfs_zfs_zio, OID_AUTO, dva_throttle_enabled, CTLFLAG_RDTUN, 86307279Smav &zio_dva_throttle_enabled, 0, ""); 87307279Smav 88168404Spjd/* 89168404Spjd * ========================================================================== 90168404Spjd * I/O kmem caches 91168404Spjd * ========================================================================== 92168404Spjd */ 93168926Spjdkmem_cache_t *zio_cache; 94209962Smmkmem_cache_t *zio_link_cache; 95168404Spjdkmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 96168404Spjdkmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 97168404Spjd 98168404Spjd#ifdef _KERNEL 99168404Spjdextern vmem_t *zio_alloc_arena; 100168404Spjd#endif 101168404Spjd 102297078Smav#define ZIO_PIPELINE_CONTINUE 0x100 103297078Smav#define ZIO_PIPELINE_STOP 0x101 104297078Smav 105288571Smav#define BP_SPANB(indblkshift, level) \ 106288571Smav (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT))) 107288571Smav#define COMPARE_META_LEVEL 0x80000000ul 108185029Spjd/* 109243503Smm * The following actions directly effect the spa's sync-to-convergence logic. 110243503Smm * The values below define the sync pass when we start performing the action. 111243503Smm * Care should be taken when changing these values as they directly impact 112243503Smm * spa_sync() performance. Tuning these values may introduce subtle performance 113243503Smm * pathologies and should only be done in the context of performance analysis. 114243503Smm * These tunables will eventually be removed and replaced with #defines once 115243503Smm * enough analysis has been done to determine optimal values. 116243503Smm * 117243503Smm * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that 118243503Smm * regular blocks are not deferred. 119243503Smm */ 120243503Smmint zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */ 121243503SmmTUNABLE_INT("vfs.zfs.sync_pass_deferred_free", &zfs_sync_pass_deferred_free); 122243503SmmSYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_deferred_free, CTLFLAG_RDTUN, 123243503Smm &zfs_sync_pass_deferred_free, 0, "defer frees starting in this pass"); 124243503Smmint zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */ 125243503SmmTUNABLE_INT("vfs.zfs.sync_pass_dont_compress", &zfs_sync_pass_dont_compress); 126243503SmmSYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_dont_compress, CTLFLAG_RDTUN, 127243503Smm &zfs_sync_pass_dont_compress, 0, "don't compress starting in this pass"); 128243503Smmint zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ 129243503SmmTUNABLE_INT("vfs.zfs.sync_pass_rewrite", &zfs_sync_pass_rewrite); 130243503SmmSYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_rewrite, CTLFLAG_RDTUN, 131243503Smm &zfs_sync_pass_rewrite, 0, "rewrite new bps starting in this pass"); 132243503Smm 133243503Smm/* 134185029Spjd * An allocating zio is one that either currently has the DVA allocate 135185029Spjd * stage set or will have it later in its lifetime. 136185029Spjd */ 137219089Spjd#define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) 138185029Spjd 139219089Spjdboolean_t zio_requeue_io_start_cut_in_line = B_TRUE; 140219089Spjd 141299061Savg#ifdef illumos 142219089Spjd#ifdef ZFS_DEBUG 143219089Spjdint zio_buf_debug_limit = 16384; 144219089Spjd#else 145219089Spjdint zio_buf_debug_limit = 0; 146219089Spjd#endif 147299061Savg#endif 148219089Spjd 149307279Smavstatic void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t); 150307279Smav 151168404Spjdvoid 152168404Spjdzio_init(void) 153168404Spjd{ 154168404Spjd size_t c; 155209962Smm zio_cache = kmem_cache_create("zio_cache", 156209962Smm sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 157209962Smm zio_link_cache = kmem_cache_create("zio_link_cache", 158209962Smm sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 159250149Sdavide if (!zio_use_uma) 160250149Sdavide goto out; 161168926Spjd 162168404Spjd /* 163168404Spjd * For small buffers, we want a cache for each multiple of 164276081Sdelphij * SPA_MINBLOCKSIZE. For larger buffers, we want a cache 165276081Sdelphij * for each quarter-power of 2. 166168404Spjd */ 167168404Spjd for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 168168404Spjd size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 169168404Spjd size_t p2 = size; 170168404Spjd size_t align = 0; 171299061Savg int cflags = zio_exclude_metadata ? KMC_NODEBUG : 0; 172168404Spjd 173277547Sdelphij while (!ISP2(p2)) 174168404Spjd p2 &= p2 - 1; 175168404Spjd 176240133Smm#ifdef illumos 177240133Smm#ifndef _KERNEL 178240133Smm /* 179240133Smm * If we are using watchpoints, put each buffer on its own page, 180240133Smm * to eliminate the performance overhead of trapping to the 181240133Smm * kernel when modifying a non-watched buffer that shares the 182240133Smm * page with a watched buffer. 183240133Smm */ 184240133Smm if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) 185240133Smm continue; 186240133Smm#endif 187240133Smm#endif /* illumos */ 188168404Spjd if (size <= 4 * SPA_MINBLOCKSIZE) { 189168404Spjd align = SPA_MINBLOCKSIZE; 190240133Smm } else if (IS_P2ALIGNED(size, p2 >> 2)) { 191276081Sdelphij align = MIN(p2 >> 2, PAGESIZE); 192168404Spjd } 193168404Spjd 194168404Spjd if (align != 0) { 195168404Spjd char name[36]; 196168404Spjd (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 197168404Spjd zio_buf_cache[c] = kmem_cache_create(name, size, 198219089Spjd align, NULL, NULL, NULL, NULL, NULL, cflags); 199168404Spjd 200219089Spjd /* 201219089Spjd * Since zio_data bufs do not appear in crash dumps, we 202219089Spjd * pass KMC_NOTOUCH so that no allocator metadata is 203219089Spjd * stored with the buffers. 204219089Spjd */ 205168404Spjd (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 206168404Spjd zio_data_buf_cache[c] = kmem_cache_create(name, size, 207219089Spjd align, NULL, NULL, NULL, NULL, NULL, 208230689Skmacy cflags | KMC_NOTOUCH | KMC_NODEBUG); 209168404Spjd } 210168404Spjd } 211168404Spjd 212168404Spjd while (--c != 0) { 213168404Spjd ASSERT(zio_buf_cache[c] != NULL); 214168404Spjd if (zio_buf_cache[c - 1] == NULL) 215168404Spjd zio_buf_cache[c - 1] = zio_buf_cache[c]; 216168404Spjd 217168404Spjd ASSERT(zio_data_buf_cache[c] != NULL); 218168404Spjd if (zio_data_buf_cache[c - 1] == NULL) 219168404Spjd zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 220168404Spjd } 221250149Sdavideout: 222208458Spjd 223168404Spjd zio_inject_init(); 224240868Spjd 225240868Spjd zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc", 226240868Spjd KSTAT_TYPE_NAMED, 227240868Spjd sizeof(zio_trim_stats) / sizeof(kstat_named_t), 228240868Spjd KSTAT_FLAG_VIRTUAL); 229240868Spjd 230240868Spjd if (zio_trim_ksp != NULL) { 231240868Spjd zio_trim_ksp->ks_data = &zio_trim_stats; 232240868Spjd kstat_install(zio_trim_ksp); 233240868Spjd } 234168404Spjd} 235168404Spjd 236168404Spjdvoid 237168404Spjdzio_fini(void) 238168404Spjd{ 239168404Spjd size_t c; 240168404Spjd kmem_cache_t *last_cache = NULL; 241168404Spjd kmem_cache_t *last_data_cache = NULL; 242168404Spjd 243168404Spjd for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 244168404Spjd if (zio_buf_cache[c] != last_cache) { 245168404Spjd last_cache = zio_buf_cache[c]; 246168404Spjd kmem_cache_destroy(zio_buf_cache[c]); 247168404Spjd } 248168404Spjd zio_buf_cache[c] = NULL; 249168404Spjd 250168404Spjd if (zio_data_buf_cache[c] != last_data_cache) { 251168404Spjd last_data_cache = zio_data_buf_cache[c]; 252168404Spjd kmem_cache_destroy(zio_data_buf_cache[c]); 253168404Spjd } 254168404Spjd zio_data_buf_cache[c] = NULL; 255168404Spjd } 256168404Spjd 257209962Smm kmem_cache_destroy(zio_link_cache); 258168926Spjd kmem_cache_destroy(zio_cache); 259168926Spjd 260168404Spjd zio_inject_fini(); 261240868Spjd 262240868Spjd if (zio_trim_ksp != NULL) { 263240868Spjd kstat_delete(zio_trim_ksp); 264240868Spjd zio_trim_ksp = NULL; 265240868Spjd } 266168404Spjd} 267168404Spjd 268168404Spjd/* 269168404Spjd * ========================================================================== 270168404Spjd * Allocate and free I/O buffers 271168404Spjd * ========================================================================== 272168404Spjd */ 273168404Spjd 274168404Spjd/* 275168404Spjd * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 276168404Spjd * crashdump if the kernel panics, so use it judiciously. Obviously, it's 277168404Spjd * useful to inspect ZFS metadata, but if possible, we should avoid keeping 278168404Spjd * excess / transient data in-core during a crashdump. 279168404Spjd */ 280316850Savgstatic void * 281316850Savgzio_buf_alloc_impl(size_t size, boolean_t canwait) 282168404Spjd{ 283168404Spjd size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 284230647Skmacy int flags = zio_exclude_metadata ? KM_NODEBUG : 0; 285168404Spjd 286277582Sdelphij VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 287168404Spjd 288316850Savg if (zio_use_uma) { 289316850Savg return (kmem_cache_alloc(zio_buf_cache[c], 290316850Savg canwait ? KM_PUSHPAGE : KM_NOSLEEP)); 291316850Savg } else { 292316850Savg return (kmem_alloc(size, 293316850Savg (canwait ? KM_SLEEP : KM_NOSLEEP) | flags)); 294316850Savg } 295168404Spjd} 296168404Spjd 297316850Savgvoid * 298316850Savgzio_buf_alloc(size_t size) 299316850Savg{ 300316850Savg return (zio_buf_alloc_impl(size, B_TRUE)); 301316850Savg} 302316850Savg 303316850Savgvoid * 304316850Savgzio_buf_alloc_nowait(size_t size) 305316850Savg{ 306316850Savg return (zio_buf_alloc_impl(size, B_FALSE)); 307316850Savg} 308316850Savg 309168404Spjd/* 310168404Spjd * Use zio_data_buf_alloc to allocate data. The data will not appear in a 311168404Spjd * crashdump if the kernel panics. This exists so that we will limit the amount 312168404Spjd * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 313168404Spjd * of kernel heap dumped to disk when the kernel panics) 314168404Spjd */ 315168404Spjdvoid * 316168404Spjdzio_data_buf_alloc(size_t size) 317168404Spjd{ 318168404Spjd size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 319168404Spjd 320277582Sdelphij VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 321168404Spjd 322208148Spjd if (zio_use_uma) 323208148Spjd return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 324208148Spjd else 325230623Skmacy return (kmem_alloc(size, KM_SLEEP | KM_NODEBUG)); 326168404Spjd} 327168404Spjd 328168404Spjdvoid 329168404Spjdzio_buf_free(void *buf, size_t size) 330168404Spjd{ 331168404Spjd size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 332168404Spjd 333277582Sdelphij VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 334168404Spjd 335208148Spjd if (zio_use_uma) 336208148Spjd kmem_cache_free(zio_buf_cache[c], buf); 337208148Spjd else 338208148Spjd kmem_free(buf, size); 339168404Spjd} 340168404Spjd 341168404Spjdvoid 342168404Spjdzio_data_buf_free(void *buf, size_t size) 343168404Spjd{ 344168404Spjd size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 345168404Spjd 346277582Sdelphij VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 347168404Spjd 348208148Spjd if (zio_use_uma) 349208148Spjd kmem_cache_free(zio_data_buf_cache[c], buf); 350208148Spjd else 351208148Spjd kmem_free(buf, size); 352168404Spjd} 353168404Spjd 354168404Spjd/* 355168404Spjd * ========================================================================== 356168404Spjd * Push and pop I/O transform buffers 357168404Spjd * ========================================================================== 358168404Spjd */ 359307266Smavvoid 360185029Spjdzio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, 361290765Smav zio_transform_func_t *transform) 362168404Spjd{ 363168404Spjd zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 364168404Spjd 365185029Spjd zt->zt_orig_data = zio->io_data; 366185029Spjd zt->zt_orig_size = zio->io_size; 367168404Spjd zt->zt_bufsize = bufsize; 368185029Spjd zt->zt_transform = transform; 369168404Spjd 370168404Spjd zt->zt_next = zio->io_transform_stack; 371168404Spjd zio->io_transform_stack = zt; 372168404Spjd 373168404Spjd zio->io_data = data; 374168404Spjd zio->io_size = size; 375168404Spjd} 376168404Spjd 377307266Smavvoid 378185029Spjdzio_pop_transforms(zio_t *zio) 379168404Spjd{ 380185029Spjd zio_transform_t *zt; 381168404Spjd 382185029Spjd while ((zt = zio->io_transform_stack) != NULL) { 383185029Spjd if (zt->zt_transform != NULL) 384185029Spjd zt->zt_transform(zio, 385185029Spjd zt->zt_orig_data, zt->zt_orig_size); 386168404Spjd 387219089Spjd if (zt->zt_bufsize != 0) 388219089Spjd zio_buf_free(zio->io_data, zt->zt_bufsize); 389168404Spjd 390185029Spjd zio->io_data = zt->zt_orig_data; 391185029Spjd zio->io_size = zt->zt_orig_size; 392185029Spjd zio->io_transform_stack = zt->zt_next; 393185029Spjd 394185029Spjd kmem_free(zt, sizeof (zio_transform_t)); 395168404Spjd } 396168404Spjd} 397168404Spjd 398185029Spjd/* 399185029Spjd * ========================================================================== 400185029Spjd * I/O transform callbacks for subblocks and decompression 401185029Spjd * ========================================================================== 402185029Spjd */ 403168404Spjdstatic void 404185029Spjdzio_subblock(zio_t *zio, void *data, uint64_t size) 405168404Spjd{ 406185029Spjd ASSERT(zio->io_size > size); 407168404Spjd 408185029Spjd if (zio->io_type == ZIO_TYPE_READ) 409185029Spjd bcopy(zio->io_data, data, size); 410185029Spjd} 411168404Spjd 412185029Spjdstatic void 413185029Spjdzio_decompress(zio_t *zio, void *data, uint64_t size) 414185029Spjd{ 415185029Spjd if (zio->io_error == 0 && 416185029Spjd zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), 417219089Spjd zio->io_data, data, zio->io_size, size) != 0) 418249195Smm zio->io_error = SET_ERROR(EIO); 419185029Spjd} 420185029Spjd 421185029Spjd/* 422185029Spjd * ========================================================================== 423185029Spjd * I/O parent/child relationships and pipeline interlocks 424185029Spjd * ========================================================================== 425185029Spjd */ 426209962Smmzio_t * 427307279Smavzio_walk_parents(zio_t *cio, zio_link_t **zl) 428209962Smm{ 429209962Smm list_t *pl = &cio->io_parent_list; 430185029Spjd 431307279Smav *zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl); 432307279Smav if (*zl == NULL) 433209962Smm return (NULL); 434209962Smm 435307279Smav ASSERT((*zl)->zl_child == cio); 436307279Smav return ((*zl)->zl_parent); 437209962Smm} 438209962Smm 439209962Smmzio_t * 440307279Smavzio_walk_children(zio_t *pio, zio_link_t **zl) 441185029Spjd{ 442209962Smm list_t *cl = &pio->io_child_list; 443209962Smm 444307279Smav *zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl); 445307279Smav if (*zl == NULL) 446209962Smm return (NULL); 447209962Smm 448307279Smav ASSERT((*zl)->zl_parent == pio); 449307279Smav return ((*zl)->zl_child); 450209962Smm} 451209962Smm 452209962Smmzio_t * 453209962Smmzio_unique_parent(zio_t *cio) 454209962Smm{ 455307279Smav zio_link_t *zl = NULL; 456307279Smav zio_t *pio = zio_walk_parents(cio, &zl); 457209962Smm 458307279Smav VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL); 459209962Smm return (pio); 460209962Smm} 461209962Smm 462209962Smmvoid 463209962Smmzio_add_child(zio_t *pio, zio_t *cio) 464209962Smm{ 465209962Smm zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); 466209962Smm 467209962Smm /* 468209962Smm * Logical I/Os can have logical, gang, or vdev children. 469209962Smm * Gang I/Os can have gang or vdev children. 470209962Smm * Vdev I/Os can only have vdev children. 471209962Smm * The following ASSERT captures all of these constraints. 472209962Smm */ 473209962Smm ASSERT(cio->io_child_type <= pio->io_child_type); 474209962Smm 475209962Smm zl->zl_parent = pio; 476209962Smm zl->zl_child = cio; 477209962Smm 478209962Smm mutex_enter(&cio->io_lock); 479185029Spjd mutex_enter(&pio->io_lock); 480209962Smm 481209962Smm ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); 482209962Smm 483209962Smm for (int w = 0; w < ZIO_WAIT_TYPES; w++) 484209962Smm pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; 485209962Smm 486209962Smm list_insert_head(&pio->io_child_list, zl); 487209962Smm list_insert_head(&cio->io_parent_list, zl); 488209962Smm 489219089Spjd pio->io_child_count++; 490219089Spjd cio->io_parent_count++; 491219089Spjd 492185029Spjd mutex_exit(&pio->io_lock); 493209962Smm mutex_exit(&cio->io_lock); 494185029Spjd} 495185029Spjd 496185029Spjdstatic void 497209962Smmzio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) 498185029Spjd{ 499209962Smm ASSERT(zl->zl_parent == pio); 500209962Smm ASSERT(zl->zl_child == cio); 501185029Spjd 502209962Smm mutex_enter(&cio->io_lock); 503209962Smm mutex_enter(&pio->io_lock); 504185029Spjd 505209962Smm list_remove(&pio->io_child_list, zl); 506209962Smm list_remove(&cio->io_parent_list, zl); 507209962Smm 508219089Spjd pio->io_child_count--; 509219089Spjd cio->io_parent_count--; 510219089Spjd 511185029Spjd mutex_exit(&pio->io_lock); 512209962Smm mutex_exit(&cio->io_lock); 513209962Smm 514209962Smm kmem_cache_free(zio_link_cache, zl); 515185029Spjd} 516185029Spjd 517185029Spjdstatic boolean_t 518330238Savgzio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait) 519185029Spjd{ 520185029Spjd boolean_t waiting = B_FALSE; 521185029Spjd 522185029Spjd mutex_enter(&zio->io_lock); 523185029Spjd ASSERT(zio->io_stall == NULL); 524330238Savg for (int c = 0; c < ZIO_CHILD_TYPES; c++) { 525330238Savg if (!(ZIO_CHILD_BIT_IS_SET(childbits, c))) 526330238Savg continue; 527330238Savg 528330238Savg uint64_t *countp = &zio->io_children[c][wait]; 529330238Savg if (*countp != 0) { 530330238Savg zio->io_stage >>= 1; 531330238Savg ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN); 532330238Savg zio->io_stall = countp; 533330238Savg waiting = B_TRUE; 534330238Savg break; 535330238Savg } 536168404Spjd } 537185029Spjd mutex_exit(&zio->io_lock); 538185029Spjd return (waiting); 539168404Spjd} 540168404Spjd 541185029Spjdstatic void 542185029Spjdzio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) 543185029Spjd{ 544185029Spjd uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; 545185029Spjd int *errorp = &pio->io_child_error[zio->io_child_type]; 546185029Spjd 547185029Spjd mutex_enter(&pio->io_lock); 548185029Spjd if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 549185029Spjd *errorp = zio_worst_error(*errorp, zio->io_error); 550185029Spjd pio->io_reexecute |= zio->io_reexecute; 551185029Spjd ASSERT3U(*countp, >, 0); 552260763Savg 553260763Savg (*countp)--; 554260763Savg 555260763Savg if (*countp == 0 && pio->io_stall == countp) { 556307279Smav zio_taskq_type_t type = 557307279Smav pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE : 558307279Smav ZIO_TASKQ_INTERRUPT; 559185029Spjd pio->io_stall = NULL; 560185029Spjd mutex_exit(&pio->io_lock); 561307279Smav /* 562307279Smav * Dispatch the parent zio in its own taskq so that 563307279Smav * the child can continue to make progress. This also 564307279Smav * prevents overflowing the stack when we have deeply nested 565307279Smav * parent-child relationships. 566307279Smav */ 567307279Smav zio_taskq_dispatch(pio, type, B_FALSE); 568185029Spjd } else { 569185029Spjd mutex_exit(&pio->io_lock); 570185029Spjd } 571185029Spjd} 572185029Spjd 573185029Spjdstatic void 574185029Spjdzio_inherit_child_errors(zio_t *zio, enum zio_child c) 575185029Spjd{ 576185029Spjd if (zio->io_child_error[c] != 0 && zio->io_error == 0) 577185029Spjd zio->io_error = zio->io_child_error[c]; 578185029Spjd} 579185029Spjd 580307279Smavint 581307279Smavzio_timestamp_compare(const void *x1, const void *x2) 582307279Smav{ 583307279Smav const zio_t *z1 = x1; 584307279Smav const zio_t *z2 = x2; 585307279Smav 586307279Smav if (z1->io_queued_timestamp < z2->io_queued_timestamp) 587307279Smav return (-1); 588307279Smav if (z1->io_queued_timestamp > z2->io_queued_timestamp) 589307279Smav return (1); 590307279Smav 591310106Smav if (z1->io_bookmark.zb_objset < z2->io_bookmark.zb_objset) 592307279Smav return (-1); 593310106Smav if (z1->io_bookmark.zb_objset > z2->io_bookmark.zb_objset) 594307279Smav return (1); 595307279Smav 596310106Smav if (z1->io_bookmark.zb_object < z2->io_bookmark.zb_object) 597310106Smav return (-1); 598310106Smav if (z1->io_bookmark.zb_object > z2->io_bookmark.zb_object) 599310106Smav return (1); 600310106Smav 601310106Smav if (z1->io_bookmark.zb_level < z2->io_bookmark.zb_level) 602310106Smav return (-1); 603310106Smav if (z1->io_bookmark.zb_level > z2->io_bookmark.zb_level) 604310106Smav return (1); 605310106Smav 606310106Smav if (z1->io_bookmark.zb_blkid < z2->io_bookmark.zb_blkid) 607310106Smav return (-1); 608310106Smav if (z1->io_bookmark.zb_blkid > z2->io_bookmark.zb_blkid) 609310106Smav return (1); 610310106Smav 611307279Smav if (z1 < z2) 612307279Smav return (-1); 613307279Smav if (z1 > z2) 614307279Smav return (1); 615307279Smav 616307279Smav return (0); 617307279Smav} 618307279Smav 619168404Spjd/* 620168404Spjd * ========================================================================== 621185029Spjd * Create the various types of I/O (read, write, free, etc) 622168404Spjd * ========================================================================== 623168404Spjd */ 624168404Spjdstatic zio_t * 625219089Spjdzio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 626168404Spjd void *data, uint64_t size, zio_done_func_t *done, void *private, 627260763Savg zio_type_t type, zio_priority_t priority, enum zio_flag flags, 628268657Sdelphij vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb, 629219089Spjd enum zio_stage stage, enum zio_stage pipeline) 630168404Spjd{ 631168404Spjd zio_t *zio; 632168404Spjd 633240868Spjd ASSERT3U(type == ZIO_TYPE_FREE || size, <=, SPA_MAXBLOCKSIZE); 634168404Spjd ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 635185029Spjd ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 636168404Spjd 637185029Spjd ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); 638185029Spjd ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); 639185029Spjd ASSERT(vd || stage == ZIO_STAGE_OPEN); 640185029Spjd 641168926Spjd zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 642168926Spjd bzero(zio, sizeof (zio_t)); 643185029Spjd 644185029Spjd mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 645185029Spjd cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 646185029Spjd 647209962Smm list_create(&zio->io_parent_list, sizeof (zio_link_t), 648209962Smm offsetof(zio_link_t, zl_parent_node)); 649209962Smm list_create(&zio->io_child_list, sizeof (zio_link_t), 650209962Smm offsetof(zio_link_t, zl_child_node)); 651209962Smm 652185029Spjd if (vd != NULL) 653185029Spjd zio->io_child_type = ZIO_CHILD_VDEV; 654185029Spjd else if (flags & ZIO_FLAG_GANG_CHILD) 655185029Spjd zio->io_child_type = ZIO_CHILD_GANG; 656219089Spjd else if (flags & ZIO_FLAG_DDT_CHILD) 657219089Spjd zio->io_child_type = ZIO_CHILD_DDT; 658185029Spjd else 659185029Spjd zio->io_child_type = ZIO_CHILD_LOGICAL; 660185029Spjd 661168404Spjd if (bp != NULL) { 662219089Spjd zio->io_bp = (blkptr_t *)bp; 663168404Spjd zio->io_bp_copy = *bp; 664168404Spjd zio->io_bp_orig = *bp; 665219089Spjd if (type != ZIO_TYPE_WRITE || 666219089Spjd zio->io_child_type == ZIO_CHILD_DDT) 667185029Spjd zio->io_bp = &zio->io_bp_copy; /* so caller can free */ 668209962Smm if (zio->io_child_type == ZIO_CHILD_LOGICAL) 669185029Spjd zio->io_logical = zio; 670209962Smm if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) 671209962Smm pipeline |= ZIO_GANG_STAGES; 672168404Spjd } 673185029Spjd 674185029Spjd zio->io_spa = spa; 675185029Spjd zio->io_txg = txg; 676168404Spjd zio->io_done = done; 677168404Spjd zio->io_private = private; 678168404Spjd zio->io_type = type; 679168404Spjd zio->io_priority = priority; 680185029Spjd zio->io_vd = vd; 681185029Spjd zio->io_offset = offset; 682219089Spjd zio->io_orig_data = zio->io_data = data; 683219089Spjd zio->io_orig_size = zio->io_size = size; 684185029Spjd zio->io_orig_flags = zio->io_flags = flags; 685185029Spjd zio->io_orig_stage = zio->io_stage = stage; 686185029Spjd zio->io_orig_pipeline = zio->io_pipeline = pipeline; 687307279Smav zio->io_pipeline_trace = ZIO_STAGE_OPEN; 688168404Spjd 689209962Smm zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); 690209962Smm zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); 691209962Smm 692185029Spjd if (zb != NULL) 693185029Spjd zio->io_bookmark = *zb; 694185029Spjd 695185029Spjd if (pio != NULL) { 696185029Spjd if (zio->io_logical == NULL) 697168404Spjd zio->io_logical = pio->io_logical; 698209962Smm if (zio->io_child_type == ZIO_CHILD_GANG) 699209962Smm zio->io_gang_leader = pio->io_gang_leader; 700185029Spjd zio_add_child(pio, zio); 701168404Spjd } 702168404Spjd 703168404Spjd return (zio); 704168404Spjd} 705168404Spjd 706185029Spjdstatic void 707185029Spjdzio_destroy(zio_t *zio) 708185029Spjd{ 709209962Smm list_destroy(&zio->io_parent_list); 710209962Smm list_destroy(&zio->io_child_list); 711185029Spjd mutex_destroy(&zio->io_lock); 712185029Spjd cv_destroy(&zio->io_cv); 713185029Spjd kmem_cache_free(zio_cache, zio); 714185029Spjd} 715185029Spjd 716168404Spjdzio_t * 717209962Smmzio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, 718219089Spjd void *private, enum zio_flag flags) 719168404Spjd{ 720168404Spjd zio_t *zio; 721168404Spjd 722168404Spjd zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 723209962Smm ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 724185029Spjd ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); 725168404Spjd 726168404Spjd return (zio); 727168404Spjd} 728168404Spjd 729168404Spjdzio_t * 730219089Spjdzio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) 731168404Spjd{ 732209962Smm return (zio_null(NULL, spa, NULL, done, private, flags)); 733168404Spjd} 734168404Spjd 735277582Sdelphijvoid 736277582Sdelphijzfs_blkptr_verify(spa_t *spa, const blkptr_t *bp) 737277582Sdelphij{ 738277582Sdelphij if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) { 739277582Sdelphij zfs_panic_recover("blkptr at %p has invalid TYPE %llu", 740277582Sdelphij bp, (longlong_t)BP_GET_TYPE(bp)); 741277582Sdelphij } 742277582Sdelphij if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS || 743277582Sdelphij BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) { 744277582Sdelphij zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu", 745277582Sdelphij bp, (longlong_t)BP_GET_CHECKSUM(bp)); 746277582Sdelphij } 747277582Sdelphij if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS || 748277582Sdelphij BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) { 749277582Sdelphij zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu", 750277582Sdelphij bp, (longlong_t)BP_GET_COMPRESS(bp)); 751277582Sdelphij } 752277582Sdelphij if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) { 753277582Sdelphij zfs_panic_recover("blkptr at %p has invalid LSIZE %llu", 754277582Sdelphij bp, (longlong_t)BP_GET_LSIZE(bp)); 755277582Sdelphij } 756277582Sdelphij if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) { 757277582Sdelphij zfs_panic_recover("blkptr at %p has invalid PSIZE %llu", 758277582Sdelphij bp, (longlong_t)BP_GET_PSIZE(bp)); 759277582Sdelphij } 760277582Sdelphij 761277582Sdelphij if (BP_IS_EMBEDDED(bp)) { 762277582Sdelphij if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) { 763277582Sdelphij zfs_panic_recover("blkptr at %p has invalid ETYPE %llu", 764277582Sdelphij bp, (longlong_t)BPE_GET_ETYPE(bp)); 765277582Sdelphij } 766277582Sdelphij } 767277582Sdelphij 768277582Sdelphij /* 769277582Sdelphij * Pool-specific checks. 770277582Sdelphij * 771277582Sdelphij * Note: it would be nice to verify that the blk_birth and 772277582Sdelphij * BP_PHYSICAL_BIRTH() are not too large. However, spa_freeze() 773277582Sdelphij * allows the birth time of log blocks (and dmu_sync()-ed blocks 774277582Sdelphij * that are in the log) to be arbitrarily large. 775277582Sdelphij */ 776277582Sdelphij for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 777277582Sdelphij uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]); 778277582Sdelphij if (vdevid >= spa->spa_root_vdev->vdev_children) { 779277582Sdelphij zfs_panic_recover("blkptr at %p DVA %u has invalid " 780277582Sdelphij "VDEV %llu", 781277582Sdelphij bp, i, (longlong_t)vdevid); 782277618Sdelphij continue; 783277582Sdelphij } 784277582Sdelphij vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; 785277582Sdelphij if (vd == NULL) { 786277582Sdelphij zfs_panic_recover("blkptr at %p DVA %u has invalid " 787277582Sdelphij "VDEV %llu", 788277582Sdelphij bp, i, (longlong_t)vdevid); 789277618Sdelphij continue; 790277582Sdelphij } 791277582Sdelphij if (vd->vdev_ops == &vdev_hole_ops) { 792277582Sdelphij zfs_panic_recover("blkptr at %p DVA %u has hole " 793277582Sdelphij "VDEV %llu", 794277582Sdelphij bp, i, (longlong_t)vdevid); 795277618Sdelphij continue; 796277582Sdelphij } 797277582Sdelphij if (vd->vdev_ops == &vdev_missing_ops) { 798277582Sdelphij /* 799277582Sdelphij * "missing" vdevs are valid during import, but we 800277582Sdelphij * don't have their detailed info (e.g. asize), so 801277582Sdelphij * we can't perform any more checks on them. 802277582Sdelphij */ 803277582Sdelphij continue; 804277582Sdelphij } 805277582Sdelphij uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); 806277582Sdelphij uint64_t asize = DVA_GET_ASIZE(&bp->blk_dva[i]); 807277582Sdelphij if (BP_IS_GANG(bp)) 808277582Sdelphij asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 809277582Sdelphij if (offset + asize > vd->vdev_asize) { 810277582Sdelphij zfs_panic_recover("blkptr at %p DVA %u has invalid " 811277582Sdelphij "OFFSET %llu", 812277582Sdelphij bp, i, (longlong_t)offset); 813277582Sdelphij } 814277582Sdelphij } 815277582Sdelphij} 816277582Sdelphij 817168404Spjdzio_t * 818185029Spjdzio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, 819185029Spjd void *data, uint64_t size, zio_done_func_t *done, void *private, 820268657Sdelphij zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) 821168404Spjd{ 822168404Spjd zio_t *zio; 823168404Spjd 824277582Sdelphij zfs_blkptr_verify(spa, bp); 825277582Sdelphij 826219089Spjd zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, 827185029Spjd data, size, done, private, 828185029Spjd ZIO_TYPE_READ, priority, flags, NULL, 0, zb, 829219089Spjd ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 830219089Spjd ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); 831168404Spjd 832168404Spjd return (zio); 833168404Spjd} 834168404Spjd 835168404Spjdzio_t * 836185029Spjdzio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 837219089Spjd void *data, uint64_t size, const zio_prop_t *zp, 838304139Savg zio_done_func_t *ready, zio_done_func_t *children_ready, 839304139Savg zio_done_func_t *physdone, zio_done_func_t *done, 840304139Savg void *private, zio_priority_t priority, enum zio_flag flags, 841304139Savg const zbookmark_phys_t *zb) 842168404Spjd{ 843168404Spjd zio_t *zio; 844168404Spjd 845185029Spjd ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && 846185029Spjd zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && 847185029Spjd zp->zp_compress >= ZIO_COMPRESS_OFF && 848185029Spjd zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && 849236884Smm DMU_OT_IS_VALID(zp->zp_type) && 850185029Spjd zp->zp_level < 32 && 851219089Spjd zp->zp_copies > 0 && 852243524Smm zp->zp_copies <= spa_max_replication(spa)); 853168404Spjd 854168404Spjd zio = zio_create(pio, spa, txg, bp, data, size, done, private, 855185029Spjd ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 856219089Spjd ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 857219089Spjd ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); 858168404Spjd 859168404Spjd zio->io_ready = ready; 860304139Savg zio->io_children_ready = children_ready; 861260763Savg zio->io_physdone = physdone; 862185029Spjd zio->io_prop = *zp; 863168404Spjd 864268649Sdelphij /* 865268649Sdelphij * Data can be NULL if we are going to call zio_write_override() to 866268649Sdelphij * provide the already-allocated BP. But we may need the data to 867268649Sdelphij * verify a dedup hit (if requested). In this case, don't try to 868268649Sdelphij * dedup (just take the already-allocated BP verbatim). 869268649Sdelphij */ 870268649Sdelphij if (data == NULL && zio->io_prop.zp_dedup_verify) { 871268649Sdelphij zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE; 872268649Sdelphij } 873268649Sdelphij 874168404Spjd return (zio); 875168404Spjd} 876168404Spjd 877168404Spjdzio_t * 878185029Spjdzio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, 879260763Savg uint64_t size, zio_done_func_t *done, void *private, 880268657Sdelphij zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb) 881168404Spjd{ 882168404Spjd zio_t *zio; 883168404Spjd 884168404Spjd zio = zio_create(pio, spa, txg, bp, data, size, done, private, 885307279Smav ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb, 886168404Spjd ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 887168404Spjd 888168404Spjd return (zio); 889168404Spjd} 890168404Spjd 891219089Spjdvoid 892243524Smmzio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) 893219089Spjd{ 894219089Spjd ASSERT(zio->io_type == ZIO_TYPE_WRITE); 895219089Spjd ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 896219089Spjd ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 897219089Spjd ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); 898219089Spjd 899243524Smm /* 900243524Smm * We must reset the io_prop to match the values that existed 901243524Smm * when the bp was first written by dmu_sync() keeping in mind 902243524Smm * that nopwrite and dedup are mutually exclusive. 903243524Smm */ 904243524Smm zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; 905243524Smm zio->io_prop.zp_nopwrite = nopwrite; 906219089Spjd zio->io_prop.zp_copies = copies; 907219089Spjd zio->io_bp_override = bp; 908219089Spjd} 909219089Spjd 910219089Spjdvoid 911219089Spjdzio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) 912219089Spjd{ 913268649Sdelphij 914268649Sdelphij /* 915268649Sdelphij * The check for EMBEDDED is a performance optimization. We 916268649Sdelphij * process the free here (by ignoring it) rather than 917268649Sdelphij * putting it on the list and then processing it in zio_free_sync(). 918268649Sdelphij */ 919268649Sdelphij if (BP_IS_EMBEDDED(bp)) 920268649Sdelphij return; 921248571Smm metaslab_check_free(spa, bp); 922252840Smm 923252840Smm /* 924252840Smm * Frees that are for the currently-syncing txg, are not going to be 925252840Smm * deferred, and which will not need to do a read (i.e. not GANG or 926252840Smm * DEDUP), can be processed immediately. Otherwise, put them on the 927252840Smm * in-memory list for later processing. 928252840Smm */ 929253992Smav if (zfs_trim_enabled || BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || 930252840Smm txg != spa->spa_syncing_txg || 931252840Smm spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) { 932252840Smm bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); 933252840Smm } else { 934252840Smm VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 935252840Smm BP_GET_PSIZE(bp), 0))); 936252840Smm } 937219089Spjd} 938219089Spjd 939168404Spjdzio_t * 940219089Spjdzio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 941240868Spjd uint64_t size, enum zio_flag flags) 942168404Spjd{ 943168404Spjd zio_t *zio; 944252840Smm enum zio_stage stage = ZIO_FREE_PIPELINE; 945168404Spjd 946168404Spjd ASSERT(!BP_IS_HOLE(bp)); 947219089Spjd ASSERT(spa_syncing_txg(spa) == txg); 948243503Smm ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); 949168404Spjd 950268649Sdelphij if (BP_IS_EMBEDDED(bp)) 951268649Sdelphij return (zio_null(pio, spa, NULL, NULL, NULL, 0)); 952268649Sdelphij 953248571Smm metaslab_check_free(spa, bp); 954251520Sdelphij arc_freed(spa, bp); 955248571Smm 956253992Smav if (zfs_trim_enabled) 957253992Smav stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START | 958253992Smav ZIO_STAGE_VDEV_IO_ASSESS; 959252840Smm /* 960252840Smm * GANG and DEDUP blocks can induce a read (for the gang block header, 961252840Smm * or the DDT), so issue them asynchronously so that this thread is 962252840Smm * not tied up. 963252840Smm */ 964253992Smav else if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) 965252840Smm stage |= ZIO_STAGE_ISSUE_ASYNC; 966252840Smm 967270312Ssmh flags |= ZIO_FLAG_DONT_QUEUE; 968270312Ssmh 969240868Spjd zio = zio_create(pio, spa, txg, bp, NULL, size, 970260763Savg NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags, 971252840Smm NULL, 0, NULL, ZIO_STAGE_OPEN, stage); 972168404Spjd 973168404Spjd return (zio); 974168404Spjd} 975168404Spjd 976168404Spjdzio_t * 977219089Spjdzio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 978219089Spjd zio_done_func_t *done, void *private, enum zio_flag flags) 979168404Spjd{ 980168404Spjd zio_t *zio; 981168404Spjd 982268649Sdelphij dprintf_bp(bp, "claiming in txg %llu", txg); 983268649Sdelphij 984268649Sdelphij if (BP_IS_EMBEDDED(bp)) 985268649Sdelphij return (zio_null(pio, spa, NULL, NULL, NULL, 0)); 986268649Sdelphij 987168404Spjd /* 988168404Spjd * A claim is an allocation of a specific block. Claims are needed 989168404Spjd * to support immediate writes in the intent log. The issue is that 990168404Spjd * immediate writes contain committed data, but in a txg that was 991168404Spjd * *not* committed. Upon opening the pool after an unclean shutdown, 992168404Spjd * the intent log claims all blocks that contain immediate write data 993168404Spjd * so that the SPA knows they're in use. 994168404Spjd * 995168404Spjd * All claims *must* be resolved in the first txg -- before the SPA 996168404Spjd * starts allocating blocks -- so that nothing is allocated twice. 997219089Spjd * If txg == 0 we just verify that the block is claimable. 998168404Spjd */ 999168404Spjd ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 1000219089Spjd ASSERT(txg == spa_first_txg(spa) || txg == 0); 1001219089Spjd ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ 1002168404Spjd 1003185029Spjd zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 1004185029Spjd done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, 1005185029Spjd NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 1006307279Smav ASSERT0(zio->io_queued_timestamp); 1007168404Spjd 1008168404Spjd return (zio); 1009168404Spjd} 1010168404Spjd 1011168404Spjdzio_t * 1012240868Spjdzio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset, 1013260763Savg uint64_t size, zio_done_func_t *done, void *private, 1014270312Ssmh zio_priority_t priority, enum zio_flag flags) 1015168404Spjd{ 1016168404Spjd zio_t *zio; 1017168404Spjd int c; 1018168404Spjd 1019168404Spjd if (vd->vdev_children == 0) { 1020240868Spjd zio = zio_create(pio, spa, 0, NULL, NULL, size, done, private, 1021270312Ssmh ZIO_TYPE_IOCTL, priority, flags, vd, offset, NULL, 1022168404Spjd ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 1023168404Spjd 1024168404Spjd zio->io_cmd = cmd; 1025168404Spjd } else { 1026209962Smm zio = zio_null(pio, spa, NULL, NULL, NULL, flags); 1027168404Spjd 1028168404Spjd for (c = 0; c < vd->vdev_children; c++) 1029168404Spjd zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 1030270312Ssmh offset, size, done, private, priority, flags)); 1031168404Spjd } 1032168404Spjd 1033168404Spjd return (zio); 1034168404Spjd} 1035168404Spjd 1036168404Spjdzio_t * 1037168404Spjdzio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 1038168404Spjd void *data, int checksum, zio_done_func_t *done, void *private, 1039260763Savg zio_priority_t priority, enum zio_flag flags, boolean_t labels) 1040168404Spjd{ 1041168404Spjd zio_t *zio; 1042168404Spjd 1043185029Spjd ASSERT(vd->vdev_children == 0); 1044185029Spjd ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 1045185029Spjd offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 1046185029Spjd ASSERT3U(offset + size, <=, vd->vdev_psize); 1047168404Spjd 1048185029Spjd zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 1049269416Sdelphij ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, 1050269416Sdelphij NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 1051168404Spjd 1052185029Spjd zio->io_prop.zp_checksum = checksum; 1053168404Spjd 1054168404Spjd return (zio); 1055168404Spjd} 1056168404Spjd 1057168404Spjdzio_t * 1058168404Spjdzio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 1059168404Spjd void *data, int checksum, zio_done_func_t *done, void *private, 1060260763Savg zio_priority_t priority, enum zio_flag flags, boolean_t labels) 1061168404Spjd{ 1062168404Spjd zio_t *zio; 1063168404Spjd 1064185029Spjd ASSERT(vd->vdev_children == 0); 1065185029Spjd ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 1066185029Spjd offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 1067185029Spjd ASSERT3U(offset + size, <=, vd->vdev_psize); 1068168404Spjd 1069185029Spjd zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 1070269416Sdelphij ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, 1071269416Sdelphij NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 1072168404Spjd 1073185029Spjd zio->io_prop.zp_checksum = checksum; 1074168404Spjd 1075290757Smav if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { 1076168404Spjd /* 1077219089Spjd * zec checksums are necessarily destructive -- they modify 1078185029Spjd * the end of the write buffer to hold the verifier/checksum. 1079168404Spjd * Therefore, we must make a local copy in case the data is 1080185029Spjd * being written to multiple places in parallel. 1081168404Spjd */ 1082185029Spjd void *wbuf = zio_buf_alloc(size); 1083168404Spjd bcopy(data, wbuf, size); 1084185029Spjd zio_push_transform(zio, wbuf, size, size, NULL); 1085168404Spjd } 1086168404Spjd 1087168404Spjd return (zio); 1088168404Spjd} 1089168404Spjd 1090168404Spjd/* 1091185029Spjd * Create a child I/O to do some work for us. 1092168404Spjd */ 1093168404Spjdzio_t * 1094185029Spjdzio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 1095307266Smav void *data, uint64_t size, int type, zio_priority_t priority, 1096307266Smav enum zio_flag flags, zio_done_func_t *done, void *private) 1097168404Spjd{ 1098219089Spjd enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; 1099185029Spjd zio_t *zio; 1100168404Spjd 1101185029Spjd ASSERT(vd->vdev_parent == 1102185029Spjd (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); 1103185029Spjd 1104168404Spjd if (type == ZIO_TYPE_READ && bp != NULL) { 1105168404Spjd /* 1106168404Spjd * If we have the bp, then the child should perform the 1107168404Spjd * checksum and the parent need not. This pushes error 1108168404Spjd * detection as close to the leaves as possible and 1109168404Spjd * eliminates redundant checksums in the interior nodes. 1110168404Spjd */ 1111219089Spjd pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; 1112219089Spjd pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 1113168404Spjd } 1114168404Spjd 1115270312Ssmh /* Not all IO types require vdev io done stage e.g. free */ 1116270312Ssmh if (!(pio->io_pipeline & ZIO_STAGE_VDEV_IO_DONE)) 1117270312Ssmh pipeline &= ~ZIO_STAGE_VDEV_IO_DONE; 1118270312Ssmh 1119185029Spjd if (vd->vdev_children == 0) 1120185029Spjd offset += VDEV_LABEL_START_SIZE; 1121185029Spjd 1122219089Spjd flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE; 1123219089Spjd 1124219089Spjd /* 1125219089Spjd * If we've decided to do a repair, the write is not speculative -- 1126219089Spjd * even if the original read was. 1127219089Spjd */ 1128219089Spjd if (flags & ZIO_FLAG_IO_REPAIR) 1129219089Spjd flags &= ~ZIO_FLAG_SPECULATIVE; 1130219089Spjd 1131307279Smav /* 1132307279Smav * If we're creating a child I/O that is not associated with a 1133307279Smav * top-level vdev, then the child zio is not an allocating I/O. 1134307279Smav * If this is a retried I/O then we ignore it since we will 1135307279Smav * have already processed the original allocating I/O. 1136307279Smav */ 1137307279Smav if (flags & ZIO_FLAG_IO_ALLOCATING && 1138307279Smav (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) { 1139307279Smav metaslab_class_t *mc = spa_normal_class(pio->io_spa); 1140307279Smav 1141307279Smav ASSERT(mc->mc_alloc_throttle_enabled); 1142307279Smav ASSERT(type == ZIO_TYPE_WRITE); 1143307279Smav ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE); 1144307279Smav ASSERT(!(flags & ZIO_FLAG_IO_REPAIR)); 1145307279Smav ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) || 1146307279Smav pio->io_child_type == ZIO_CHILD_GANG); 1147307279Smav 1148307279Smav flags &= ~ZIO_FLAG_IO_ALLOCATING; 1149307279Smav } 1150307279Smav 1151185029Spjd zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, 1152219089Spjd done, private, type, priority, flags, vd, offset, &pio->io_bookmark, 1153219089Spjd ZIO_STAGE_VDEV_IO_START >> 1, pipeline); 1154307279Smav ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); 1155168404Spjd 1156260763Savg zio->io_physdone = pio->io_physdone; 1157260763Savg if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) 1158260763Savg zio->io_logical->io_phys_children++; 1159260763Savg 1160185029Spjd return (zio); 1161168404Spjd} 1162168404Spjd 1163185029Spjdzio_t * 1164185029Spjdzio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, 1165290765Smav int type, zio_priority_t priority, enum zio_flag flags, 1166290765Smav zio_done_func_t *done, void *private) 1167168404Spjd{ 1168185029Spjd zio_t *zio; 1169168404Spjd 1170185029Spjd ASSERT(vd->vdev_ops->vdev_op_leaf); 1171168404Spjd 1172185029Spjd zio = zio_create(NULL, vd->vdev_spa, 0, NULL, 1173185029Spjd data, size, done, private, type, priority, 1174260763Savg flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED, 1175185029Spjd vd, offset, NULL, 1176219089Spjd ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); 1177168404Spjd 1178185029Spjd return (zio); 1179168404Spjd} 1180168404Spjd 1181168404Spjdvoid 1182185029Spjdzio_flush(zio_t *zio, vdev_t *vd) 1183168404Spjd{ 1184240868Spjd zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0, 1185270312Ssmh NULL, NULL, ZIO_PRIORITY_NOW, 1186185029Spjd ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); 1187168404Spjd} 1188168404Spjd 1189240868Spjdzio_t * 1190240868Spjdzio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size) 1191240868Spjd{ 1192240868Spjd 1193240868Spjd ASSERT(vd->vdev_ops->vdev_op_leaf); 1194240868Spjd 1195270312Ssmh return (zio_create(zio, spa, 0, NULL, NULL, size, NULL, NULL, 1196270312Ssmh ZIO_TYPE_FREE, ZIO_PRIORITY_TRIM, ZIO_FLAG_DONT_AGGREGATE | 1197270312Ssmh ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, 1198270312Ssmh vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PHYS_PIPELINE)); 1199240868Spjd} 1200240868Spjd 1201219089Spjdvoid 1202219089Spjdzio_shrink(zio_t *zio, uint64_t size) 1203219089Spjd{ 1204219089Spjd ASSERT(zio->io_executor == NULL); 1205219089Spjd ASSERT(zio->io_orig_size == zio->io_size); 1206219089Spjd ASSERT(size <= zio->io_size); 1207219089Spjd 1208219089Spjd /* 1209219089Spjd * We don't shrink for raidz because of problems with the 1210219089Spjd * reconstruction when reading back less than the block size. 1211219089Spjd * Note, BP_IS_RAIDZ() assumes no compression. 1212219089Spjd */ 1213219089Spjd ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); 1214219089Spjd if (!BP_IS_RAIDZ(zio->io_bp)) 1215219089Spjd zio->io_orig_size = zio->io_size = size; 1216219089Spjd} 1217219089Spjd 1218168404Spjd/* 1219168404Spjd * ========================================================================== 1220185029Spjd * Prepare to read and write logical blocks 1221168404Spjd * ========================================================================== 1222168404Spjd */ 1223185029Spjd 1224185029Spjdstatic int 1225270312Ssmhzio_read_bp_init(zio_t *zio) 1226168404Spjd{ 1227185029Spjd blkptr_t *bp = zio->io_bp; 1228185029Spjd 1229209962Smm if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && 1230209962Smm zio->io_child_type == ZIO_CHILD_LOGICAL && 1231209962Smm !(zio->io_flags & ZIO_FLAG_RAW)) { 1232268649Sdelphij uint64_t psize = 1233268649Sdelphij BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); 1234219089Spjd void *cbuf = zio_buf_alloc(psize); 1235185029Spjd 1236219089Spjd zio_push_transform(zio, cbuf, psize, psize, zio_decompress); 1237168404Spjd } 1238185029Spjd 1239268649Sdelphij if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { 1240268649Sdelphij zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1241268649Sdelphij decode_embedded_bp_compressed(bp, zio->io_data); 1242268649Sdelphij } else { 1243268649Sdelphij ASSERT(!BP_IS_EMBEDDED(bp)); 1244268649Sdelphij } 1245268649Sdelphij 1246236884Smm if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) 1247185029Spjd zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1248185029Spjd 1249219089Spjd if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) 1250219089Spjd zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1251219089Spjd 1252219089Spjd if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) 1253219089Spjd zio->io_pipeline = ZIO_DDT_READ_PIPELINE; 1254219089Spjd 1255185029Spjd return (ZIO_PIPELINE_CONTINUE); 1256168404Spjd} 1257168404Spjd 1258185029Spjdstatic int 1259270312Ssmhzio_write_bp_init(zio_t *zio) 1260168404Spjd{ 1261185029Spjd if (!IO_IS_ALLOCATING(zio)) 1262185029Spjd return (ZIO_PIPELINE_CONTINUE); 1263185029Spjd 1264219089Spjd ASSERT(zio->io_child_type != ZIO_CHILD_DDT); 1265185029Spjd 1266219089Spjd if (zio->io_bp_override) { 1267307279Smav blkptr_t *bp = zio->io_bp; 1268307279Smav zio_prop_t *zp = &zio->io_prop; 1269307279Smav 1270219089Spjd ASSERT(bp->blk_birth != zio->io_txg); 1271219089Spjd ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); 1272219089Spjd 1273219089Spjd *bp = *zio->io_bp_override; 1274219089Spjd zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1275219089Spjd 1276268649Sdelphij if (BP_IS_EMBEDDED(bp)) 1277268649Sdelphij return (ZIO_PIPELINE_CONTINUE); 1278268649Sdelphij 1279243524Smm /* 1280243524Smm * If we've been overridden and nopwrite is set then 1281243524Smm * set the flag accordingly to indicate that a nopwrite 1282243524Smm * has already occurred. 1283243524Smm */ 1284243524Smm if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { 1285243524Smm ASSERT(!zp->zp_dedup); 1286307279Smav ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum); 1287243524Smm zio->io_flags |= ZIO_FLAG_NOPWRITE; 1288243524Smm return (ZIO_PIPELINE_CONTINUE); 1289243524Smm } 1290243524Smm 1291243524Smm ASSERT(!zp->zp_nopwrite); 1292243524Smm 1293219089Spjd if (BP_IS_HOLE(bp) || !zp->zp_dedup) 1294219089Spjd return (ZIO_PIPELINE_CONTINUE); 1295219089Spjd 1296290757Smav ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags & 1297290757Smav ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify); 1298219089Spjd 1299219089Spjd if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { 1300219089Spjd BP_SET_DEDUP(bp, 1); 1301219089Spjd zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; 1302219089Spjd return (ZIO_PIPELINE_CONTINUE); 1303219089Spjd } 1304307279Smav 1305307279Smav /* 1306307279Smav * We were unable to handle this as an override bp, treat 1307307279Smav * it as a regular write I/O. 1308307279Smav */ 1309297093Smav zio->io_bp_override = NULL; 1310307279Smav *bp = zio->io_bp_orig; 1311307279Smav zio->io_pipeline = zio->io_orig_pipeline; 1312219089Spjd } 1313219089Spjd 1314307279Smav return (ZIO_PIPELINE_CONTINUE); 1315307279Smav} 1316307279Smav 1317307279Smavstatic int 1318307279Smavzio_write_compress(zio_t *zio) 1319307279Smav{ 1320307279Smav spa_t *spa = zio->io_spa; 1321307279Smav zio_prop_t *zp = &zio->io_prop; 1322307279Smav enum zio_compress compress = zp->zp_compress; 1323307279Smav blkptr_t *bp = zio->io_bp; 1324307279Smav uint64_t lsize = zio->io_size; 1325307279Smav uint64_t psize = lsize; 1326307279Smav int pass = 1; 1327307279Smav 1328307279Smav /* 1329307279Smav * If our children haven't all reached the ready stage, 1330307279Smav * wait for them and then repeat this pipeline stage. 1331307279Smav */ 1332330238Savg if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT | 1333330238Savg ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) { 1334307279Smav return (ZIO_PIPELINE_STOP); 1335330238Savg } 1336307279Smav 1337307279Smav if (!IO_IS_ALLOCATING(zio)) 1338307279Smav return (ZIO_PIPELINE_CONTINUE); 1339307279Smav 1340307279Smav if (zio->io_children_ready != NULL) { 1341307279Smav /* 1342307279Smav * Now that all our children are ready, run the callback 1343307279Smav * associated with this zio in case it wants to modify the 1344307279Smav * data to be written. 1345307279Smav */ 1346307279Smav ASSERT3U(zp->zp_level, >, 0); 1347307279Smav zio->io_children_ready(zio); 1348307279Smav } 1349307279Smav 1350307279Smav ASSERT(zio->io_child_type != ZIO_CHILD_DDT); 1351307279Smav ASSERT(zio->io_bp_override == NULL); 1352307279Smav 1353263397Sdelphij if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) { 1354185029Spjd /* 1355185029Spjd * We're rewriting an existing block, which means we're 1356185029Spjd * working on behalf of spa_sync(). For spa_sync() to 1357185029Spjd * converge, it must eventually be the case that we don't 1358185029Spjd * have to allocate new blocks. But compression changes 1359185029Spjd * the blocksize, which forces a reallocate, and makes 1360185029Spjd * convergence take longer. Therefore, after the first 1361185029Spjd * few passes, stop compressing to ensure convergence. 1362185029Spjd */ 1363219089Spjd pass = spa_sync_pass(spa); 1364185029Spjd 1365219089Spjd ASSERT(zio->io_txg == spa_syncing_txg(spa)); 1366219089Spjd ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1367219089Spjd ASSERT(!BP_GET_DEDUP(bp)); 1368219089Spjd 1369243503Smm if (pass >= zfs_sync_pass_dont_compress) 1370185029Spjd compress = ZIO_COMPRESS_OFF; 1371185029Spjd 1372185029Spjd /* Make sure someone doesn't change their mind on overwrites */ 1373268649Sdelphij ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp), 1374219089Spjd spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 1375185029Spjd } 1376185029Spjd 1377185029Spjd if (compress != ZIO_COMPRESS_OFF) { 1378219089Spjd void *cbuf = zio_buf_alloc(lsize); 1379269732Sdelphij psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); 1380219089Spjd if (psize == 0 || psize == lsize) { 1381185029Spjd compress = ZIO_COMPRESS_OFF; 1382219089Spjd zio_buf_free(cbuf, lsize); 1383268649Sdelphij } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE && 1384268649Sdelphij zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && 1385268649Sdelphij spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { 1386268649Sdelphij encode_embedded_bp_compressed(bp, 1387268649Sdelphij cbuf, compress, lsize, psize); 1388268649Sdelphij BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA); 1389268649Sdelphij BP_SET_TYPE(bp, zio->io_prop.zp_type); 1390268649Sdelphij BP_SET_LEVEL(bp, zio->io_prop.zp_level); 1391268649Sdelphij zio_buf_free(cbuf, lsize); 1392268649Sdelphij bp->blk_birth = zio->io_txg; 1393268649Sdelphij zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1394268649Sdelphij ASSERT(spa_feature_is_active(spa, 1395268649Sdelphij SPA_FEATURE_EMBEDDED_DATA)); 1396268649Sdelphij return (ZIO_PIPELINE_CONTINUE); 1397219089Spjd } else { 1398268649Sdelphij /* 1399285001Savg * Round up compressed size up to the ashift 1400285001Savg * of the smallest-ashift device, and zero the tail. 1401285001Savg * This ensures that the compressed size of the BP 1402285001Savg * (and thus compressratio property) are correct, 1403285001Savg * in that we charge for the padding used to fill out 1404285001Savg * the last sector. 1405268649Sdelphij */ 1406285001Savg ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); 1407285001Savg size_t rounded = (size_t)P2ROUNDUP(psize, 1408285001Savg 1ULL << spa->spa_min_ashift); 1409285001Savg if (rounded >= lsize) { 1410268649Sdelphij compress = ZIO_COMPRESS_OFF; 1411268649Sdelphij zio_buf_free(cbuf, lsize); 1412285001Savg psize = lsize; 1413268649Sdelphij } else { 1414285001Savg bzero((char *)cbuf + psize, rounded - psize); 1415285001Savg psize = rounded; 1416268649Sdelphij zio_push_transform(zio, cbuf, 1417268649Sdelphij psize, lsize, NULL); 1418268649Sdelphij } 1419185029Spjd } 1420307279Smav 1421307279Smav /* 1422307279Smav * We were unable to handle this as an override bp, treat 1423307279Smav * it as a regular write I/O. 1424307279Smav */ 1425307279Smav zio->io_bp_override = NULL; 1426307279Smav *bp = zio->io_bp_orig; 1427307279Smav zio->io_pipeline = zio->io_orig_pipeline; 1428185029Spjd } 1429185029Spjd 1430185029Spjd /* 1431185029Spjd * The final pass of spa_sync() must be all rewrites, but the first 1432185029Spjd * few passes offer a trade-off: allocating blocks defers convergence, 1433185029Spjd * but newly allocated blocks are sequential, so they can be written 1434185029Spjd * to disk faster. Therefore, we allow the first few passes of 1435185029Spjd * spa_sync() to allocate new blocks, but force rewrites after that. 1436185029Spjd * There should only be a handful of blocks after pass 1 in any case. 1437185029Spjd */ 1438263397Sdelphij if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg && 1439263397Sdelphij BP_GET_PSIZE(bp) == psize && 1440243503Smm pass >= zfs_sync_pass_rewrite) { 1441219089Spjd ASSERT(psize != 0); 1442219089Spjd enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; 1443185029Spjd zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; 1444185029Spjd zio->io_flags |= ZIO_FLAG_IO_REWRITE; 1445168404Spjd } else { 1446185029Spjd BP_ZERO(bp); 1447185029Spjd zio->io_pipeline = ZIO_WRITE_PIPELINE; 1448168404Spjd } 1449185029Spjd 1450219089Spjd if (psize == 0) { 1451263397Sdelphij if (zio->io_bp_orig.blk_birth != 0 && 1452263397Sdelphij spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { 1453263397Sdelphij BP_SET_LSIZE(bp, lsize); 1454263397Sdelphij BP_SET_TYPE(bp, zp->zp_type); 1455263397Sdelphij BP_SET_LEVEL(bp, zp->zp_level); 1456263397Sdelphij BP_SET_BIRTH(bp, zio->io_txg, 0); 1457263397Sdelphij } 1458185029Spjd zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1459185029Spjd } else { 1460185029Spjd ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); 1461185029Spjd BP_SET_LSIZE(bp, lsize); 1462263397Sdelphij BP_SET_TYPE(bp, zp->zp_type); 1463263397Sdelphij BP_SET_LEVEL(bp, zp->zp_level); 1464219089Spjd BP_SET_PSIZE(bp, psize); 1465185029Spjd BP_SET_COMPRESS(bp, compress); 1466185029Spjd BP_SET_CHECKSUM(bp, zp->zp_checksum); 1467219089Spjd BP_SET_DEDUP(bp, zp->zp_dedup); 1468185029Spjd BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1469219089Spjd if (zp->zp_dedup) { 1470219089Spjd ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1471219089Spjd ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1472219089Spjd zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; 1473219089Spjd } 1474243524Smm if (zp->zp_nopwrite) { 1475243524Smm ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1476243524Smm ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1477243524Smm zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; 1478243524Smm } 1479185029Spjd } 1480185029Spjd return (ZIO_PIPELINE_CONTINUE); 1481168404Spjd} 1482168404Spjd 1483219089Spjdstatic int 1484270312Ssmhzio_free_bp_init(zio_t *zio) 1485219089Spjd{ 1486219089Spjd blkptr_t *bp = zio->io_bp; 1487219089Spjd 1488219089Spjd if (zio->io_child_type == ZIO_CHILD_LOGICAL) { 1489219089Spjd if (BP_GET_DEDUP(bp)) 1490219089Spjd zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; 1491219089Spjd } 1492219089Spjd 1493219089Spjd return (ZIO_PIPELINE_CONTINUE); 1494219089Spjd} 1495219089Spjd 1496185029Spjd/* 1497185029Spjd * ========================================================================== 1498185029Spjd * Execute the I/O pipeline 1499185029Spjd * ========================================================================== 1500185029Spjd */ 1501185029Spjd 1502168404Spjdstatic void 1503260750Savgzio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) 1504168404Spjd{ 1505211931Smm spa_t *spa = zio->io_spa; 1506185029Spjd zio_type_t t = zio->io_type; 1507260742Savg int flags = (cutinline ? TQ_FRONT : 0); 1508168404Spjd 1509216919Smm ASSERT(q == ZIO_TASKQ_ISSUE || q == ZIO_TASKQ_INTERRUPT); 1510216919Smm 1511185029Spjd /* 1512209096Smm * If we're a config writer or a probe, the normal issue and 1513209096Smm * interrupt threads may all be blocked waiting for the config lock. 1514209096Smm * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. 1515185029Spjd */ 1516209096Smm if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) 1517185029Spjd t = ZIO_TYPE_NULL; 1518185029Spjd 1519185029Spjd /* 1520185029Spjd * A similar issue exists for the L2ARC write thread until L2ARC 2.0. 1521185029Spjd */ 1522185029Spjd if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) 1523185029Spjd t = ZIO_TYPE_NULL; 1524185029Spjd 1525211931Smm /* 1526260750Savg * If this is a high priority I/O, then use the high priority taskq if 1527260750Savg * available. 1528211931Smm */ 1529211931Smm if (zio->io_priority == ZIO_PRIORITY_NOW && 1530260750Savg spa->spa_zio_taskq[t][q + 1].stqs_count != 0) 1531211931Smm q++; 1532211931Smm 1533211931Smm ASSERT3U(q, <, ZIO_TASKQ_TYPES); 1534260742Savg 1535260742Savg /* 1536260742Savg * NB: We are assuming that the zio can only be dispatched 1537260742Savg * to a single taskq at a time. It would be a grievous error 1538260742Savg * to dispatch the zio to another taskq at the same time. 1539260742Savg */ 1540260742Savg#if defined(illumos) || !defined(_KERNEL) 1541260742Savg ASSERT(zio->io_tqent.tqent_next == NULL); 1542216919Smm#else 1543260742Savg ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); 1544216919Smm#endif 1545260750Savg spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio, 1546260750Savg flags, &zio->io_tqent); 1547168404Spjd} 1548168404Spjd 1549185029Spjdstatic boolean_t 1550260750Savgzio_taskq_member(zio_t *zio, zio_taskq_type_t q) 1551168404Spjd{ 1552185029Spjd kthread_t *executor = zio->io_executor; 1553185029Spjd spa_t *spa = zio->io_spa; 1554168404Spjd 1555260750Savg for (zio_type_t t = 0; t < ZIO_TYPES; t++) { 1556260750Savg spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1557260750Savg uint_t i; 1558260750Savg for (i = 0; i < tqs->stqs_count; i++) { 1559260750Savg if (taskq_member(tqs->stqs_taskq[i], executor)) 1560260750Savg return (B_TRUE); 1561260750Savg } 1562260750Savg } 1563168404Spjd 1564185029Spjd return (B_FALSE); 1565185029Spjd} 1566168404Spjd 1567185029Spjdstatic int 1568270312Ssmhzio_issue_async(zio_t *zio) 1569185029Spjd{ 1570219089Spjd zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 1571168404Spjd 1572185029Spjd return (ZIO_PIPELINE_STOP); 1573168404Spjd} 1574168404Spjd 1575185029Spjdvoid 1576185029Spjdzio_interrupt(zio_t *zio) 1577168404Spjd{ 1578219089Spjd zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); 1579185029Spjd} 1580168404Spjd 1581297108Smavvoid 1582297108Smavzio_delay_interrupt(zio_t *zio) 1583297108Smav{ 1584297108Smav /* 1585297108Smav * The timeout_generic() function isn't defined in userspace, so 1586297108Smav * rather than trying to implement the function, the zio delay 1587297108Smav * functionality has been disabled for userspace builds. 1588297108Smav */ 1589297108Smav 1590297108Smav#ifdef _KERNEL 1591297108Smav /* 1592297108Smav * If io_target_timestamp is zero, then no delay has been registered 1593297108Smav * for this IO, thus jump to the end of this function and "skip" the 1594297108Smav * delay; issuing it directly to the zio layer. 1595297108Smav */ 1596297108Smav if (zio->io_target_timestamp != 0) { 1597297108Smav hrtime_t now = gethrtime(); 1598297108Smav 1599297108Smav if (now >= zio->io_target_timestamp) { 1600297108Smav /* 1601297108Smav * This IO has already taken longer than the target 1602297108Smav * delay to complete, so we don't want to delay it 1603297108Smav * any longer; we "miss" the delay and issue it 1604297108Smav * directly to the zio layer. This is likely due to 1605297108Smav * the target latency being set to a value less than 1606297108Smav * the underlying hardware can satisfy (e.g. delay 1607297108Smav * set to 1ms, but the disks take 10ms to complete an 1608297108Smav * IO request). 1609297108Smav */ 1610297108Smav 1611297108Smav DTRACE_PROBE2(zio__delay__miss, zio_t *, zio, 1612297108Smav hrtime_t, now); 1613297108Smav 1614297108Smav zio_interrupt(zio); 1615297108Smav } else { 1616297108Smav hrtime_t diff = zio->io_target_timestamp - now; 1617297108Smav 1618297108Smav DTRACE_PROBE3(zio__delay__hit, zio_t *, zio, 1619297108Smav hrtime_t, now, hrtime_t, diff); 1620297108Smav 1621297108Smav (void) timeout_generic(CALLOUT_NORMAL, 1622297108Smav (void (*)(void *))zio_interrupt, zio, diff, 1, 0); 1623297108Smav } 1624297108Smav 1625297108Smav return; 1626297108Smav } 1627297108Smav#endif 1628297108Smav 1629297108Smav DTRACE_PROBE1(zio__delay__skip, zio_t *, zio); 1630297108Smav zio_interrupt(zio); 1631297108Smav} 1632297108Smav 1633185029Spjd/* 1634185029Spjd * Execute the I/O pipeline until one of the following occurs: 1635185029Spjd * 1636251631Sdelphij * (1) the I/O completes 1637251631Sdelphij * (2) the pipeline stalls waiting for dependent child I/Os 1638251631Sdelphij * (3) the I/O issues, so we're waiting for an I/O completion interrupt 1639251631Sdelphij * (4) the I/O is delegated by vdev-level caching or aggregation 1640251631Sdelphij * (5) the I/O is deferred due to vdev-level queueing 1641251631Sdelphij * (6) the I/O is handed off to another thread. 1642251631Sdelphij * 1643251631Sdelphij * In all cases, the pipeline stops whenever there's no CPU work; it never 1644251631Sdelphij * burns a thread in cv_wait(). 1645251631Sdelphij * 1646185029Spjd * There's no locking on io_stage because there's no legitimate way 1647185029Spjd * for multiple threads to be attempting to process the same I/O. 1648185029Spjd */ 1649219089Spjdstatic zio_pipe_stage_t *zio_pipeline[]; 1650168404Spjd 1651185029Spjdvoid 1652185029Spjdzio_execute(zio_t *zio) 1653185029Spjd{ 1654185029Spjd zio->io_executor = curthread; 1655168404Spjd 1656307279Smav ASSERT3U(zio->io_queued_timestamp, >, 0); 1657307279Smav 1658185029Spjd while (zio->io_stage < ZIO_STAGE_DONE) { 1659219089Spjd enum zio_stage pipeline = zio->io_pipeline; 1660219089Spjd enum zio_stage stage = zio->io_stage; 1661185029Spjd int rv; 1662168404Spjd 1663185029Spjd ASSERT(!MUTEX_HELD(&zio->io_lock)); 1664219089Spjd ASSERT(ISP2(stage)); 1665219089Spjd ASSERT(zio->io_stall == NULL); 1666168404Spjd 1667219089Spjd do { 1668219089Spjd stage <<= 1; 1669219089Spjd } while ((stage & pipeline) == 0); 1670168404Spjd 1671185029Spjd ASSERT(stage <= ZIO_STAGE_DONE); 1672168404Spjd 1673168404Spjd /* 1674185029Spjd * If we are in interrupt context and this pipeline stage 1675185029Spjd * will grab a config lock that is held across I/O, 1676219089Spjd * or may wait for an I/O that needs an interrupt thread 1677219089Spjd * to complete, issue async to avoid deadlock. 1678219089Spjd * 1679219089Spjd * For VDEV_IO_START, we cut in line so that the io will 1680219089Spjd * be sent to disk promptly. 1681168404Spjd */ 1682219089Spjd if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && 1683185029Spjd zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { 1684219089Spjd boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? 1685219089Spjd zio_requeue_io_start_cut_in_line : B_FALSE; 1686219089Spjd zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); 1687185029Spjd return; 1688185029Spjd } 1689168404Spjd 1690185029Spjd zio->io_stage = stage; 1691307279Smav zio->io_pipeline_trace |= zio->io_stage; 1692270312Ssmh rv = zio_pipeline[highbit64(stage) - 1](zio); 1693185029Spjd 1694185029Spjd if (rv == ZIO_PIPELINE_STOP) 1695185029Spjd return; 1696185029Spjd 1697185029Spjd ASSERT(rv == ZIO_PIPELINE_CONTINUE); 1698168404Spjd } 1699185029Spjd} 1700168404Spjd 1701185029Spjd/* 1702185029Spjd * ========================================================================== 1703185029Spjd * Initiate I/O, either sync or async 1704185029Spjd * ========================================================================== 1705185029Spjd */ 1706185029Spjdint 1707185029Spjdzio_wait(zio_t *zio) 1708185029Spjd{ 1709185029Spjd int error; 1710168404Spjd 1711185029Spjd ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 1712185029Spjd ASSERT(zio->io_executor == NULL); 1713168404Spjd 1714185029Spjd zio->io_waiter = curthread; 1715307279Smav ASSERT0(zio->io_queued_timestamp); 1716307279Smav zio->io_queued_timestamp = gethrtime(); 1717168404Spjd 1718185029Spjd zio_execute(zio); 1719168404Spjd 1720185029Spjd mutex_enter(&zio->io_lock); 1721185029Spjd while (zio->io_executor != NULL) 1722185029Spjd cv_wait(&zio->io_cv, &zio->io_lock); 1723185029Spjd mutex_exit(&zio->io_lock); 1724168404Spjd 1725185029Spjd error = zio->io_error; 1726185029Spjd zio_destroy(zio); 1727168404Spjd 1728185029Spjd return (error); 1729185029Spjd} 1730185029Spjd 1731185029Spjdvoid 1732185029Spjdzio_nowait(zio_t *zio) 1733185029Spjd{ 1734185029Spjd ASSERT(zio->io_executor == NULL); 1735185029Spjd 1736209962Smm if (zio->io_child_type == ZIO_CHILD_LOGICAL && 1737209962Smm zio_unique_parent(zio) == NULL) { 1738185029Spjd /* 1739185029Spjd * This is a logical async I/O with no parent to wait for it. 1740209962Smm * We add it to the spa_async_root_zio "Godfather" I/O which 1741209962Smm * will ensure they complete prior to unloading the pool. 1742185029Spjd */ 1743185029Spjd spa_t *spa = zio->io_spa; 1744209962Smm 1745273348Sdelphij zio_add_child(spa->spa_async_zio_root[CPU_SEQID], zio); 1746168404Spjd } 1747185029Spjd 1748307279Smav ASSERT0(zio->io_queued_timestamp); 1749307279Smav zio->io_queued_timestamp = gethrtime(); 1750185029Spjd zio_execute(zio); 1751168404Spjd} 1752168404Spjd 1753168404Spjd/* 1754168404Spjd * ========================================================================== 1755185029Spjd * Reexecute or suspend/resume failed I/O 1756168404Spjd * ========================================================================== 1757168404Spjd */ 1758185029Spjd 1759168404Spjdstatic void 1760185029Spjdzio_reexecute(zio_t *pio) 1761168404Spjd{ 1762209962Smm zio_t *cio, *cio_next; 1763168404Spjd 1764209962Smm ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); 1765209962Smm ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); 1766209962Smm ASSERT(pio->io_gang_leader == NULL); 1767209962Smm ASSERT(pio->io_gang_tree == NULL); 1768209962Smm 1769185029Spjd pio->io_flags = pio->io_orig_flags; 1770185029Spjd pio->io_stage = pio->io_orig_stage; 1771185029Spjd pio->io_pipeline = pio->io_orig_pipeline; 1772185029Spjd pio->io_reexecute = 0; 1773243524Smm pio->io_flags |= ZIO_FLAG_REEXECUTED; 1774307279Smav pio->io_pipeline_trace = 0; 1775185029Spjd pio->io_error = 0; 1776209962Smm for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1777209962Smm pio->io_state[w] = 0; 1778185029Spjd for (int c = 0; c < ZIO_CHILD_TYPES; c++) 1779185029Spjd pio->io_child_error[c] = 0; 1780185029Spjd 1781219089Spjd if (IO_IS_ALLOCATING(pio)) 1782219089Spjd BP_ZERO(pio->io_bp); 1783168404Spjd 1784185029Spjd /* 1785185029Spjd * As we reexecute pio's children, new children could be created. 1786209962Smm * New children go to the head of pio's io_child_list, however, 1787185029Spjd * so we will (correctly) not reexecute them. The key is that 1788209962Smm * the remainder of pio's io_child_list, from 'cio_next' onward, 1789209962Smm * cannot be affected by any side effects of reexecuting 'cio'. 1790185029Spjd */ 1791307279Smav zio_link_t *zl = NULL; 1792307279Smav for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { 1793307279Smav cio_next = zio_walk_children(pio, &zl); 1794185029Spjd mutex_enter(&pio->io_lock); 1795209962Smm for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1796209962Smm pio->io_children[cio->io_child_type][w]++; 1797185029Spjd mutex_exit(&pio->io_lock); 1798209962Smm zio_reexecute(cio); 1799185029Spjd } 1800168404Spjd 1801168404Spjd /* 1802185029Spjd * Now that all children have been reexecuted, execute the parent. 1803209962Smm * We don't reexecute "The Godfather" I/O here as it's the 1804209962Smm * responsibility of the caller to wait on him. 1805168404Spjd */ 1806307279Smav if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) { 1807307279Smav pio->io_queued_timestamp = gethrtime(); 1808209962Smm zio_execute(pio); 1809307279Smav } 1810185029Spjd} 1811185029Spjd 1812185029Spjdvoid 1813185029Spjdzio_suspend(spa_t *spa, zio_t *zio) 1814185029Spjd{ 1815185029Spjd if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) 1816185029Spjd fm_panic("Pool '%s' has encountered an uncorrectable I/O " 1817185029Spjd "failure and the failure mode property for this pool " 1818185029Spjd "is set to panic.", spa_name(spa)); 1819185029Spjd 1820185029Spjd zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); 1821185029Spjd 1822185029Spjd mutex_enter(&spa->spa_suspend_lock); 1823185029Spjd 1824185029Spjd if (spa->spa_suspend_zio_root == NULL) 1825209962Smm spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 1826209962Smm ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 1827209962Smm ZIO_FLAG_GODFATHER); 1828185029Spjd 1829185029Spjd spa->spa_suspended = B_TRUE; 1830185029Spjd 1831185029Spjd if (zio != NULL) { 1832209962Smm ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 1833185029Spjd ASSERT(zio != spa->spa_suspend_zio_root); 1834185029Spjd ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1835209962Smm ASSERT(zio_unique_parent(zio) == NULL); 1836185029Spjd ASSERT(zio->io_stage == ZIO_STAGE_DONE); 1837185029Spjd zio_add_child(spa->spa_suspend_zio_root, zio); 1838168404Spjd } 1839168404Spjd 1840185029Spjd mutex_exit(&spa->spa_suspend_lock); 1841168404Spjd} 1842168404Spjd 1843209962Smmint 1844185029Spjdzio_resume(spa_t *spa) 1845168404Spjd{ 1846209962Smm zio_t *pio; 1847168404Spjd 1848185029Spjd /* 1849185029Spjd * Reexecute all previously suspended i/o. 1850185029Spjd */ 1851185029Spjd mutex_enter(&spa->spa_suspend_lock); 1852185029Spjd spa->spa_suspended = B_FALSE; 1853185029Spjd cv_broadcast(&spa->spa_suspend_cv); 1854185029Spjd pio = spa->spa_suspend_zio_root; 1855185029Spjd spa->spa_suspend_zio_root = NULL; 1856185029Spjd mutex_exit(&spa->spa_suspend_lock); 1857168404Spjd 1858185029Spjd if (pio == NULL) 1859209962Smm return (0); 1860168404Spjd 1861209962Smm zio_reexecute(pio); 1862209962Smm return (zio_wait(pio)); 1863168404Spjd} 1864168404Spjd 1865185029Spjdvoid 1866185029Spjdzio_resume_wait(spa_t *spa) 1867185029Spjd{ 1868185029Spjd mutex_enter(&spa->spa_suspend_lock); 1869185029Spjd while (spa_suspended(spa)) 1870185029Spjd cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); 1871185029Spjd mutex_exit(&spa->spa_suspend_lock); 1872185029Spjd} 1873185029Spjd 1874168404Spjd/* 1875168404Spjd * ========================================================================== 1876185029Spjd * Gang blocks. 1877185029Spjd * 1878185029Spjd * A gang block is a collection of small blocks that looks to the DMU 1879185029Spjd * like one large block. When zio_dva_allocate() cannot find a block 1880185029Spjd * of the requested size, due to either severe fragmentation or the pool 1881185029Spjd * being nearly full, it calls zio_write_gang_block() to construct the 1882185029Spjd * block from smaller fragments. 1883185029Spjd * 1884185029Spjd * A gang block consists of a gang header (zio_gbh_phys_t) and up to 1885185029Spjd * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like 1886185029Spjd * an indirect block: it's an array of block pointers. It consumes 1887185029Spjd * only one sector and hence is allocatable regardless of fragmentation. 1888185029Spjd * The gang header's bps point to its gang members, which hold the data. 1889185029Spjd * 1890185029Spjd * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> 1891185029Spjd * as the verifier to ensure uniqueness of the SHA256 checksum. 1892185029Spjd * Critically, the gang block bp's blk_cksum is the checksum of the data, 1893185029Spjd * not the gang header. This ensures that data block signatures (needed for 1894185029Spjd * deduplication) are independent of how the block is physically stored. 1895185029Spjd * 1896185029Spjd * Gang blocks can be nested: a gang member may itself be a gang block. 1897185029Spjd * Thus every gang block is a tree in which root and all interior nodes are 1898185029Spjd * gang headers, and the leaves are normal blocks that contain user data. 1899185029Spjd * The root of the gang tree is called the gang leader. 1900185029Spjd * 1901185029Spjd * To perform any operation (read, rewrite, free, claim) on a gang block, 1902185029Spjd * zio_gang_assemble() first assembles the gang tree (minus data leaves) 1903185029Spjd * in the io_gang_tree field of the original logical i/o by recursively 1904185029Spjd * reading the gang leader and all gang headers below it. This yields 1905185029Spjd * an in-core tree containing the contents of every gang header and the 1906185029Spjd * bps for every constituent of the gang block. 1907185029Spjd * 1908185029Spjd * With the gang tree now assembled, zio_gang_issue() just walks the gang tree 1909185029Spjd * and invokes a callback on each bp. To free a gang block, zio_gang_issue() 1910185029Spjd * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. 1911185029Spjd * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). 1912185029Spjd * zio_read_gang() is a wrapper around zio_read() that omits reading gang 1913185029Spjd * headers, since we already have those in io_gang_tree. zio_rewrite_gang() 1914185029Spjd * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() 1915185029Spjd * of the gang header plus zio_checksum_compute() of the data to update the 1916185029Spjd * gang header's blk_cksum as described above. 1917185029Spjd * 1918185029Spjd * The two-phase assemble/issue model solves the problem of partial failure -- 1919185029Spjd * what if you'd freed part of a gang block but then couldn't read the 1920185029Spjd * gang header for another part? Assembling the entire gang tree first 1921185029Spjd * ensures that all the necessary gang header I/O has succeeded before 1922185029Spjd * starting the actual work of free, claim, or write. Once the gang tree 1923185029Spjd * is assembled, free and claim are in-memory operations that cannot fail. 1924185029Spjd * 1925185029Spjd * In the event that a gang write fails, zio_dva_unallocate() walks the 1926185029Spjd * gang tree to immediately free (i.e. insert back into the space map) 1927185029Spjd * everything we've allocated. This ensures that we don't get ENOSPC 1928185029Spjd * errors during repeated suspend/resume cycles due to a flaky device. 1929185029Spjd * 1930185029Spjd * Gang rewrites only happen during sync-to-convergence. If we can't assemble 1931185029Spjd * the gang tree, we won't modify the block, so we can safely defer the free 1932185029Spjd * (knowing that the block is still intact). If we *can* assemble the gang 1933185029Spjd * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free 1934185029Spjd * each constituent bp and we can allocate a new block on the next sync pass. 1935185029Spjd * 1936185029Spjd * In all cases, the gang tree allows complete recovery from partial failure. 1937168404Spjd * ========================================================================== 1938168404Spjd */ 1939185029Spjd 1940185029Spjdstatic zio_t * 1941185029Spjdzio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1942168404Spjd{ 1943185029Spjd if (gn != NULL) 1944185029Spjd return (pio); 1945168404Spjd 1946185029Spjd return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), 1947185029Spjd NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1948185029Spjd &pio->io_bookmark)); 1949168404Spjd} 1950168404Spjd 1951185029Spjdzio_t * 1952185029Spjdzio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1953168404Spjd{ 1954185029Spjd zio_t *zio; 1955168404Spjd 1956185029Spjd if (gn != NULL) { 1957185029Spjd zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1958185029Spjd gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, 1959185029Spjd ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1960185029Spjd /* 1961185029Spjd * As we rewrite each gang header, the pipeline will compute 1962185029Spjd * a new gang block header checksum for it; but no one will 1963185029Spjd * compute a new data checksum, so we do that here. The one 1964185029Spjd * exception is the gang leader: the pipeline already computed 1965185029Spjd * its data checksum because that stage precedes gang assembly. 1966185029Spjd * (Presently, nothing actually uses interior data checksums; 1967185029Spjd * this is just good hygiene.) 1968185029Spjd */ 1969209962Smm if (gn != pio->io_gang_leader->io_gang_tree) { 1970185029Spjd zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), 1971185029Spjd data, BP_GET_PSIZE(bp)); 1972185029Spjd } 1973219089Spjd /* 1974219089Spjd * If we are here to damage data for testing purposes, 1975219089Spjd * leave the GBH alone so that we can detect the damage. 1976219089Spjd */ 1977219089Spjd if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) 1978219089Spjd zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 1979185029Spjd } else { 1980185029Spjd zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1981185029Spjd data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, 1982185029Spjd ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1983185029Spjd } 1984185029Spjd 1985185029Spjd return (zio); 1986168404Spjd} 1987168404Spjd 1988185029Spjd/* ARGSUSED */ 1989185029Spjdzio_t * 1990185029Spjdzio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1991168404Spjd{ 1992219089Spjd return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, 1993240868Spjd BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp), 1994219089Spjd ZIO_GANG_CHILD_FLAGS(pio))); 1995185029Spjd} 1996168404Spjd 1997185029Spjd/* ARGSUSED */ 1998185029Spjdzio_t * 1999185029Spjdzio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 2000185029Spjd{ 2001185029Spjd return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, 2002185029Spjd NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); 2003185029Spjd} 2004168404Spjd 2005185029Spjdstatic zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { 2006185029Spjd NULL, 2007185029Spjd zio_read_gang, 2008185029Spjd zio_rewrite_gang, 2009185029Spjd zio_free_gang, 2010185029Spjd zio_claim_gang, 2011185029Spjd NULL 2012185029Spjd}; 2013168404Spjd 2014185029Spjdstatic void zio_gang_tree_assemble_done(zio_t *zio); 2015168404Spjd 2016185029Spjdstatic zio_gang_node_t * 2017185029Spjdzio_gang_node_alloc(zio_gang_node_t **gnpp) 2018185029Spjd{ 2019185029Spjd zio_gang_node_t *gn; 2020185029Spjd 2021185029Spjd ASSERT(*gnpp == NULL); 2022185029Spjd 2023185029Spjd gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); 2024185029Spjd gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); 2025185029Spjd *gnpp = gn; 2026185029Spjd 2027185029Spjd return (gn); 2028168404Spjd} 2029168404Spjd 2030168404Spjdstatic void 2031185029Spjdzio_gang_node_free(zio_gang_node_t **gnpp) 2032168404Spjd{ 2033185029Spjd zio_gang_node_t *gn = *gnpp; 2034168404Spjd 2035185029Spjd for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 2036185029Spjd ASSERT(gn->gn_child[g] == NULL); 2037168404Spjd 2038185029Spjd zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); 2039185029Spjd kmem_free(gn, sizeof (*gn)); 2040185029Spjd *gnpp = NULL; 2041185029Spjd} 2042168404Spjd 2043185029Spjdstatic void 2044185029Spjdzio_gang_tree_free(zio_gang_node_t **gnpp) 2045185029Spjd{ 2046185029Spjd zio_gang_node_t *gn = *gnpp; 2047168404Spjd 2048185029Spjd if (gn == NULL) 2049185029Spjd return; 2050168404Spjd 2051185029Spjd for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 2052185029Spjd zio_gang_tree_free(&gn->gn_child[g]); 2053168404Spjd 2054185029Spjd zio_gang_node_free(gnpp); 2055168404Spjd} 2056168404Spjd 2057168404Spjdstatic void 2058209962Smmzio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) 2059168404Spjd{ 2060185029Spjd zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); 2061168404Spjd 2062209962Smm ASSERT(gio->io_gang_leader == gio); 2063185029Spjd ASSERT(BP_IS_GANG(bp)); 2064168404Spjd 2065209962Smm zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, 2066185029Spjd SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, 2067209962Smm gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); 2068185029Spjd} 2069168404Spjd 2070185029Spjdstatic void 2071185029Spjdzio_gang_tree_assemble_done(zio_t *zio) 2072185029Spjd{ 2073209962Smm zio_t *gio = zio->io_gang_leader; 2074185029Spjd zio_gang_node_t *gn = zio->io_private; 2075185029Spjd blkptr_t *bp = zio->io_bp; 2076168404Spjd 2077209962Smm ASSERT(gio == zio_unique_parent(zio)); 2078219089Spjd ASSERT(zio->io_child_count == 0); 2079168404Spjd 2080185029Spjd if (zio->io_error) 2081185029Spjd return; 2082168404Spjd 2083185029Spjd if (BP_SHOULD_BYTESWAP(bp)) 2084185029Spjd byteswap_uint64_array(zio->io_data, zio->io_size); 2085185029Spjd 2086185029Spjd ASSERT(zio->io_data == gn->gn_gbh); 2087185029Spjd ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 2088219089Spjd ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 2089185029Spjd 2090185029Spjd for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 2091185029Spjd blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 2092185029Spjd if (!BP_IS_GANG(gbp)) 2093185029Spjd continue; 2094209962Smm zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); 2095168404Spjd } 2096168404Spjd} 2097168404Spjd 2098168404Spjdstatic void 2099185029Spjdzio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) 2100168404Spjd{ 2101209962Smm zio_t *gio = pio->io_gang_leader; 2102185029Spjd zio_t *zio; 2103168404Spjd 2104185029Spjd ASSERT(BP_IS_GANG(bp) == !!gn); 2105209962Smm ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); 2106209962Smm ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); 2107168404Spjd 2108185029Spjd /* 2109185029Spjd * If you're a gang header, your data is in gn->gn_gbh. 2110185029Spjd * If you're a gang member, your data is in 'data' and gn == NULL. 2111185029Spjd */ 2112209962Smm zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); 2113168404Spjd 2114185029Spjd if (gn != NULL) { 2115219089Spjd ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 2116168404Spjd 2117185029Spjd for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 2118185029Spjd blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 2119185029Spjd if (BP_IS_HOLE(gbp)) 2120185029Spjd continue; 2121185029Spjd zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); 2122185029Spjd data = (char *)data + BP_GET_PSIZE(gbp); 2123185029Spjd } 2124168404Spjd } 2125168404Spjd 2126240868Spjd if (gn == gio->io_gang_tree && gio->io_data != NULL) 2127209962Smm ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); 2128185029Spjd 2129185029Spjd if (zio != pio) 2130185029Spjd zio_nowait(zio); 2131168404Spjd} 2132168404Spjd 2133185029Spjdstatic int 2134270312Ssmhzio_gang_assemble(zio_t *zio) 2135168404Spjd{ 2136185029Spjd blkptr_t *bp = zio->io_bp; 2137168404Spjd 2138209962Smm ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); 2139209962Smm ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2140168404Spjd 2141209962Smm zio->io_gang_leader = zio; 2142209962Smm 2143185029Spjd zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); 2144168404Spjd 2145185029Spjd return (ZIO_PIPELINE_CONTINUE); 2146185029Spjd} 2147168404Spjd 2148185029Spjdstatic int 2149270312Ssmhzio_gang_issue(zio_t *zio) 2150185029Spjd{ 2151185029Spjd blkptr_t *bp = zio->io_bp; 2152185029Spjd 2153330238Savg if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) { 2154185029Spjd return (ZIO_PIPELINE_STOP); 2155330238Savg } 2156185029Spjd 2157209962Smm ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); 2158209962Smm ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2159185029Spjd 2160185029Spjd if (zio->io_child_error[ZIO_CHILD_GANG] == 0) 2161209962Smm zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); 2162185029Spjd else 2163209962Smm zio_gang_tree_free(&zio->io_gang_tree); 2164185029Spjd 2165185029Spjd zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2166185029Spjd 2167185029Spjd return (ZIO_PIPELINE_CONTINUE); 2168168404Spjd} 2169168404Spjd 2170168404Spjdstatic void 2171185029Spjdzio_write_gang_member_ready(zio_t *zio) 2172168404Spjd{ 2173209962Smm zio_t *pio = zio_unique_parent(zio); 2174209962Smm zio_t *gio = zio->io_gang_leader; 2175168404Spjd dva_t *cdva = zio->io_bp->blk_dva; 2176168404Spjd dva_t *pdva = pio->io_bp->blk_dva; 2177168404Spjd uint64_t asize; 2178168404Spjd 2179185029Spjd if (BP_IS_HOLE(zio->io_bp)) 2180185029Spjd return; 2181185029Spjd 2182185029Spjd ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); 2183185029Spjd 2184185029Spjd ASSERT(zio->io_child_type == ZIO_CHILD_GANG); 2185219089Spjd ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); 2186219089Spjd ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); 2187219089Spjd ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); 2188168404Spjd ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 2189168404Spjd 2190168404Spjd mutex_enter(&pio->io_lock); 2191185029Spjd for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { 2192168404Spjd ASSERT(DVA_GET_GANG(&pdva[d])); 2193168404Spjd asize = DVA_GET_ASIZE(&pdva[d]); 2194168404Spjd asize += DVA_GET_ASIZE(&cdva[d]); 2195168404Spjd DVA_SET_ASIZE(&pdva[d], asize); 2196168404Spjd } 2197168404Spjd mutex_exit(&pio->io_lock); 2198168404Spjd} 2199168404Spjd 2200185029Spjdstatic int 2201185029Spjdzio_write_gang_block(zio_t *pio) 2202168404Spjd{ 2203185029Spjd spa_t *spa = pio->io_spa; 2204307279Smav metaslab_class_t *mc = spa_normal_class(spa); 2205185029Spjd blkptr_t *bp = pio->io_bp; 2206209962Smm zio_t *gio = pio->io_gang_leader; 2207185029Spjd zio_t *zio; 2208185029Spjd zio_gang_node_t *gn, **gnpp; 2209168404Spjd zio_gbh_phys_t *gbh; 2210185029Spjd uint64_t txg = pio->io_txg; 2211185029Spjd uint64_t resid = pio->io_size; 2212185029Spjd uint64_t lsize; 2213219089Spjd int copies = gio->io_prop.zp_copies; 2214219089Spjd int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); 2215185029Spjd zio_prop_t zp; 2216168404Spjd int error; 2217168404Spjd 2218307279Smav int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER; 2219307279Smav if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { 2220307279Smav ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); 2221307279Smav ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA)); 2222307279Smav 2223307279Smav flags |= METASLAB_ASYNC_ALLOC; 2224307279Smav VERIFY(refcount_held(&mc->mc_alloc_slots, pio)); 2225307279Smav 2226307279Smav /* 2227307279Smav * The logical zio has already placed a reservation for 2228307279Smav * 'copies' allocation slots but gang blocks may require 2229307279Smav * additional copies. These additional copies 2230307279Smav * (i.e. gbh_copies - copies) are guaranteed to succeed 2231307279Smav * since metaslab_class_throttle_reserve() always allows 2232307279Smav * additional reservations for gang blocks. 2233307279Smav */ 2234307279Smav VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies, 2235307279Smav pio, flags)); 2236307279Smav } 2237307279Smav 2238307279Smav error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE, 2239307279Smav bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, pio); 2240185029Spjd if (error) { 2241307279Smav if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { 2242307279Smav ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); 2243307279Smav ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA)); 2244307279Smav 2245307279Smav /* 2246307279Smav * If we failed to allocate the gang block header then 2247307279Smav * we remove any additional allocation reservations that 2248307279Smav * we placed here. The original reservation will 2249307279Smav * be removed when the logical I/O goes to the ready 2250307279Smav * stage. 2251307279Smav */ 2252307279Smav metaslab_class_throttle_unreserve(mc, 2253307279Smav gbh_copies - copies, pio); 2254307279Smav } 2255185029Spjd pio->io_error = error; 2256185029Spjd return (ZIO_PIPELINE_CONTINUE); 2257185029Spjd } 2258168404Spjd 2259209962Smm if (pio == gio) { 2260209962Smm gnpp = &gio->io_gang_tree; 2261185029Spjd } else { 2262185029Spjd gnpp = pio->io_private; 2263185029Spjd ASSERT(pio->io_ready == zio_write_gang_member_ready); 2264185029Spjd } 2265168404Spjd 2266185029Spjd gn = zio_gang_node_alloc(gnpp); 2267185029Spjd gbh = gn->gn_gbh; 2268185029Spjd bzero(gbh, SPA_GANGBLOCKSIZE); 2269168404Spjd 2270185029Spjd /* 2271185029Spjd * Create the gang header. 2272185029Spjd */ 2273185029Spjd zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, 2274185029Spjd pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 2275168404Spjd 2276185029Spjd /* 2277185029Spjd * Create and nowait the gang children. 2278185029Spjd */ 2279185029Spjd for (int g = 0; resid != 0; resid -= lsize, g++) { 2280185029Spjd lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), 2281185029Spjd SPA_MINBLOCKSIZE); 2282185029Spjd ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); 2283168404Spjd 2284209962Smm zp.zp_checksum = gio->io_prop.zp_checksum; 2285185029Spjd zp.zp_compress = ZIO_COMPRESS_OFF; 2286185029Spjd zp.zp_type = DMU_OT_NONE; 2287185029Spjd zp.zp_level = 0; 2288219089Spjd zp.zp_copies = gio->io_prop.zp_copies; 2289243524Smm zp.zp_dedup = B_FALSE; 2290243524Smm zp.zp_dedup_verify = B_FALSE; 2291243524Smm zp.zp_nopwrite = B_FALSE; 2292168404Spjd 2293307279Smav zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g], 2294185029Spjd (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, 2295304139Savg zio_write_gang_member_ready, NULL, NULL, NULL, 2296304139Savg &gn->gn_child[g], pio->io_priority, 2297307279Smav ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 2298307279Smav 2299307279Smav if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { 2300307279Smav ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); 2301307279Smav ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA)); 2302307279Smav 2303307279Smav /* 2304307279Smav * Gang children won't throttle but we should 2305307279Smav * account for their work, so reserve an allocation 2306307279Smav * slot for them here. 2307307279Smav */ 2308307279Smav VERIFY(metaslab_class_throttle_reserve(mc, 2309307279Smav zp.zp_copies, cio, flags)); 2310307279Smav } 2311307279Smav zio_nowait(cio); 2312168404Spjd } 2313168404Spjd 2314185029Spjd /* 2315185029Spjd * Set pio's pipeline to just wait for zio to finish. 2316185029Spjd */ 2317185029Spjd pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2318168404Spjd 2319185029Spjd zio_nowait(zio); 2320168404Spjd 2321185029Spjd return (ZIO_PIPELINE_CONTINUE); 2322168404Spjd} 2323168404Spjd 2324168404Spjd/* 2325290757Smav * The zio_nop_write stage in the pipeline determines if allocating a 2326290757Smav * new bp is necessary. The nopwrite feature can handle writes in 2327290757Smav * either syncing or open context (i.e. zil writes) and as a result is 2328290757Smav * mutually exclusive with dedup. 2329290757Smav * 2330290757Smav * By leveraging a cryptographically secure checksum, such as SHA256, we 2331290757Smav * can compare the checksums of the new data and the old to determine if 2332290757Smav * allocating a new block is required. Note that our requirements for 2333290757Smav * cryptographic strength are fairly weak: there can't be any accidental 2334290757Smav * hash collisions, but we don't need to be secure against intentional 2335290757Smav * (malicious) collisions. To trigger a nopwrite, you have to be able 2336290757Smav * to write the file to begin with, and triggering an incorrect (hash 2337290757Smav * collision) nopwrite is no worse than simply writing to the file. 2338290757Smav * That said, there are no known attacks against the checksum algorithms 2339290757Smav * used for nopwrite, assuming that the salt and the checksums 2340290757Smav * themselves remain secret. 2341243524Smm */ 2342243524Smmstatic int 2343270312Ssmhzio_nop_write(zio_t *zio) 2344243524Smm{ 2345243524Smm blkptr_t *bp = zio->io_bp; 2346243524Smm blkptr_t *bp_orig = &zio->io_bp_orig; 2347243524Smm zio_prop_t *zp = &zio->io_prop; 2348243524Smm 2349243524Smm ASSERT(BP_GET_LEVEL(bp) == 0); 2350243524Smm ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 2351243524Smm ASSERT(zp->zp_nopwrite); 2352243524Smm ASSERT(!zp->zp_dedup); 2353243524Smm ASSERT(zio->io_bp_override == NULL); 2354243524Smm ASSERT(IO_IS_ALLOCATING(zio)); 2355243524Smm 2356243524Smm /* 2357243524Smm * Check to see if the original bp and the new bp have matching 2358243524Smm * characteristics (i.e. same checksum, compression algorithms, etc). 2359243524Smm * If they don't then just continue with the pipeline which will 2360243524Smm * allocate a new bp. 2361243524Smm */ 2362243524Smm if (BP_IS_HOLE(bp_orig) || 2363290757Smav !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags & 2364290757Smav ZCHECKSUM_FLAG_NOPWRITE) || 2365243524Smm BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || 2366243524Smm BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || 2367243524Smm BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || 2368243524Smm zp->zp_copies != BP_GET_NDVAS(bp_orig)) 2369243524Smm return (ZIO_PIPELINE_CONTINUE); 2370243524Smm 2371243524Smm /* 2372243524Smm * If the checksums match then reset the pipeline so that we 2373243524Smm * avoid allocating a new bp and issuing any I/O. 2374243524Smm */ 2375243524Smm if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { 2376290757Smav ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags & 2377290757Smav ZCHECKSUM_FLAG_NOPWRITE); 2378243524Smm ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); 2379243524Smm ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); 2380243524Smm ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); 2381243524Smm ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, 2382243524Smm sizeof (uint64_t)) == 0); 2383243524Smm 2384243524Smm *bp = *bp_orig; 2385243524Smm zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2386243524Smm zio->io_flags |= ZIO_FLAG_NOPWRITE; 2387243524Smm } 2388243524Smm 2389243524Smm return (ZIO_PIPELINE_CONTINUE); 2390243524Smm} 2391243524Smm 2392243524Smm/* 2393168404Spjd * ========================================================================== 2394219089Spjd * Dedup 2395168404Spjd * ========================================================================== 2396168404Spjd */ 2397219089Spjdstatic void 2398219089Spjdzio_ddt_child_read_done(zio_t *zio) 2399219089Spjd{ 2400219089Spjd blkptr_t *bp = zio->io_bp; 2401219089Spjd ddt_entry_t *dde = zio->io_private; 2402219089Spjd ddt_phys_t *ddp; 2403219089Spjd zio_t *pio = zio_unique_parent(zio); 2404185029Spjd 2405219089Spjd mutex_enter(&pio->io_lock); 2406219089Spjd ddp = ddt_phys_select(dde, bp); 2407219089Spjd if (zio->io_error == 0) 2408219089Spjd ddt_phys_clear(ddp); /* this ddp doesn't need repair */ 2409219089Spjd if (zio->io_error == 0 && dde->dde_repair_data == NULL) 2410219089Spjd dde->dde_repair_data = zio->io_data; 2411219089Spjd else 2412219089Spjd zio_buf_free(zio->io_data, zio->io_size); 2413219089Spjd mutex_exit(&pio->io_lock); 2414219089Spjd} 2415219089Spjd 2416185029Spjdstatic int 2417270312Ssmhzio_ddt_read_start(zio_t *zio) 2418219089Spjd{ 2419219089Spjd blkptr_t *bp = zio->io_bp; 2420219089Spjd 2421219089Spjd ASSERT(BP_GET_DEDUP(bp)); 2422219089Spjd ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2423219089Spjd ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2424219089Spjd 2425219089Spjd if (zio->io_child_error[ZIO_CHILD_DDT]) { 2426219089Spjd ddt_t *ddt = ddt_select(zio->io_spa, bp); 2427219089Spjd ddt_entry_t *dde = ddt_repair_start(ddt, bp); 2428219089Spjd ddt_phys_t *ddp = dde->dde_phys; 2429219089Spjd ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); 2430219089Spjd blkptr_t blk; 2431219089Spjd 2432219089Spjd ASSERT(zio->io_vsd == NULL); 2433219089Spjd zio->io_vsd = dde; 2434219089Spjd 2435219089Spjd if (ddp_self == NULL) 2436219089Spjd return (ZIO_PIPELINE_CONTINUE); 2437219089Spjd 2438219089Spjd for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 2439219089Spjd if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) 2440219089Spjd continue; 2441219089Spjd ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, 2442219089Spjd &blk); 2443219089Spjd zio_nowait(zio_read(zio, zio->io_spa, &blk, 2444219089Spjd zio_buf_alloc(zio->io_size), zio->io_size, 2445219089Spjd zio_ddt_child_read_done, dde, zio->io_priority, 2446219089Spjd ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, 2447219089Spjd &zio->io_bookmark)); 2448219089Spjd } 2449219089Spjd return (ZIO_PIPELINE_CONTINUE); 2450219089Spjd } 2451219089Spjd 2452219089Spjd zio_nowait(zio_read(zio, zio->io_spa, bp, 2453219089Spjd zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, 2454219089Spjd ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); 2455219089Spjd 2456219089Spjd return (ZIO_PIPELINE_CONTINUE); 2457219089Spjd} 2458219089Spjd 2459219089Spjdstatic int 2460270312Ssmhzio_ddt_read_done(zio_t *zio) 2461219089Spjd{ 2462219089Spjd blkptr_t *bp = zio->io_bp; 2463219089Spjd 2464330238Savg if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) { 2465219089Spjd return (ZIO_PIPELINE_STOP); 2466330238Savg } 2467219089Spjd 2468219089Spjd ASSERT(BP_GET_DEDUP(bp)); 2469219089Spjd ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2470219089Spjd ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2471219089Spjd 2472219089Spjd if (zio->io_child_error[ZIO_CHILD_DDT]) { 2473219089Spjd ddt_t *ddt = ddt_select(zio->io_spa, bp); 2474219089Spjd ddt_entry_t *dde = zio->io_vsd; 2475219089Spjd if (ddt == NULL) { 2476219089Spjd ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); 2477219089Spjd return (ZIO_PIPELINE_CONTINUE); 2478219089Spjd } 2479219089Spjd if (dde == NULL) { 2480219089Spjd zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; 2481219089Spjd zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 2482219089Spjd return (ZIO_PIPELINE_STOP); 2483219089Spjd } 2484219089Spjd if (dde->dde_repair_data != NULL) { 2485219089Spjd bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); 2486219089Spjd zio->io_child_error[ZIO_CHILD_DDT] = 0; 2487219089Spjd } 2488219089Spjd ddt_repair_done(ddt, dde); 2489219089Spjd zio->io_vsd = NULL; 2490219089Spjd } 2491219089Spjd 2492219089Spjd ASSERT(zio->io_vsd == NULL); 2493219089Spjd 2494219089Spjd return (ZIO_PIPELINE_CONTINUE); 2495219089Spjd} 2496219089Spjd 2497219089Spjdstatic boolean_t 2498219089Spjdzio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) 2499219089Spjd{ 2500219089Spjd spa_t *spa = zio->io_spa; 2501219089Spjd 2502219089Spjd /* 2503219089Spjd * Note: we compare the original data, not the transformed data, 2504219089Spjd * because when zio->io_bp is an override bp, we will not have 2505219089Spjd * pushed the I/O transforms. That's an important optimization 2506219089Spjd * because otherwise we'd compress/encrypt all dmu_sync() data twice. 2507219089Spjd */ 2508219089Spjd for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2509219089Spjd zio_t *lio = dde->dde_lead_zio[p]; 2510219089Spjd 2511219089Spjd if (lio != NULL) { 2512219089Spjd return (lio->io_orig_size != zio->io_orig_size || 2513219089Spjd bcmp(zio->io_orig_data, lio->io_orig_data, 2514219089Spjd zio->io_orig_size) != 0); 2515219089Spjd } 2516219089Spjd } 2517219089Spjd 2518219089Spjd for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2519219089Spjd ddt_phys_t *ddp = &dde->dde_phys[p]; 2520219089Spjd 2521219089Spjd if (ddp->ddp_phys_birth != 0) { 2522219089Spjd arc_buf_t *abuf = NULL; 2523277586Sdelphij arc_flags_t aflags = ARC_FLAG_WAIT; 2524219089Spjd blkptr_t blk = *zio->io_bp; 2525219089Spjd int error; 2526219089Spjd 2527219089Spjd ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); 2528219089Spjd 2529219089Spjd ddt_exit(ddt); 2530219089Spjd 2531246666Smm error = arc_read(NULL, spa, &blk, 2532219089Spjd arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, 2533219089Spjd ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2534219089Spjd &aflags, &zio->io_bookmark); 2535219089Spjd 2536219089Spjd if (error == 0) { 2537219089Spjd if (arc_buf_size(abuf) != zio->io_orig_size || 2538219089Spjd bcmp(abuf->b_data, zio->io_orig_data, 2539219089Spjd zio->io_orig_size) != 0) 2540249195Smm error = SET_ERROR(EEXIST); 2541307266Smav arc_buf_destroy(abuf, &abuf); 2542219089Spjd } 2543219089Spjd 2544219089Spjd ddt_enter(ddt); 2545219089Spjd return (error != 0); 2546219089Spjd } 2547219089Spjd } 2548219089Spjd 2549219089Spjd return (B_FALSE); 2550219089Spjd} 2551219089Spjd 2552219089Spjdstatic void 2553219089Spjdzio_ddt_child_write_ready(zio_t *zio) 2554219089Spjd{ 2555219089Spjd int p = zio->io_prop.zp_copies; 2556219089Spjd ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2557219089Spjd ddt_entry_t *dde = zio->io_private; 2558219089Spjd ddt_phys_t *ddp = &dde->dde_phys[p]; 2559219089Spjd zio_t *pio; 2560219089Spjd 2561219089Spjd if (zio->io_error) 2562219089Spjd return; 2563219089Spjd 2564219089Spjd ddt_enter(ddt); 2565219089Spjd 2566219089Spjd ASSERT(dde->dde_lead_zio[p] == zio); 2567219089Spjd 2568219089Spjd ddt_phys_fill(ddp, zio->io_bp); 2569219089Spjd 2570307279Smav zio_link_t *zl = NULL; 2571307279Smav while ((pio = zio_walk_parents(zio, &zl)) != NULL) 2572219089Spjd ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); 2573219089Spjd 2574219089Spjd ddt_exit(ddt); 2575219089Spjd} 2576219089Spjd 2577219089Spjdstatic void 2578219089Spjdzio_ddt_child_write_done(zio_t *zio) 2579219089Spjd{ 2580219089Spjd int p = zio->io_prop.zp_copies; 2581219089Spjd ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2582219089Spjd ddt_entry_t *dde = zio->io_private; 2583219089Spjd ddt_phys_t *ddp = &dde->dde_phys[p]; 2584219089Spjd 2585219089Spjd ddt_enter(ddt); 2586219089Spjd 2587219089Spjd ASSERT(ddp->ddp_refcnt == 0); 2588219089Spjd ASSERT(dde->dde_lead_zio[p] == zio); 2589219089Spjd dde->dde_lead_zio[p] = NULL; 2590219089Spjd 2591219089Spjd if (zio->io_error == 0) { 2592307279Smav zio_link_t *zl = NULL; 2593307279Smav while (zio_walk_parents(zio, &zl) != NULL) 2594219089Spjd ddt_phys_addref(ddp); 2595219089Spjd } else { 2596219089Spjd ddt_phys_clear(ddp); 2597219089Spjd } 2598219089Spjd 2599219089Spjd ddt_exit(ddt); 2600219089Spjd} 2601219089Spjd 2602219089Spjdstatic void 2603219089Spjdzio_ddt_ditto_write_done(zio_t *zio) 2604219089Spjd{ 2605219089Spjd int p = DDT_PHYS_DITTO; 2606219089Spjd zio_prop_t *zp = &zio->io_prop; 2607219089Spjd blkptr_t *bp = zio->io_bp; 2608219089Spjd ddt_t *ddt = ddt_select(zio->io_spa, bp); 2609219089Spjd ddt_entry_t *dde = zio->io_private; 2610219089Spjd ddt_phys_t *ddp = &dde->dde_phys[p]; 2611219089Spjd ddt_key_t *ddk = &dde->dde_key; 2612219089Spjd 2613219089Spjd ddt_enter(ddt); 2614219089Spjd 2615219089Spjd ASSERT(ddp->ddp_refcnt == 0); 2616219089Spjd ASSERT(dde->dde_lead_zio[p] == zio); 2617219089Spjd dde->dde_lead_zio[p] = NULL; 2618219089Spjd 2619219089Spjd if (zio->io_error == 0) { 2620219089Spjd ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); 2621219089Spjd ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); 2622219089Spjd ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); 2623219089Spjd if (ddp->ddp_phys_birth != 0) 2624219089Spjd ddt_phys_free(ddt, ddk, ddp, zio->io_txg); 2625219089Spjd ddt_phys_fill(ddp, bp); 2626219089Spjd } 2627219089Spjd 2628219089Spjd ddt_exit(ddt); 2629219089Spjd} 2630219089Spjd 2631219089Spjdstatic int 2632270312Ssmhzio_ddt_write(zio_t *zio) 2633219089Spjd{ 2634219089Spjd spa_t *spa = zio->io_spa; 2635219089Spjd blkptr_t *bp = zio->io_bp; 2636219089Spjd uint64_t txg = zio->io_txg; 2637219089Spjd zio_prop_t *zp = &zio->io_prop; 2638219089Spjd int p = zp->zp_copies; 2639219089Spjd int ditto_copies; 2640219089Spjd zio_t *cio = NULL; 2641219089Spjd zio_t *dio = NULL; 2642219089Spjd ddt_t *ddt = ddt_select(spa, bp); 2643219089Spjd ddt_entry_t *dde; 2644219089Spjd ddt_phys_t *ddp; 2645219089Spjd 2646219089Spjd ASSERT(BP_GET_DEDUP(bp)); 2647219089Spjd ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); 2648219089Spjd ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); 2649219089Spjd 2650219089Spjd ddt_enter(ddt); 2651219089Spjd dde = ddt_lookup(ddt, bp, B_TRUE); 2652219089Spjd ddp = &dde->dde_phys[p]; 2653219089Spjd 2654219089Spjd if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { 2655219089Spjd /* 2656219089Spjd * If we're using a weak checksum, upgrade to a strong checksum 2657219089Spjd * and try again. If we're already using a strong checksum, 2658219089Spjd * we can't resolve it, so just convert to an ordinary write. 2659219089Spjd * (And automatically e-mail a paper to Nature?) 2660219089Spjd */ 2661290757Smav if (!(zio_checksum_table[zp->zp_checksum].ci_flags & 2662290757Smav ZCHECKSUM_FLAG_DEDUP)) { 2663219089Spjd zp->zp_checksum = spa_dedup_checksum(spa); 2664219089Spjd zio_pop_transforms(zio); 2665219089Spjd zio->io_stage = ZIO_STAGE_OPEN; 2666219089Spjd BP_ZERO(bp); 2667219089Spjd } else { 2668243524Smm zp->zp_dedup = B_FALSE; 2669219089Spjd } 2670219089Spjd zio->io_pipeline = ZIO_WRITE_PIPELINE; 2671219089Spjd ddt_exit(ddt); 2672219089Spjd return (ZIO_PIPELINE_CONTINUE); 2673219089Spjd } 2674219089Spjd 2675219089Spjd ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); 2676219089Spjd ASSERT(ditto_copies < SPA_DVAS_PER_BP); 2677219089Spjd 2678219089Spjd if (ditto_copies > ddt_ditto_copies_present(dde) && 2679219089Spjd dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { 2680219089Spjd zio_prop_t czp = *zp; 2681219089Spjd 2682219089Spjd czp.zp_copies = ditto_copies; 2683219089Spjd 2684219089Spjd /* 2685219089Spjd * If we arrived here with an override bp, we won't have run 2686219089Spjd * the transform stack, so we won't have the data we need to 2687219089Spjd * generate a child i/o. So, toss the override bp and restart. 2688219089Spjd * This is safe, because using the override bp is just an 2689219089Spjd * optimization; and it's rare, so the cost doesn't matter. 2690219089Spjd */ 2691219089Spjd if (zio->io_bp_override) { 2692219089Spjd zio_pop_transforms(zio); 2693219089Spjd zio->io_stage = ZIO_STAGE_OPEN; 2694219089Spjd zio->io_pipeline = ZIO_WRITE_PIPELINE; 2695219089Spjd zio->io_bp_override = NULL; 2696219089Spjd BP_ZERO(bp); 2697219089Spjd ddt_exit(ddt); 2698219089Spjd return (ZIO_PIPELINE_CONTINUE); 2699219089Spjd } 2700219089Spjd 2701219089Spjd dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2702260763Savg zio->io_orig_size, &czp, NULL, NULL, 2703304139Savg NULL, zio_ddt_ditto_write_done, dde, zio->io_priority, 2704219089Spjd ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2705219089Spjd 2706219089Spjd zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); 2707219089Spjd dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; 2708219089Spjd } 2709219089Spjd 2710219089Spjd if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { 2711219089Spjd if (ddp->ddp_phys_birth != 0) 2712219089Spjd ddt_bp_fill(ddp, bp, txg); 2713219089Spjd if (dde->dde_lead_zio[p] != NULL) 2714219089Spjd zio_add_child(zio, dde->dde_lead_zio[p]); 2715219089Spjd else 2716219089Spjd ddt_phys_addref(ddp); 2717219089Spjd } else if (zio->io_bp_override) { 2718219089Spjd ASSERT(bp->blk_birth == txg); 2719219089Spjd ASSERT(BP_EQUAL(bp, zio->io_bp_override)); 2720219089Spjd ddt_phys_fill(ddp, bp); 2721219089Spjd ddt_phys_addref(ddp); 2722219089Spjd } else { 2723219089Spjd cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2724304139Savg zio->io_orig_size, zp, 2725304139Savg zio_ddt_child_write_ready, NULL, NULL, 2726219089Spjd zio_ddt_child_write_done, dde, zio->io_priority, 2727219089Spjd ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2728219089Spjd 2729219089Spjd zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); 2730219089Spjd dde->dde_lead_zio[p] = cio; 2731219089Spjd } 2732219089Spjd 2733219089Spjd ddt_exit(ddt); 2734219089Spjd 2735219089Spjd if (cio) 2736219089Spjd zio_nowait(cio); 2737219089Spjd if (dio) 2738219089Spjd zio_nowait(dio); 2739219089Spjd 2740219089Spjd return (ZIO_PIPELINE_CONTINUE); 2741219089Spjd} 2742219089Spjd 2743219089Spjdddt_entry_t *freedde; /* for debugging */ 2744219089Spjd 2745219089Spjdstatic int 2746270312Ssmhzio_ddt_free(zio_t *zio) 2747219089Spjd{ 2748219089Spjd spa_t *spa = zio->io_spa; 2749219089Spjd blkptr_t *bp = zio->io_bp; 2750219089Spjd ddt_t *ddt = ddt_select(spa, bp); 2751219089Spjd ddt_entry_t *dde; 2752219089Spjd ddt_phys_t *ddp; 2753219089Spjd 2754219089Spjd ASSERT(BP_GET_DEDUP(bp)); 2755219089Spjd ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2756219089Spjd 2757219089Spjd ddt_enter(ddt); 2758219089Spjd freedde = dde = ddt_lookup(ddt, bp, B_TRUE); 2759219089Spjd ddp = ddt_phys_select(dde, bp); 2760219089Spjd ddt_phys_decref(ddp); 2761219089Spjd ddt_exit(ddt); 2762219089Spjd 2763219089Spjd return (ZIO_PIPELINE_CONTINUE); 2764219089Spjd} 2765219089Spjd 2766219089Spjd/* 2767219089Spjd * ========================================================================== 2768219089Spjd * Allocate and free blocks 2769219089Spjd * ========================================================================== 2770219089Spjd */ 2771307279Smav 2772307279Smavstatic zio_t * 2773307279Smavzio_io_to_allocate(spa_t *spa) 2774307279Smav{ 2775307279Smav zio_t *zio; 2776307279Smav 2777307279Smav ASSERT(MUTEX_HELD(&spa->spa_alloc_lock)); 2778307279Smav 2779307279Smav zio = avl_first(&spa->spa_alloc_tree); 2780307279Smav if (zio == NULL) 2781307279Smav return (NULL); 2782307279Smav 2783307279Smav ASSERT(IO_IS_ALLOCATING(zio)); 2784307279Smav 2785307279Smav /* 2786307279Smav * Try to place a reservation for this zio. If we're unable to 2787307279Smav * reserve then we throttle. 2788307279Smav */ 2789307279Smav if (!metaslab_class_throttle_reserve(spa_normal_class(spa), 2790307279Smav zio->io_prop.zp_copies, zio, 0)) { 2791307279Smav return (NULL); 2792307279Smav } 2793307279Smav 2794307279Smav avl_remove(&spa->spa_alloc_tree, zio); 2795307279Smav ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE); 2796307279Smav 2797307279Smav return (zio); 2798307279Smav} 2799307279Smav 2800219089Spjdstatic int 2801307279Smavzio_dva_throttle(zio_t *zio) 2802307279Smav{ 2803307279Smav spa_t *spa = zio->io_spa; 2804307279Smav zio_t *nio; 2805307279Smav 2806307279Smav if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE || 2807307279Smav !spa_normal_class(zio->io_spa)->mc_alloc_throttle_enabled || 2808307279Smav zio->io_child_type == ZIO_CHILD_GANG || 2809307279Smav zio->io_flags & ZIO_FLAG_NODATA) { 2810307279Smav return (ZIO_PIPELINE_CONTINUE); 2811307279Smav } 2812307279Smav 2813307279Smav ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2814307279Smav 2815307279Smav ASSERT3U(zio->io_queued_timestamp, >, 0); 2816307279Smav ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE); 2817307279Smav 2818307279Smav mutex_enter(&spa->spa_alloc_lock); 2819307279Smav 2820307279Smav ASSERT(zio->io_type == ZIO_TYPE_WRITE); 2821307279Smav avl_add(&spa->spa_alloc_tree, zio); 2822307279Smav 2823307279Smav nio = zio_io_to_allocate(zio->io_spa); 2824307279Smav mutex_exit(&spa->spa_alloc_lock); 2825307279Smav 2826307279Smav if (nio == zio) 2827307279Smav return (ZIO_PIPELINE_CONTINUE); 2828307279Smav 2829307279Smav if (nio != NULL) { 2830307279Smav ASSERT3U(nio->io_queued_timestamp, <=, 2831307279Smav zio->io_queued_timestamp); 2832307279Smav ASSERT(nio->io_stage == ZIO_STAGE_DVA_THROTTLE); 2833307279Smav /* 2834307279Smav * We are passing control to a new zio so make sure that 2835307279Smav * it is processed by a different thread. We do this to 2836307279Smav * avoid stack overflows that can occur when parents are 2837307279Smav * throttled and children are making progress. We allow 2838307279Smav * it to go to the head of the taskq since it's already 2839307279Smav * been waiting. 2840307279Smav */ 2841307279Smav zio_taskq_dispatch(nio, ZIO_TASKQ_ISSUE, B_TRUE); 2842307279Smav } 2843307279Smav return (ZIO_PIPELINE_STOP); 2844307279Smav} 2845307279Smav 2846307279Smavvoid 2847307279Smavzio_allocate_dispatch(spa_t *spa) 2848307279Smav{ 2849307279Smav zio_t *zio; 2850307279Smav 2851307279Smav mutex_enter(&spa->spa_alloc_lock); 2852307279Smav zio = zio_io_to_allocate(spa); 2853307279Smav mutex_exit(&spa->spa_alloc_lock); 2854307279Smav if (zio == NULL) 2855307279Smav return; 2856307279Smav 2857307279Smav ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE); 2858307279Smav ASSERT0(zio->io_error); 2859307279Smav zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE); 2860307279Smav} 2861307279Smav 2862307279Smavstatic int 2863270312Ssmhzio_dva_allocate(zio_t *zio) 2864168404Spjd{ 2865185029Spjd spa_t *spa = zio->io_spa; 2866219089Spjd metaslab_class_t *mc = spa_normal_class(spa); 2867168404Spjd blkptr_t *bp = zio->io_bp; 2868168404Spjd int error; 2869224177Smm int flags = 0; 2870168404Spjd 2871209962Smm if (zio->io_gang_leader == NULL) { 2872209962Smm ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2873209962Smm zio->io_gang_leader = zio; 2874209962Smm } 2875209962Smm 2876168404Spjd ASSERT(BP_IS_HOLE(bp)); 2877240415Smm ASSERT0(BP_GET_NDVAS(bp)); 2878219089Spjd ASSERT3U(zio->io_prop.zp_copies, >, 0); 2879219089Spjd ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); 2880168404Spjd ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 2881168404Spjd 2882307279Smav if (zio->io_flags & ZIO_FLAG_NODATA) { 2883307279Smav flags |= METASLAB_DONT_THROTTLE; 2884307279Smav } 2885307279Smav if (zio->io_flags & ZIO_FLAG_GANG_CHILD) { 2886307279Smav flags |= METASLAB_GANG_CHILD; 2887307279Smav } 2888307279Smav if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE) { 2889307279Smav flags |= METASLAB_ASYNC_ALLOC; 2890307279Smav } 2891307279Smav 2892185029Spjd error = metaslab_alloc(spa, mc, zio->io_size, bp, 2893307279Smav zio->io_prop.zp_copies, zio->io_txg, NULL, flags, zio); 2894168404Spjd 2895307279Smav if (error != 0) { 2896224177Smm spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " 2897224177Smm "size %llu, error %d", spa_name(spa), zio, zio->io_size, 2898224177Smm error); 2899185029Spjd if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) 2900185029Spjd return (zio_write_gang_block(zio)); 2901168404Spjd zio->io_error = error; 2902168404Spjd } 2903185029Spjd 2904185029Spjd return (ZIO_PIPELINE_CONTINUE); 2905168404Spjd} 2906168404Spjd 2907185029Spjdstatic int 2908270312Ssmhzio_dva_free(zio_t *zio) 2909168404Spjd{ 2910185029Spjd metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); 2911168404Spjd 2912185029Spjd return (ZIO_PIPELINE_CONTINUE); 2913185029Spjd} 2914168404Spjd 2915185029Spjdstatic int 2916270312Ssmhzio_dva_claim(zio_t *zio) 2917185029Spjd{ 2918185029Spjd int error; 2919168404Spjd 2920185029Spjd error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 2921185029Spjd if (error) 2922185029Spjd zio->io_error = error; 2923185029Spjd 2924185029Spjd return (ZIO_PIPELINE_CONTINUE); 2925168404Spjd} 2926168404Spjd 2927185029Spjd/* 2928185029Spjd * Undo an allocation. This is used by zio_done() when an I/O fails 2929185029Spjd * and we want to give back the block we just allocated. 2930185029Spjd * This handles both normal blocks and gang blocks. 2931185029Spjd */ 2932168404Spjdstatic void 2933185029Spjdzio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) 2934168404Spjd{ 2935185029Spjd ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 2936219089Spjd ASSERT(zio->io_bp_override == NULL); 2937185029Spjd 2938185029Spjd if (!BP_IS_HOLE(bp)) 2939219089Spjd metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); 2940185029Spjd 2941185029Spjd if (gn != NULL) { 2942185029Spjd for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 2943185029Spjd zio_dva_unallocate(zio, gn->gn_child[g], 2944185029Spjd &gn->gn_gbh->zg_blkptr[g]); 2945185029Spjd } 2946185029Spjd } 2947168404Spjd} 2948168404Spjd 2949168404Spjd/* 2950185029Spjd * Try to allocate an intent log block. Return 0 on success, errno on failure. 2951185029Spjd */ 2952185029Spjdint 2953219089Spjdzio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, 2954320496Savg uint64_t size, boolean_t *slog) 2955185029Spjd{ 2956219089Spjd int error = 1; 2957185029Spjd 2958219089Spjd ASSERT(txg > spa_syncing_txg(spa)); 2959185029Spjd 2960320496Savg error = metaslab_alloc(spa, spa_log_class(spa), size, 2961320496Savg new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, NULL); 2962320496Savg if (error == 0) { 2963320496Savg *slog = TRUE; 2964320496Savg } else { 2965219089Spjd error = metaslab_alloc(spa, spa_normal_class(spa), size, 2966307279Smav new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, NULL); 2967320496Savg if (error == 0) 2968320496Savg *slog = FALSE; 2969230514Smm } 2970185029Spjd 2971185029Spjd if (error == 0) { 2972185029Spjd BP_SET_LSIZE(new_bp, size); 2973185029Spjd BP_SET_PSIZE(new_bp, size); 2974185029Spjd BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 2975219089Spjd BP_SET_CHECKSUM(new_bp, 2976219089Spjd spa_version(spa) >= SPA_VERSION_SLIM_ZIL 2977219089Spjd ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); 2978185029Spjd BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 2979185029Spjd BP_SET_LEVEL(new_bp, 0); 2980219089Spjd BP_SET_DEDUP(new_bp, 0); 2981185029Spjd BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 2982185029Spjd } 2983185029Spjd 2984185029Spjd return (error); 2985185029Spjd} 2986185029Spjd 2987185029Spjd/* 2988219089Spjd * Free an intent log block. 2989185029Spjd */ 2990185029Spjdvoid 2991219089Spjdzio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp) 2992185029Spjd{ 2993219089Spjd ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG); 2994185029Spjd ASSERT(!BP_IS_GANG(bp)); 2995185029Spjd 2996219089Spjd zio_free(spa, txg, bp); 2997185029Spjd} 2998185029Spjd 2999185029Spjd/* 3000168404Spjd * ========================================================================== 3001244187Ssmh * Read, write and delete to physical devices 3002168404Spjd * ========================================================================== 3003168404Spjd */ 3004297078Smav 3005297078Smav 3006297078Smav/* 3007297078Smav * Issue an I/O to the underlying vdev. Typically the issue pipeline 3008297078Smav * stops after this stage and will resume upon I/O completion. 3009297078Smav * However, there are instances where the vdev layer may need to 3010297078Smav * continue the pipeline when an I/O was not issued. Since the I/O 3011297078Smav * that was sent to the vdev layer might be different than the one 3012297078Smav * currently active in the pipeline (see vdev_queue_io()), we explicitly 3013297078Smav * force the underlying vdev layers to call either zio_execute() or 3014297078Smav * zio_interrupt() to ensure that the pipeline continues with the correct I/O. 3015297078Smav */ 3016185029Spjdstatic int 3017270312Ssmhzio_vdev_io_start(zio_t *zio) 3018168404Spjd{ 3019168404Spjd vdev_t *vd = zio->io_vd; 3020168404Spjd uint64_t align; 3021185029Spjd spa_t *spa = zio->io_spa; 3022270312Ssmh int ret; 3023168404Spjd 3024185029Spjd ASSERT(zio->io_error == 0); 3025185029Spjd ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); 3026185029Spjd 3027168404Spjd if (vd == NULL) { 3028185029Spjd if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 3029185029Spjd spa_config_enter(spa, SCL_ZIO, zio, RW_READER); 3030185029Spjd 3031185029Spjd /* 3032185029Spjd * The mirror_ops handle multiple DVAs in a single BP. 3033185029Spjd */ 3034297078Smav vdev_mirror_ops.vdev_op_io_start(zio); 3035297078Smav return (ZIO_PIPELINE_STOP); 3036168404Spjd } 3037168404Spjd 3038270312Ssmh if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE && 3039270312Ssmh zio->io_priority == ZIO_PRIORITY_NOW) { 3040248574Ssmh trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg); 3041240868Spjd return (ZIO_PIPELINE_CONTINUE); 3042240868Spjd } 3043240868Spjd 3044307279Smav ASSERT3P(zio->io_logical, !=, zio); 3045307279Smav 3046219089Spjd /* 3047219089Spjd * We keep track of time-sensitive I/Os so that the scan thread 3048219089Spjd * can quickly react to certain workloads. In particular, we care 3049219089Spjd * about non-scrubbing, top-level reads and writes with the following 3050219089Spjd * characteristics: 3051297078Smav * - synchronous writes of user data to non-slog devices 3052219089Spjd * - any reads of user data 3053219089Spjd * When these conditions are met, adjust the timestamp of spa_last_io 3054219089Spjd * which allows the scan thread to adjust its workload accordingly. 3055219089Spjd */ 3056219089Spjd if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && 3057219089Spjd vd == vd->vdev_top && !vd->vdev_islog && 3058219089Spjd zio->io_bookmark.zb_objset != DMU_META_OBJSET && 3059219089Spjd zio->io_txg != spa_syncing_txg(spa)) { 3060219089Spjd uint64_t old = spa->spa_last_io; 3061219089Spjd uint64_t new = ddi_get_lbolt64(); 3062219089Spjd if (old != new) 3063219089Spjd (void) atomic_cas_64(&spa->spa_last_io, old, new); 3064219089Spjd } 3065219089Spjd 3066185029Spjd align = 1ULL << vd->vdev_top->vdev_ashift; 3067168404Spjd 3068297085Smav if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && 3069269416Sdelphij P2PHASE(zio->io_size, align) != 0) { 3070269416Sdelphij /* Transform logical writes to be a full physical block size. */ 3071168404Spjd uint64_t asize = P2ROUNDUP(zio->io_size, align); 3072240868Spjd char *abuf = NULL; 3073240868Spjd if (zio->io_type == ZIO_TYPE_READ || 3074240868Spjd zio->io_type == ZIO_TYPE_WRITE) 3075240868Spjd abuf = zio_buf_alloc(asize); 3076185029Spjd ASSERT(vd == vd->vdev_top); 3077168404Spjd if (zio->io_type == ZIO_TYPE_WRITE) { 3078168404Spjd bcopy(zio->io_data, abuf, zio->io_size); 3079168404Spjd bzero(abuf + zio->io_size, asize - zio->io_size); 3080168404Spjd } 3081240868Spjd zio_push_transform(zio, abuf, asize, abuf ? asize : 0, 3082240868Spjd zio_subblock); 3083168404Spjd } 3084168404Spjd 3085269416Sdelphij /* 3086269416Sdelphij * If this is not a physical io, make sure that it is properly aligned 3087269416Sdelphij * before proceeding. 3088269416Sdelphij */ 3089269416Sdelphij if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) { 3090269416Sdelphij ASSERT0(P2PHASE(zio->io_offset, align)); 3091269416Sdelphij ASSERT0(P2PHASE(zio->io_size, align)); 3092269416Sdelphij } else { 3093269416Sdelphij /* 3094300039Savg * For the physical io we allow alignment 3095300039Savg * to a logical block size. 3096269416Sdelphij */ 3097300039Savg uint64_t log_align = 3098300039Savg 1ULL << vd->vdev_top->vdev_logical_ashift; 3099300039Savg ASSERT0(P2PHASE(zio->io_offset, log_align)); 3100300039Savg ASSERT0(P2PHASE(zio->io_size, log_align)); 3101269416Sdelphij } 3102269416Sdelphij 3103240868Spjd VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa)); 3104168404Spjd 3105209962Smm /* 3106209962Smm * If this is a repair I/O, and there's no self-healing involved -- 3107209962Smm * that is, we're just resilvering what we expect to resilver -- 3108209962Smm * then don't do the I/O unless zio's txg is actually in vd's DTL. 3109209962Smm * This prevents spurious resilvering with nested replication. 3110209962Smm * For example, given a mirror of mirrors, (A+B)+(C+D), if only 3111209962Smm * A is out of date, we'll read from C+D, then use the data to 3112209962Smm * resilver A+B -- but we don't actually want to resilver B, just A. 3113209962Smm * The top-level mirror has no way to know this, so instead we just 3114209962Smm * discard unnecessary repairs as we work our way down the vdev tree. 3115209962Smm * The same logic applies to any form of nested replication: 3116209962Smm * ditto + mirror, RAID-Z + replacing, etc. This covers them all. 3117209962Smm */ 3118209962Smm if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && 3119209962Smm !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && 3120209962Smm zio->io_txg != 0 && /* not a delegated i/o */ 3121209962Smm !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { 3122209962Smm ASSERT(zio->io_type == ZIO_TYPE_WRITE); 3123209962Smm zio_vdev_io_bypass(zio); 3124209962Smm return (ZIO_PIPELINE_CONTINUE); 3125209962Smm } 3126209962Smm 3127270312Ssmh if (vd->vdev_ops->vdev_op_leaf) { 3128270312Ssmh switch (zio->io_type) { 3129270312Ssmh case ZIO_TYPE_READ: 3130270312Ssmh if (vdev_cache_read(zio)) 3131270312Ssmh return (ZIO_PIPELINE_CONTINUE); 3132270312Ssmh /* FALLTHROUGH */ 3133270312Ssmh case ZIO_TYPE_WRITE: 3134270312Ssmh case ZIO_TYPE_FREE: 3135270312Ssmh if ((zio = vdev_queue_io(zio)) == NULL) 3136270312Ssmh return (ZIO_PIPELINE_STOP); 3137168404Spjd 3138270312Ssmh if (!vdev_accessible(vd, zio)) { 3139270312Ssmh zio->io_error = SET_ERROR(ENXIO); 3140270312Ssmh zio_interrupt(zio); 3141270312Ssmh return (ZIO_PIPELINE_STOP); 3142270312Ssmh } 3143270312Ssmh break; 3144185029Spjd } 3145270312Ssmh /* 3146270312Ssmh * Note that we ignore repair writes for TRIM because they can 3147270312Ssmh * conflict with normal writes. This isn't an issue because, by 3148270312Ssmh * definition, we only repair blocks that aren't freed. 3149270312Ssmh */ 3150270312Ssmh if (zio->io_type == ZIO_TYPE_WRITE && 3151270312Ssmh !(zio->io_flags & ZIO_FLAG_IO_REPAIR) && 3152270312Ssmh !trim_map_write_start(zio)) 3153240868Spjd return (ZIO_PIPELINE_STOP); 3154240868Spjd } 3155240868Spjd 3156297078Smav vd->vdev_ops->vdev_op_io_start(zio); 3157297078Smav return (ZIO_PIPELINE_STOP); 3158168404Spjd} 3159168404Spjd 3160185029Spjdstatic int 3161270312Ssmhzio_vdev_io_done(zio_t *zio) 3162168404Spjd{ 3163168404Spjd vdev_t *vd = zio->io_vd; 3164185029Spjd vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; 3165185029Spjd boolean_t unexpected_error = B_FALSE; 3166168404Spjd 3167330238Savg if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) { 3168185029Spjd return (ZIO_PIPELINE_STOP); 3169330238Savg } 3170168404Spjd 3171240868Spjd ASSERT(zio->io_type == ZIO_TYPE_READ || 3172240868Spjd zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE); 3173185029Spjd 3174240868Spjd if (vd != NULL && vd->vdev_ops->vdev_op_leaf && 3175270312Ssmh (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || 3176270312Ssmh zio->io_type == ZIO_TYPE_FREE)) { 3177240868Spjd 3178248573Ssmh if (zio->io_type == ZIO_TYPE_WRITE && 3179248573Ssmh !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) 3180248573Ssmh trim_map_write_done(zio); 3181248573Ssmh 3182185029Spjd vdev_queue_io_done(zio); 3183185029Spjd 3184185029Spjd if (zio->io_type == ZIO_TYPE_WRITE) 3185185029Spjd vdev_cache_write(zio); 3186185029Spjd 3187185029Spjd if (zio_injection_enabled && zio->io_error == 0) 3188213198Smm zio->io_error = zio_handle_device_injection(vd, 3189213198Smm zio, EIO); 3190185029Spjd 3191185029Spjd if (zio_injection_enabled && zio->io_error == 0) 3192185029Spjd zio->io_error = zio_handle_label_injection(zio, EIO); 3193185029Spjd 3194185029Spjd if (zio->io_error) { 3195271683Ssmh if (zio->io_error == ENOTSUP && 3196271683Ssmh zio->io_type == ZIO_TYPE_FREE) { 3197271683Ssmh /* Not all devices support TRIM. */ 3198271683Ssmh } else if (!vdev_accessible(vd, zio)) { 3199249195Smm zio->io_error = SET_ERROR(ENXIO); 3200185029Spjd } else { 3201185029Spjd unexpected_error = B_TRUE; 3202185029Spjd } 3203185029Spjd } 3204185029Spjd } 3205185029Spjd 3206185029Spjd ops->vdev_op_io_done(zio); 3207185029Spjd 3208185029Spjd if (unexpected_error) 3209209962Smm VERIFY(vdev_probe(vd, zio) == NULL); 3210185029Spjd 3211185029Spjd return (ZIO_PIPELINE_CONTINUE); 3212168404Spjd} 3213168404Spjd 3214219089Spjd/* 3215219089Spjd * For non-raidz ZIOs, we can just copy aside the bad data read from the 3216219089Spjd * disk, and use that to finish the checksum ereport later. 3217219089Spjd */ 3218219089Spjdstatic void 3219219089Spjdzio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, 3220219089Spjd const void *good_buf) 3221219089Spjd{ 3222219089Spjd /* no processing needed */ 3223219089Spjd zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); 3224219089Spjd} 3225219089Spjd 3226219089Spjd/*ARGSUSED*/ 3227219089Spjdvoid 3228219089Spjdzio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) 3229219089Spjd{ 3230219089Spjd void *buf = zio_buf_alloc(zio->io_size); 3231219089Spjd 3232219089Spjd bcopy(zio->io_data, buf, zio->io_size); 3233219089Spjd 3234219089Spjd zcr->zcr_cbinfo = zio->io_size; 3235219089Spjd zcr->zcr_cbdata = buf; 3236219089Spjd zcr->zcr_finish = zio_vsd_default_cksum_finish; 3237219089Spjd zcr->zcr_free = zio_buf_free; 3238219089Spjd} 3239219089Spjd 3240185029Spjdstatic int 3241270312Ssmhzio_vdev_io_assess(zio_t *zio) 3242168404Spjd{ 3243168404Spjd vdev_t *vd = zio->io_vd; 3244168404Spjd 3245330238Savg if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) { 3246185029Spjd return (ZIO_PIPELINE_STOP); 3247330238Savg } 3248168404Spjd 3249185029Spjd if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 3250185029Spjd spa_config_exit(zio->io_spa, SCL_ZIO, zio); 3251185029Spjd 3252185029Spjd if (zio->io_vsd != NULL) { 3253219089Spjd zio->io_vsd_ops->vsd_free(zio); 3254185029Spjd zio->io_vsd = NULL; 3255168404Spjd } 3256168404Spjd 3257185029Spjd if (zio_injection_enabled && zio->io_error == 0) 3258168404Spjd zio->io_error = zio_handle_fault_injection(zio, EIO); 3259168404Spjd 3260270312Ssmh if (zio->io_type == ZIO_TYPE_FREE && 3261270312Ssmh zio->io_priority != ZIO_PRIORITY_NOW) { 3262240868Spjd switch (zio->io_error) { 3263240868Spjd case 0: 3264244155Ssmh ZIO_TRIM_STAT_INCR(bytes, zio->io_size); 3265244155Ssmh ZIO_TRIM_STAT_BUMP(success); 3266240868Spjd break; 3267240868Spjd case EOPNOTSUPP: 3268244155Ssmh ZIO_TRIM_STAT_BUMP(unsupported); 3269240868Spjd break; 3270240868Spjd default: 3271244155Ssmh ZIO_TRIM_STAT_BUMP(failed); 3272240868Spjd break; 3273240868Spjd } 3274270312Ssmh } 3275240868Spjd 3276168404Spjd /* 3277168404Spjd * If the I/O failed, determine whether we should attempt to retry it. 3278219089Spjd * 3279219089Spjd * On retry, we cut in line in the issue queue, since we don't want 3280219089Spjd * compression/checksumming/etc. work to prevent our (cheap) IO reissue. 3281168404Spjd */ 3282185029Spjd if (zio->io_error && vd == NULL && 3283185029Spjd !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { 3284185029Spjd ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ 3285185029Spjd ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ 3286168404Spjd zio->io_error = 0; 3287185029Spjd zio->io_flags |= ZIO_FLAG_IO_RETRY | 3288185029Spjd ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; 3289219089Spjd zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; 3290219089Spjd zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, 3291219089Spjd zio_requeue_io_start_cut_in_line); 3292185029Spjd return (ZIO_PIPELINE_STOP); 3293185029Spjd } 3294168404Spjd 3295185029Spjd /* 3296185029Spjd * If we got an error on a leaf device, convert it to ENXIO 3297185029Spjd * if the device is not accessible at all. 3298185029Spjd */ 3299185029Spjd if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && 3300185029Spjd !vdev_accessible(vd, zio)) 3301249195Smm zio->io_error = SET_ERROR(ENXIO); 3302168404Spjd 3303185029Spjd /* 3304185029Spjd * If we can't write to an interior vdev (mirror or RAID-Z), 3305185029Spjd * set vdev_cant_write so that we stop trying to allocate from it. 3306185029Spjd */ 3307185029Spjd if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && 3308248571Smm vd != NULL && !vd->vdev_ops->vdev_op_leaf) { 3309185029Spjd vd->vdev_cant_write = B_TRUE; 3310248571Smm } 3311168404Spjd 3312185029Spjd if (zio->io_error) 3313185029Spjd zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 3314168404Spjd 3315260763Savg if (vd != NULL && vd->vdev_ops->vdev_op_leaf && 3316260763Savg zio->io_physdone != NULL) { 3317260763Savg ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED)); 3318260763Savg ASSERT(zio->io_child_type == ZIO_CHILD_VDEV); 3319260763Savg zio->io_physdone(zio->io_logical); 3320260763Savg } 3321260763Savg 3322185029Spjd return (ZIO_PIPELINE_CONTINUE); 3323168404Spjd} 3324168404Spjd 3325168404Spjdvoid 3326168404Spjdzio_vdev_io_reissue(zio_t *zio) 3327168404Spjd{ 3328168404Spjd ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 3329168404Spjd ASSERT(zio->io_error == 0); 3330168404Spjd 3331219089Spjd zio->io_stage >>= 1; 3332168404Spjd} 3333168404Spjd 3334168404Spjdvoid 3335168404Spjdzio_vdev_io_redone(zio_t *zio) 3336168404Spjd{ 3337168404Spjd ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 3338168404Spjd 3339219089Spjd zio->io_stage >>= 1; 3340168404Spjd} 3341168404Spjd 3342168404Spjdvoid 3343168404Spjdzio_vdev_io_bypass(zio_t *zio) 3344168404Spjd{ 3345168404Spjd ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 3346168404Spjd ASSERT(zio->io_error == 0); 3347168404Spjd 3348168404Spjd zio->io_flags |= ZIO_FLAG_IO_BYPASS; 3349219089Spjd zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; 3350168404Spjd} 3351168404Spjd 3352168404Spjd/* 3353168404Spjd * ========================================================================== 3354168404Spjd * Generate and verify checksums 3355168404Spjd * ========================================================================== 3356168404Spjd */ 3357185029Spjdstatic int 3358270312Ssmhzio_checksum_generate(zio_t *zio) 3359168404Spjd{ 3360168404Spjd blkptr_t *bp = zio->io_bp; 3361185029Spjd enum zio_checksum checksum; 3362168404Spjd 3363185029Spjd if (bp == NULL) { 3364185029Spjd /* 3365185029Spjd * This is zio_write_phys(). 3366185029Spjd * We're either generating a label checksum, or none at all. 3367185029Spjd */ 3368185029Spjd checksum = zio->io_prop.zp_checksum; 3369168404Spjd 3370185029Spjd if (checksum == ZIO_CHECKSUM_OFF) 3371185029Spjd return (ZIO_PIPELINE_CONTINUE); 3372168404Spjd 3373185029Spjd ASSERT(checksum == ZIO_CHECKSUM_LABEL); 3374185029Spjd } else { 3375185029Spjd if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { 3376185029Spjd ASSERT(!IO_IS_ALLOCATING(zio)); 3377185029Spjd checksum = ZIO_CHECKSUM_GANG_HEADER; 3378185029Spjd } else { 3379185029Spjd checksum = BP_GET_CHECKSUM(bp); 3380185029Spjd } 3381185029Spjd } 3382168404Spjd 3383185029Spjd zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); 3384185029Spjd 3385185029Spjd return (ZIO_PIPELINE_CONTINUE); 3386168404Spjd} 3387168404Spjd 3388185029Spjdstatic int 3389270312Ssmhzio_checksum_verify(zio_t *zio) 3390168404Spjd{ 3391219089Spjd zio_bad_cksum_t info; 3392185029Spjd blkptr_t *bp = zio->io_bp; 3393185029Spjd int error; 3394168404Spjd 3395219089Spjd ASSERT(zio->io_vd != NULL); 3396219089Spjd 3397185029Spjd if (bp == NULL) { 3398185029Spjd /* 3399185029Spjd * This is zio_read_phys(). 3400185029Spjd * We're either verifying a label checksum, or nothing at all. 3401185029Spjd */ 3402185029Spjd if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) 3403185029Spjd return (ZIO_PIPELINE_CONTINUE); 3404168404Spjd 3405185029Spjd ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); 3406185029Spjd } 3407168404Spjd 3408219089Spjd if ((error = zio_checksum_error(zio, &info)) != 0) { 3409185029Spjd zio->io_error = error; 3410277575Sdelphij if (error == ECKSUM && 3411277575Sdelphij !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 3412219089Spjd zfs_ereport_start_checksum(zio->io_spa, 3413219089Spjd zio->io_vd, zio, zio->io_offset, 3414219089Spjd zio->io_size, NULL, &info); 3415185029Spjd } 3416168404Spjd } 3417168404Spjd 3418185029Spjd return (ZIO_PIPELINE_CONTINUE); 3419168404Spjd} 3420168404Spjd 3421168404Spjd/* 3422168404Spjd * Called by RAID-Z to ensure we don't compute the checksum twice. 3423168404Spjd */ 3424168404Spjdvoid 3425168404Spjdzio_checksum_verified(zio_t *zio) 3426168404Spjd{ 3427219089Spjd zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 3428168404Spjd} 3429168404Spjd 3430168404Spjd/* 3431185029Spjd * ========================================================================== 3432185029Spjd * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. 3433268649Sdelphij * An error of 0 indicates success. ENXIO indicates whole-device failure, 3434185029Spjd * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO 3435185029Spjd * indicate errors that are specific to one I/O, and most likely permanent. 3436185029Spjd * Any other error is presumed to be worse because we weren't expecting it. 3437185029Spjd * ========================================================================== 3438168404Spjd */ 3439185029Spjdint 3440185029Spjdzio_worst_error(int e1, int e2) 3441168404Spjd{ 3442185029Spjd static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; 3443185029Spjd int r1, r2; 3444168404Spjd 3445185029Spjd for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) 3446185029Spjd if (e1 == zio_error_rank[r1]) 3447185029Spjd break; 3448185029Spjd 3449185029Spjd for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) 3450185029Spjd if (e2 == zio_error_rank[r2]) 3451185029Spjd break; 3452185029Spjd 3453185029Spjd return (r1 > r2 ? e1 : e2); 3454168404Spjd} 3455168404Spjd 3456168404Spjd/* 3457168404Spjd * ========================================================================== 3458185029Spjd * I/O completion 3459168404Spjd * ========================================================================== 3460168404Spjd */ 3461185029Spjdstatic int 3462270312Ssmhzio_ready(zio_t *zio) 3463168404Spjd{ 3464185029Spjd blkptr_t *bp = zio->io_bp; 3465209962Smm zio_t *pio, *pio_next; 3466307279Smav zio_link_t *zl = NULL; 3467168404Spjd 3468330238Savg if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT, 3469330238Savg ZIO_WAIT_READY)) { 3470209962Smm return (ZIO_PIPELINE_STOP); 3471330238Savg } 3472209962Smm 3473185029Spjd if (zio->io_ready) { 3474185029Spjd ASSERT(IO_IS_ALLOCATING(zio)); 3475243524Smm ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || 3476243524Smm (zio->io_flags & ZIO_FLAG_NOPWRITE)); 3477185029Spjd ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); 3478168404Spjd 3479185029Spjd zio->io_ready(zio); 3480168404Spjd } 3481168404Spjd 3482185029Spjd if (bp != NULL && bp != &zio->io_bp_copy) 3483185029Spjd zio->io_bp_copy = *bp; 3484168404Spjd 3485307279Smav if (zio->io_error != 0) { 3486185029Spjd zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 3487168404Spjd 3488307279Smav if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { 3489307279Smav ASSERT(IO_IS_ALLOCATING(zio)); 3490307279Smav ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); 3491307279Smav /* 3492307279Smav * We were unable to allocate anything, unreserve and 3493307279Smav * issue the next I/O to allocate. 3494307279Smav */ 3495307279Smav metaslab_class_throttle_unreserve( 3496307279Smav spa_normal_class(zio->io_spa), 3497307279Smav zio->io_prop.zp_copies, zio); 3498307279Smav zio_allocate_dispatch(zio->io_spa); 3499307279Smav } 3500307279Smav } 3501307279Smav 3502209962Smm mutex_enter(&zio->io_lock); 3503209962Smm zio->io_state[ZIO_WAIT_READY] = 1; 3504307279Smav pio = zio_walk_parents(zio, &zl); 3505209962Smm mutex_exit(&zio->io_lock); 3506209962Smm 3507209962Smm /* 3508209962Smm * As we notify zio's parents, new parents could be added. 3509209962Smm * New parents go to the head of zio's io_parent_list, however, 3510209962Smm * so we will (correctly) not notify them. The remainder of zio's 3511209962Smm * io_parent_list, from 'pio_next' onward, cannot change because 3512209962Smm * all parents must wait for us to be done before they can be done. 3513209962Smm */ 3514209962Smm for (; pio != NULL; pio = pio_next) { 3515307279Smav pio_next = zio_walk_parents(zio, &zl); 3516185029Spjd zio_notify_parent(pio, zio, ZIO_WAIT_READY); 3517209962Smm } 3518185029Spjd 3519219089Spjd if (zio->io_flags & ZIO_FLAG_NODATA) { 3520219089Spjd if (BP_IS_GANG(bp)) { 3521219089Spjd zio->io_flags &= ~ZIO_FLAG_NODATA; 3522219089Spjd } else { 3523219089Spjd ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE); 3524219089Spjd zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 3525219089Spjd } 3526219089Spjd } 3527219089Spjd 3528219089Spjd if (zio_injection_enabled && 3529219089Spjd zio->io_spa->spa_syncing_txg == zio->io_txg) 3530219089Spjd zio_handle_ignored_writes(zio); 3531219089Spjd 3532185029Spjd return (ZIO_PIPELINE_CONTINUE); 3533185029Spjd} 3534185029Spjd 3535307279Smav/* 3536307279Smav * Update the allocation throttle accounting. 3537307279Smav */ 3538307279Smavstatic void 3539307279Smavzio_dva_throttle_done(zio_t *zio) 3540307279Smav{ 3541307279Smav zio_t *lio = zio->io_logical; 3542307279Smav zio_t *pio = zio_unique_parent(zio); 3543307279Smav vdev_t *vd = zio->io_vd; 3544307279Smav int flags = METASLAB_ASYNC_ALLOC; 3545307279Smav 3546307279Smav ASSERT3P(zio->io_bp, !=, NULL); 3547307279Smav ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); 3548307279Smav ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE); 3549307279Smav ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); 3550307279Smav ASSERT(vd != NULL); 3551307279Smav ASSERT3P(vd, ==, vd->vdev_top); 3552307279Smav ASSERT(!(zio->io_flags & (ZIO_FLAG_IO_REPAIR | ZIO_FLAG_IO_RETRY))); 3553307279Smav ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING); 3554307279Smav ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE)); 3555307279Smav ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA)); 3556307279Smav 3557307279Smav /* 3558307279Smav * Parents of gang children can have two flavors -- ones that 3559307279Smav * allocated the gang header (will have ZIO_FLAG_IO_REWRITE set) 3560307279Smav * and ones that allocated the constituent blocks. The allocation 3561307279Smav * throttle needs to know the allocating parent zio so we must find 3562307279Smav * it here. 3563307279Smav */ 3564307279Smav if (pio->io_child_type == ZIO_CHILD_GANG) { 3565307279Smav /* 3566307279Smav * If our parent is a rewrite gang child then our grandparent 3567307279Smav * would have been the one that performed the allocation. 3568307279Smav */ 3569307279Smav if (pio->io_flags & ZIO_FLAG_IO_REWRITE) 3570307279Smav pio = zio_unique_parent(pio); 3571307279Smav flags |= METASLAB_GANG_CHILD; 3572307279Smav } 3573307279Smav 3574307279Smav ASSERT(IO_IS_ALLOCATING(pio)); 3575307279Smav ASSERT3P(zio, !=, zio->io_logical); 3576307279Smav ASSERT(zio->io_logical != NULL); 3577307279Smav ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); 3578307279Smav ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE); 3579307279Smav 3580307279Smav mutex_enter(&pio->io_lock); 3581307279Smav metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags); 3582307279Smav mutex_exit(&pio->io_lock); 3583307279Smav 3584307279Smav metaslab_class_throttle_unreserve(spa_normal_class(zio->io_spa), 3585307279Smav 1, pio); 3586307279Smav 3587307279Smav /* 3588307279Smav * Call into the pipeline to see if there is more work that 3589307279Smav * needs to be done. If there is work to be done it will be 3590307279Smav * dispatched to another taskq thread. 3591307279Smav */ 3592307279Smav zio_allocate_dispatch(zio->io_spa); 3593307279Smav} 3594307279Smav 3595185029Spjdstatic int 3596270312Ssmhzio_done(zio_t *zio) 3597185029Spjd{ 3598185029Spjd spa_t *spa = zio->io_spa; 3599185029Spjd zio_t *lio = zio->io_logical; 3600185029Spjd blkptr_t *bp = zio->io_bp; 3601185029Spjd vdev_t *vd = zio->io_vd; 3602185029Spjd uint64_t psize = zio->io_size; 3603209962Smm zio_t *pio, *pio_next; 3604307279Smav metaslab_class_t *mc = spa_normal_class(spa); 3605307279Smav zio_link_t *zl = NULL; 3606185029Spjd 3607168404Spjd /* 3608209962Smm * If our children haven't all completed, 3609185029Spjd * wait for them and then repeat this pipeline stage. 3610168404Spjd */ 3611330238Savg if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) { 3612185029Spjd return (ZIO_PIPELINE_STOP); 3613330238Savg } 3614185029Spjd 3615307279Smav /* 3616307279Smav * If the allocation throttle is enabled, then update the accounting. 3617307279Smav * We only track child I/Os that are part of an allocating async 3618307279Smav * write. We must do this since the allocation is performed 3619307279Smav * by the logical I/O but the actual write is done by child I/Os. 3620307279Smav */ 3621307279Smav if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING && 3622307279Smav zio->io_child_type == ZIO_CHILD_VDEV) { 3623307279Smav ASSERT(mc->mc_alloc_throttle_enabled); 3624307279Smav zio_dva_throttle_done(zio); 3625307279Smav } 3626307279Smav 3627307279Smav /* 3628307279Smav * If the allocation throttle is enabled, verify that 3629307279Smav * we have decremented the refcounts for every I/O that was throttled. 3630307279Smav */ 3631307279Smav if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { 3632307279Smav ASSERT(zio->io_type == ZIO_TYPE_WRITE); 3633307279Smav ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); 3634307279Smav ASSERT(bp != NULL); 3635307279Smav metaslab_group_alloc_verify(spa, zio->io_bp, zio); 3636307279Smav VERIFY(refcount_not_held(&mc->mc_alloc_slots, zio)); 3637307279Smav } 3638307279Smav 3639185029Spjd for (int c = 0; c < ZIO_CHILD_TYPES; c++) 3640185029Spjd for (int w = 0; w < ZIO_WAIT_TYPES; w++) 3641185029Spjd ASSERT(zio->io_children[c][w] == 0); 3642185029Spjd 3643268649Sdelphij if (bp != NULL && !BP_IS_EMBEDDED(bp)) { 3644185029Spjd ASSERT(bp->blk_pad[0] == 0); 3645185029Spjd ASSERT(bp->blk_pad[1] == 0); 3646185029Spjd ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || 3647209962Smm (bp == zio_unique_parent(zio)->io_bp)); 3648185029Spjd if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 3649219089Spjd zio->io_bp_override == NULL && 3650185029Spjd !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 3651185029Spjd ASSERT(!BP_SHOULD_BYTESWAP(bp)); 3652219089Spjd ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); 3653185029Spjd ASSERT(BP_COUNT_GANG(bp) == 0 || 3654185029Spjd (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 3655185029Spjd } 3656243524Smm if (zio->io_flags & ZIO_FLAG_NOPWRITE) 3657243524Smm VERIFY(BP_EQUAL(bp, &zio->io_bp_orig)); 3658168404Spjd } 3659168404Spjd 3660185029Spjd /* 3661219089Spjd * If there were child vdev/gang/ddt errors, they apply to us now. 3662185029Spjd */ 3663185029Spjd zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); 3664185029Spjd zio_inherit_child_errors(zio, ZIO_CHILD_GANG); 3665219089Spjd zio_inherit_child_errors(zio, ZIO_CHILD_DDT); 3666168404Spjd 3667219089Spjd /* 3668219089Spjd * If the I/O on the transformed data was successful, generate any 3669219089Spjd * checksum reports now while we still have the transformed data. 3670219089Spjd */ 3671219089Spjd if (zio->io_error == 0) { 3672219089Spjd while (zio->io_cksum_report != NULL) { 3673219089Spjd zio_cksum_report_t *zcr = zio->io_cksum_report; 3674219089Spjd uint64_t align = zcr->zcr_align; 3675219089Spjd uint64_t asize = P2ROUNDUP(psize, align); 3676219089Spjd char *abuf = zio->io_data; 3677219089Spjd 3678219089Spjd if (asize != psize) { 3679219089Spjd abuf = zio_buf_alloc(asize); 3680219089Spjd bcopy(zio->io_data, abuf, psize); 3681219089Spjd bzero(abuf + psize, asize - psize); 3682219089Spjd } 3683219089Spjd 3684219089Spjd zio->io_cksum_report = zcr->zcr_next; 3685219089Spjd zcr->zcr_next = NULL; 3686219089Spjd zcr->zcr_finish(zcr, abuf); 3687219089Spjd zfs_ereport_free_checksum(zcr); 3688219089Spjd 3689219089Spjd if (asize != psize) 3690219089Spjd zio_buf_free(abuf, asize); 3691219089Spjd } 3692219089Spjd } 3693219089Spjd 3694185029Spjd zio_pop_transforms(zio); /* note: may set zio->io_error */ 3695168404Spjd 3696185029Spjd vdev_stat_update(zio, psize); 3697185029Spjd 3698168404Spjd if (zio->io_error) { 3699185029Spjd /* 3700185029Spjd * If this I/O is attached to a particular vdev, 3701185029Spjd * generate an error message describing the I/O failure 3702185029Spjd * at the block level. We ignore these errors if the 3703185029Spjd * device is currently unavailable. 3704185029Spjd */ 3705185029Spjd if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 3706185029Spjd zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 3707185029Spjd 3708219089Spjd if ((zio->io_error == EIO || !(zio->io_flags & 3709219089Spjd (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && 3710219089Spjd zio == lio) { 3711185029Spjd /* 3712185029Spjd * For logical I/O requests, tell the SPA to log the 3713185029Spjd * error and generate a logical data ereport. 3714185029Spjd */ 3715185029Spjd spa_log_error(spa, zio); 3716185029Spjd zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 3717185029Spjd 0, 0); 3718185029Spjd } 3719168404Spjd } 3720168404Spjd 3721185029Spjd if (zio->io_error && zio == lio) { 3722185029Spjd /* 3723185029Spjd * Determine whether zio should be reexecuted. This will 3724185029Spjd * propagate all the way to the root via zio_notify_parent(). 3725185029Spjd */ 3726185029Spjd ASSERT(vd == NULL && bp != NULL); 3727219089Spjd ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3728168404Spjd 3729219089Spjd if (IO_IS_ALLOCATING(zio) && 3730219089Spjd !(zio->io_flags & ZIO_FLAG_CANFAIL)) { 3731185029Spjd if (zio->io_error != ENOSPC) 3732185029Spjd zio->io_reexecute |= ZIO_REEXECUTE_NOW; 3733185029Spjd else 3734185029Spjd zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3735219089Spjd } 3736168404Spjd 3737185029Spjd if ((zio->io_type == ZIO_TYPE_READ || 3738185029Spjd zio->io_type == ZIO_TYPE_FREE) && 3739219089Spjd !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && 3740185029Spjd zio->io_error == ENXIO && 3741219089Spjd spa_load_state(spa) == SPA_LOAD_NONE && 3742185029Spjd spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 3743185029Spjd zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3744185029Spjd 3745185029Spjd if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) 3746185029Spjd zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3747219089Spjd 3748219089Spjd /* 3749219089Spjd * Here is a possibly good place to attempt to do 3750219089Spjd * either combinatorial reconstruction or error correction 3751219089Spjd * based on checksums. It also might be a good place 3752219089Spjd * to send out preliminary ereports before we suspend 3753219089Spjd * processing. 3754219089Spjd */ 3755185029Spjd } 3756185029Spjd 3757168404Spjd /* 3758185029Spjd * If there were logical child errors, they apply to us now. 3759185029Spjd * We defer this until now to avoid conflating logical child 3760185029Spjd * errors with errors that happened to the zio itself when 3761185029Spjd * updating vdev stats and reporting FMA events above. 3762168404Spjd */ 3763185029Spjd zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); 3764185029Spjd 3765219089Spjd if ((zio->io_error || zio->io_reexecute) && 3766219089Spjd IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && 3767243524Smm !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) 3768209962Smm zio_dva_unallocate(zio, zio->io_gang_tree, bp); 3769209962Smm 3770209962Smm zio_gang_tree_free(&zio->io_gang_tree); 3771209962Smm 3772209962Smm /* 3773209962Smm * Godfather I/Os should never suspend. 3774209962Smm */ 3775209962Smm if ((zio->io_flags & ZIO_FLAG_GODFATHER) && 3776209962Smm (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) 3777209962Smm zio->io_reexecute = 0; 3778209962Smm 3779185029Spjd if (zio->io_reexecute) { 3780185029Spjd /* 3781185029Spjd * This is a logical I/O that wants to reexecute. 3782185029Spjd * 3783185029Spjd * Reexecute is top-down. When an i/o fails, if it's not 3784185029Spjd * the root, it simply notifies its parent and sticks around. 3785185029Spjd * The parent, seeing that it still has children in zio_done(), 3786185029Spjd * does the same. This percolates all the way up to the root. 3787185029Spjd * The root i/o will reexecute or suspend the entire tree. 3788185029Spjd * 3789185029Spjd * This approach ensures that zio_reexecute() honors 3790185029Spjd * all the original i/o dependency relationships, e.g. 3791185029Spjd * parents not executing until children are ready. 3792185029Spjd */ 3793185029Spjd ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3794185029Spjd 3795209962Smm zio->io_gang_leader = NULL; 3796185029Spjd 3797209962Smm mutex_enter(&zio->io_lock); 3798209962Smm zio->io_state[ZIO_WAIT_DONE] = 1; 3799209962Smm mutex_exit(&zio->io_lock); 3800185029Spjd 3801209962Smm /* 3802209962Smm * "The Godfather" I/O monitors its children but is 3803209962Smm * not a true parent to them. It will track them through 3804209962Smm * the pipeline but severs its ties whenever they get into 3805209962Smm * trouble (e.g. suspended). This allows "The Godfather" 3806209962Smm * I/O to return status without blocking. 3807209962Smm */ 3808307279Smav zl = NULL; 3809307279Smav for (pio = zio_walk_parents(zio, &zl); pio != NULL; 3810307279Smav pio = pio_next) { 3811307279Smav zio_link_t *remove_zl = zl; 3812307279Smav pio_next = zio_walk_parents(zio, &zl); 3813209962Smm 3814209962Smm if ((pio->io_flags & ZIO_FLAG_GODFATHER) && 3815209962Smm (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { 3816307279Smav zio_remove_child(pio, zio, remove_zl); 3817209962Smm zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3818209962Smm } 3819209962Smm } 3820209962Smm 3821209962Smm if ((pio = zio_unique_parent(zio)) != NULL) { 3822185029Spjd /* 3823185029Spjd * We're not a root i/o, so there's nothing to do 3824185029Spjd * but notify our parent. Don't propagate errors 3825185029Spjd * upward since we haven't permanently failed yet. 3826185029Spjd */ 3827209962Smm ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 3828185029Spjd zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; 3829185029Spjd zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3830185029Spjd } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { 3831185029Spjd /* 3832185029Spjd * We'd fail again if we reexecuted now, so suspend 3833185029Spjd * until conditions improve (e.g. device comes online). 3834185029Spjd */ 3835185029Spjd zio_suspend(spa, zio); 3836185029Spjd } else { 3837185029Spjd /* 3838185029Spjd * Reexecution is potentially a huge amount of work. 3839185029Spjd * Hand it off to the otherwise-unused claim taskq. 3840185029Spjd */ 3841260742Savg#if defined(illumos) || !defined(_KERNEL) 3842260742Savg ASSERT(zio->io_tqent.tqent_next == NULL); 3843216919Smm#else 3844260742Savg ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); 3845260742Savg#endif 3846260750Savg spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM, 3847260750Savg ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio, 3848260750Savg 0, &zio->io_tqent); 3849185029Spjd } 3850185029Spjd return (ZIO_PIPELINE_STOP); 3851168404Spjd } 3852168404Spjd 3853219089Spjd ASSERT(zio->io_child_count == 0); 3854185029Spjd ASSERT(zio->io_reexecute == 0); 3855185029Spjd ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); 3856168404Spjd 3857209962Smm /* 3858219089Spjd * Report any checksum errors, since the I/O is complete. 3859219089Spjd */ 3860219089Spjd while (zio->io_cksum_report != NULL) { 3861219089Spjd zio_cksum_report_t *zcr = zio->io_cksum_report; 3862219089Spjd zio->io_cksum_report = zcr->zcr_next; 3863219089Spjd zcr->zcr_next = NULL; 3864219089Spjd zcr->zcr_finish(zcr, NULL); 3865219089Spjd zfs_ereport_free_checksum(zcr); 3866219089Spjd } 3867219089Spjd 3868219089Spjd /* 3869209962Smm * It is the responsibility of the done callback to ensure that this 3870209962Smm * particular zio is no longer discoverable for adoption, and as 3871209962Smm * such, cannot acquire any new parents. 3872209962Smm */ 3873185029Spjd if (zio->io_done) 3874185029Spjd zio->io_done(zio); 3875168404Spjd 3876209962Smm mutex_enter(&zio->io_lock); 3877209962Smm zio->io_state[ZIO_WAIT_DONE] = 1; 3878209962Smm mutex_exit(&zio->io_lock); 3879168404Spjd 3880307279Smav zl = NULL; 3881307279Smav for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) { 3882307279Smav zio_link_t *remove_zl = zl; 3883307279Smav pio_next = zio_walk_parents(zio, &zl); 3884307279Smav zio_remove_child(pio, zio, remove_zl); 3885185029Spjd zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3886168404Spjd } 3887168404Spjd 3888185029Spjd if (zio->io_waiter != NULL) { 3889185029Spjd mutex_enter(&zio->io_lock); 3890185029Spjd zio->io_executor = NULL; 3891185029Spjd cv_broadcast(&zio->io_cv); 3892185029Spjd mutex_exit(&zio->io_lock); 3893185029Spjd } else { 3894185029Spjd zio_destroy(zio); 3895168404Spjd } 3896168404Spjd 3897185029Spjd return (ZIO_PIPELINE_STOP); 3898168404Spjd} 3899168404Spjd 3900168404Spjd/* 3901185029Spjd * ========================================================================== 3902185029Spjd * I/O pipeline definition 3903185029Spjd * ========================================================================== 3904168404Spjd */ 3905219089Spjdstatic zio_pipe_stage_t *zio_pipeline[] = { 3906185029Spjd NULL, 3907219089Spjd zio_read_bp_init, 3908307279Smav zio_write_bp_init, 3909219089Spjd zio_free_bp_init, 3910185029Spjd zio_issue_async, 3911307279Smav zio_write_compress, 3912185029Spjd zio_checksum_generate, 3913243524Smm zio_nop_write, 3914219089Spjd zio_ddt_read_start, 3915219089Spjd zio_ddt_read_done, 3916219089Spjd zio_ddt_write, 3917219089Spjd zio_ddt_free, 3918185029Spjd zio_gang_assemble, 3919185029Spjd zio_gang_issue, 3920307279Smav zio_dva_throttle, 3921185029Spjd zio_dva_allocate, 3922185029Spjd zio_dva_free, 3923185029Spjd zio_dva_claim, 3924185029Spjd zio_ready, 3925185029Spjd zio_vdev_io_start, 3926185029Spjd zio_vdev_io_done, 3927185029Spjd zio_vdev_io_assess, 3928185029Spjd zio_checksum_verify, 3929185029Spjd zio_done 3930185029Spjd}; 3931236884Smm 3932236884Smm 3933236884Smm 3934236884Smm 3935288571Smav/* 3936288571Smav * Compare two zbookmark_phys_t's to see which we would reach first in a 3937288571Smav * pre-order traversal of the object tree. 3938288571Smav * 3939288571Smav * This is simple in every case aside from the meta-dnode object. For all other 3940288571Smav * objects, we traverse them in order (object 1 before object 2, and so on). 3941288571Smav * However, all of these objects are traversed while traversing object 0, since 3942288571Smav * the data it points to is the list of objects. Thus, we need to convert to a 3943288571Smav * canonical representation so we can compare meta-dnode bookmarks to 3944288571Smav * non-meta-dnode bookmarks. 3945288571Smav * 3946288571Smav * We do this by calculating "equivalents" for each field of the zbookmark. 3947288571Smav * zbookmarks outside of the meta-dnode use their own object and level, and 3948288571Smav * calculate the level 0 equivalent (the first L0 blkid that is contained in the 3949288571Smav * blocks this bookmark refers to) by multiplying their blkid by their span 3950288571Smav * (the number of L0 blocks contained within one block at their level). 3951288571Smav * zbookmarks inside the meta-dnode calculate their object equivalent 3952288571Smav * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use 3953288571Smav * level + 1<<31 (any value larger than a level could ever be) for their level. 3954288571Smav * This causes them to always compare before a bookmark in their object 3955288571Smav * equivalent, compare appropriately to bookmarks in other objects, and to 3956288571Smav * compare appropriately to other bookmarks in the meta-dnode. 3957288571Smav */ 3958288571Smavint 3959288571Smavzbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2, 3960288571Smav const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2) 3961288571Smav{ 3962288571Smav /* 3963288571Smav * These variables represent the "equivalent" values for the zbookmark, 3964288571Smav * after converting zbookmarks inside the meta dnode to their 3965288571Smav * normal-object equivalents. 3966288571Smav */ 3967288571Smav uint64_t zb1obj, zb2obj; 3968288571Smav uint64_t zb1L0, zb2L0; 3969288571Smav uint64_t zb1level, zb2level; 3970236884Smm 3971288571Smav if (zb1->zb_object == zb2->zb_object && 3972288571Smav zb1->zb_level == zb2->zb_level && 3973288571Smav zb1->zb_blkid == zb2->zb_blkid) 3974288571Smav return (0); 3975236884Smm 3976288571Smav /* 3977288571Smav * BP_SPANB calculates the span in blocks. 3978288571Smav */ 3979288571Smav zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level); 3980288571Smav zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level); 3981288571Smav 3982236884Smm if (zb1->zb_object == DMU_META_DNODE_OBJECT) { 3983288571Smav zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); 3984288571Smav zb1L0 = 0; 3985288571Smav zb1level = zb1->zb_level + COMPARE_META_LEVEL; 3986288571Smav } else { 3987288571Smav zb1obj = zb1->zb_object; 3988288571Smav zb1level = zb1->zb_level; 3989236884Smm } 3990236884Smm 3991288571Smav if (zb2->zb_object == DMU_META_DNODE_OBJECT) { 3992288571Smav zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); 3993288571Smav zb2L0 = 0; 3994288571Smav zb2level = zb2->zb_level + COMPARE_META_LEVEL; 3995288571Smav } else { 3996288571Smav zb2obj = zb2->zb_object; 3997288571Smav zb2level = zb2->zb_level; 3998288571Smav } 3999288571Smav 4000288571Smav /* Now that we have a canonical representation, do the comparison. */ 4001288571Smav if (zb1obj != zb2obj) 4002288571Smav return (zb1obj < zb2obj ? -1 : 1); 4003288571Smav else if (zb1L0 != zb2L0) 4004288571Smav return (zb1L0 < zb2L0 ? -1 : 1); 4005288571Smav else if (zb1level != zb2level) 4006288571Smav return (zb1level > zb2level ? -1 : 1); 4007288571Smav /* 4008288571Smav * This can (theoretically) happen if the bookmarks have the same object 4009288571Smav * and level, but different blkids, if the block sizes are not the same. 4010288571Smav * There is presently no way to change the indirect block sizes 4011288571Smav */ 4012288571Smav return (0); 4013288571Smav} 4014288571Smav 4015288571Smav/* 4016288571Smav * This function checks the following: given that last_block is the place that 4017288571Smav * our traversal stopped last time, does that guarantee that we've visited 4018288571Smav * every node under subtree_root? Therefore, we can't just use the raw output 4019288571Smav * of zbookmark_compare. We have to pass in a modified version of 4020288571Smav * subtree_root; by incrementing the block id, and then checking whether 4021288571Smav * last_block is before or equal to that, we can tell whether or not having 4022288571Smav * visited last_block implies that all of subtree_root's children have been 4023288571Smav * visited. 4024288571Smav */ 4025288571Smavboolean_t 4026288571Smavzbookmark_subtree_completed(const dnode_phys_t *dnp, 4027288571Smav const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block) 4028288571Smav{ 4029288571Smav zbookmark_phys_t mod_zb = *subtree_root; 4030288571Smav mod_zb.zb_blkid++; 4031288571Smav ASSERT(last_block->zb_level == 0); 4032288571Smav 4033288571Smav /* The objset_phys_t isn't before anything. */ 4034288571Smav if (dnp == NULL) 4035236884Smm return (B_FALSE); 4036288571Smav 4037288571Smav /* 4038288571Smav * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the 4039288571Smav * data block size in sectors, because that variable is only used if 4040288571Smav * the bookmark refers to a block in the meta-dnode. Since we don't 4041288571Smav * know without examining it what object it refers to, and there's no 4042288571Smav * harm in passing in this value in other cases, we always pass it in. 4043288571Smav * 4044288571Smav * We pass in 0 for the indirect block size shift because zb2 must be 4045288571Smav * level 0. The indirect block size is only used to calculate the span 4046288571Smav * of the bookmark, but since the bookmark must be level 0, the span is 4047288571Smav * always 1, so the math works out. 4048288571Smav * 4049288571Smav * If you make changes to how the zbookmark_compare code works, be sure 4050288571Smav * to make sure that this code still works afterwards. 4051288571Smav */ 4052288571Smav return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift, 4053288571Smav 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb, 4054288571Smav last_block) <= 0); 4055236884Smm} 4056