1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23268657Sdelphij * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 24168404Spjd */ 25251478Sdelphij/* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */ 26255750Sdelphij/* Copyright (c) 2013, Joyent, Inc. All rights reserved. */ 27268658Sdelphij/* Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved. */ 28251478Sdelphij 29168404Spjd#include <sys/dmu.h> 30168404Spjd#include <sys/dmu_impl.h> 31168404Spjd#include <sys/dmu_tx.h> 32168404Spjd#include <sys/dbuf.h> 33168404Spjd#include <sys/dnode.h> 34168404Spjd#include <sys/zfs_context.h> 35168404Spjd#include <sys/dmu_objset.h> 36168404Spjd#include <sys/dmu_traverse.h> 37168404Spjd#include <sys/dsl_dataset.h> 38168404Spjd#include <sys/dsl_dir.h> 39168404Spjd#include <sys/dsl_pool.h> 40168404Spjd#include <sys/dsl_synctask.h> 41168404Spjd#include <sys/dsl_prop.h> 42168404Spjd#include <sys/dmu_zfetch.h> 43168404Spjd#include <sys/zfs_ioctl.h> 44168404Spjd#include <sys/zap.h> 45168404Spjd#include <sys/zio_checksum.h> 46243524Smm#include <sys/zio_compress.h> 47219089Spjd#include <sys/sa.h> 48268658Sdelphij#include <sys/zfeature.h> 49219089Spjd#ifdef _KERNEL 50260786Savg#include <sys/vm.h> 51185029Spjd#include <sys/zfs_znode.h> 52219089Spjd#endif 53168404Spjd 54243524Smm/* 55243524Smm * Enable/disable nopwrite feature. 56243524Smm */ 57243524Smmint zfs_nopwrite_enabled = 1; 58243525SmmSYSCTL_DECL(_vfs_zfs); 59243525SmmTUNABLE_INT("vfs.zfs.nopwrite_enabled", &zfs_nopwrite_enabled); 60243525SmmSYSCTL_INT(_vfs_zfs, OID_AUTO, nopwrite_enabled, CTLFLAG_RDTUN, 61243525Smm &zfs_nopwrite_enabled, 0, "Enable nopwrite feature"); 62243524Smm 63168404Spjdconst dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { 64236884Smm { DMU_BSWAP_UINT8, TRUE, "unallocated" }, 65236884Smm { DMU_BSWAP_ZAP, TRUE, "object directory" }, 66236884Smm { DMU_BSWAP_UINT64, TRUE, "object array" }, 67236884Smm { DMU_BSWAP_UINT8, TRUE, "packed nvlist" }, 68236884Smm { DMU_BSWAP_UINT64, TRUE, "packed nvlist size" }, 69236884Smm { DMU_BSWAP_UINT64, TRUE, "bpobj" }, 70236884Smm { DMU_BSWAP_UINT64, TRUE, "bpobj header" }, 71236884Smm { DMU_BSWAP_UINT64, TRUE, "SPA space map header" }, 72236884Smm { DMU_BSWAP_UINT64, TRUE, "SPA space map" }, 73236884Smm { DMU_BSWAP_UINT64, TRUE, "ZIL intent log" }, 74236884Smm { DMU_BSWAP_DNODE, TRUE, "DMU dnode" }, 75236884Smm { DMU_BSWAP_OBJSET, TRUE, "DMU objset" }, 76236884Smm { DMU_BSWAP_UINT64, TRUE, "DSL directory" }, 77236884Smm { DMU_BSWAP_ZAP, TRUE, "DSL directory child map"}, 78236884Smm { DMU_BSWAP_ZAP, TRUE, "DSL dataset snap map" }, 79236884Smm { DMU_BSWAP_ZAP, TRUE, "DSL props" }, 80236884Smm { DMU_BSWAP_UINT64, TRUE, "DSL dataset" }, 81236884Smm { DMU_BSWAP_ZNODE, TRUE, "ZFS znode" }, 82236884Smm { DMU_BSWAP_OLDACL, TRUE, "ZFS V0 ACL" }, 83236884Smm { DMU_BSWAP_UINT8, FALSE, "ZFS plain file" }, 84236884Smm { DMU_BSWAP_ZAP, TRUE, "ZFS directory" }, 85236884Smm { DMU_BSWAP_ZAP, TRUE, "ZFS master node" }, 86236884Smm { DMU_BSWAP_ZAP, TRUE, "ZFS delete queue" }, 87236884Smm { DMU_BSWAP_UINT8, FALSE, "zvol object" }, 88236884Smm { DMU_BSWAP_ZAP, TRUE, "zvol prop" }, 89236884Smm { DMU_BSWAP_UINT8, FALSE, "other uint8[]" }, 90236884Smm { DMU_BSWAP_UINT64, FALSE, "other uint64[]" }, 91236884Smm { DMU_BSWAP_ZAP, TRUE, "other ZAP" }, 92236884Smm { DMU_BSWAP_ZAP, TRUE, "persistent error log" }, 93236884Smm { DMU_BSWAP_UINT8, TRUE, "SPA history" }, 94236884Smm { DMU_BSWAP_UINT64, TRUE, "SPA history offsets" }, 95236884Smm { DMU_BSWAP_ZAP, TRUE, "Pool properties" }, 96236884Smm { DMU_BSWAP_ZAP, TRUE, "DSL permissions" }, 97236884Smm { DMU_BSWAP_ACL, TRUE, "ZFS ACL" }, 98236884Smm { DMU_BSWAP_UINT8, TRUE, "ZFS SYSACL" }, 99236884Smm { DMU_BSWAP_UINT8, TRUE, "FUID table" }, 100236884Smm { DMU_BSWAP_UINT64, TRUE, "FUID table size" }, 101236884Smm { DMU_BSWAP_ZAP, TRUE, "DSL dataset next clones"}, 102236884Smm { DMU_BSWAP_ZAP, TRUE, "scan work queue" }, 103236884Smm { DMU_BSWAP_ZAP, TRUE, "ZFS user/group used" }, 104236884Smm { DMU_BSWAP_ZAP, TRUE, "ZFS user/group quota" }, 105236884Smm { DMU_BSWAP_ZAP, TRUE, "snapshot refcount tags"}, 106236884Smm { DMU_BSWAP_ZAP, TRUE, "DDT ZAP algorithm" }, 107236884Smm { DMU_BSWAP_ZAP, TRUE, "DDT statistics" }, 108236884Smm { DMU_BSWAP_UINT8, TRUE, "System attributes" }, 109236884Smm { DMU_BSWAP_ZAP, TRUE, "SA master node" }, 110236884Smm { DMU_BSWAP_ZAP, TRUE, "SA attr registration" }, 111236884Smm { DMU_BSWAP_ZAP, TRUE, "SA attr layouts" }, 112236884Smm { DMU_BSWAP_ZAP, TRUE, "scan translations" }, 113236884Smm { DMU_BSWAP_UINT8, FALSE, "deduplicated block" }, 114236884Smm { DMU_BSWAP_ZAP, TRUE, "DSL deadlist map" }, 115236884Smm { DMU_BSWAP_UINT64, TRUE, "DSL deadlist map hdr" }, 116236884Smm { DMU_BSWAP_ZAP, TRUE, "DSL dir clones" }, 117236884Smm { DMU_BSWAP_UINT64, TRUE, "bpobj subobj" } 118168404Spjd}; 119168404Spjd 120236884Smmconst dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = { 121236884Smm { byteswap_uint8_array, "uint8" }, 122236884Smm { byteswap_uint16_array, "uint16" }, 123236884Smm { byteswap_uint32_array, "uint32" }, 124236884Smm { byteswap_uint64_array, "uint64" }, 125236884Smm { zap_byteswap, "zap" }, 126236884Smm { dnode_buf_byteswap, "dnode" }, 127236884Smm { dmu_objset_byteswap, "objset" }, 128236884Smm { zfs_znode_byteswap, "znode" }, 129236884Smm { zfs_oldacl_byteswap, "oldacl" }, 130236884Smm { zfs_acl_byteswap, "acl" } 131236884Smm}; 132236884Smm 133168404Spjdint 134268649Sdelphijdmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, 135268649Sdelphij void *tag, dmu_buf_t **dbp) 136168404Spjd{ 137168404Spjd dnode_t *dn; 138168404Spjd uint64_t blkid; 139168404Spjd dmu_buf_impl_t *db; 140168404Spjd int err; 141168404Spjd 142219089Spjd err = dnode_hold(os, object, FTAG, &dn); 143168404Spjd if (err) 144168404Spjd return (err); 145168404Spjd blkid = dbuf_whichblock(dn, offset); 146168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 147168404Spjd db = dbuf_hold(dn, blkid, tag); 148168404Spjd rw_exit(&dn->dn_struct_rwlock); 149268649Sdelphij dnode_rele(dn, FTAG); 150268649Sdelphij 151168404Spjd if (db == NULL) { 152268649Sdelphij *dbp = NULL; 153268649Sdelphij return (SET_ERROR(EIO)); 154268649Sdelphij } 155268649Sdelphij 156268649Sdelphij *dbp = &db->db; 157268649Sdelphij return (err); 158268649Sdelphij} 159268649Sdelphij 160268649Sdelphijint 161268649Sdelphijdmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, 162268649Sdelphij void *tag, dmu_buf_t **dbp, int flags) 163268649Sdelphij{ 164268649Sdelphij int err; 165268649Sdelphij int db_flags = DB_RF_CANFAIL; 166268649Sdelphij 167268649Sdelphij if (flags & DMU_READ_NO_PREFETCH) 168268649Sdelphij db_flags |= DB_RF_NOPREFETCH; 169268649Sdelphij 170268649Sdelphij err = dmu_buf_hold_noread(os, object, offset, tag, dbp); 171268649Sdelphij if (err == 0) { 172268649Sdelphij dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp); 173219089Spjd err = dbuf_read(db, NULL, db_flags); 174268649Sdelphij if (err != 0) { 175168404Spjd dbuf_rele(db, tag); 176268649Sdelphij *dbp = NULL; 177168404Spjd } 178168404Spjd } 179168404Spjd 180168404Spjd return (err); 181168404Spjd} 182168404Spjd 183168404Spjdint 184168404Spjddmu_bonus_max(void) 185168404Spjd{ 186168404Spjd return (DN_MAX_BONUSLEN); 187168404Spjd} 188168404Spjd 189185029Spjdint 190219089Spjddmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx) 191185029Spjd{ 192219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 193219089Spjd dnode_t *dn; 194219089Spjd int error; 195185029Spjd 196219089Spjd DB_DNODE_ENTER(db); 197219089Spjd dn = DB_DNODE(db); 198219089Spjd 199219089Spjd if (dn->dn_bonus != db) { 200249195Smm error = SET_ERROR(EINVAL); 201219089Spjd } else if (newsize < 0 || newsize > db_fake->db_size) { 202249195Smm error = SET_ERROR(EINVAL); 203219089Spjd } else { 204219089Spjd dnode_setbonuslen(dn, newsize, tx); 205219089Spjd error = 0; 206219089Spjd } 207219089Spjd 208219089Spjd DB_DNODE_EXIT(db); 209219089Spjd return (error); 210185029Spjd} 211185029Spjd 212219089Spjdint 213219089Spjddmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx) 214219089Spjd{ 215219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 216219089Spjd dnode_t *dn; 217219089Spjd int error; 218219089Spjd 219219089Spjd DB_DNODE_ENTER(db); 220219089Spjd dn = DB_DNODE(db); 221219089Spjd 222236884Smm if (!DMU_OT_IS_VALID(type)) { 223249195Smm error = SET_ERROR(EINVAL); 224219089Spjd } else if (dn->dn_bonus != db) { 225249195Smm error = SET_ERROR(EINVAL); 226219089Spjd } else { 227219089Spjd dnode_setbonus_type(dn, type, tx); 228219089Spjd error = 0; 229219089Spjd } 230219089Spjd 231219089Spjd DB_DNODE_EXIT(db); 232219089Spjd return (error); 233219089Spjd} 234219089Spjd 235219089Spjddmu_object_type_t 236219089Spjddmu_get_bonustype(dmu_buf_t *db_fake) 237219089Spjd{ 238219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 239219089Spjd dnode_t *dn; 240219089Spjd dmu_object_type_t type; 241219089Spjd 242219089Spjd DB_DNODE_ENTER(db); 243219089Spjd dn = DB_DNODE(db); 244219089Spjd type = dn->dn_bonustype; 245219089Spjd DB_DNODE_EXIT(db); 246219089Spjd 247219089Spjd return (type); 248219089Spjd} 249219089Spjd 250219089Spjdint 251219089Spjddmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx) 252219089Spjd{ 253219089Spjd dnode_t *dn; 254219089Spjd int error; 255219089Spjd 256219089Spjd error = dnode_hold(os, object, FTAG, &dn); 257219089Spjd dbuf_rm_spill(dn, tx); 258219089Spjd rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 259219089Spjd dnode_rm_spill(dn, tx); 260219089Spjd rw_exit(&dn->dn_struct_rwlock); 261219089Spjd dnode_rele(dn, FTAG); 262219089Spjd return (error); 263219089Spjd} 264219089Spjd 265168404Spjd/* 266168404Spjd * returns ENOENT, EIO, or 0. 267168404Spjd */ 268168404Spjdint 269168404Spjddmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) 270168404Spjd{ 271168404Spjd dnode_t *dn; 272168404Spjd dmu_buf_impl_t *db; 273185029Spjd int error; 274168404Spjd 275219089Spjd error = dnode_hold(os, object, FTAG, &dn); 276185029Spjd if (error) 277185029Spjd return (error); 278168404Spjd 279168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 280168404Spjd if (dn->dn_bonus == NULL) { 281168404Spjd rw_exit(&dn->dn_struct_rwlock); 282168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 283168404Spjd if (dn->dn_bonus == NULL) 284185029Spjd dbuf_create_bonus(dn); 285168404Spjd } 286168404Spjd db = dn->dn_bonus; 287185029Spjd 288185029Spjd /* as long as the bonus buf is held, the dnode will be held */ 289219089Spjd if (refcount_add(&db->db_holds, tag) == 1) { 290185029Spjd VERIFY(dnode_add_ref(dn, db)); 291271002Sdelphij atomic_inc_32(&dn->dn_dbufs_count); 292219089Spjd } 293185029Spjd 294219089Spjd /* 295219089Spjd * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's 296219089Spjd * hold and incrementing the dbuf count to ensure that dnode_move() sees 297219089Spjd * a dnode hold for every dbuf. 298219089Spjd */ 299219089Spjd rw_exit(&dn->dn_struct_rwlock); 300219089Spjd 301168404Spjd dnode_rele(dn, FTAG); 302168404Spjd 303219089Spjd VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH)); 304168404Spjd 305168404Spjd *dbp = &db->db; 306168404Spjd return (0); 307168404Spjd} 308168404Spjd 309168404Spjd/* 310219089Spjd * returns ENOENT, EIO, or 0. 311219089Spjd * 312219089Spjd * This interface will allocate a blank spill dbuf when a spill blk 313219089Spjd * doesn't already exist on the dnode. 314219089Spjd * 315219089Spjd * if you only want to find an already existing spill db, then 316219089Spjd * dmu_spill_hold_existing() should be used. 317219089Spjd */ 318219089Spjdint 319219089Spjddmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp) 320219089Spjd{ 321219089Spjd dmu_buf_impl_t *db = NULL; 322219089Spjd int err; 323219089Spjd 324219089Spjd if ((flags & DB_RF_HAVESTRUCT) == 0) 325219089Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 326219089Spjd 327219089Spjd db = dbuf_hold(dn, DMU_SPILL_BLKID, tag); 328219089Spjd 329219089Spjd if ((flags & DB_RF_HAVESTRUCT) == 0) 330219089Spjd rw_exit(&dn->dn_struct_rwlock); 331219089Spjd 332219089Spjd ASSERT(db != NULL); 333219089Spjd err = dbuf_read(db, NULL, flags); 334219089Spjd if (err == 0) 335219089Spjd *dbp = &db->db; 336219089Spjd else 337219089Spjd dbuf_rele(db, tag); 338219089Spjd return (err); 339219089Spjd} 340219089Spjd 341219089Spjdint 342219089Spjddmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) 343219089Spjd{ 344219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; 345219089Spjd dnode_t *dn; 346219089Spjd int err; 347219089Spjd 348219089Spjd DB_DNODE_ENTER(db); 349219089Spjd dn = DB_DNODE(db); 350219089Spjd 351219089Spjd if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) { 352249195Smm err = SET_ERROR(EINVAL); 353219089Spjd } else { 354219089Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 355219089Spjd 356219089Spjd if (!dn->dn_have_spill) { 357249195Smm err = SET_ERROR(ENOENT); 358219089Spjd } else { 359219089Spjd err = dmu_spill_hold_by_dnode(dn, 360219089Spjd DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp); 361219089Spjd } 362219089Spjd 363219089Spjd rw_exit(&dn->dn_struct_rwlock); 364219089Spjd } 365219089Spjd 366219089Spjd DB_DNODE_EXIT(db); 367219089Spjd return (err); 368219089Spjd} 369219089Spjd 370219089Spjdint 371219089Spjddmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) 372219089Spjd{ 373219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; 374219089Spjd dnode_t *dn; 375219089Spjd int err; 376219089Spjd 377219089Spjd DB_DNODE_ENTER(db); 378219089Spjd dn = DB_DNODE(db); 379219089Spjd err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp); 380219089Spjd DB_DNODE_EXIT(db); 381219089Spjd 382219089Spjd return (err); 383219089Spjd} 384219089Spjd 385219089Spjd/* 386168404Spjd * Note: longer-term, we should modify all of the dmu_buf_*() interfaces 387168404Spjd * to take a held dnode rather than <os, object> -- the lookup is wasteful, 388168404Spjd * and can induce severe lock contention when writing to several files 389168404Spjd * whose dnodes are in the same block. 390168404Spjd */ 391168404Spjdstatic int 392209962Smmdmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, 393209962Smm int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) 394168404Spjd{ 395168404Spjd dmu_buf_t **dbp; 396168404Spjd uint64_t blkid, nblks, i; 397209962Smm uint32_t dbuf_flags; 398168404Spjd int err; 399168404Spjd zio_t *zio; 400168404Spjd 401168404Spjd ASSERT(length <= DMU_MAX_ACCESS); 402168404Spjd 403214378Smm dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT; 404209962Smm if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz) 405209962Smm dbuf_flags |= DB_RF_NOPREFETCH; 406168404Spjd 407168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 408168404Spjd if (dn->dn_datablkshift) { 409168404Spjd int blkshift = dn->dn_datablkshift; 410168404Spjd nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) - 411168404Spjd P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift; 412168404Spjd } else { 413168404Spjd if (offset + length > dn->dn_datablksz) { 414168404Spjd zfs_panic_recover("zfs: accessing past end of object " 415168404Spjd "%llx/%llx (size=%u access=%llu+%llu)", 416168404Spjd (longlong_t)dn->dn_objset-> 417168404Spjd os_dsl_dataset->ds_object, 418168404Spjd (longlong_t)dn->dn_object, dn->dn_datablksz, 419168404Spjd (longlong_t)offset, (longlong_t)length); 420214378Smm rw_exit(&dn->dn_struct_rwlock); 421249195Smm return (SET_ERROR(EIO)); 422168404Spjd } 423168404Spjd nblks = 1; 424168404Spjd } 425168404Spjd dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); 426168404Spjd 427185029Spjd zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); 428168404Spjd blkid = dbuf_whichblock(dn, offset); 429168404Spjd for (i = 0; i < nblks; i++) { 430168404Spjd dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); 431168404Spjd if (db == NULL) { 432168404Spjd rw_exit(&dn->dn_struct_rwlock); 433168404Spjd dmu_buf_rele_array(dbp, nblks, tag); 434168404Spjd zio_nowait(zio); 435249195Smm return (SET_ERROR(EIO)); 436168404Spjd } 437168404Spjd /* initiate async i/o */ 438226620Spjd if (read) 439209962Smm (void) dbuf_read(db, zio, dbuf_flags); 440226620Spjd#ifdef _KERNEL 441226620Spjd else 442226620Spjd curthread->td_ru.ru_oublock++; 443226620Spjd#endif 444168404Spjd dbp[i] = &db->db; 445168404Spjd } 446168404Spjd rw_exit(&dn->dn_struct_rwlock); 447168404Spjd 448168404Spjd /* wait for async i/o */ 449168404Spjd err = zio_wait(zio); 450168404Spjd if (err) { 451168404Spjd dmu_buf_rele_array(dbp, nblks, tag); 452168404Spjd return (err); 453168404Spjd } 454168404Spjd 455168404Spjd /* wait for other io to complete */ 456168404Spjd if (read) { 457168404Spjd for (i = 0; i < nblks; i++) { 458168404Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; 459168404Spjd mutex_enter(&db->db_mtx); 460168404Spjd while (db->db_state == DB_READ || 461168404Spjd db->db_state == DB_FILL) 462168404Spjd cv_wait(&db->db_changed, &db->db_mtx); 463168404Spjd if (db->db_state == DB_UNCACHED) 464249195Smm err = SET_ERROR(EIO); 465168404Spjd mutex_exit(&db->db_mtx); 466168404Spjd if (err) { 467168404Spjd dmu_buf_rele_array(dbp, nblks, tag); 468168404Spjd return (err); 469168404Spjd } 470168404Spjd } 471168404Spjd } 472168404Spjd 473168404Spjd *numbufsp = nblks; 474168404Spjd *dbpp = dbp; 475168404Spjd return (0); 476168404Spjd} 477168404Spjd 478168404Spjdstatic int 479168404Spjddmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, 480168404Spjd uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) 481168404Spjd{ 482168404Spjd dnode_t *dn; 483168404Spjd int err; 484168404Spjd 485219089Spjd err = dnode_hold(os, object, FTAG, &dn); 486168404Spjd if (err) 487168404Spjd return (err); 488168404Spjd 489168404Spjd err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, 490209962Smm numbufsp, dbpp, DMU_READ_PREFETCH); 491168404Spjd 492168404Spjd dnode_rele(dn, FTAG); 493168404Spjd 494168404Spjd return (err); 495168404Spjd} 496168404Spjd 497168404Spjdint 498219089Spjddmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset, 499168404Spjd uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) 500168404Spjd{ 501219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 502219089Spjd dnode_t *dn; 503168404Spjd int err; 504168404Spjd 505219089Spjd DB_DNODE_ENTER(db); 506219089Spjd dn = DB_DNODE(db); 507168404Spjd err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, 508209962Smm numbufsp, dbpp, DMU_READ_PREFETCH); 509219089Spjd DB_DNODE_EXIT(db); 510168404Spjd 511168404Spjd return (err); 512168404Spjd} 513168404Spjd 514168404Spjdvoid 515168404Spjddmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) 516168404Spjd{ 517168404Spjd int i; 518168404Spjd dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; 519168404Spjd 520168404Spjd if (numbufs == 0) 521168404Spjd return; 522168404Spjd 523168404Spjd for (i = 0; i < numbufs; i++) { 524168404Spjd if (dbp[i]) 525168404Spjd dbuf_rele(dbp[i], tag); 526168404Spjd } 527168404Spjd 528168404Spjd kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); 529168404Spjd} 530168404Spjd 531260763Savg/* 532260763Savg * Issue prefetch i/os for the given blocks. 533260763Savg * 534260763Savg * Note: The assumption is that we *know* these blocks will be needed 535260763Savg * almost immediately. Therefore, the prefetch i/os will be issued at 536260763Savg * ZIO_PRIORITY_SYNC_READ 537260763Savg * 538260763Savg * Note: indirect blocks and other metadata will be read synchronously, 539260763Savg * causing this function to block if they are not already cached. 540260763Savg */ 541168404Spjdvoid 542168404Spjddmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) 543168404Spjd{ 544168404Spjd dnode_t *dn; 545168404Spjd uint64_t blkid; 546260763Savg int nblks, err; 547168404Spjd 548194043Skmacy if (zfs_prefetch_disable) 549168404Spjd return; 550168404Spjd 551168404Spjd if (len == 0) { /* they're interested in the bonus buffer */ 552219089Spjd dn = DMU_META_DNODE(os); 553168404Spjd 554168404Spjd if (object == 0 || object >= DN_MAX_OBJECT) 555168404Spjd return; 556168404Spjd 557168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 558168404Spjd blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t)); 559260763Savg dbuf_prefetch(dn, blkid, ZIO_PRIORITY_SYNC_READ); 560168404Spjd rw_exit(&dn->dn_struct_rwlock); 561168404Spjd return; 562168404Spjd } 563168404Spjd 564168404Spjd /* 565168404Spjd * XXX - Note, if the dnode for the requested object is not 566168404Spjd * already cached, we will do a *synchronous* read in the 567168404Spjd * dnode_hold() call. The same is true for any indirects. 568168404Spjd */ 569219089Spjd err = dnode_hold(os, object, FTAG, &dn); 570168404Spjd if (err != 0) 571168404Spjd return; 572168404Spjd 573168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 574168404Spjd if (dn->dn_datablkshift) { 575168404Spjd int blkshift = dn->dn_datablkshift; 576260763Savg nblks = (P2ROUNDUP(offset + len, 1 << blkshift) - 577260763Savg P2ALIGN(offset, 1 << blkshift)) >> blkshift; 578168404Spjd } else { 579168404Spjd nblks = (offset < dn->dn_datablksz); 580168404Spjd } 581168404Spjd 582168404Spjd if (nblks != 0) { 583168404Spjd blkid = dbuf_whichblock(dn, offset); 584260763Savg for (int i = 0; i < nblks; i++) 585260763Savg dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_SYNC_READ); 586168404Spjd } 587168404Spjd 588168404Spjd rw_exit(&dn->dn_struct_rwlock); 589168404Spjd 590168404Spjd dnode_rele(dn, FTAG); 591168404Spjd} 592168404Spjd 593208775Smm/* 594208775Smm * Get the next "chunk" of file data to free. We traverse the file from 595208775Smm * the end so that the file gets shorter over time (if we crashes in the 596208775Smm * middle, this will leave us in a better state). We find allocated file 597208775Smm * data by simply searching the allocated level 1 indirects. 598254753Sdelphij * 599254753Sdelphij * On input, *start should be the first offset that does not need to be 600254753Sdelphij * freed (e.g. "offset + length"). On return, *start will be the first 601254753Sdelphij * offset that should be freed. 602208775Smm */ 603185029Spjdstatic int 604254753Sdelphijget_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum) 605185029Spjd{ 606254753Sdelphij uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1); 607254753Sdelphij /* bytes of data covered by a level-1 indirect block */ 608208775Smm uint64_t iblkrange = 609185029Spjd dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT); 610185029Spjd 611254753Sdelphij ASSERT3U(minimum, <=, *start); 612185029Spjd 613254753Sdelphij if (*start - minimum <= iblkrange * maxblks) { 614254753Sdelphij *start = minimum; 615185029Spjd return (0); 616185029Spjd } 617208775Smm ASSERT(ISP2(iblkrange)); 618185029Spjd 619254753Sdelphij for (uint64_t blks = 0; *start > minimum && blks < maxblks; blks++) { 620185029Spjd int err; 621185029Spjd 622254753Sdelphij /* 623254753Sdelphij * dnode_next_offset(BACKWARDS) will find an allocated L1 624254753Sdelphij * indirect block at or before the input offset. We must 625254753Sdelphij * decrement *start so that it is at the end of the region 626254753Sdelphij * to search. 627254753Sdelphij */ 628254753Sdelphij (*start)--; 629185029Spjd err = dnode_next_offset(dn, 630208775Smm DNODE_FIND_BACKWARDS, start, 2, 1, 0); 631185029Spjd 632254753Sdelphij /* if there are no indirect blocks before start, we are done */ 633208775Smm if (err == ESRCH) { 634254753Sdelphij *start = minimum; 635254753Sdelphij break; 636254753Sdelphij } else if (err != 0) { 637208775Smm return (err); 638185029Spjd } 639185029Spjd 640254753Sdelphij /* set start to the beginning of this L1 indirect */ 641208775Smm *start = P2ALIGN(*start, iblkrange); 642185029Spjd } 643254753Sdelphij if (*start < minimum) 644254753Sdelphij *start = minimum; 645185029Spjd return (0); 646185029Spjd} 647185029Spjd 648185029Spjdstatic int 649185029Spjddmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, 650254753Sdelphij uint64_t length) 651185029Spjd{ 652254753Sdelphij uint64_t object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz; 653254753Sdelphij int err; 654185029Spjd 655254753Sdelphij if (offset >= object_size) 656185029Spjd return (0); 657185029Spjd 658254753Sdelphij if (length == DMU_OBJECT_END || offset + length > object_size) 659254753Sdelphij length = object_size - offset; 660254753Sdelphij 661254753Sdelphij while (length != 0) { 662254753Sdelphij uint64_t chunk_end, chunk_begin; 663254753Sdelphij 664254753Sdelphij chunk_end = chunk_begin = offset + length; 665254753Sdelphij 666254753Sdelphij /* move chunk_begin backwards to the beginning of this chunk */ 667254753Sdelphij err = get_next_chunk(dn, &chunk_begin, offset); 668185029Spjd if (err) 669185029Spjd return (err); 670254753Sdelphij ASSERT3U(chunk_begin, >=, offset); 671254753Sdelphij ASSERT3U(chunk_begin, <=, chunk_end); 672185029Spjd 673254753Sdelphij dmu_tx_t *tx = dmu_tx_create(os); 674254753Sdelphij dmu_tx_hold_free(tx, dn->dn_object, 675254753Sdelphij chunk_begin, chunk_end - chunk_begin); 676269002Sdelphij 677269002Sdelphij /* 678269002Sdelphij * Mark this transaction as typically resulting in a net 679269002Sdelphij * reduction in space used. 680269002Sdelphij */ 681269002Sdelphij dmu_tx_mark_netfree(tx); 682185029Spjd err = dmu_tx_assign(tx, TXG_WAIT); 683185029Spjd if (err) { 684185029Spjd dmu_tx_abort(tx); 685185029Spjd return (err); 686185029Spjd } 687254753Sdelphij dnode_free_range(dn, chunk_begin, chunk_end - chunk_begin, tx); 688254753Sdelphij dmu_tx_commit(tx); 689185029Spjd 690254753Sdelphij length -= chunk_end - chunk_begin; 691185029Spjd } 692185029Spjd return (0); 693185029Spjd} 694185029Spjd 695168404Spjdint 696185029Spjddmu_free_long_range(objset_t *os, uint64_t object, 697185029Spjd uint64_t offset, uint64_t length) 698185029Spjd{ 699185029Spjd dnode_t *dn; 700185029Spjd int err; 701185029Spjd 702219089Spjd err = dnode_hold(os, object, FTAG, &dn); 703185029Spjd if (err != 0) 704185029Spjd return (err); 705254753Sdelphij err = dmu_free_long_range_impl(os, dn, offset, length); 706256259Savg 707256259Savg /* 708256259Savg * It is important to zero out the maxblkid when freeing the entire 709256259Savg * file, so that (a) subsequent calls to dmu_free_long_range_impl() 710256259Savg * will take the fast path, and (b) dnode_reallocate() can verify 711256259Savg * that the entire file has been freed. 712256259Savg */ 713263397Sdelphij if (err == 0 && offset == 0 && length == DMU_OBJECT_END) 714256259Savg dn->dn_maxblkid = 0; 715256259Savg 716185029Spjd dnode_rele(dn, FTAG); 717185029Spjd return (err); 718185029Spjd} 719185029Spjd 720185029Spjdint 721254753Sdelphijdmu_free_long_object(objset_t *os, uint64_t object) 722185029Spjd{ 723185029Spjd dmu_tx_t *tx; 724185029Spjd int err; 725185029Spjd 726254753Sdelphij err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END); 727185029Spjd if (err != 0) 728185029Spjd return (err); 729254753Sdelphij 730254753Sdelphij tx = dmu_tx_create(os); 731254753Sdelphij dmu_tx_hold_bonus(tx, object); 732254753Sdelphij dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); 733269002Sdelphij dmu_tx_mark_netfree(tx); 734254753Sdelphij err = dmu_tx_assign(tx, TXG_WAIT); 735254753Sdelphij if (err == 0) { 736254753Sdelphij err = dmu_object_free(os, object, tx); 737254753Sdelphij dmu_tx_commit(tx); 738185029Spjd } else { 739254753Sdelphij dmu_tx_abort(tx); 740185029Spjd } 741254753Sdelphij 742185029Spjd return (err); 743185029Spjd} 744185029Spjd 745185029Spjdint 746168404Spjddmu_free_range(objset_t *os, uint64_t object, uint64_t offset, 747168404Spjd uint64_t size, dmu_tx_t *tx) 748168404Spjd{ 749168404Spjd dnode_t *dn; 750219089Spjd int err = dnode_hold(os, object, FTAG, &dn); 751168404Spjd if (err) 752168404Spjd return (err); 753168404Spjd ASSERT(offset < UINT64_MAX); 754168404Spjd ASSERT(size == -1ULL || size <= UINT64_MAX - offset); 755168404Spjd dnode_free_range(dn, offset, size, tx); 756168404Spjd dnode_rele(dn, FTAG); 757168404Spjd return (0); 758168404Spjd} 759168404Spjd 760168404Spjdint 761168404Spjddmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 762209962Smm void *buf, uint32_t flags) 763168404Spjd{ 764168404Spjd dnode_t *dn; 765168404Spjd dmu_buf_t **dbp; 766214378Smm int numbufs, err; 767168404Spjd 768219089Spjd err = dnode_hold(os, object, FTAG, &dn); 769168404Spjd if (err) 770168404Spjd return (err); 771168404Spjd 772168404Spjd /* 773168404Spjd * Deal with odd block sizes, where there can't be data past the first 774168404Spjd * block. If we ever do the tail block optimization, we will need to 775168404Spjd * handle that here as well. 776168404Spjd */ 777214378Smm if (dn->dn_maxblkid == 0) { 778168404Spjd int newsz = offset > dn->dn_datablksz ? 0 : 779168404Spjd MIN(size, dn->dn_datablksz - offset); 780168404Spjd bzero((char *)buf + newsz, size - newsz); 781168404Spjd size = newsz; 782168404Spjd } 783168404Spjd 784168404Spjd while (size > 0) { 785168404Spjd uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); 786214378Smm int i; 787168404Spjd 788168404Spjd /* 789168404Spjd * NB: we could do this block-at-a-time, but it's nice 790168404Spjd * to be reading in parallel. 791168404Spjd */ 792168404Spjd err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, 793209962Smm TRUE, FTAG, &numbufs, &dbp, flags); 794168404Spjd if (err) 795185029Spjd break; 796168404Spjd 797168404Spjd for (i = 0; i < numbufs; i++) { 798168404Spjd int tocpy; 799168404Spjd int bufoff; 800168404Spjd dmu_buf_t *db = dbp[i]; 801168404Spjd 802168404Spjd ASSERT(size > 0); 803168404Spjd 804168404Spjd bufoff = offset - db->db_offset; 805168404Spjd tocpy = (int)MIN(db->db_size - bufoff, size); 806168404Spjd 807168404Spjd bcopy((char *)db->db_data + bufoff, buf, tocpy); 808168404Spjd 809168404Spjd offset += tocpy; 810168404Spjd size -= tocpy; 811168404Spjd buf = (char *)buf + tocpy; 812168404Spjd } 813168404Spjd dmu_buf_rele_array(dbp, numbufs, FTAG); 814168404Spjd } 815168404Spjd dnode_rele(dn, FTAG); 816185029Spjd return (err); 817168404Spjd} 818168404Spjd 819168404Spjdvoid 820168404Spjddmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 821168404Spjd const void *buf, dmu_tx_t *tx) 822168404Spjd{ 823168404Spjd dmu_buf_t **dbp; 824168404Spjd int numbufs, i; 825168404Spjd 826168404Spjd if (size == 0) 827168404Spjd return; 828168404Spjd 829168404Spjd VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, 830168404Spjd FALSE, FTAG, &numbufs, &dbp)); 831168404Spjd 832168404Spjd for (i = 0; i < numbufs; i++) { 833168404Spjd int tocpy; 834168404Spjd int bufoff; 835168404Spjd dmu_buf_t *db = dbp[i]; 836168404Spjd 837168404Spjd ASSERT(size > 0); 838168404Spjd 839168404Spjd bufoff = offset - db->db_offset; 840168404Spjd tocpy = (int)MIN(db->db_size - bufoff, size); 841168404Spjd 842168404Spjd ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); 843168404Spjd 844168404Spjd if (tocpy == db->db_size) 845168404Spjd dmu_buf_will_fill(db, tx); 846168404Spjd else 847168404Spjd dmu_buf_will_dirty(db, tx); 848168404Spjd 849168404Spjd bcopy(buf, (char *)db->db_data + bufoff, tocpy); 850168404Spjd 851168404Spjd if (tocpy == db->db_size) 852168404Spjd dmu_buf_fill_done(db, tx); 853168404Spjd 854168404Spjd offset += tocpy; 855168404Spjd size -= tocpy; 856168404Spjd buf = (char *)buf + tocpy; 857168404Spjd } 858168404Spjd dmu_buf_rele_array(dbp, numbufs, FTAG); 859168404Spjd} 860168404Spjd 861219089Spjdvoid 862219089Spjddmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 863219089Spjd dmu_tx_t *tx) 864219089Spjd{ 865219089Spjd dmu_buf_t **dbp; 866219089Spjd int numbufs, i; 867219089Spjd 868219089Spjd if (size == 0) 869219089Spjd return; 870219089Spjd 871219089Spjd VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, 872219089Spjd FALSE, FTAG, &numbufs, &dbp)); 873219089Spjd 874219089Spjd for (i = 0; i < numbufs; i++) { 875219089Spjd dmu_buf_t *db = dbp[i]; 876219089Spjd 877219089Spjd dmu_buf_will_not_fill(db, tx); 878219089Spjd } 879219089Spjd dmu_buf_rele_array(dbp, numbufs, FTAG); 880219089Spjd} 881219089Spjd 882268649Sdelphijvoid 883268649Sdelphijdmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, 884268649Sdelphij void *data, uint8_t etype, uint8_t comp, int uncompressed_size, 885268649Sdelphij int compressed_size, int byteorder, dmu_tx_t *tx) 886268649Sdelphij{ 887268649Sdelphij dmu_buf_t *db; 888268649Sdelphij 889268649Sdelphij ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES); 890268649Sdelphij ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS); 891268649Sdelphij VERIFY0(dmu_buf_hold_noread(os, object, offset, 892268649Sdelphij FTAG, &db)); 893268649Sdelphij 894268649Sdelphij dmu_buf_write_embedded(db, 895268649Sdelphij data, (bp_embedded_type_t)etype, (enum zio_compress)comp, 896268649Sdelphij uncompressed_size, compressed_size, byteorder, tx); 897268649Sdelphij 898268649Sdelphij dmu_buf_rele(db, FTAG); 899268649Sdelphij} 900268649Sdelphij 901219089Spjd/* 902219089Spjd * DMU support for xuio 903219089Spjd */ 904219089Spjdkstat_t *xuio_ksp = NULL; 905219089Spjd 906219089Spjdint 907219089Spjddmu_xuio_init(xuio_t *xuio, int nblk) 908219089Spjd{ 909219089Spjd dmu_xuio_t *priv; 910219089Spjd uio_t *uio = &xuio->xu_uio; 911219089Spjd 912219089Spjd uio->uio_iovcnt = nblk; 913219089Spjd uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP); 914219089Spjd 915219089Spjd priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP); 916219089Spjd priv->cnt = nblk; 917219089Spjd priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP); 918219089Spjd priv->iovp = uio->uio_iov; 919219089Spjd XUIO_XUZC_PRIV(xuio) = priv; 920219089Spjd 921219089Spjd if (XUIO_XUZC_RW(xuio) == UIO_READ) 922219089Spjd XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk); 923219089Spjd else 924219089Spjd XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk); 925219089Spjd 926219089Spjd return (0); 927219089Spjd} 928219089Spjd 929219089Spjdvoid 930219089Spjddmu_xuio_fini(xuio_t *xuio) 931219089Spjd{ 932219089Spjd dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); 933219089Spjd int nblk = priv->cnt; 934219089Spjd 935219089Spjd kmem_free(priv->iovp, nblk * sizeof (iovec_t)); 936219089Spjd kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *)); 937219089Spjd kmem_free(priv, sizeof (dmu_xuio_t)); 938219089Spjd 939219089Spjd if (XUIO_XUZC_RW(xuio) == UIO_READ) 940219089Spjd XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk); 941219089Spjd else 942219089Spjd XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk); 943219089Spjd} 944219089Spjd 945219089Spjd/* 946219089Spjd * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf } 947219089Spjd * and increase priv->next by 1. 948219089Spjd */ 949219089Spjdint 950219089Spjddmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n) 951219089Spjd{ 952219089Spjd struct iovec *iov; 953219089Spjd uio_t *uio = &xuio->xu_uio; 954219089Spjd dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); 955219089Spjd int i = priv->next++; 956219089Spjd 957219089Spjd ASSERT(i < priv->cnt); 958219089Spjd ASSERT(off + n <= arc_buf_size(abuf)); 959219089Spjd iov = uio->uio_iov + i; 960219089Spjd iov->iov_base = (char *)abuf->b_data + off; 961219089Spjd iov->iov_len = n; 962219089Spjd priv->bufs[i] = abuf; 963219089Spjd return (0); 964219089Spjd} 965219089Spjd 966219089Spjdint 967219089Spjddmu_xuio_cnt(xuio_t *xuio) 968219089Spjd{ 969219089Spjd dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); 970219089Spjd return (priv->cnt); 971219089Spjd} 972219089Spjd 973219089Spjdarc_buf_t * 974219089Spjddmu_xuio_arcbuf(xuio_t *xuio, int i) 975219089Spjd{ 976219089Spjd dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); 977219089Spjd 978219089Spjd ASSERT(i < priv->cnt); 979219089Spjd return (priv->bufs[i]); 980219089Spjd} 981219089Spjd 982219089Spjdvoid 983219089Spjddmu_xuio_clear(xuio_t *xuio, int i) 984219089Spjd{ 985219089Spjd dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); 986219089Spjd 987219089Spjd ASSERT(i < priv->cnt); 988219089Spjd priv->bufs[i] = NULL; 989219089Spjd} 990219089Spjd 991219089Spjdstatic void 992219089Spjdxuio_stat_init(void) 993219089Spjd{ 994219089Spjd xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc", 995219089Spjd KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t), 996219089Spjd KSTAT_FLAG_VIRTUAL); 997219089Spjd if (xuio_ksp != NULL) { 998219089Spjd xuio_ksp->ks_data = &xuio_stats; 999219089Spjd kstat_install(xuio_ksp); 1000219089Spjd } 1001219089Spjd} 1002219089Spjd 1003219089Spjdstatic void 1004219089Spjdxuio_stat_fini(void) 1005219089Spjd{ 1006219089Spjd if (xuio_ksp != NULL) { 1007219089Spjd kstat_delete(xuio_ksp); 1008219089Spjd xuio_ksp = NULL; 1009219089Spjd } 1010219089Spjd} 1011219089Spjd 1012219089Spjdvoid 1013219089Spjdxuio_stat_wbuf_copied() 1014219089Spjd{ 1015219089Spjd XUIOSTAT_BUMP(xuiostat_wbuf_copied); 1016219089Spjd} 1017219089Spjd 1018219089Spjdvoid 1019219089Spjdxuio_stat_wbuf_nocopy() 1020219089Spjd{ 1021219089Spjd XUIOSTAT_BUMP(xuiostat_wbuf_nocopy); 1022219089Spjd} 1023219089Spjd 1024168404Spjd#ifdef _KERNEL 1025168404Spjdint 1026168404Spjddmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) 1027168404Spjd{ 1028168404Spjd dmu_buf_t **dbp; 1029168404Spjd int numbufs, i, err; 1030219089Spjd xuio_t *xuio = NULL; 1031168404Spjd 1032168404Spjd /* 1033168404Spjd * NB: we could do this block-at-a-time, but it's nice 1034168404Spjd * to be reading in parallel. 1035168404Spjd */ 1036168404Spjd err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG, 1037168404Spjd &numbufs, &dbp); 1038168404Spjd if (err) 1039168404Spjd return (err); 1040168404Spjd 1041219089Spjd#ifdef UIO_XUIO 1042219089Spjd if (uio->uio_extflg == UIO_XUIO) 1043219089Spjd xuio = (xuio_t *)uio; 1044219089Spjd#endif 1045219089Spjd 1046168404Spjd for (i = 0; i < numbufs; i++) { 1047168404Spjd int tocpy; 1048168404Spjd int bufoff; 1049168404Spjd dmu_buf_t *db = dbp[i]; 1050168404Spjd 1051168404Spjd ASSERT(size > 0); 1052168404Spjd 1053168404Spjd bufoff = uio->uio_loffset - db->db_offset; 1054168404Spjd tocpy = (int)MIN(db->db_size - bufoff, size); 1055168404Spjd 1056219089Spjd if (xuio) { 1057219089Spjd dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; 1058219089Spjd arc_buf_t *dbuf_abuf = dbi->db_buf; 1059219089Spjd arc_buf_t *abuf = dbuf_loan_arcbuf(dbi); 1060219089Spjd err = dmu_xuio_add(xuio, abuf, bufoff, tocpy); 1061219089Spjd if (!err) { 1062219089Spjd uio->uio_resid -= tocpy; 1063219089Spjd uio->uio_loffset += tocpy; 1064219089Spjd } 1065219089Spjd 1066219089Spjd if (abuf == dbuf_abuf) 1067219089Spjd XUIOSTAT_BUMP(xuiostat_rbuf_nocopy); 1068219089Spjd else 1069219089Spjd XUIOSTAT_BUMP(xuiostat_rbuf_copied); 1070219089Spjd } else { 1071219089Spjd err = uiomove((char *)db->db_data + bufoff, tocpy, 1072219089Spjd UIO_READ, uio); 1073219089Spjd } 1074168404Spjd if (err) 1075168404Spjd break; 1076168404Spjd 1077168404Spjd size -= tocpy; 1078168404Spjd } 1079168404Spjd dmu_buf_rele_array(dbp, numbufs, FTAG); 1080168404Spjd 1081168404Spjd return (err); 1082168404Spjd} 1083168404Spjd 1084219089Spjdstatic int 1085219089Spjddmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) 1086168404Spjd{ 1087168404Spjd dmu_buf_t **dbp; 1088219089Spjd int numbufs; 1089168404Spjd int err = 0; 1090219089Spjd int i; 1091168404Spjd 1092219089Spjd err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, 1093219089Spjd FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); 1094168404Spjd if (err) 1095168404Spjd return (err); 1096168404Spjd 1097168404Spjd for (i = 0; i < numbufs; i++) { 1098168404Spjd int tocpy; 1099168404Spjd int bufoff; 1100168404Spjd dmu_buf_t *db = dbp[i]; 1101168404Spjd 1102168404Spjd ASSERT(size > 0); 1103168404Spjd 1104168404Spjd bufoff = uio->uio_loffset - db->db_offset; 1105168404Spjd tocpy = (int)MIN(db->db_size - bufoff, size); 1106168404Spjd 1107168404Spjd ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); 1108168404Spjd 1109168404Spjd if (tocpy == db->db_size) 1110168404Spjd dmu_buf_will_fill(db, tx); 1111168404Spjd else 1112168404Spjd dmu_buf_will_dirty(db, tx); 1113168404Spjd 1114168404Spjd /* 1115168404Spjd * XXX uiomove could block forever (eg. nfs-backed 1116168404Spjd * pages). There needs to be a uiolockdown() function 1117168404Spjd * to lock the pages in memory, so that uiomove won't 1118168404Spjd * block. 1119168404Spjd */ 1120168404Spjd err = uiomove((char *)db->db_data + bufoff, tocpy, 1121168404Spjd UIO_WRITE, uio); 1122168404Spjd 1123168404Spjd if (tocpy == db->db_size) 1124168404Spjd dmu_buf_fill_done(db, tx); 1125168404Spjd 1126168404Spjd if (err) 1127168404Spjd break; 1128168404Spjd 1129168404Spjd size -= tocpy; 1130168404Spjd } 1131219089Spjd 1132168404Spjd dmu_buf_rele_array(dbp, numbufs, FTAG); 1133168404Spjd return (err); 1134168404Spjd} 1135168404Spjd 1136168404Spjdint 1137219089Spjddmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size, 1138219089Spjd dmu_tx_t *tx) 1139219089Spjd{ 1140219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; 1141219089Spjd dnode_t *dn; 1142219089Spjd int err; 1143219089Spjd 1144219089Spjd if (size == 0) 1145219089Spjd return (0); 1146219089Spjd 1147219089Spjd DB_DNODE_ENTER(db); 1148219089Spjd dn = DB_DNODE(db); 1149219089Spjd err = dmu_write_uio_dnode(dn, uio, size, tx); 1150219089Spjd DB_DNODE_EXIT(db); 1151219089Spjd 1152219089Spjd return (err); 1153219089Spjd} 1154219089Spjd 1155219089Spjdint 1156219089Spjddmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, 1157219089Spjd dmu_tx_t *tx) 1158219089Spjd{ 1159219089Spjd dnode_t *dn; 1160219089Spjd int err; 1161219089Spjd 1162219089Spjd if (size == 0) 1163219089Spjd return (0); 1164219089Spjd 1165219089Spjd err = dnode_hold(os, object, FTAG, &dn); 1166219089Spjd if (err) 1167219089Spjd return (err); 1168219089Spjd 1169219089Spjd err = dmu_write_uio_dnode(dn, uio, size, tx); 1170219089Spjd 1171219089Spjd dnode_rele(dn, FTAG); 1172219089Spjd 1173219089Spjd return (err); 1174219089Spjd} 1175219089Spjd 1176219089Spjd#ifdef sun 1177219089Spjdint 1178168404Spjddmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 1179168404Spjd page_t *pp, dmu_tx_t *tx) 1180168404Spjd{ 1181168404Spjd dmu_buf_t **dbp; 1182168404Spjd int numbufs, i; 1183168404Spjd int err; 1184168404Spjd 1185168404Spjd if (size == 0) 1186168404Spjd return (0); 1187168404Spjd 1188168404Spjd err = dmu_buf_hold_array(os, object, offset, size, 1189168404Spjd FALSE, FTAG, &numbufs, &dbp); 1190168404Spjd if (err) 1191168404Spjd return (err); 1192168404Spjd 1193168404Spjd for (i = 0; i < numbufs; i++) { 1194168404Spjd int tocpy, copied, thiscpy; 1195168404Spjd int bufoff; 1196168404Spjd dmu_buf_t *db = dbp[i]; 1197168404Spjd caddr_t va; 1198168404Spjd 1199168404Spjd ASSERT(size > 0); 1200168404Spjd ASSERT3U(db->db_size, >=, PAGESIZE); 1201168404Spjd 1202168404Spjd bufoff = offset - db->db_offset; 1203168404Spjd tocpy = (int)MIN(db->db_size - bufoff, size); 1204168404Spjd 1205168404Spjd ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); 1206168404Spjd 1207168404Spjd if (tocpy == db->db_size) 1208168404Spjd dmu_buf_will_fill(db, tx); 1209168404Spjd else 1210168404Spjd dmu_buf_will_dirty(db, tx); 1211168404Spjd 1212168404Spjd for (copied = 0; copied < tocpy; copied += PAGESIZE) { 1213168404Spjd ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff); 1214168404Spjd thiscpy = MIN(PAGESIZE, tocpy - copied); 1215185029Spjd va = zfs_map_page(pp, S_READ); 1216168404Spjd bcopy(va, (char *)db->db_data + bufoff, thiscpy); 1217185029Spjd zfs_unmap_page(pp, va); 1218168404Spjd pp = pp->p_next; 1219168404Spjd bufoff += PAGESIZE; 1220168404Spjd } 1221168404Spjd 1222168404Spjd if (tocpy == db->db_size) 1223168404Spjd dmu_buf_fill_done(db, tx); 1224168404Spjd 1225168404Spjd offset += tocpy; 1226168404Spjd size -= tocpy; 1227168404Spjd } 1228168404Spjd dmu_buf_rele_array(dbp, numbufs, FTAG); 1229168404Spjd return (err); 1230168404Spjd} 1231260786Savg 1232260786Savg#else 1233260786Savg 1234260786Savgint 1235260786Savgdmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 1236260786Savg vm_page_t *ma, dmu_tx_t *tx) 1237260786Savg{ 1238260786Savg dmu_buf_t **dbp; 1239260786Savg struct sf_buf *sf; 1240260786Savg int numbufs, i; 1241260786Savg int err; 1242260786Savg 1243260786Savg if (size == 0) 1244260786Savg return (0); 1245260786Savg 1246260786Savg err = dmu_buf_hold_array(os, object, offset, size, 1247260786Savg FALSE, FTAG, &numbufs, &dbp); 1248260786Savg if (err) 1249260786Savg return (err); 1250260786Savg 1251260786Savg for (i = 0; i < numbufs; i++) { 1252260786Savg int tocpy, copied, thiscpy; 1253260786Savg int bufoff; 1254260786Savg dmu_buf_t *db = dbp[i]; 1255260786Savg caddr_t va; 1256260786Savg 1257260786Savg ASSERT(size > 0); 1258260786Savg ASSERT3U(db->db_size, >=, PAGESIZE); 1259260786Savg 1260260786Savg bufoff = offset - db->db_offset; 1261260786Savg tocpy = (int)MIN(db->db_size - bufoff, size); 1262260786Savg 1263260786Savg ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); 1264260786Savg 1265260786Savg if (tocpy == db->db_size) 1266260786Savg dmu_buf_will_fill(db, tx); 1267260786Savg else 1268260786Savg dmu_buf_will_dirty(db, tx); 1269260786Savg 1270260786Savg for (copied = 0; copied < tocpy; copied += PAGESIZE) { 1271260786Savg ASSERT3U(ptoa((*ma)->pindex), ==, db->db_offset + bufoff); 1272260786Savg thiscpy = MIN(PAGESIZE, tocpy - copied); 1273260786Savg va = zfs_map_page(*ma, &sf); 1274260786Savg bcopy(va, (char *)db->db_data + bufoff, thiscpy); 1275260786Savg zfs_unmap_page(sf); 1276260786Savg ma += 1; 1277260786Savg bufoff += PAGESIZE; 1278260786Savg } 1279260786Savg 1280260786Savg if (tocpy == db->db_size) 1281260786Savg dmu_buf_fill_done(db, tx); 1282260786Savg 1283260786Savg offset += tocpy; 1284260786Savg size -= tocpy; 1285260786Savg } 1286260786Savg dmu_buf_rele_array(dbp, numbufs, FTAG); 1287260786Savg return (err); 1288260786Savg} 1289219089Spjd#endif /* sun */ 1290219089Spjd#endif 1291168404Spjd 1292209962Smm/* 1293209962Smm * Allocate a loaned anonymous arc buffer. 1294209962Smm */ 1295209962Smmarc_buf_t * 1296209962Smmdmu_request_arcbuf(dmu_buf_t *handle, int size) 1297209962Smm{ 1298219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle; 1299209962Smm 1300263397Sdelphij return (arc_loan_buf(db->db_objset->os_spa, size)); 1301209962Smm} 1302209962Smm 1303209962Smm/* 1304209962Smm * Free a loaned arc buffer. 1305209962Smm */ 1306209962Smmvoid 1307209962Smmdmu_return_arcbuf(arc_buf_t *buf) 1308209962Smm{ 1309209962Smm arc_return_buf(buf, FTAG); 1310248571Smm VERIFY(arc_buf_remove_ref(buf, FTAG)); 1311209962Smm} 1312209962Smm 1313209962Smm/* 1314209962Smm * When possible directly assign passed loaned arc buffer to a dbuf. 1315209962Smm * If this is not possible copy the contents of passed arc buf via 1316209962Smm * dmu_write(). 1317209962Smm */ 1318209962Smmvoid 1319209962Smmdmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, 1320209962Smm dmu_tx_t *tx) 1321209962Smm{ 1322219089Spjd dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle; 1323219089Spjd dnode_t *dn; 1324209962Smm dmu_buf_impl_t *db; 1325209962Smm uint32_t blksz = (uint32_t)arc_buf_size(buf); 1326209962Smm uint64_t blkid; 1327209962Smm 1328219089Spjd DB_DNODE_ENTER(dbuf); 1329219089Spjd dn = DB_DNODE(dbuf); 1330209962Smm rw_enter(&dn->dn_struct_rwlock, RW_READER); 1331209962Smm blkid = dbuf_whichblock(dn, offset); 1332209962Smm VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL); 1333209962Smm rw_exit(&dn->dn_struct_rwlock); 1334219089Spjd DB_DNODE_EXIT(dbuf); 1335209962Smm 1336209962Smm if (offset == db->db.db_offset && blksz == db->db.db_size) { 1337209962Smm dbuf_assign_arcbuf(db, buf, tx); 1338209962Smm dbuf_rele(db, FTAG); 1339209962Smm } else { 1340219089Spjd objset_t *os; 1341219089Spjd uint64_t object; 1342219089Spjd 1343219089Spjd DB_DNODE_ENTER(dbuf); 1344219089Spjd dn = DB_DNODE(dbuf); 1345219089Spjd os = dn->dn_objset; 1346219089Spjd object = dn->dn_object; 1347219089Spjd DB_DNODE_EXIT(dbuf); 1348219089Spjd 1349209962Smm dbuf_rele(db, FTAG); 1350219089Spjd dmu_write(os, object, offset, blksz, buf->b_data, tx); 1351209962Smm dmu_return_arcbuf(buf); 1352219089Spjd XUIOSTAT_BUMP(xuiostat_wbuf_copied); 1353209962Smm } 1354209962Smm} 1355209962Smm 1356168404Spjdtypedef struct { 1357219089Spjd dbuf_dirty_record_t *dsa_dr; 1358219089Spjd dmu_sync_cb_t *dsa_done; 1359219089Spjd zgd_t *dsa_zgd; 1360219089Spjd dmu_tx_t *dsa_tx; 1361168404Spjd} dmu_sync_arg_t; 1362168404Spjd 1363168404Spjd/* ARGSUSED */ 1364168404Spjdstatic void 1365185029Spjddmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) 1366185029Spjd{ 1367219089Spjd dmu_sync_arg_t *dsa = varg; 1368219089Spjd dmu_buf_t *db = dsa->dsa_zgd->zgd_db; 1369185029Spjd blkptr_t *bp = zio->io_bp; 1370185029Spjd 1371219089Spjd if (zio->io_error == 0) { 1372219089Spjd if (BP_IS_HOLE(bp)) { 1373219089Spjd /* 1374219089Spjd * A block of zeros may compress to a hole, but the 1375219089Spjd * block size still needs to be known for replay. 1376219089Spjd */ 1377219089Spjd BP_SET_LSIZE(bp, db->db_size); 1378268649Sdelphij } else if (!BP_IS_EMBEDDED(bp)) { 1379219089Spjd ASSERT(BP_GET_LEVEL(bp) == 0); 1380219089Spjd bp->blk_fill = 1; 1381219089Spjd } 1382185029Spjd } 1383185029Spjd} 1384185029Spjd 1385219089Spjdstatic void 1386219089Spjddmu_sync_late_arrival_ready(zio_t *zio) 1387219089Spjd{ 1388219089Spjd dmu_sync_ready(zio, NULL, zio->io_private); 1389219089Spjd} 1390219089Spjd 1391185029Spjd/* ARGSUSED */ 1392185029Spjdstatic void 1393168404Spjddmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) 1394168404Spjd{ 1395219089Spjd dmu_sync_arg_t *dsa = varg; 1396219089Spjd dbuf_dirty_record_t *dr = dsa->dsa_dr; 1397168404Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 1398168404Spjd 1399168404Spjd mutex_enter(&db->db_mtx); 1400168404Spjd ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); 1401219089Spjd if (zio->io_error == 0) { 1402243524Smm dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE); 1403243524Smm if (dr->dt.dl.dr_nopwrite) { 1404243524Smm blkptr_t *bp = zio->io_bp; 1405243524Smm blkptr_t *bp_orig = &zio->io_bp_orig; 1406243524Smm uint8_t chksum = BP_GET_CHECKSUM(bp_orig); 1407243524Smm 1408243524Smm ASSERT(BP_EQUAL(bp, bp_orig)); 1409243524Smm ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF); 1410243524Smm ASSERT(zio_checksum_table[chksum].ci_dedup); 1411243524Smm } 1412219089Spjd dr->dt.dl.dr_overridden_by = *zio->io_bp; 1413219089Spjd dr->dt.dl.dr_override_state = DR_OVERRIDDEN; 1414219089Spjd dr->dt.dl.dr_copies = zio->io_prop.zp_copies; 1415219089Spjd if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) 1416219089Spjd BP_ZERO(&dr->dt.dl.dr_overridden_by); 1417219089Spjd } else { 1418219089Spjd dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 1419219089Spjd } 1420168404Spjd cv_broadcast(&db->db_changed); 1421168404Spjd mutex_exit(&db->db_mtx); 1422168404Spjd 1423219089Spjd dsa->dsa_done(dsa->dsa_zgd, zio->io_error); 1424168404Spjd 1425219089Spjd kmem_free(dsa, sizeof (*dsa)); 1426168404Spjd} 1427168404Spjd 1428219089Spjdstatic void 1429219089Spjddmu_sync_late_arrival_done(zio_t *zio) 1430219089Spjd{ 1431219089Spjd blkptr_t *bp = zio->io_bp; 1432219089Spjd dmu_sync_arg_t *dsa = zio->io_private; 1433243524Smm blkptr_t *bp_orig = &zio->io_bp_orig; 1434219089Spjd 1435219089Spjd if (zio->io_error == 0 && !BP_IS_HOLE(bp)) { 1436243524Smm /* 1437243524Smm * If we didn't allocate a new block (i.e. ZIO_FLAG_NOPWRITE) 1438243524Smm * then there is nothing to do here. Otherwise, free the 1439243524Smm * newly allocated block in this txg. 1440243524Smm */ 1441243524Smm if (zio->io_flags & ZIO_FLAG_NOPWRITE) { 1442243524Smm ASSERT(BP_EQUAL(bp, bp_orig)); 1443243524Smm } else { 1444243524Smm ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig)); 1445243524Smm ASSERT(zio->io_bp->blk_birth == zio->io_txg); 1446243524Smm ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); 1447243524Smm zio_free(zio->io_spa, zio->io_txg, zio->io_bp); 1448243524Smm } 1449219089Spjd } 1450219089Spjd 1451219089Spjd dmu_tx_commit(dsa->dsa_tx); 1452219089Spjd 1453219089Spjd dsa->dsa_done(dsa->dsa_zgd, zio->io_error); 1454219089Spjd 1455219089Spjd kmem_free(dsa, sizeof (*dsa)); 1456219089Spjd} 1457219089Spjd 1458219089Spjdstatic int 1459219089Spjddmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, 1460268657Sdelphij zio_prop_t *zp, zbookmark_phys_t *zb) 1461219089Spjd{ 1462219089Spjd dmu_sync_arg_t *dsa; 1463219089Spjd dmu_tx_t *tx; 1464219089Spjd 1465219089Spjd tx = dmu_tx_create(os); 1466219089Spjd dmu_tx_hold_space(tx, zgd->zgd_db->db_size); 1467219089Spjd if (dmu_tx_assign(tx, TXG_WAIT) != 0) { 1468219089Spjd dmu_tx_abort(tx); 1469249195Smm /* Make zl_get_data do txg_waited_synced() */ 1470249195Smm return (SET_ERROR(EIO)); 1471219089Spjd } 1472219089Spjd 1473219089Spjd dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); 1474219089Spjd dsa->dsa_dr = NULL; 1475219089Spjd dsa->dsa_done = done; 1476219089Spjd dsa->dsa_zgd = zgd; 1477219089Spjd dsa->dsa_tx = tx; 1478219089Spjd 1479219089Spjd zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp, 1480219089Spjd zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp, 1481260763Savg dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done, dsa, 1482219089Spjd ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); 1483219089Spjd 1484219089Spjd return (0); 1485219089Spjd} 1486219089Spjd 1487168404Spjd/* 1488168404Spjd * Intent log support: sync the block associated with db to disk. 1489168404Spjd * N.B. and XXX: the caller is responsible for making sure that the 1490168404Spjd * data isn't changing while dmu_sync() is writing it. 1491168404Spjd * 1492168404Spjd * Return values: 1493168404Spjd * 1494243524Smm * EEXIST: this txg has already been synced, so there's nothing to do. 1495168404Spjd * The caller should not log the write. 1496168404Spjd * 1497168404Spjd * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. 1498168404Spjd * The caller should not log the write. 1499168404Spjd * 1500168404Spjd * EALREADY: this block is already in the process of being synced. 1501168404Spjd * The caller should track its progress (somehow). 1502168404Spjd * 1503219089Spjd * EIO: could not do the I/O. 1504219089Spjd * The caller should do a txg_wait_synced(). 1505168404Spjd * 1506219089Spjd * 0: the I/O has been initiated. 1507219089Spjd * The caller should log this blkptr in the done callback. 1508219089Spjd * It is possible that the I/O will fail, in which case 1509219089Spjd * the error will be reported to the done callback and 1510219089Spjd * propagated to pio from zio_done(). 1511168404Spjd */ 1512168404Spjdint 1513219089Spjddmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) 1514168404Spjd{ 1515219089Spjd blkptr_t *bp = zgd->zgd_bp; 1516219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db; 1517219089Spjd objset_t *os = db->db_objset; 1518219089Spjd dsl_dataset_t *ds = os->os_dsl_dataset; 1519168404Spjd dbuf_dirty_record_t *dr; 1520219089Spjd dmu_sync_arg_t *dsa; 1521268657Sdelphij zbookmark_phys_t zb; 1522219089Spjd zio_prop_t zp; 1523219089Spjd dnode_t *dn; 1524168404Spjd 1525219089Spjd ASSERT(pio != NULL); 1526168404Spjd ASSERT(txg != 0); 1527168404Spjd 1528219089Spjd SET_BOOKMARK(&zb, ds->ds_object, 1529219089Spjd db->db.db_object, db->db_level, db->db_blkid); 1530168404Spjd 1531219089Spjd DB_DNODE_ENTER(db); 1532219089Spjd dn = DB_DNODE(db); 1533219089Spjd dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp); 1534219089Spjd DB_DNODE_EXIT(db); 1535219089Spjd 1536168404Spjd /* 1537219089Spjd * If we're frozen (running ziltest), we always need to generate a bp. 1538168404Spjd */ 1539219089Spjd if (txg > spa_freeze_txg(os->os_spa)) 1540219089Spjd return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); 1541168404Spjd 1542168404Spjd /* 1543219089Spjd * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf() 1544219089Spjd * and us. If we determine that this txg is not yet syncing, 1545219089Spjd * but it begins to sync a moment later, that's OK because the 1546219089Spjd * sync thread will block in dbuf_sync_leaf() until we drop db_mtx. 1547168404Spjd */ 1548219089Spjd mutex_enter(&db->db_mtx); 1549219089Spjd 1550219089Spjd if (txg <= spa_last_synced_txg(os->os_spa)) { 1551168404Spjd /* 1552219089Spjd * This txg has already synced. There's nothing to do. 1553168404Spjd */ 1554219089Spjd mutex_exit(&db->db_mtx); 1555249195Smm return (SET_ERROR(EEXIST)); 1556168404Spjd } 1557168404Spjd 1558219089Spjd if (txg <= spa_syncing_txg(os->os_spa)) { 1559219089Spjd /* 1560219089Spjd * This txg is currently syncing, so we can't mess with 1561219089Spjd * the dirty record anymore; just write a new log block. 1562219089Spjd */ 1563219089Spjd mutex_exit(&db->db_mtx); 1564219089Spjd return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); 1565168404Spjd } 1566168404Spjd 1567168404Spjd dr = db->db_last_dirty; 1568219089Spjd while (dr && dr->dr_txg != txg) 1569168404Spjd dr = dr->dr_next; 1570219089Spjd 1571219089Spjd if (dr == NULL) { 1572168404Spjd /* 1573219089Spjd * There's no dr for this dbuf, so it must have been freed. 1574168404Spjd * There's no need to log writes to freed blocks, so we're done. 1575168404Spjd */ 1576168404Spjd mutex_exit(&db->db_mtx); 1577249195Smm return (SET_ERROR(ENOENT)); 1578168404Spjd } 1579168404Spjd 1580243524Smm ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg); 1581243524Smm 1582243524Smm /* 1583243524Smm * Assume the on-disk data is X, the current syncing data is Y, 1584243524Smm * and the current in-memory data is Z (currently in dmu_sync). 1585243524Smm * X and Z are identical but Y is has been modified. Normally, 1586243524Smm * when X and Z are the same we will perform a nopwrite but if Y 1587243524Smm * is different we must disable nopwrite since the resulting write 1588243524Smm * of Y to disk can free the block containing X. If we allowed a 1589243524Smm * nopwrite to occur the block pointing to Z would reference a freed 1590243524Smm * block. Since this is a rare case we simplify this by disabling 1591243524Smm * nopwrite if the current dmu_sync-ing dbuf has been modified in 1592243524Smm * a previous transaction. 1593243524Smm */ 1594243524Smm if (dr->dr_next) 1595243524Smm zp.zp_nopwrite = B_FALSE; 1596243524Smm 1597168404Spjd ASSERT(dr->dr_txg == txg); 1598219089Spjd if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC || 1599219089Spjd dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 1600168404Spjd /* 1601219089Spjd * We have already issued a sync write for this buffer, 1602219089Spjd * or this buffer has already been synced. It could not 1603219089Spjd * have been dirtied since, or we would have cleared the state. 1604168404Spjd */ 1605168404Spjd mutex_exit(&db->db_mtx); 1606249195Smm return (SET_ERROR(EALREADY)); 1607168404Spjd } 1608168404Spjd 1609219089Spjd ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); 1610168404Spjd dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC; 1611168404Spjd mutex_exit(&db->db_mtx); 1612168404Spjd 1613219089Spjd dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); 1614219089Spjd dsa->dsa_dr = dr; 1615219089Spjd dsa->dsa_done = done; 1616219089Spjd dsa->dsa_zgd = zgd; 1617219089Spjd dsa->dsa_tx = NULL; 1618168404Spjd 1619219089Spjd zio_nowait(arc_write(pio, os->os_spa, txg, 1620251478Sdelphij bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), 1621260763Savg DBUF_IS_L2COMPRESSIBLE(db), &zp, dmu_sync_ready, 1622260763Savg NULL, dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, 1623260763Savg ZIO_FLAG_CANFAIL, &zb)); 1624185029Spjd 1625219089Spjd return (0); 1626168404Spjd} 1627168404Spjd 1628168404Spjdint 1629168404Spjddmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, 1630168404Spjd dmu_tx_t *tx) 1631168404Spjd{ 1632168404Spjd dnode_t *dn; 1633168404Spjd int err; 1634168404Spjd 1635219089Spjd err = dnode_hold(os, object, FTAG, &dn); 1636168404Spjd if (err) 1637168404Spjd return (err); 1638168404Spjd err = dnode_set_blksz(dn, size, ibs, tx); 1639168404Spjd dnode_rele(dn, FTAG); 1640168404Spjd return (err); 1641168404Spjd} 1642168404Spjd 1643168404Spjdvoid 1644168404Spjddmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, 1645168404Spjd dmu_tx_t *tx) 1646168404Spjd{ 1647168404Spjd dnode_t *dn; 1648168404Spjd 1649268649Sdelphij /* 1650268649Sdelphij * Send streams include each object's checksum function. This 1651268649Sdelphij * check ensures that the receiving system can understand the 1652268649Sdelphij * checksum function transmitted. 1653268649Sdelphij */ 1654268649Sdelphij ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS); 1655268649Sdelphij 1656268649Sdelphij VERIFY0(dnode_hold(os, object, FTAG, &dn)); 1657268649Sdelphij ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS); 1658168404Spjd dn->dn_checksum = checksum; 1659168404Spjd dnode_setdirty(dn, tx); 1660168404Spjd dnode_rele(dn, FTAG); 1661168404Spjd} 1662168404Spjd 1663168404Spjdvoid 1664168404Spjddmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, 1665168404Spjd dmu_tx_t *tx) 1666168404Spjd{ 1667168404Spjd dnode_t *dn; 1668168404Spjd 1669268649Sdelphij /* 1670268649Sdelphij * Send streams include each object's compression function. This 1671268649Sdelphij * check ensures that the receiving system can understand the 1672268649Sdelphij * compression function transmitted. 1673268649Sdelphij */ 1674268649Sdelphij ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS); 1675268649Sdelphij 1676268649Sdelphij VERIFY0(dnode_hold(os, object, FTAG, &dn)); 1677168404Spjd dn->dn_compress = compress; 1678168404Spjd dnode_setdirty(dn, tx); 1679168404Spjd dnode_rele(dn, FTAG); 1680168404Spjd} 1681168404Spjd 1682219089Spjdint zfs_mdcomp_disable = 0; 1683219089SpjdTUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable); 1684219089SpjdSYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RW, 1685219089Spjd &zfs_mdcomp_disable, 0, "Disable metadata compression"); 1686219089Spjd 1687268647Sdelphij/* 1688268647Sdelphij * When the "redundant_metadata" property is set to "most", only indirect 1689268647Sdelphij * blocks of this level and higher will have an additional ditto block. 1690268647Sdelphij */ 1691268647Sdelphijint zfs_redundant_metadata_most_ditto_level = 2; 1692268647Sdelphij 1693219089Spjdvoid 1694219089Spjddmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) 1695219089Spjd{ 1696219089Spjd dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET; 1697236884Smm boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) || 1698219089Spjd (wp & WP_SPILL)); 1699219089Spjd enum zio_checksum checksum = os->os_checksum; 1700219089Spjd enum zio_compress compress = os->os_compress; 1701219089Spjd enum zio_checksum dedup_checksum = os->os_dedup_checksum; 1702243524Smm boolean_t dedup = B_FALSE; 1703243524Smm boolean_t nopwrite = B_FALSE; 1704219089Spjd boolean_t dedup_verify = os->os_dedup_verify; 1705219089Spjd int copies = os->os_copies; 1706219089Spjd 1707219089Spjd /* 1708243524Smm * We maintain different write policies for each of the following 1709243524Smm * types of data: 1710243524Smm * 1. metadata 1711243524Smm * 2. preallocated blocks (i.e. level-0 blocks of a dump device) 1712243524Smm * 3. all other level 0 blocks 1713219089Spjd */ 1714219089Spjd if (ismd) { 1715219089Spjd /* 1716243524Smm * XXX -- we should design a compression algorithm 1717243524Smm * that specializes in arrays of bps. 1718243524Smm */ 1719268658Sdelphij boolean_t lz4_ac = spa_feature_is_active(os->os_spa, 1720268658Sdelphij SPA_FEATURE_LZ4_COMPRESS); 1721243524Smm 1722268658Sdelphij if (zfs_mdcomp_disable) { 1723268658Sdelphij compress = ZIO_COMPRESS_EMPTY; 1724268658Sdelphij } else if (lz4_ac) { 1725268658Sdelphij compress = ZIO_COMPRESS_LZ4; 1726268658Sdelphij } else { 1727268658Sdelphij compress = ZIO_COMPRESS_LZJB; 1728268658Sdelphij } 1729268658Sdelphij 1730243524Smm /* 1731219089Spjd * Metadata always gets checksummed. If the data 1732219089Spjd * checksum is multi-bit correctable, and it's not a 1733219089Spjd * ZBT-style checksum, then it's suitable for metadata 1734219089Spjd * as well. Otherwise, the metadata checksum defaults 1735219089Spjd * to fletcher4. 1736219089Spjd */ 1737219089Spjd if (zio_checksum_table[checksum].ci_correctable < 1 || 1738219089Spjd zio_checksum_table[checksum].ci_eck) 1739219089Spjd checksum = ZIO_CHECKSUM_FLETCHER_4; 1740268647Sdelphij 1741268647Sdelphij if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL || 1742268647Sdelphij (os->os_redundant_metadata == 1743268647Sdelphij ZFS_REDUNDANT_METADATA_MOST && 1744268647Sdelphij (level >= zfs_redundant_metadata_most_ditto_level || 1745268647Sdelphij DMU_OT_IS_METADATA(type) || (wp & WP_SPILL)))) 1746268647Sdelphij copies++; 1747243524Smm } else if (wp & WP_NOFILL) { 1748243524Smm ASSERT(level == 0); 1749219089Spjd 1750219089Spjd /* 1751243524Smm * If we're writing preallocated blocks, we aren't actually 1752243524Smm * writing them so don't set any policy properties. These 1753243524Smm * blocks are currently only used by an external subsystem 1754243524Smm * outside of zfs (i.e. dump) and not written by the zio 1755243524Smm * pipeline. 1756219089Spjd */ 1757243524Smm compress = ZIO_COMPRESS_OFF; 1758255750Sdelphij checksum = ZIO_CHECKSUM_NOPARITY; 1759219089Spjd } else { 1760219089Spjd compress = zio_compress_select(dn->dn_compress, compress); 1761219089Spjd 1762243524Smm checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ? 1763243524Smm zio_checksum_select(dn->dn_checksum, checksum) : 1764243524Smm dedup_checksum; 1765219089Spjd 1766243524Smm /* 1767243524Smm * Determine dedup setting. If we are in dmu_sync(), 1768243524Smm * we won't actually dedup now because that's all 1769243524Smm * done in syncing context; but we do want to use the 1770243524Smm * dedup checkum. If the checksum is not strong 1771243524Smm * enough to ensure unique signatures, force 1772243524Smm * dedup_verify. 1773243524Smm */ 1774243524Smm if (dedup_checksum != ZIO_CHECKSUM_OFF) { 1775243524Smm dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE; 1776243524Smm if (!zio_checksum_table[checksum].ci_dedup) 1777243524Smm dedup_verify = B_TRUE; 1778243524Smm } 1779219089Spjd 1780243524Smm /* 1781243524Smm * Enable nopwrite if we have a cryptographically secure 1782243524Smm * checksum that has no known collisions (i.e. SHA-256) 1783243524Smm * and compression is enabled. We don't enable nopwrite if 1784243524Smm * dedup is enabled as the two features are mutually exclusive. 1785243524Smm */ 1786243524Smm nopwrite = (!dedup && zio_checksum_table[checksum].ci_dedup && 1787243524Smm compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled); 1788219089Spjd } 1789219089Spjd 1790219089Spjd zp->zp_checksum = checksum; 1791219089Spjd zp->zp_compress = compress; 1792219089Spjd zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type; 1793219089Spjd zp->zp_level = level; 1794268647Sdelphij zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa)); 1795219089Spjd zp->zp_dedup = dedup; 1796219089Spjd zp->zp_dedup_verify = dedup && dedup_verify; 1797243524Smm zp->zp_nopwrite = nopwrite; 1798219089Spjd} 1799219089Spjd 1800168404Spjdint 1801168404Spjddmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) 1802168404Spjd{ 1803168404Spjd dnode_t *dn; 1804168404Spjd int i, err; 1805168404Spjd 1806219089Spjd err = dnode_hold(os, object, FTAG, &dn); 1807168404Spjd if (err) 1808168404Spjd return (err); 1809168404Spjd /* 1810168404Spjd * Sync any current changes before 1811168404Spjd * we go trundling through the block pointers. 1812168404Spjd */ 1813168404Spjd for (i = 0; i < TXG_SIZE; i++) { 1814168404Spjd if (list_link_active(&dn->dn_dirty_link[i])) 1815168404Spjd break; 1816168404Spjd } 1817168404Spjd if (i != TXG_SIZE) { 1818168404Spjd dnode_rele(dn, FTAG); 1819168404Spjd txg_wait_synced(dmu_objset_pool(os), 0); 1820219089Spjd err = dnode_hold(os, object, FTAG, &dn); 1821168404Spjd if (err) 1822168404Spjd return (err); 1823168404Spjd } 1824168404Spjd 1825185029Spjd err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0); 1826168404Spjd dnode_rele(dn, FTAG); 1827168404Spjd 1828168404Spjd return (err); 1829168404Spjd} 1830168404Spjd 1831168404Spjdvoid 1832168404Spjddmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) 1833168404Spjd{ 1834219089Spjd dnode_phys_t *dnp; 1835219089Spjd 1836168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 1837168404Spjd mutex_enter(&dn->dn_mtx); 1838168404Spjd 1839219089Spjd dnp = dn->dn_phys; 1840219089Spjd 1841168404Spjd doi->doi_data_block_size = dn->dn_datablksz; 1842168404Spjd doi->doi_metadata_block_size = dn->dn_indblkshift ? 1843168404Spjd 1ULL << dn->dn_indblkshift : 0; 1844219089Spjd doi->doi_type = dn->dn_type; 1845219089Spjd doi->doi_bonus_type = dn->dn_bonustype; 1846219089Spjd doi->doi_bonus_size = dn->dn_bonuslen; 1847168404Spjd doi->doi_indirection = dn->dn_nlevels; 1848168404Spjd doi->doi_checksum = dn->dn_checksum; 1849168404Spjd doi->doi_compress = dn->dn_compress; 1850219089Spjd doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9; 1851247852Smm doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz; 1852219089Spjd doi->doi_fill_count = 0; 1853219089Spjd for (int i = 0; i < dnp->dn_nblkptr; i++) 1854268649Sdelphij doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]); 1855168404Spjd 1856168404Spjd mutex_exit(&dn->dn_mtx); 1857168404Spjd rw_exit(&dn->dn_struct_rwlock); 1858168404Spjd} 1859168404Spjd 1860168404Spjd/* 1861168404Spjd * Get information on a DMU object. 1862168404Spjd * If doi is NULL, just indicates whether the object exists. 1863168404Spjd */ 1864168404Spjdint 1865168404Spjddmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) 1866168404Spjd{ 1867168404Spjd dnode_t *dn; 1868219089Spjd int err = dnode_hold(os, object, FTAG, &dn); 1869168404Spjd 1870168404Spjd if (err) 1871168404Spjd return (err); 1872168404Spjd 1873168404Spjd if (doi != NULL) 1874168404Spjd dmu_object_info_from_dnode(dn, doi); 1875168404Spjd 1876168404Spjd dnode_rele(dn, FTAG); 1877168404Spjd return (0); 1878168404Spjd} 1879168404Spjd 1880168404Spjd/* 1881168404Spjd * As above, but faster; can be used when you have a held dbuf in hand. 1882168404Spjd */ 1883168404Spjdvoid 1884219089Spjddmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi) 1885168404Spjd{ 1886219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1887219089Spjd 1888219089Spjd DB_DNODE_ENTER(db); 1889219089Spjd dmu_object_info_from_dnode(DB_DNODE(db), doi); 1890219089Spjd DB_DNODE_EXIT(db); 1891168404Spjd} 1892168404Spjd 1893168404Spjd/* 1894168404Spjd * Faster still when you only care about the size. 1895168404Spjd * This is specifically optimized for zfs_getattr(). 1896168404Spjd */ 1897168404Spjdvoid 1898219089Spjddmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize, 1899219089Spjd u_longlong_t *nblk512) 1900168404Spjd{ 1901219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1902219089Spjd dnode_t *dn; 1903168404Spjd 1904219089Spjd DB_DNODE_ENTER(db); 1905219089Spjd dn = DB_DNODE(db); 1906219089Spjd 1907168404Spjd *blksize = dn->dn_datablksz; 1908168404Spjd /* add 1 for dnode space */ 1909168404Spjd *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >> 1910168404Spjd SPA_MINBLOCKSHIFT) + 1; 1911219089Spjd DB_DNODE_EXIT(db); 1912168404Spjd} 1913168404Spjd 1914168404Spjdvoid 1915168404Spjdbyteswap_uint64_array(void *vbuf, size_t size) 1916168404Spjd{ 1917168404Spjd uint64_t *buf = vbuf; 1918168404Spjd size_t count = size >> 3; 1919168404Spjd int i; 1920168404Spjd 1921168404Spjd ASSERT((size & 7) == 0); 1922168404Spjd 1923168404Spjd for (i = 0; i < count; i++) 1924168404Spjd buf[i] = BSWAP_64(buf[i]); 1925168404Spjd} 1926168404Spjd 1927168404Spjdvoid 1928168404Spjdbyteswap_uint32_array(void *vbuf, size_t size) 1929168404Spjd{ 1930168404Spjd uint32_t *buf = vbuf; 1931168404Spjd size_t count = size >> 2; 1932168404Spjd int i; 1933168404Spjd 1934168404Spjd ASSERT((size & 3) == 0); 1935168404Spjd 1936168404Spjd for (i = 0; i < count; i++) 1937168404Spjd buf[i] = BSWAP_32(buf[i]); 1938168404Spjd} 1939168404Spjd 1940168404Spjdvoid 1941168404Spjdbyteswap_uint16_array(void *vbuf, size_t size) 1942168404Spjd{ 1943168404Spjd uint16_t *buf = vbuf; 1944168404Spjd size_t count = size >> 1; 1945168404Spjd int i; 1946168404Spjd 1947168404Spjd ASSERT((size & 1) == 0); 1948168404Spjd 1949168404Spjd for (i = 0; i < count; i++) 1950168404Spjd buf[i] = BSWAP_16(buf[i]); 1951168404Spjd} 1952168404Spjd 1953168404Spjd/* ARGSUSED */ 1954168404Spjdvoid 1955168404Spjdbyteswap_uint8_array(void *vbuf, size_t size) 1956168404Spjd{ 1957168404Spjd} 1958168404Spjd 1959168404Spjdvoid 1960168404Spjddmu_init(void) 1961168404Spjd{ 1962219089Spjd zfs_dbgmsg_init(); 1963219089Spjd sa_cache_init(); 1964219089Spjd xuio_stat_init(); 1965219089Spjd dmu_objset_init(); 1966219089Spjd dnode_init(); 1967168404Spjd dbuf_init(); 1968208130Smm zfetch_init(); 1969254608Sgibbs zio_compress_init(); 1970239620Smm l2arc_init(); 1971168404Spjd arc_init(); 1972168404Spjd} 1973168404Spjd 1974168404Spjdvoid 1975168404Spjddmu_fini(void) 1976168404Spjd{ 1977251629Sdelphij arc_fini(); /* arc depends on l2arc, so arc must go first */ 1978219089Spjd l2arc_fini(); 1979208130Smm zfetch_fini(); 1980254608Sgibbs zio_compress_fini(); 1981219089Spjd dbuf_fini(); 1982168404Spjd dnode_fini(); 1983219089Spjd dmu_objset_fini(); 1984219089Spjd xuio_stat_fini(); 1985219089Spjd sa_cache_fini(); 1986219089Spjd zfs_dbgmsg_fini(); 1987168404Spjd} 1988