1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23249195Smm * Copyright (c) 2013 by Delphix. All rights reserved. 24168404Spjd */ 25251478Sdelphij/* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */ 26255750Sdelphij/* Copyright (c) 2013, Joyent, Inc. All rights reserved. */ 27251478Sdelphij 28168404Spjd#include <sys/dmu.h> 29168404Spjd#include <sys/dmu_impl.h> 30168404Spjd#include <sys/dmu_tx.h> 31168404Spjd#include <sys/dbuf.h> 32168404Spjd#include <sys/dnode.h> 33168404Spjd#include <sys/zfs_context.h> 34168404Spjd#include <sys/dmu_objset.h> 35168404Spjd#include <sys/dmu_traverse.h> 36168404Spjd#include <sys/dsl_dataset.h> 37168404Spjd#include <sys/dsl_dir.h> 38168404Spjd#include <sys/dsl_pool.h> 39168404Spjd#include <sys/dsl_synctask.h> 40168404Spjd#include <sys/dsl_prop.h> 41168404Spjd#include <sys/dmu_zfetch.h> 42168404Spjd#include <sys/zfs_ioctl.h> 43168404Spjd#include <sys/zap.h> 44168404Spjd#include <sys/zio_checksum.h> 45243524Smm#include <sys/zio_compress.h> 46219089Spjd#include <sys/sa.h> 47219089Spjd#ifdef _KERNEL 48185029Spjd#include <sys/zfs_znode.h> 49219089Spjd#endif 50168404Spjd 51243524Smm/* 52243524Smm * Enable/disable nopwrite feature. 53243524Smm */ 54243524Smmint zfs_nopwrite_enabled = 1; 55243525SmmSYSCTL_DECL(_vfs_zfs); 56243525SmmTUNABLE_INT("vfs.zfs.nopwrite_enabled", &zfs_nopwrite_enabled); 57243525SmmSYSCTL_INT(_vfs_zfs, OID_AUTO, nopwrite_enabled, CTLFLAG_RDTUN, 58243525Smm &zfs_nopwrite_enabled, 0, "Enable nopwrite feature"); 59243524Smm 60168404Spjdconst dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { 61236884Smm { DMU_BSWAP_UINT8, TRUE, "unallocated" }, 62236884Smm { DMU_BSWAP_ZAP, TRUE, "object directory" }, 63236884Smm { DMU_BSWAP_UINT64, TRUE, "object array" }, 64236884Smm { DMU_BSWAP_UINT8, TRUE, "packed nvlist" }, 65236884Smm { DMU_BSWAP_UINT64, TRUE, "packed nvlist size" }, 66236884Smm { DMU_BSWAP_UINT64, TRUE, "bpobj" }, 67236884Smm { DMU_BSWAP_UINT64, TRUE, "bpobj header" }, 68236884Smm { DMU_BSWAP_UINT64, TRUE, "SPA space map header" }, 69236884Smm { DMU_BSWAP_UINT64, TRUE, "SPA space map" }, 70236884Smm { DMU_BSWAP_UINT64, TRUE, "ZIL intent log" }, 71236884Smm { DMU_BSWAP_DNODE, TRUE, "DMU dnode" }, 72236884Smm { DMU_BSWAP_OBJSET, TRUE, "DMU objset" }, 73236884Smm { DMU_BSWAP_UINT64, TRUE, "DSL directory" }, 74236884Smm { DMU_BSWAP_ZAP, TRUE, "DSL directory child map"}, 75236884Smm { DMU_BSWAP_ZAP, TRUE, "DSL dataset snap map" }, 76236884Smm { DMU_BSWAP_ZAP, TRUE, "DSL props" }, 77236884Smm { DMU_BSWAP_UINT64, TRUE, "DSL dataset" }, 78236884Smm { DMU_BSWAP_ZNODE, TRUE, "ZFS znode" }, 79236884Smm { DMU_BSWAP_OLDACL, TRUE, "ZFS V0 ACL" }, 80236884Smm { DMU_BSWAP_UINT8, FALSE, "ZFS plain file" }, 81236884Smm { DMU_BSWAP_ZAP, TRUE, "ZFS directory" }, 82236884Smm { DMU_BSWAP_ZAP, TRUE, "ZFS master node" }, 83236884Smm { DMU_BSWAP_ZAP, TRUE, "ZFS delete queue" }, 84236884Smm { DMU_BSWAP_UINT8, FALSE, "zvol object" }, 85236884Smm { DMU_BSWAP_ZAP, TRUE, "zvol prop" }, 86236884Smm { DMU_BSWAP_UINT8, FALSE, "other uint8[]" }, 87236884Smm { DMU_BSWAP_UINT64, FALSE, "other uint64[]" }, 88236884Smm { DMU_BSWAP_ZAP, TRUE, "other ZAP" }, 89236884Smm { DMU_BSWAP_ZAP, TRUE, "persistent error log" }, 90236884Smm { DMU_BSWAP_UINT8, TRUE, "SPA history" }, 91236884Smm { DMU_BSWAP_UINT64, TRUE, "SPA history offsets" }, 92236884Smm { DMU_BSWAP_ZAP, TRUE, "Pool properties" }, 93236884Smm { DMU_BSWAP_ZAP, TRUE, "DSL permissions" }, 94236884Smm { DMU_BSWAP_ACL, TRUE, "ZFS ACL" }, 95236884Smm { DMU_BSWAP_UINT8, TRUE, "ZFS SYSACL" }, 96236884Smm { DMU_BSWAP_UINT8, TRUE, "FUID table" }, 97236884Smm { DMU_BSWAP_UINT64, TRUE, "FUID table size" }, 98236884Smm { DMU_BSWAP_ZAP, TRUE, "DSL dataset next clones"}, 99236884Smm { DMU_BSWAP_ZAP, TRUE, "scan work queue" }, 100236884Smm { DMU_BSWAP_ZAP, TRUE, "ZFS user/group used" }, 101236884Smm { DMU_BSWAP_ZAP, TRUE, "ZFS user/group quota" }, 102236884Smm { DMU_BSWAP_ZAP, TRUE, "snapshot refcount tags"}, 103236884Smm { DMU_BSWAP_ZAP, TRUE, "DDT ZAP algorithm" }, 104236884Smm { DMU_BSWAP_ZAP, TRUE, "DDT statistics" }, 105236884Smm { DMU_BSWAP_UINT8, TRUE, "System attributes" }, 106236884Smm { DMU_BSWAP_ZAP, TRUE, "SA master node" }, 107236884Smm { DMU_BSWAP_ZAP, TRUE, "SA attr registration" }, 108236884Smm { DMU_BSWAP_ZAP, TRUE, "SA attr layouts" }, 109236884Smm { DMU_BSWAP_ZAP, TRUE, "scan translations" }, 110236884Smm { DMU_BSWAP_UINT8, FALSE, "deduplicated block" }, 111236884Smm { DMU_BSWAP_ZAP, TRUE, "DSL deadlist map" }, 112236884Smm { DMU_BSWAP_UINT64, TRUE, "DSL deadlist map hdr" }, 113236884Smm { DMU_BSWAP_ZAP, TRUE, "DSL dir clones" }, 114236884Smm { DMU_BSWAP_UINT64, TRUE, "bpobj subobj" } 115168404Spjd}; 116168404Spjd 117236884Smmconst dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = { 118236884Smm { byteswap_uint8_array, "uint8" }, 119236884Smm { byteswap_uint16_array, "uint16" }, 120236884Smm { byteswap_uint32_array, "uint32" }, 121236884Smm { byteswap_uint64_array, "uint64" }, 122236884Smm { zap_byteswap, "zap" }, 123236884Smm { dnode_buf_byteswap, "dnode" }, 124236884Smm { dmu_objset_byteswap, "objset" }, 125236884Smm { zfs_znode_byteswap, "znode" }, 126236884Smm { zfs_oldacl_byteswap, "oldacl" }, 127236884Smm { zfs_acl_byteswap, "acl" } 128236884Smm}; 129236884Smm 130168404Spjdint 131168404Spjddmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, 132219089Spjd void *tag, dmu_buf_t **dbp, int flags) 133168404Spjd{ 134168404Spjd dnode_t *dn; 135168404Spjd uint64_t blkid; 136168404Spjd dmu_buf_impl_t *db; 137168404Spjd int err; 138219089Spjd int db_flags = DB_RF_CANFAIL; 139168404Spjd 140219089Spjd if (flags & DMU_READ_NO_PREFETCH) 141219089Spjd db_flags |= DB_RF_NOPREFETCH; 142219089Spjd 143219089Spjd err = dnode_hold(os, object, FTAG, &dn); 144168404Spjd if (err) 145168404Spjd return (err); 146168404Spjd blkid = dbuf_whichblock(dn, offset); 147168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 148168404Spjd db = dbuf_hold(dn, blkid, tag); 149168404Spjd rw_exit(&dn->dn_struct_rwlock); 150168404Spjd if (db == NULL) { 151249195Smm err = SET_ERROR(EIO); 152168404Spjd } else { 153219089Spjd err = dbuf_read(db, NULL, db_flags); 154168404Spjd if (err) { 155168404Spjd dbuf_rele(db, tag); 156168404Spjd db = NULL; 157168404Spjd } 158168404Spjd } 159168404Spjd 160168404Spjd dnode_rele(dn, FTAG); 161219089Spjd *dbp = &db->db; /* NULL db plus first field offset is NULL */ 162168404Spjd return (err); 163168404Spjd} 164168404Spjd 165168404Spjdint 166168404Spjddmu_bonus_max(void) 167168404Spjd{ 168168404Spjd return (DN_MAX_BONUSLEN); 169168404Spjd} 170168404Spjd 171185029Spjdint 172219089Spjddmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx) 173185029Spjd{ 174219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 175219089Spjd dnode_t *dn; 176219089Spjd int error; 177185029Spjd 178219089Spjd DB_DNODE_ENTER(db); 179219089Spjd dn = DB_DNODE(db); 180219089Spjd 181219089Spjd if (dn->dn_bonus != db) { 182249195Smm error = SET_ERROR(EINVAL); 183219089Spjd } else if (newsize < 0 || newsize > db_fake->db_size) { 184249195Smm error = SET_ERROR(EINVAL); 185219089Spjd } else { 186219089Spjd dnode_setbonuslen(dn, newsize, tx); 187219089Spjd error = 0; 188219089Spjd } 189219089Spjd 190219089Spjd DB_DNODE_EXIT(db); 191219089Spjd return (error); 192185029Spjd} 193185029Spjd 194219089Spjdint 195219089Spjddmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx) 196219089Spjd{ 197219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 198219089Spjd dnode_t *dn; 199219089Spjd int error; 200219089Spjd 201219089Spjd DB_DNODE_ENTER(db); 202219089Spjd dn = DB_DNODE(db); 203219089Spjd 204236884Smm if (!DMU_OT_IS_VALID(type)) { 205249195Smm error = SET_ERROR(EINVAL); 206219089Spjd } else if (dn->dn_bonus != db) { 207249195Smm error = SET_ERROR(EINVAL); 208219089Spjd } else { 209219089Spjd dnode_setbonus_type(dn, type, tx); 210219089Spjd error = 0; 211219089Spjd } 212219089Spjd 213219089Spjd DB_DNODE_EXIT(db); 214219089Spjd return (error); 215219089Spjd} 216219089Spjd 217219089Spjddmu_object_type_t 218219089Spjddmu_get_bonustype(dmu_buf_t *db_fake) 219219089Spjd{ 220219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 221219089Spjd dnode_t *dn; 222219089Spjd dmu_object_type_t type; 223219089Spjd 224219089Spjd DB_DNODE_ENTER(db); 225219089Spjd dn = DB_DNODE(db); 226219089Spjd type = dn->dn_bonustype; 227219089Spjd DB_DNODE_EXIT(db); 228219089Spjd 229219089Spjd return (type); 230219089Spjd} 231219089Spjd 232219089Spjdint 233219089Spjddmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx) 234219089Spjd{ 235219089Spjd dnode_t *dn; 236219089Spjd int error; 237219089Spjd 238219089Spjd error = dnode_hold(os, object, FTAG, &dn); 239219089Spjd dbuf_rm_spill(dn, tx); 240219089Spjd rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 241219089Spjd dnode_rm_spill(dn, tx); 242219089Spjd rw_exit(&dn->dn_struct_rwlock); 243219089Spjd dnode_rele(dn, FTAG); 244219089Spjd return (error); 245219089Spjd} 246219089Spjd 247168404Spjd/* 248168404Spjd * returns ENOENT, EIO, or 0. 249168404Spjd */ 250168404Spjdint 251168404Spjddmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) 252168404Spjd{ 253168404Spjd dnode_t *dn; 254168404Spjd dmu_buf_impl_t *db; 255185029Spjd int error; 256168404Spjd 257219089Spjd error = dnode_hold(os, object, FTAG, &dn); 258185029Spjd if (error) 259185029Spjd return (error); 260168404Spjd 261168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 262168404Spjd if (dn->dn_bonus == NULL) { 263168404Spjd rw_exit(&dn->dn_struct_rwlock); 264168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 265168404Spjd if (dn->dn_bonus == NULL) 266185029Spjd dbuf_create_bonus(dn); 267168404Spjd } 268168404Spjd db = dn->dn_bonus; 269185029Spjd 270185029Spjd /* as long as the bonus buf is held, the dnode will be held */ 271219089Spjd if (refcount_add(&db->db_holds, tag) == 1) { 272185029Spjd VERIFY(dnode_add_ref(dn, db)); 273219089Spjd (void) atomic_inc_32_nv(&dn->dn_dbufs_count); 274219089Spjd } 275185029Spjd 276219089Spjd /* 277219089Spjd * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's 278219089Spjd * hold and incrementing the dbuf count to ensure that dnode_move() sees 279219089Spjd * a dnode hold for every dbuf. 280219089Spjd */ 281219089Spjd rw_exit(&dn->dn_struct_rwlock); 282219089Spjd 283168404Spjd dnode_rele(dn, FTAG); 284168404Spjd 285219089Spjd VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH)); 286168404Spjd 287168404Spjd *dbp = &db->db; 288168404Spjd return (0); 289168404Spjd} 290168404Spjd 291168404Spjd/* 292219089Spjd * returns ENOENT, EIO, or 0. 293219089Spjd * 294219089Spjd * This interface will allocate a blank spill dbuf when a spill blk 295219089Spjd * doesn't already exist on the dnode. 296219089Spjd * 297219089Spjd * if you only want to find an already existing spill db, then 298219089Spjd * dmu_spill_hold_existing() should be used. 299219089Spjd */ 300219089Spjdint 301219089Spjddmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp) 302219089Spjd{ 303219089Spjd dmu_buf_impl_t *db = NULL; 304219089Spjd int err; 305219089Spjd 306219089Spjd if ((flags & DB_RF_HAVESTRUCT) == 0) 307219089Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 308219089Spjd 309219089Spjd db = dbuf_hold(dn, DMU_SPILL_BLKID, tag); 310219089Spjd 311219089Spjd if ((flags & DB_RF_HAVESTRUCT) == 0) 312219089Spjd rw_exit(&dn->dn_struct_rwlock); 313219089Spjd 314219089Spjd ASSERT(db != NULL); 315219089Spjd err = dbuf_read(db, NULL, flags); 316219089Spjd if (err == 0) 317219089Spjd *dbp = &db->db; 318219089Spjd else 319219089Spjd dbuf_rele(db, tag); 320219089Spjd return (err); 321219089Spjd} 322219089Spjd 323219089Spjdint 324219089Spjddmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) 325219089Spjd{ 326219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; 327219089Spjd dnode_t *dn; 328219089Spjd int err; 329219089Spjd 330219089Spjd DB_DNODE_ENTER(db); 331219089Spjd dn = DB_DNODE(db); 332219089Spjd 333219089Spjd if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) { 334249195Smm err = SET_ERROR(EINVAL); 335219089Spjd } else { 336219089Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 337219089Spjd 338219089Spjd if (!dn->dn_have_spill) { 339249195Smm err = SET_ERROR(ENOENT); 340219089Spjd } else { 341219089Spjd err = dmu_spill_hold_by_dnode(dn, 342219089Spjd DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp); 343219089Spjd } 344219089Spjd 345219089Spjd rw_exit(&dn->dn_struct_rwlock); 346219089Spjd } 347219089Spjd 348219089Spjd DB_DNODE_EXIT(db); 349219089Spjd return (err); 350219089Spjd} 351219089Spjd 352219089Spjdint 353219089Spjddmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) 354219089Spjd{ 355219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; 356219089Spjd dnode_t *dn; 357219089Spjd int err; 358219089Spjd 359219089Spjd DB_DNODE_ENTER(db); 360219089Spjd dn = DB_DNODE(db); 361219089Spjd err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp); 362219089Spjd DB_DNODE_EXIT(db); 363219089Spjd 364219089Spjd return (err); 365219089Spjd} 366219089Spjd 367219089Spjd/* 368168404Spjd * Note: longer-term, we should modify all of the dmu_buf_*() interfaces 369168404Spjd * to take a held dnode rather than <os, object> -- the lookup is wasteful, 370168404Spjd * and can induce severe lock contention when writing to several files 371168404Spjd * whose dnodes are in the same block. 372168404Spjd */ 373168404Spjdstatic int 374209962Smmdmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, 375209962Smm int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) 376168404Spjd{ 377185029Spjd dsl_pool_t *dp = NULL; 378168404Spjd dmu_buf_t **dbp; 379168404Spjd uint64_t blkid, nblks, i; 380209962Smm uint32_t dbuf_flags; 381168404Spjd int err; 382168404Spjd zio_t *zio; 383185029Spjd hrtime_t start; 384168404Spjd 385168404Spjd ASSERT(length <= DMU_MAX_ACCESS); 386168404Spjd 387214378Smm dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT; 388209962Smm if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz) 389209962Smm dbuf_flags |= DB_RF_NOPREFETCH; 390168404Spjd 391168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 392168404Spjd if (dn->dn_datablkshift) { 393168404Spjd int blkshift = dn->dn_datablkshift; 394168404Spjd nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) - 395168404Spjd P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift; 396168404Spjd } else { 397168404Spjd if (offset + length > dn->dn_datablksz) { 398168404Spjd zfs_panic_recover("zfs: accessing past end of object " 399168404Spjd "%llx/%llx (size=%u access=%llu+%llu)", 400168404Spjd (longlong_t)dn->dn_objset-> 401168404Spjd os_dsl_dataset->ds_object, 402168404Spjd (longlong_t)dn->dn_object, dn->dn_datablksz, 403168404Spjd (longlong_t)offset, (longlong_t)length); 404214378Smm rw_exit(&dn->dn_struct_rwlock); 405249195Smm return (SET_ERROR(EIO)); 406168404Spjd } 407168404Spjd nblks = 1; 408168404Spjd } 409168404Spjd dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); 410168404Spjd 411185029Spjd if (dn->dn_objset->os_dsl_dataset) 412185029Spjd dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool; 413247187Smm start = gethrtime(); 414185029Spjd zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); 415168404Spjd blkid = dbuf_whichblock(dn, offset); 416168404Spjd for (i = 0; i < nblks; i++) { 417168404Spjd dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); 418168404Spjd if (db == NULL) { 419168404Spjd rw_exit(&dn->dn_struct_rwlock); 420168404Spjd dmu_buf_rele_array(dbp, nblks, tag); 421168404Spjd zio_nowait(zio); 422249195Smm return (SET_ERROR(EIO)); 423168404Spjd } 424168404Spjd /* initiate async i/o */ 425226620Spjd if (read) 426209962Smm (void) dbuf_read(db, zio, dbuf_flags); 427226620Spjd#ifdef _KERNEL 428226620Spjd else 429226620Spjd curthread->td_ru.ru_oublock++; 430226620Spjd#endif 431168404Spjd dbp[i] = &db->db; 432168404Spjd } 433168404Spjd rw_exit(&dn->dn_struct_rwlock); 434168404Spjd 435168404Spjd /* wait for async i/o */ 436168404Spjd err = zio_wait(zio); 437185029Spjd /* track read overhead when we are in sync context */ 438185029Spjd if (dp && dsl_pool_sync_context(dp)) 439185029Spjd dp->dp_read_overhead += gethrtime() - start; 440168404Spjd if (err) { 441168404Spjd dmu_buf_rele_array(dbp, nblks, tag); 442168404Spjd return (err); 443168404Spjd } 444168404Spjd 445168404Spjd /* wait for other io to complete */ 446168404Spjd if (read) { 447168404Spjd for (i = 0; i < nblks; i++) { 448168404Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; 449168404Spjd mutex_enter(&db->db_mtx); 450168404Spjd while (db->db_state == DB_READ || 451168404Spjd db->db_state == DB_FILL) 452168404Spjd cv_wait(&db->db_changed, &db->db_mtx); 453168404Spjd if (db->db_state == DB_UNCACHED) 454249195Smm err = SET_ERROR(EIO); 455168404Spjd mutex_exit(&db->db_mtx); 456168404Spjd if (err) { 457168404Spjd dmu_buf_rele_array(dbp, nblks, tag); 458168404Spjd return (err); 459168404Spjd } 460168404Spjd } 461168404Spjd } 462168404Spjd 463168404Spjd *numbufsp = nblks; 464168404Spjd *dbpp = dbp; 465168404Spjd return (0); 466168404Spjd} 467168404Spjd 468168404Spjdstatic int 469168404Spjddmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, 470168404Spjd uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) 471168404Spjd{ 472168404Spjd dnode_t *dn; 473168404Spjd int err; 474168404Spjd 475219089Spjd err = dnode_hold(os, object, FTAG, &dn); 476168404Spjd if (err) 477168404Spjd return (err); 478168404Spjd 479168404Spjd err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, 480209962Smm numbufsp, dbpp, DMU_READ_PREFETCH); 481168404Spjd 482168404Spjd dnode_rele(dn, FTAG); 483168404Spjd 484168404Spjd return (err); 485168404Spjd} 486168404Spjd 487168404Spjdint 488219089Spjddmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset, 489168404Spjd uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) 490168404Spjd{ 491219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 492219089Spjd dnode_t *dn; 493168404Spjd int err; 494168404Spjd 495219089Spjd DB_DNODE_ENTER(db); 496219089Spjd dn = DB_DNODE(db); 497168404Spjd err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, 498209962Smm numbufsp, dbpp, DMU_READ_PREFETCH); 499219089Spjd DB_DNODE_EXIT(db); 500168404Spjd 501168404Spjd return (err); 502168404Spjd} 503168404Spjd 504168404Spjdvoid 505168404Spjddmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) 506168404Spjd{ 507168404Spjd int i; 508168404Spjd dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; 509168404Spjd 510168404Spjd if (numbufs == 0) 511168404Spjd return; 512168404Spjd 513168404Spjd for (i = 0; i < numbufs; i++) { 514168404Spjd if (dbp[i]) 515168404Spjd dbuf_rele(dbp[i], tag); 516168404Spjd } 517168404Spjd 518168404Spjd kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); 519168404Spjd} 520168404Spjd 521168404Spjdvoid 522168404Spjddmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) 523168404Spjd{ 524168404Spjd dnode_t *dn; 525168404Spjd uint64_t blkid; 526168404Spjd int nblks, i, err; 527168404Spjd 528194043Skmacy if (zfs_prefetch_disable) 529168404Spjd return; 530168404Spjd 531168404Spjd if (len == 0) { /* they're interested in the bonus buffer */ 532219089Spjd dn = DMU_META_DNODE(os); 533168404Spjd 534168404Spjd if (object == 0 || object >= DN_MAX_OBJECT) 535168404Spjd return; 536168404Spjd 537168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 538168404Spjd blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t)); 539168404Spjd dbuf_prefetch(dn, blkid); 540168404Spjd rw_exit(&dn->dn_struct_rwlock); 541168404Spjd return; 542168404Spjd } 543168404Spjd 544168404Spjd /* 545168404Spjd * XXX - Note, if the dnode for the requested object is not 546168404Spjd * already cached, we will do a *synchronous* read in the 547168404Spjd * dnode_hold() call. The same is true for any indirects. 548168404Spjd */ 549219089Spjd err = dnode_hold(os, object, FTAG, &dn); 550168404Spjd if (err != 0) 551168404Spjd return; 552168404Spjd 553168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 554168404Spjd if (dn->dn_datablkshift) { 555168404Spjd int blkshift = dn->dn_datablkshift; 556168404Spjd nblks = (P2ROUNDUP(offset+len, 1<<blkshift) - 557168404Spjd P2ALIGN(offset, 1<<blkshift)) >> blkshift; 558168404Spjd } else { 559168404Spjd nblks = (offset < dn->dn_datablksz); 560168404Spjd } 561168404Spjd 562168404Spjd if (nblks != 0) { 563168404Spjd blkid = dbuf_whichblock(dn, offset); 564168404Spjd for (i = 0; i < nblks; i++) 565168404Spjd dbuf_prefetch(dn, blkid+i); 566168404Spjd } 567168404Spjd 568168404Spjd rw_exit(&dn->dn_struct_rwlock); 569168404Spjd 570168404Spjd dnode_rele(dn, FTAG); 571168404Spjd} 572168404Spjd 573208775Smm/* 574208775Smm * Get the next "chunk" of file data to free. We traverse the file from 575208775Smm * the end so that the file gets shorter over time (if we crashes in the 576208775Smm * middle, this will leave us in a better state). We find allocated file 577208775Smm * data by simply searching the allocated level 1 indirects. 578254753Sdelphij * 579254753Sdelphij * On input, *start should be the first offset that does not need to be 580254753Sdelphij * freed (e.g. "offset + length"). On return, *start will be the first 581254753Sdelphij * offset that should be freed. 582208775Smm */ 583185029Spjdstatic int 584254753Sdelphijget_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum) 585185029Spjd{ 586254753Sdelphij uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1); 587254753Sdelphij /* bytes of data covered by a level-1 indirect block */ 588208775Smm uint64_t iblkrange = 589185029Spjd dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT); 590185029Spjd 591254753Sdelphij ASSERT3U(minimum, <=, *start); 592185029Spjd 593254753Sdelphij if (*start - minimum <= iblkrange * maxblks) { 594254753Sdelphij *start = minimum; 595185029Spjd return (0); 596185029Spjd } 597208775Smm ASSERT(ISP2(iblkrange)); 598185029Spjd 599254753Sdelphij for (uint64_t blks = 0; *start > minimum && blks < maxblks; blks++) { 600185029Spjd int err; 601185029Spjd 602254753Sdelphij /* 603254753Sdelphij * dnode_next_offset(BACKWARDS) will find an allocated L1 604254753Sdelphij * indirect block at or before the input offset. We must 605254753Sdelphij * decrement *start so that it is at the end of the region 606254753Sdelphij * to search. 607254753Sdelphij */ 608254753Sdelphij (*start)--; 609185029Spjd err = dnode_next_offset(dn, 610208775Smm DNODE_FIND_BACKWARDS, start, 2, 1, 0); 611185029Spjd 612254753Sdelphij /* if there are no indirect blocks before start, we are done */ 613208775Smm if (err == ESRCH) { 614254753Sdelphij *start = minimum; 615254753Sdelphij break; 616254753Sdelphij } else if (err != 0) { 617208775Smm return (err); 618185029Spjd } 619185029Spjd 620254753Sdelphij /* set start to the beginning of this L1 indirect */ 621208775Smm *start = P2ALIGN(*start, iblkrange); 622185029Spjd } 623254753Sdelphij if (*start < minimum) 624254753Sdelphij *start = minimum; 625185029Spjd return (0); 626185029Spjd} 627185029Spjd 628185029Spjdstatic int 629185029Spjddmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, 630254753Sdelphij uint64_t length) 631185029Spjd{ 632254753Sdelphij uint64_t object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz; 633254753Sdelphij int err; 634185029Spjd 635254753Sdelphij if (offset >= object_size) 636185029Spjd return (0); 637185029Spjd 638254753Sdelphij if (length == DMU_OBJECT_END || offset + length > object_size) 639254753Sdelphij length = object_size - offset; 640254753Sdelphij 641254753Sdelphij while (length != 0) { 642254753Sdelphij uint64_t chunk_end, chunk_begin; 643254753Sdelphij 644254753Sdelphij chunk_end = chunk_begin = offset + length; 645254753Sdelphij 646254753Sdelphij /* move chunk_begin backwards to the beginning of this chunk */ 647254753Sdelphij err = get_next_chunk(dn, &chunk_begin, offset); 648185029Spjd if (err) 649185029Spjd return (err); 650254753Sdelphij ASSERT3U(chunk_begin, >=, offset); 651254753Sdelphij ASSERT3U(chunk_begin, <=, chunk_end); 652185029Spjd 653254753Sdelphij dmu_tx_t *tx = dmu_tx_create(os); 654254753Sdelphij dmu_tx_hold_free(tx, dn->dn_object, 655254753Sdelphij chunk_begin, chunk_end - chunk_begin); 656185029Spjd err = dmu_tx_assign(tx, TXG_WAIT); 657185029Spjd if (err) { 658185029Spjd dmu_tx_abort(tx); 659185029Spjd return (err); 660185029Spjd } 661254753Sdelphij dnode_free_range(dn, chunk_begin, chunk_end - chunk_begin, tx); 662254753Sdelphij dmu_tx_commit(tx); 663185029Spjd 664254753Sdelphij length -= chunk_end - chunk_begin; 665185029Spjd } 666185029Spjd return (0); 667185029Spjd} 668185029Spjd 669168404Spjdint 670185029Spjddmu_free_long_range(objset_t *os, uint64_t object, 671185029Spjd uint64_t offset, uint64_t length) 672185029Spjd{ 673185029Spjd dnode_t *dn; 674185029Spjd int err; 675185029Spjd 676219089Spjd err = dnode_hold(os, object, FTAG, &dn); 677185029Spjd if (err != 0) 678185029Spjd return (err); 679254753Sdelphij err = dmu_free_long_range_impl(os, dn, offset, length); 680256259Savg 681256259Savg /* 682256259Savg * It is important to zero out the maxblkid when freeing the entire 683256259Savg * file, so that (a) subsequent calls to dmu_free_long_range_impl() 684256259Savg * will take the fast path, and (b) dnode_reallocate() can verify 685256259Savg * that the entire file has been freed. 686256259Savg */ 687256259Savg if (offset == 0 && length == DMU_OBJECT_END) 688256259Savg dn->dn_maxblkid = 0; 689256259Savg 690185029Spjd dnode_rele(dn, FTAG); 691185029Spjd return (err); 692185029Spjd} 693185029Spjd 694185029Spjdint 695254753Sdelphijdmu_free_long_object(objset_t *os, uint64_t object) 696185029Spjd{ 697185029Spjd dmu_tx_t *tx; 698185029Spjd int err; 699185029Spjd 700254753Sdelphij err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END); 701185029Spjd if (err != 0) 702185029Spjd return (err); 703254753Sdelphij 704254753Sdelphij tx = dmu_tx_create(os); 705254753Sdelphij dmu_tx_hold_bonus(tx, object); 706254753Sdelphij dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); 707254753Sdelphij err = dmu_tx_assign(tx, TXG_WAIT); 708254753Sdelphij if (err == 0) { 709254753Sdelphij err = dmu_object_free(os, object, tx); 710254753Sdelphij dmu_tx_commit(tx); 711185029Spjd } else { 712254753Sdelphij dmu_tx_abort(tx); 713185029Spjd } 714254753Sdelphij 715185029Spjd return (err); 716185029Spjd} 717185029Spjd 718185029Spjdint 719168404Spjddmu_free_range(objset_t *os, uint64_t object, uint64_t offset, 720168404Spjd uint64_t size, dmu_tx_t *tx) 721168404Spjd{ 722168404Spjd dnode_t *dn; 723219089Spjd int err = dnode_hold(os, object, FTAG, &dn); 724168404Spjd if (err) 725168404Spjd return (err); 726168404Spjd ASSERT(offset < UINT64_MAX); 727168404Spjd ASSERT(size == -1ULL || size <= UINT64_MAX - offset); 728168404Spjd dnode_free_range(dn, offset, size, tx); 729168404Spjd dnode_rele(dn, FTAG); 730168404Spjd return (0); 731168404Spjd} 732168404Spjd 733168404Spjdint 734168404Spjddmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 735209962Smm void *buf, uint32_t flags) 736168404Spjd{ 737168404Spjd dnode_t *dn; 738168404Spjd dmu_buf_t **dbp; 739214378Smm int numbufs, err; 740168404Spjd 741219089Spjd err = dnode_hold(os, object, FTAG, &dn); 742168404Spjd if (err) 743168404Spjd return (err); 744168404Spjd 745168404Spjd /* 746168404Spjd * Deal with odd block sizes, where there can't be data past the first 747168404Spjd * block. If we ever do the tail block optimization, we will need to 748168404Spjd * handle that here as well. 749168404Spjd */ 750214378Smm if (dn->dn_maxblkid == 0) { 751168404Spjd int newsz = offset > dn->dn_datablksz ? 0 : 752168404Spjd MIN(size, dn->dn_datablksz - offset); 753168404Spjd bzero((char *)buf + newsz, size - newsz); 754168404Spjd size = newsz; 755168404Spjd } 756168404Spjd 757168404Spjd while (size > 0) { 758168404Spjd uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); 759214378Smm int i; 760168404Spjd 761168404Spjd /* 762168404Spjd * NB: we could do this block-at-a-time, but it's nice 763168404Spjd * to be reading in parallel. 764168404Spjd */ 765168404Spjd err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, 766209962Smm TRUE, FTAG, &numbufs, &dbp, flags); 767168404Spjd if (err) 768185029Spjd break; 769168404Spjd 770168404Spjd for (i = 0; i < numbufs; i++) { 771168404Spjd int tocpy; 772168404Spjd int bufoff; 773168404Spjd dmu_buf_t *db = dbp[i]; 774168404Spjd 775168404Spjd ASSERT(size > 0); 776168404Spjd 777168404Spjd bufoff = offset - db->db_offset; 778168404Spjd tocpy = (int)MIN(db->db_size - bufoff, size); 779168404Spjd 780168404Spjd bcopy((char *)db->db_data + bufoff, buf, tocpy); 781168404Spjd 782168404Spjd offset += tocpy; 783168404Spjd size -= tocpy; 784168404Spjd buf = (char *)buf + tocpy; 785168404Spjd } 786168404Spjd dmu_buf_rele_array(dbp, numbufs, FTAG); 787168404Spjd } 788168404Spjd dnode_rele(dn, FTAG); 789185029Spjd return (err); 790168404Spjd} 791168404Spjd 792168404Spjdvoid 793168404Spjddmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 794168404Spjd const void *buf, dmu_tx_t *tx) 795168404Spjd{ 796168404Spjd dmu_buf_t **dbp; 797168404Spjd int numbufs, i; 798168404Spjd 799168404Spjd if (size == 0) 800168404Spjd return; 801168404Spjd 802168404Spjd VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, 803168404Spjd FALSE, FTAG, &numbufs, &dbp)); 804168404Spjd 805168404Spjd for (i = 0; i < numbufs; i++) { 806168404Spjd int tocpy; 807168404Spjd int bufoff; 808168404Spjd dmu_buf_t *db = dbp[i]; 809168404Spjd 810168404Spjd ASSERT(size > 0); 811168404Spjd 812168404Spjd bufoff = offset - db->db_offset; 813168404Spjd tocpy = (int)MIN(db->db_size - bufoff, size); 814168404Spjd 815168404Spjd ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); 816168404Spjd 817168404Spjd if (tocpy == db->db_size) 818168404Spjd dmu_buf_will_fill(db, tx); 819168404Spjd else 820168404Spjd dmu_buf_will_dirty(db, tx); 821168404Spjd 822168404Spjd bcopy(buf, (char *)db->db_data + bufoff, tocpy); 823168404Spjd 824168404Spjd if (tocpy == db->db_size) 825168404Spjd dmu_buf_fill_done(db, tx); 826168404Spjd 827168404Spjd offset += tocpy; 828168404Spjd size -= tocpy; 829168404Spjd buf = (char *)buf + tocpy; 830168404Spjd } 831168404Spjd dmu_buf_rele_array(dbp, numbufs, FTAG); 832168404Spjd} 833168404Spjd 834219089Spjdvoid 835219089Spjddmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 836219089Spjd dmu_tx_t *tx) 837219089Spjd{ 838219089Spjd dmu_buf_t **dbp; 839219089Spjd int numbufs, i; 840219089Spjd 841219089Spjd if (size == 0) 842219089Spjd return; 843219089Spjd 844219089Spjd VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, 845219089Spjd FALSE, FTAG, &numbufs, &dbp)); 846219089Spjd 847219089Spjd for (i = 0; i < numbufs; i++) { 848219089Spjd dmu_buf_t *db = dbp[i]; 849219089Spjd 850219089Spjd dmu_buf_will_not_fill(db, tx); 851219089Spjd } 852219089Spjd dmu_buf_rele_array(dbp, numbufs, FTAG); 853219089Spjd} 854219089Spjd 855219089Spjd/* 856219089Spjd * DMU support for xuio 857219089Spjd */ 858219089Spjdkstat_t *xuio_ksp = NULL; 859219089Spjd 860219089Spjdint 861219089Spjddmu_xuio_init(xuio_t *xuio, int nblk) 862219089Spjd{ 863219089Spjd dmu_xuio_t *priv; 864219089Spjd uio_t *uio = &xuio->xu_uio; 865219089Spjd 866219089Spjd uio->uio_iovcnt = nblk; 867219089Spjd uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP); 868219089Spjd 869219089Spjd priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP); 870219089Spjd priv->cnt = nblk; 871219089Spjd priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP); 872219089Spjd priv->iovp = uio->uio_iov; 873219089Spjd XUIO_XUZC_PRIV(xuio) = priv; 874219089Spjd 875219089Spjd if (XUIO_XUZC_RW(xuio) == UIO_READ) 876219089Spjd XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk); 877219089Spjd else 878219089Spjd XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk); 879219089Spjd 880219089Spjd return (0); 881219089Spjd} 882219089Spjd 883219089Spjdvoid 884219089Spjddmu_xuio_fini(xuio_t *xuio) 885219089Spjd{ 886219089Spjd dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); 887219089Spjd int nblk = priv->cnt; 888219089Spjd 889219089Spjd kmem_free(priv->iovp, nblk * sizeof (iovec_t)); 890219089Spjd kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *)); 891219089Spjd kmem_free(priv, sizeof (dmu_xuio_t)); 892219089Spjd 893219089Spjd if (XUIO_XUZC_RW(xuio) == UIO_READ) 894219089Spjd XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk); 895219089Spjd else 896219089Spjd XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk); 897219089Spjd} 898219089Spjd 899219089Spjd/* 900219089Spjd * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf } 901219089Spjd * and increase priv->next by 1. 902219089Spjd */ 903219089Spjdint 904219089Spjddmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n) 905219089Spjd{ 906219089Spjd struct iovec *iov; 907219089Spjd uio_t *uio = &xuio->xu_uio; 908219089Spjd dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); 909219089Spjd int i = priv->next++; 910219089Spjd 911219089Spjd ASSERT(i < priv->cnt); 912219089Spjd ASSERT(off + n <= arc_buf_size(abuf)); 913219089Spjd iov = uio->uio_iov + i; 914219089Spjd iov->iov_base = (char *)abuf->b_data + off; 915219089Spjd iov->iov_len = n; 916219089Spjd priv->bufs[i] = abuf; 917219089Spjd return (0); 918219089Spjd} 919219089Spjd 920219089Spjdint 921219089Spjddmu_xuio_cnt(xuio_t *xuio) 922219089Spjd{ 923219089Spjd dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); 924219089Spjd return (priv->cnt); 925219089Spjd} 926219089Spjd 927219089Spjdarc_buf_t * 928219089Spjddmu_xuio_arcbuf(xuio_t *xuio, int i) 929219089Spjd{ 930219089Spjd dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); 931219089Spjd 932219089Spjd ASSERT(i < priv->cnt); 933219089Spjd return (priv->bufs[i]); 934219089Spjd} 935219089Spjd 936219089Spjdvoid 937219089Spjddmu_xuio_clear(xuio_t *xuio, int i) 938219089Spjd{ 939219089Spjd dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); 940219089Spjd 941219089Spjd ASSERT(i < priv->cnt); 942219089Spjd priv->bufs[i] = NULL; 943219089Spjd} 944219089Spjd 945219089Spjdstatic void 946219089Spjdxuio_stat_init(void) 947219089Spjd{ 948219089Spjd xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc", 949219089Spjd KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t), 950219089Spjd KSTAT_FLAG_VIRTUAL); 951219089Spjd if (xuio_ksp != NULL) { 952219089Spjd xuio_ksp->ks_data = &xuio_stats; 953219089Spjd kstat_install(xuio_ksp); 954219089Spjd } 955219089Spjd} 956219089Spjd 957219089Spjdstatic void 958219089Spjdxuio_stat_fini(void) 959219089Spjd{ 960219089Spjd if (xuio_ksp != NULL) { 961219089Spjd kstat_delete(xuio_ksp); 962219089Spjd xuio_ksp = NULL; 963219089Spjd } 964219089Spjd} 965219089Spjd 966219089Spjdvoid 967219089Spjdxuio_stat_wbuf_copied() 968219089Spjd{ 969219089Spjd XUIOSTAT_BUMP(xuiostat_wbuf_copied); 970219089Spjd} 971219089Spjd 972219089Spjdvoid 973219089Spjdxuio_stat_wbuf_nocopy() 974219089Spjd{ 975219089Spjd XUIOSTAT_BUMP(xuiostat_wbuf_nocopy); 976219089Spjd} 977219089Spjd 978168404Spjd#ifdef _KERNEL 979168404Spjdint 980168404Spjddmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) 981168404Spjd{ 982168404Spjd dmu_buf_t **dbp; 983168404Spjd int numbufs, i, err; 984219089Spjd xuio_t *xuio = NULL; 985168404Spjd 986168404Spjd /* 987168404Spjd * NB: we could do this block-at-a-time, but it's nice 988168404Spjd * to be reading in parallel. 989168404Spjd */ 990168404Spjd err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG, 991168404Spjd &numbufs, &dbp); 992168404Spjd if (err) 993168404Spjd return (err); 994168404Spjd 995219089Spjd#ifdef UIO_XUIO 996219089Spjd if (uio->uio_extflg == UIO_XUIO) 997219089Spjd xuio = (xuio_t *)uio; 998219089Spjd#endif 999219089Spjd 1000168404Spjd for (i = 0; i < numbufs; i++) { 1001168404Spjd int tocpy; 1002168404Spjd int bufoff; 1003168404Spjd dmu_buf_t *db = dbp[i]; 1004168404Spjd 1005168404Spjd ASSERT(size > 0); 1006168404Spjd 1007168404Spjd bufoff = uio->uio_loffset - db->db_offset; 1008168404Spjd tocpy = (int)MIN(db->db_size - bufoff, size); 1009168404Spjd 1010219089Spjd if (xuio) { 1011219089Spjd dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; 1012219089Spjd arc_buf_t *dbuf_abuf = dbi->db_buf; 1013219089Spjd arc_buf_t *abuf = dbuf_loan_arcbuf(dbi); 1014219089Spjd err = dmu_xuio_add(xuio, abuf, bufoff, tocpy); 1015219089Spjd if (!err) { 1016219089Spjd uio->uio_resid -= tocpy; 1017219089Spjd uio->uio_loffset += tocpy; 1018219089Spjd } 1019219089Spjd 1020219089Spjd if (abuf == dbuf_abuf) 1021219089Spjd XUIOSTAT_BUMP(xuiostat_rbuf_nocopy); 1022219089Spjd else 1023219089Spjd XUIOSTAT_BUMP(xuiostat_rbuf_copied); 1024219089Spjd } else { 1025219089Spjd err = uiomove((char *)db->db_data + bufoff, tocpy, 1026219089Spjd UIO_READ, uio); 1027219089Spjd } 1028168404Spjd if (err) 1029168404Spjd break; 1030168404Spjd 1031168404Spjd size -= tocpy; 1032168404Spjd } 1033168404Spjd dmu_buf_rele_array(dbp, numbufs, FTAG); 1034168404Spjd 1035168404Spjd return (err); 1036168404Spjd} 1037168404Spjd 1038219089Spjdstatic int 1039219089Spjddmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) 1040168404Spjd{ 1041168404Spjd dmu_buf_t **dbp; 1042219089Spjd int numbufs; 1043168404Spjd int err = 0; 1044219089Spjd int i; 1045168404Spjd 1046219089Spjd err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, 1047219089Spjd FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); 1048168404Spjd if (err) 1049168404Spjd return (err); 1050168404Spjd 1051168404Spjd for (i = 0; i < numbufs; i++) { 1052168404Spjd int tocpy; 1053168404Spjd int bufoff; 1054168404Spjd dmu_buf_t *db = dbp[i]; 1055168404Spjd 1056168404Spjd ASSERT(size > 0); 1057168404Spjd 1058168404Spjd bufoff = uio->uio_loffset - db->db_offset; 1059168404Spjd tocpy = (int)MIN(db->db_size - bufoff, size); 1060168404Spjd 1061168404Spjd ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); 1062168404Spjd 1063168404Spjd if (tocpy == db->db_size) 1064168404Spjd dmu_buf_will_fill(db, tx); 1065168404Spjd else 1066168404Spjd dmu_buf_will_dirty(db, tx); 1067168404Spjd 1068168404Spjd /* 1069168404Spjd * XXX uiomove could block forever (eg. nfs-backed 1070168404Spjd * pages). There needs to be a uiolockdown() function 1071168404Spjd * to lock the pages in memory, so that uiomove won't 1072168404Spjd * block. 1073168404Spjd */ 1074168404Spjd err = uiomove((char *)db->db_data + bufoff, tocpy, 1075168404Spjd UIO_WRITE, uio); 1076168404Spjd 1077168404Spjd if (tocpy == db->db_size) 1078168404Spjd dmu_buf_fill_done(db, tx); 1079168404Spjd 1080168404Spjd if (err) 1081168404Spjd break; 1082168404Spjd 1083168404Spjd size -= tocpy; 1084168404Spjd } 1085219089Spjd 1086168404Spjd dmu_buf_rele_array(dbp, numbufs, FTAG); 1087168404Spjd return (err); 1088168404Spjd} 1089168404Spjd 1090168404Spjdint 1091219089Spjddmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size, 1092219089Spjd dmu_tx_t *tx) 1093219089Spjd{ 1094219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; 1095219089Spjd dnode_t *dn; 1096219089Spjd int err; 1097219089Spjd 1098219089Spjd if (size == 0) 1099219089Spjd return (0); 1100219089Spjd 1101219089Spjd DB_DNODE_ENTER(db); 1102219089Spjd dn = DB_DNODE(db); 1103219089Spjd err = dmu_write_uio_dnode(dn, uio, size, tx); 1104219089Spjd DB_DNODE_EXIT(db); 1105219089Spjd 1106219089Spjd return (err); 1107219089Spjd} 1108219089Spjd 1109219089Spjdint 1110219089Spjddmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, 1111219089Spjd dmu_tx_t *tx) 1112219089Spjd{ 1113219089Spjd dnode_t *dn; 1114219089Spjd int err; 1115219089Spjd 1116219089Spjd if (size == 0) 1117219089Spjd return (0); 1118219089Spjd 1119219089Spjd err = dnode_hold(os, object, FTAG, &dn); 1120219089Spjd if (err) 1121219089Spjd return (err); 1122219089Spjd 1123219089Spjd err = dmu_write_uio_dnode(dn, uio, size, tx); 1124219089Spjd 1125219089Spjd dnode_rele(dn, FTAG); 1126219089Spjd 1127219089Spjd return (err); 1128219089Spjd} 1129219089Spjd 1130219089Spjd#ifdef sun 1131219089Spjdint 1132168404Spjddmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 1133168404Spjd page_t *pp, dmu_tx_t *tx) 1134168404Spjd{ 1135168404Spjd dmu_buf_t **dbp; 1136168404Spjd int numbufs, i; 1137168404Spjd int err; 1138168404Spjd 1139168404Spjd if (size == 0) 1140168404Spjd return (0); 1141168404Spjd 1142168404Spjd err = dmu_buf_hold_array(os, object, offset, size, 1143168404Spjd FALSE, FTAG, &numbufs, &dbp); 1144168404Spjd if (err) 1145168404Spjd return (err); 1146168404Spjd 1147168404Spjd for (i = 0; i < numbufs; i++) { 1148168404Spjd int tocpy, copied, thiscpy; 1149168404Spjd int bufoff; 1150168404Spjd dmu_buf_t *db = dbp[i]; 1151168404Spjd caddr_t va; 1152168404Spjd 1153168404Spjd ASSERT(size > 0); 1154168404Spjd ASSERT3U(db->db_size, >=, PAGESIZE); 1155168404Spjd 1156168404Spjd bufoff = offset - db->db_offset; 1157168404Spjd tocpy = (int)MIN(db->db_size - bufoff, size); 1158168404Spjd 1159168404Spjd ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); 1160168404Spjd 1161168404Spjd if (tocpy == db->db_size) 1162168404Spjd dmu_buf_will_fill(db, tx); 1163168404Spjd else 1164168404Spjd dmu_buf_will_dirty(db, tx); 1165168404Spjd 1166168404Spjd for (copied = 0; copied < tocpy; copied += PAGESIZE) { 1167168404Spjd ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff); 1168168404Spjd thiscpy = MIN(PAGESIZE, tocpy - copied); 1169185029Spjd va = zfs_map_page(pp, S_READ); 1170168404Spjd bcopy(va, (char *)db->db_data + bufoff, thiscpy); 1171185029Spjd zfs_unmap_page(pp, va); 1172168404Spjd pp = pp->p_next; 1173168404Spjd bufoff += PAGESIZE; 1174168404Spjd } 1175168404Spjd 1176168404Spjd if (tocpy == db->db_size) 1177168404Spjd dmu_buf_fill_done(db, tx); 1178168404Spjd 1179168404Spjd offset += tocpy; 1180168404Spjd size -= tocpy; 1181168404Spjd } 1182168404Spjd dmu_buf_rele_array(dbp, numbufs, FTAG); 1183168404Spjd return (err); 1184168404Spjd} 1185219089Spjd#endif /* sun */ 1186219089Spjd#endif 1187168404Spjd 1188209962Smm/* 1189209962Smm * Allocate a loaned anonymous arc buffer. 1190209962Smm */ 1191209962Smmarc_buf_t * 1192209962Smmdmu_request_arcbuf(dmu_buf_t *handle, int size) 1193209962Smm{ 1194219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle; 1195219089Spjd spa_t *spa; 1196209962Smm 1197219089Spjd DB_GET_SPA(&spa, db); 1198219089Spjd return (arc_loan_buf(spa, size)); 1199209962Smm} 1200209962Smm 1201209962Smm/* 1202209962Smm * Free a loaned arc buffer. 1203209962Smm */ 1204209962Smmvoid 1205209962Smmdmu_return_arcbuf(arc_buf_t *buf) 1206209962Smm{ 1207209962Smm arc_return_buf(buf, FTAG); 1208248571Smm VERIFY(arc_buf_remove_ref(buf, FTAG)); 1209209962Smm} 1210209962Smm 1211209962Smm/* 1212209962Smm * When possible directly assign passed loaned arc buffer to a dbuf. 1213209962Smm * If this is not possible copy the contents of passed arc buf via 1214209962Smm * dmu_write(). 1215209962Smm */ 1216209962Smmvoid 1217209962Smmdmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, 1218209962Smm dmu_tx_t *tx) 1219209962Smm{ 1220219089Spjd dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle; 1221219089Spjd dnode_t *dn; 1222209962Smm dmu_buf_impl_t *db; 1223209962Smm uint32_t blksz = (uint32_t)arc_buf_size(buf); 1224209962Smm uint64_t blkid; 1225209962Smm 1226219089Spjd DB_DNODE_ENTER(dbuf); 1227219089Spjd dn = DB_DNODE(dbuf); 1228209962Smm rw_enter(&dn->dn_struct_rwlock, RW_READER); 1229209962Smm blkid = dbuf_whichblock(dn, offset); 1230209962Smm VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL); 1231209962Smm rw_exit(&dn->dn_struct_rwlock); 1232219089Spjd DB_DNODE_EXIT(dbuf); 1233209962Smm 1234209962Smm if (offset == db->db.db_offset && blksz == db->db.db_size) { 1235209962Smm dbuf_assign_arcbuf(db, buf, tx); 1236209962Smm dbuf_rele(db, FTAG); 1237209962Smm } else { 1238219089Spjd objset_t *os; 1239219089Spjd uint64_t object; 1240219089Spjd 1241219089Spjd DB_DNODE_ENTER(dbuf); 1242219089Spjd dn = DB_DNODE(dbuf); 1243219089Spjd os = dn->dn_objset; 1244219089Spjd object = dn->dn_object; 1245219089Spjd DB_DNODE_EXIT(dbuf); 1246219089Spjd 1247209962Smm dbuf_rele(db, FTAG); 1248219089Spjd dmu_write(os, object, offset, blksz, buf->b_data, tx); 1249209962Smm dmu_return_arcbuf(buf); 1250219089Spjd XUIOSTAT_BUMP(xuiostat_wbuf_copied); 1251209962Smm } 1252209962Smm} 1253209962Smm 1254168404Spjdtypedef struct { 1255219089Spjd dbuf_dirty_record_t *dsa_dr; 1256219089Spjd dmu_sync_cb_t *dsa_done; 1257219089Spjd zgd_t *dsa_zgd; 1258219089Spjd dmu_tx_t *dsa_tx; 1259168404Spjd} dmu_sync_arg_t; 1260168404Spjd 1261168404Spjd/* ARGSUSED */ 1262168404Spjdstatic void 1263185029Spjddmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) 1264185029Spjd{ 1265219089Spjd dmu_sync_arg_t *dsa = varg; 1266219089Spjd dmu_buf_t *db = dsa->dsa_zgd->zgd_db; 1267185029Spjd blkptr_t *bp = zio->io_bp; 1268185029Spjd 1269219089Spjd if (zio->io_error == 0) { 1270219089Spjd if (BP_IS_HOLE(bp)) { 1271219089Spjd /* 1272219089Spjd * A block of zeros may compress to a hole, but the 1273219089Spjd * block size still needs to be known for replay. 1274219089Spjd */ 1275219089Spjd BP_SET_LSIZE(bp, db->db_size); 1276219089Spjd } else { 1277219089Spjd ASSERT(BP_GET_LEVEL(bp) == 0); 1278219089Spjd bp->blk_fill = 1; 1279219089Spjd } 1280185029Spjd } 1281185029Spjd} 1282185029Spjd 1283219089Spjdstatic void 1284219089Spjddmu_sync_late_arrival_ready(zio_t *zio) 1285219089Spjd{ 1286219089Spjd dmu_sync_ready(zio, NULL, zio->io_private); 1287219089Spjd} 1288219089Spjd 1289185029Spjd/* ARGSUSED */ 1290185029Spjdstatic void 1291168404Spjddmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) 1292168404Spjd{ 1293219089Spjd dmu_sync_arg_t *dsa = varg; 1294219089Spjd dbuf_dirty_record_t *dr = dsa->dsa_dr; 1295168404Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 1296168404Spjd 1297168404Spjd mutex_enter(&db->db_mtx); 1298168404Spjd ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); 1299219089Spjd if (zio->io_error == 0) { 1300243524Smm dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE); 1301243524Smm if (dr->dt.dl.dr_nopwrite) { 1302243524Smm blkptr_t *bp = zio->io_bp; 1303243524Smm blkptr_t *bp_orig = &zio->io_bp_orig; 1304243524Smm uint8_t chksum = BP_GET_CHECKSUM(bp_orig); 1305243524Smm 1306243524Smm ASSERT(BP_EQUAL(bp, bp_orig)); 1307243524Smm ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF); 1308243524Smm ASSERT(zio_checksum_table[chksum].ci_dedup); 1309243524Smm } 1310219089Spjd dr->dt.dl.dr_overridden_by = *zio->io_bp; 1311219089Spjd dr->dt.dl.dr_override_state = DR_OVERRIDDEN; 1312219089Spjd dr->dt.dl.dr_copies = zio->io_prop.zp_copies; 1313219089Spjd if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) 1314219089Spjd BP_ZERO(&dr->dt.dl.dr_overridden_by); 1315219089Spjd } else { 1316219089Spjd dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 1317219089Spjd } 1318168404Spjd cv_broadcast(&db->db_changed); 1319168404Spjd mutex_exit(&db->db_mtx); 1320168404Spjd 1321219089Spjd dsa->dsa_done(dsa->dsa_zgd, zio->io_error); 1322168404Spjd 1323219089Spjd kmem_free(dsa, sizeof (*dsa)); 1324168404Spjd} 1325168404Spjd 1326219089Spjdstatic void 1327219089Spjddmu_sync_late_arrival_done(zio_t *zio) 1328219089Spjd{ 1329219089Spjd blkptr_t *bp = zio->io_bp; 1330219089Spjd dmu_sync_arg_t *dsa = zio->io_private; 1331243524Smm blkptr_t *bp_orig = &zio->io_bp_orig; 1332219089Spjd 1333219089Spjd if (zio->io_error == 0 && !BP_IS_HOLE(bp)) { 1334243524Smm /* 1335243524Smm * If we didn't allocate a new block (i.e. ZIO_FLAG_NOPWRITE) 1336243524Smm * then there is nothing to do here. Otherwise, free the 1337243524Smm * newly allocated block in this txg. 1338243524Smm */ 1339243524Smm if (zio->io_flags & ZIO_FLAG_NOPWRITE) { 1340243524Smm ASSERT(BP_EQUAL(bp, bp_orig)); 1341243524Smm } else { 1342243524Smm ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig)); 1343243524Smm ASSERT(zio->io_bp->blk_birth == zio->io_txg); 1344243524Smm ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); 1345243524Smm zio_free(zio->io_spa, zio->io_txg, zio->io_bp); 1346243524Smm } 1347219089Spjd } 1348219089Spjd 1349219089Spjd dmu_tx_commit(dsa->dsa_tx); 1350219089Spjd 1351219089Spjd dsa->dsa_done(dsa->dsa_zgd, zio->io_error); 1352219089Spjd 1353219089Spjd kmem_free(dsa, sizeof (*dsa)); 1354219089Spjd} 1355219089Spjd 1356219089Spjdstatic int 1357219089Spjddmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, 1358219089Spjd zio_prop_t *zp, zbookmark_t *zb) 1359219089Spjd{ 1360219089Spjd dmu_sync_arg_t *dsa; 1361219089Spjd dmu_tx_t *tx; 1362219089Spjd 1363219089Spjd tx = dmu_tx_create(os); 1364219089Spjd dmu_tx_hold_space(tx, zgd->zgd_db->db_size); 1365219089Spjd if (dmu_tx_assign(tx, TXG_WAIT) != 0) { 1366219089Spjd dmu_tx_abort(tx); 1367249195Smm /* Make zl_get_data do txg_waited_synced() */ 1368249195Smm return (SET_ERROR(EIO)); 1369219089Spjd } 1370219089Spjd 1371219089Spjd dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); 1372219089Spjd dsa->dsa_dr = NULL; 1373219089Spjd dsa->dsa_done = done; 1374219089Spjd dsa->dsa_zgd = zgd; 1375219089Spjd dsa->dsa_tx = tx; 1376219089Spjd 1377219089Spjd zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp, 1378219089Spjd zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp, 1379219089Spjd dmu_sync_late_arrival_ready, dmu_sync_late_arrival_done, dsa, 1380219089Spjd ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); 1381219089Spjd 1382219089Spjd return (0); 1383219089Spjd} 1384219089Spjd 1385168404Spjd/* 1386168404Spjd * Intent log support: sync the block associated with db to disk. 1387168404Spjd * N.B. and XXX: the caller is responsible for making sure that the 1388168404Spjd * data isn't changing while dmu_sync() is writing it. 1389168404Spjd * 1390168404Spjd * Return values: 1391168404Spjd * 1392243524Smm * EEXIST: this txg has already been synced, so there's nothing to do. 1393168404Spjd * The caller should not log the write. 1394168404Spjd * 1395168404Spjd * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. 1396168404Spjd * The caller should not log the write. 1397168404Spjd * 1398168404Spjd * EALREADY: this block is already in the process of being synced. 1399168404Spjd * The caller should track its progress (somehow). 1400168404Spjd * 1401219089Spjd * EIO: could not do the I/O. 1402219089Spjd * The caller should do a txg_wait_synced(). 1403168404Spjd * 1404219089Spjd * 0: the I/O has been initiated. 1405219089Spjd * The caller should log this blkptr in the done callback. 1406219089Spjd * It is possible that the I/O will fail, in which case 1407219089Spjd * the error will be reported to the done callback and 1408219089Spjd * propagated to pio from zio_done(). 1409168404Spjd */ 1410168404Spjdint 1411219089Spjddmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) 1412168404Spjd{ 1413219089Spjd blkptr_t *bp = zgd->zgd_bp; 1414219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db; 1415219089Spjd objset_t *os = db->db_objset; 1416219089Spjd dsl_dataset_t *ds = os->os_dsl_dataset; 1417168404Spjd dbuf_dirty_record_t *dr; 1418219089Spjd dmu_sync_arg_t *dsa; 1419168404Spjd zbookmark_t zb; 1420219089Spjd zio_prop_t zp; 1421219089Spjd dnode_t *dn; 1422168404Spjd 1423219089Spjd ASSERT(pio != NULL); 1424168404Spjd ASSERT(txg != 0); 1425168404Spjd 1426219089Spjd SET_BOOKMARK(&zb, ds->ds_object, 1427219089Spjd db->db.db_object, db->db_level, db->db_blkid); 1428168404Spjd 1429219089Spjd DB_DNODE_ENTER(db); 1430219089Spjd dn = DB_DNODE(db); 1431219089Spjd dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp); 1432219089Spjd DB_DNODE_EXIT(db); 1433219089Spjd 1434168404Spjd /* 1435219089Spjd * If we're frozen (running ziltest), we always need to generate a bp. 1436168404Spjd */ 1437219089Spjd if (txg > spa_freeze_txg(os->os_spa)) 1438219089Spjd return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); 1439168404Spjd 1440168404Spjd /* 1441219089Spjd * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf() 1442219089Spjd * and us. If we determine that this txg is not yet syncing, 1443219089Spjd * but it begins to sync a moment later, that's OK because the 1444219089Spjd * sync thread will block in dbuf_sync_leaf() until we drop db_mtx. 1445168404Spjd */ 1446219089Spjd mutex_enter(&db->db_mtx); 1447219089Spjd 1448219089Spjd if (txg <= spa_last_synced_txg(os->os_spa)) { 1449168404Spjd /* 1450219089Spjd * This txg has already synced. There's nothing to do. 1451168404Spjd */ 1452219089Spjd mutex_exit(&db->db_mtx); 1453249195Smm return (SET_ERROR(EEXIST)); 1454168404Spjd } 1455168404Spjd 1456219089Spjd if (txg <= spa_syncing_txg(os->os_spa)) { 1457219089Spjd /* 1458219089Spjd * This txg is currently syncing, so we can't mess with 1459219089Spjd * the dirty record anymore; just write a new log block. 1460219089Spjd */ 1461219089Spjd mutex_exit(&db->db_mtx); 1462219089Spjd return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); 1463168404Spjd } 1464168404Spjd 1465168404Spjd dr = db->db_last_dirty; 1466219089Spjd while (dr && dr->dr_txg != txg) 1467168404Spjd dr = dr->dr_next; 1468219089Spjd 1469219089Spjd if (dr == NULL) { 1470168404Spjd /* 1471219089Spjd * There's no dr for this dbuf, so it must have been freed. 1472168404Spjd * There's no need to log writes to freed blocks, so we're done. 1473168404Spjd */ 1474168404Spjd mutex_exit(&db->db_mtx); 1475249195Smm return (SET_ERROR(ENOENT)); 1476168404Spjd } 1477168404Spjd 1478243524Smm ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg); 1479243524Smm 1480243524Smm /* 1481243524Smm * Assume the on-disk data is X, the current syncing data is Y, 1482243524Smm * and the current in-memory data is Z (currently in dmu_sync). 1483243524Smm * X and Z are identical but Y is has been modified. Normally, 1484243524Smm * when X and Z are the same we will perform a nopwrite but if Y 1485243524Smm * is different we must disable nopwrite since the resulting write 1486243524Smm * of Y to disk can free the block containing X. If we allowed a 1487243524Smm * nopwrite to occur the block pointing to Z would reference a freed 1488243524Smm * block. Since this is a rare case we simplify this by disabling 1489243524Smm * nopwrite if the current dmu_sync-ing dbuf has been modified in 1490243524Smm * a previous transaction. 1491243524Smm */ 1492243524Smm if (dr->dr_next) 1493243524Smm zp.zp_nopwrite = B_FALSE; 1494243524Smm 1495168404Spjd ASSERT(dr->dr_txg == txg); 1496219089Spjd if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC || 1497219089Spjd dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 1498168404Spjd /* 1499219089Spjd * We have already issued a sync write for this buffer, 1500219089Spjd * or this buffer has already been synced. It could not 1501219089Spjd * have been dirtied since, or we would have cleared the state. 1502168404Spjd */ 1503168404Spjd mutex_exit(&db->db_mtx); 1504249195Smm return (SET_ERROR(EALREADY)); 1505168404Spjd } 1506168404Spjd 1507219089Spjd ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); 1508168404Spjd dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC; 1509168404Spjd mutex_exit(&db->db_mtx); 1510168404Spjd 1511219089Spjd dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); 1512219089Spjd dsa->dsa_dr = dr; 1513219089Spjd dsa->dsa_done = done; 1514219089Spjd dsa->dsa_zgd = zgd; 1515219089Spjd dsa->dsa_tx = NULL; 1516168404Spjd 1517219089Spjd zio_nowait(arc_write(pio, os->os_spa, txg, 1518251478Sdelphij bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), 1519251478Sdelphij DBUF_IS_L2COMPRESSIBLE(db), &zp, dmu_sync_ready, dmu_sync_done, 1520251478Sdelphij dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); 1521185029Spjd 1522219089Spjd return (0); 1523168404Spjd} 1524168404Spjd 1525168404Spjdint 1526168404Spjddmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, 1527168404Spjd dmu_tx_t *tx) 1528168404Spjd{ 1529168404Spjd dnode_t *dn; 1530168404Spjd int err; 1531168404Spjd 1532219089Spjd err = dnode_hold(os, object, FTAG, &dn); 1533168404Spjd if (err) 1534168404Spjd return (err); 1535168404Spjd err = dnode_set_blksz(dn, size, ibs, tx); 1536168404Spjd dnode_rele(dn, FTAG); 1537168404Spjd return (err); 1538168404Spjd} 1539168404Spjd 1540168404Spjdvoid 1541168404Spjddmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, 1542168404Spjd dmu_tx_t *tx) 1543168404Spjd{ 1544168404Spjd dnode_t *dn; 1545168404Spjd 1546168404Spjd /* XXX assumes dnode_hold will not get an i/o error */ 1547219089Spjd (void) dnode_hold(os, object, FTAG, &dn); 1548168404Spjd ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS); 1549168404Spjd dn->dn_checksum = checksum; 1550168404Spjd dnode_setdirty(dn, tx); 1551168404Spjd dnode_rele(dn, FTAG); 1552168404Spjd} 1553168404Spjd 1554168404Spjdvoid 1555168404Spjddmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, 1556168404Spjd dmu_tx_t *tx) 1557168404Spjd{ 1558168404Spjd dnode_t *dn; 1559168404Spjd 1560168404Spjd /* XXX assumes dnode_hold will not get an i/o error */ 1561219089Spjd (void) dnode_hold(os, object, FTAG, &dn); 1562168404Spjd ASSERT(compress < ZIO_COMPRESS_FUNCTIONS); 1563168404Spjd dn->dn_compress = compress; 1564168404Spjd dnode_setdirty(dn, tx); 1565168404Spjd dnode_rele(dn, FTAG); 1566168404Spjd} 1567168404Spjd 1568219089Spjdint zfs_mdcomp_disable = 0; 1569219089SpjdTUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable); 1570219089SpjdSYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RW, 1571219089Spjd &zfs_mdcomp_disable, 0, "Disable metadata compression"); 1572219089Spjd 1573219089Spjdvoid 1574219089Spjddmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) 1575219089Spjd{ 1576219089Spjd dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET; 1577236884Smm boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) || 1578219089Spjd (wp & WP_SPILL)); 1579219089Spjd enum zio_checksum checksum = os->os_checksum; 1580219089Spjd enum zio_compress compress = os->os_compress; 1581219089Spjd enum zio_checksum dedup_checksum = os->os_dedup_checksum; 1582243524Smm boolean_t dedup = B_FALSE; 1583243524Smm boolean_t nopwrite = B_FALSE; 1584219089Spjd boolean_t dedup_verify = os->os_dedup_verify; 1585219089Spjd int copies = os->os_copies; 1586219089Spjd 1587219089Spjd /* 1588243524Smm * We maintain different write policies for each of the following 1589243524Smm * types of data: 1590243524Smm * 1. metadata 1591243524Smm * 2. preallocated blocks (i.e. level-0 blocks of a dump device) 1592243524Smm * 3. all other level 0 blocks 1593219089Spjd */ 1594219089Spjd if (ismd) { 1595219089Spjd /* 1596243524Smm * XXX -- we should design a compression algorithm 1597243524Smm * that specializes in arrays of bps. 1598243524Smm */ 1599243524Smm compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : 1600243524Smm ZIO_COMPRESS_LZJB; 1601243524Smm 1602243524Smm /* 1603219089Spjd * Metadata always gets checksummed. If the data 1604219089Spjd * checksum is multi-bit correctable, and it's not a 1605219089Spjd * ZBT-style checksum, then it's suitable for metadata 1606219089Spjd * as well. Otherwise, the metadata checksum defaults 1607219089Spjd * to fletcher4. 1608219089Spjd */ 1609219089Spjd if (zio_checksum_table[checksum].ci_correctable < 1 || 1610219089Spjd zio_checksum_table[checksum].ci_eck) 1611219089Spjd checksum = ZIO_CHECKSUM_FLETCHER_4; 1612243524Smm } else if (wp & WP_NOFILL) { 1613243524Smm ASSERT(level == 0); 1614219089Spjd 1615219089Spjd /* 1616243524Smm * If we're writing preallocated blocks, we aren't actually 1617243524Smm * writing them so don't set any policy properties. These 1618243524Smm * blocks are currently only used by an external subsystem 1619243524Smm * outside of zfs (i.e. dump) and not written by the zio 1620243524Smm * pipeline. 1621219089Spjd */ 1622243524Smm compress = ZIO_COMPRESS_OFF; 1623255750Sdelphij checksum = ZIO_CHECKSUM_NOPARITY; 1624219089Spjd } else { 1625219089Spjd compress = zio_compress_select(dn->dn_compress, compress); 1626219089Spjd 1627243524Smm checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ? 1628243524Smm zio_checksum_select(dn->dn_checksum, checksum) : 1629243524Smm dedup_checksum; 1630219089Spjd 1631243524Smm /* 1632243524Smm * Determine dedup setting. If we are in dmu_sync(), 1633243524Smm * we won't actually dedup now because that's all 1634243524Smm * done in syncing context; but we do want to use the 1635243524Smm * dedup checkum. If the checksum is not strong 1636243524Smm * enough to ensure unique signatures, force 1637243524Smm * dedup_verify. 1638243524Smm */ 1639243524Smm if (dedup_checksum != ZIO_CHECKSUM_OFF) { 1640243524Smm dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE; 1641243524Smm if (!zio_checksum_table[checksum].ci_dedup) 1642243524Smm dedup_verify = B_TRUE; 1643243524Smm } 1644219089Spjd 1645243524Smm /* 1646243524Smm * Enable nopwrite if we have a cryptographically secure 1647243524Smm * checksum that has no known collisions (i.e. SHA-256) 1648243524Smm * and compression is enabled. We don't enable nopwrite if 1649243524Smm * dedup is enabled as the two features are mutually exclusive. 1650243524Smm */ 1651243524Smm nopwrite = (!dedup && zio_checksum_table[checksum].ci_dedup && 1652243524Smm compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled); 1653219089Spjd } 1654219089Spjd 1655219089Spjd zp->zp_checksum = checksum; 1656219089Spjd zp->zp_compress = compress; 1657219089Spjd zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type; 1658219089Spjd zp->zp_level = level; 1659219089Spjd zp->zp_copies = MIN(copies + ismd, spa_max_replication(os->os_spa)); 1660219089Spjd zp->zp_dedup = dedup; 1661219089Spjd zp->zp_dedup_verify = dedup && dedup_verify; 1662243524Smm zp->zp_nopwrite = nopwrite; 1663219089Spjd} 1664219089Spjd 1665168404Spjdint 1666168404Spjddmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) 1667168404Spjd{ 1668168404Spjd dnode_t *dn; 1669168404Spjd int i, err; 1670168404Spjd 1671219089Spjd err = dnode_hold(os, object, FTAG, &dn); 1672168404Spjd if (err) 1673168404Spjd return (err); 1674168404Spjd /* 1675168404Spjd * Sync any current changes before 1676168404Spjd * we go trundling through the block pointers. 1677168404Spjd */ 1678168404Spjd for (i = 0; i < TXG_SIZE; i++) { 1679168404Spjd if (list_link_active(&dn->dn_dirty_link[i])) 1680168404Spjd break; 1681168404Spjd } 1682168404Spjd if (i != TXG_SIZE) { 1683168404Spjd dnode_rele(dn, FTAG); 1684168404Spjd txg_wait_synced(dmu_objset_pool(os), 0); 1685219089Spjd err = dnode_hold(os, object, FTAG, &dn); 1686168404Spjd if (err) 1687168404Spjd return (err); 1688168404Spjd } 1689168404Spjd 1690185029Spjd err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0); 1691168404Spjd dnode_rele(dn, FTAG); 1692168404Spjd 1693168404Spjd return (err); 1694168404Spjd} 1695168404Spjd 1696168404Spjdvoid 1697168404Spjddmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) 1698168404Spjd{ 1699219089Spjd dnode_phys_t *dnp; 1700219089Spjd 1701168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 1702168404Spjd mutex_enter(&dn->dn_mtx); 1703168404Spjd 1704219089Spjd dnp = dn->dn_phys; 1705219089Spjd 1706168404Spjd doi->doi_data_block_size = dn->dn_datablksz; 1707168404Spjd doi->doi_metadata_block_size = dn->dn_indblkshift ? 1708168404Spjd 1ULL << dn->dn_indblkshift : 0; 1709219089Spjd doi->doi_type = dn->dn_type; 1710219089Spjd doi->doi_bonus_type = dn->dn_bonustype; 1711219089Spjd doi->doi_bonus_size = dn->dn_bonuslen; 1712168404Spjd doi->doi_indirection = dn->dn_nlevels; 1713168404Spjd doi->doi_checksum = dn->dn_checksum; 1714168404Spjd doi->doi_compress = dn->dn_compress; 1715219089Spjd doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9; 1716247852Smm doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz; 1717219089Spjd doi->doi_fill_count = 0; 1718219089Spjd for (int i = 0; i < dnp->dn_nblkptr; i++) 1719219089Spjd doi->doi_fill_count += dnp->dn_blkptr[i].blk_fill; 1720168404Spjd 1721168404Spjd mutex_exit(&dn->dn_mtx); 1722168404Spjd rw_exit(&dn->dn_struct_rwlock); 1723168404Spjd} 1724168404Spjd 1725168404Spjd/* 1726168404Spjd * Get information on a DMU object. 1727168404Spjd * If doi is NULL, just indicates whether the object exists. 1728168404Spjd */ 1729168404Spjdint 1730168404Spjddmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) 1731168404Spjd{ 1732168404Spjd dnode_t *dn; 1733219089Spjd int err = dnode_hold(os, object, FTAG, &dn); 1734168404Spjd 1735168404Spjd if (err) 1736168404Spjd return (err); 1737168404Spjd 1738168404Spjd if (doi != NULL) 1739168404Spjd dmu_object_info_from_dnode(dn, doi); 1740168404Spjd 1741168404Spjd dnode_rele(dn, FTAG); 1742168404Spjd return (0); 1743168404Spjd} 1744168404Spjd 1745168404Spjd/* 1746168404Spjd * As above, but faster; can be used when you have a held dbuf in hand. 1747168404Spjd */ 1748168404Spjdvoid 1749219089Spjddmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi) 1750168404Spjd{ 1751219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1752219089Spjd 1753219089Spjd DB_DNODE_ENTER(db); 1754219089Spjd dmu_object_info_from_dnode(DB_DNODE(db), doi); 1755219089Spjd DB_DNODE_EXIT(db); 1756168404Spjd} 1757168404Spjd 1758168404Spjd/* 1759168404Spjd * Faster still when you only care about the size. 1760168404Spjd * This is specifically optimized for zfs_getattr(). 1761168404Spjd */ 1762168404Spjdvoid 1763219089Spjddmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize, 1764219089Spjd u_longlong_t *nblk512) 1765168404Spjd{ 1766219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1767219089Spjd dnode_t *dn; 1768168404Spjd 1769219089Spjd DB_DNODE_ENTER(db); 1770219089Spjd dn = DB_DNODE(db); 1771219089Spjd 1772168404Spjd *blksize = dn->dn_datablksz; 1773168404Spjd /* add 1 for dnode space */ 1774168404Spjd *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >> 1775168404Spjd SPA_MINBLOCKSHIFT) + 1; 1776219089Spjd DB_DNODE_EXIT(db); 1777168404Spjd} 1778168404Spjd 1779168404Spjdvoid 1780168404Spjdbyteswap_uint64_array(void *vbuf, size_t size) 1781168404Spjd{ 1782168404Spjd uint64_t *buf = vbuf; 1783168404Spjd size_t count = size >> 3; 1784168404Spjd int i; 1785168404Spjd 1786168404Spjd ASSERT((size & 7) == 0); 1787168404Spjd 1788168404Spjd for (i = 0; i < count; i++) 1789168404Spjd buf[i] = BSWAP_64(buf[i]); 1790168404Spjd} 1791168404Spjd 1792168404Spjdvoid 1793168404Spjdbyteswap_uint32_array(void *vbuf, size_t size) 1794168404Spjd{ 1795168404Spjd uint32_t *buf = vbuf; 1796168404Spjd size_t count = size >> 2; 1797168404Spjd int i; 1798168404Spjd 1799168404Spjd ASSERT((size & 3) == 0); 1800168404Spjd 1801168404Spjd for (i = 0; i < count; i++) 1802168404Spjd buf[i] = BSWAP_32(buf[i]); 1803168404Spjd} 1804168404Spjd 1805168404Spjdvoid 1806168404Spjdbyteswap_uint16_array(void *vbuf, size_t size) 1807168404Spjd{ 1808168404Spjd uint16_t *buf = vbuf; 1809168404Spjd size_t count = size >> 1; 1810168404Spjd int i; 1811168404Spjd 1812168404Spjd ASSERT((size & 1) == 0); 1813168404Spjd 1814168404Spjd for (i = 0; i < count; i++) 1815168404Spjd buf[i] = BSWAP_16(buf[i]); 1816168404Spjd} 1817168404Spjd 1818168404Spjd/* ARGSUSED */ 1819168404Spjdvoid 1820168404Spjdbyteswap_uint8_array(void *vbuf, size_t size) 1821168404Spjd{ 1822168404Spjd} 1823168404Spjd 1824168404Spjdvoid 1825168404Spjddmu_init(void) 1826168404Spjd{ 1827219089Spjd zfs_dbgmsg_init(); 1828219089Spjd sa_cache_init(); 1829219089Spjd xuio_stat_init(); 1830219089Spjd dmu_objset_init(); 1831219089Spjd dnode_init(); 1832168404Spjd dbuf_init(); 1833208130Smm zfetch_init(); 1834254608Sgibbs zio_compress_init(); 1835239620Smm l2arc_init(); 1836168404Spjd arc_init(); 1837168404Spjd} 1838168404Spjd 1839168404Spjdvoid 1840168404Spjddmu_fini(void) 1841168404Spjd{ 1842251629Sdelphij arc_fini(); /* arc depends on l2arc, so arc must go first */ 1843219089Spjd l2arc_fini(); 1844208130Smm zfetch_fini(); 1845254608Sgibbs zio_compress_fini(); 1846219089Spjd dbuf_fini(); 1847168404Spjd dnode_fini(); 1848219089Spjd dmu_objset_fini(); 1849219089Spjd xuio_stat_fini(); 1850219089Spjd sa_cache_fini(); 1851219089Spjd zfs_dbgmsg_fini(); 1852168404Spjd} 1853