1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23219636Spjd * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24265751Sdelphij * Copyright (c) 2012, 2014 by Delphix. All rights reserved. 25252140Sdelphij * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 26262089Savg * Copyright (c) 2013, Joyent, Inc. All rights reserved. 27168404Spjd */ 28168404Spjd 29168404Spjd#include <sys/zfs_context.h> 30168404Spjd#include <sys/dmu.h> 31260722Savg#include <sys/dmu_send.h> 32168404Spjd#include <sys/dmu_impl.h> 33168404Spjd#include <sys/dbuf.h> 34168404Spjd#include <sys/dmu_objset.h> 35168404Spjd#include <sys/dsl_dataset.h> 36168404Spjd#include <sys/dsl_dir.h> 37168404Spjd#include <sys/dmu_tx.h> 38168404Spjd#include <sys/spa.h> 39168404Spjd#include <sys/zio.h> 40168404Spjd#include <sys/dmu_zfetch.h> 41219089Spjd#include <sys/sa.h> 42219089Spjd#include <sys/sa_impl.h> 43265751Sdelphij#include <sys/range_tree.h> 44168404Spjd 45260722Savg/* 46260722Savg * Number of times that zfs_free_range() took the slow path while doing 47260722Savg * a zfs receive. A nonzero value indicates a potential performance problem. 48260722Savg */ 49260722Savguint64_t zfs_free_range_recv_miss; 50260722Savg 51168404Spjdstatic void dbuf_destroy(dmu_buf_impl_t *db); 52249643Smmstatic boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); 53185029Spjdstatic void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); 54168404Spjd 55168404Spjd/* 56168404Spjd * Global data structures and functions for the dbuf cache. 57168404Spjd */ 58168404Spjdstatic kmem_cache_t *dbuf_cache; 59168404Spjd 60168404Spjd/* ARGSUSED */ 61168404Spjdstatic int 62168404Spjddbuf_cons(void *vdb, void *unused, int kmflag) 63168404Spjd{ 64168404Spjd dmu_buf_impl_t *db = vdb; 65168404Spjd bzero(db, sizeof (dmu_buf_impl_t)); 66168404Spjd 67168404Spjd mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); 68168404Spjd cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); 69168404Spjd refcount_create(&db->db_holds); 70168404Spjd return (0); 71168404Spjd} 72168404Spjd 73168404Spjd/* ARGSUSED */ 74168404Spjdstatic void 75168404Spjddbuf_dest(void *vdb, void *unused) 76168404Spjd{ 77168404Spjd dmu_buf_impl_t *db = vdb; 78168404Spjd mutex_destroy(&db->db_mtx); 79168404Spjd cv_destroy(&db->db_changed); 80168404Spjd refcount_destroy(&db->db_holds); 81168404Spjd} 82168404Spjd 83168404Spjd/* 84168404Spjd * dbuf hash table routines 85168404Spjd */ 86168404Spjdstatic dbuf_hash_table_t dbuf_hash_table; 87168404Spjd 88168404Spjdstatic uint64_t dbuf_hash_count; 89168404Spjd 90168404Spjdstatic uint64_t 91168404Spjddbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) 92168404Spjd{ 93168404Spjd uintptr_t osv = (uintptr_t)os; 94168404Spjd uint64_t crc = -1ULL; 95168404Spjd 96168404Spjd ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 97168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; 98168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; 99168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; 100168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; 101168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; 102168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; 103168404Spjd 104168404Spjd crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); 105168404Spjd 106168404Spjd return (crc); 107168404Spjd} 108168404Spjd 109168404Spjd#define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); 110168404Spjd 111168404Spjd#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ 112168404Spjd ((dbuf)->db.db_object == (obj) && \ 113168404Spjd (dbuf)->db_objset == (os) && \ 114168404Spjd (dbuf)->db_level == (level) && \ 115168404Spjd (dbuf)->db_blkid == (blkid)) 116168404Spjd 117168404Spjddmu_buf_impl_t * 118168404Spjddbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) 119168404Spjd{ 120168404Spjd dbuf_hash_table_t *h = &dbuf_hash_table; 121219089Spjd objset_t *os = dn->dn_objset; 122168404Spjd uint64_t obj = dn->dn_object; 123168404Spjd uint64_t hv = DBUF_HASH(os, obj, level, blkid); 124168404Spjd uint64_t idx = hv & h->hash_table_mask; 125168404Spjd dmu_buf_impl_t *db; 126168404Spjd 127168404Spjd mutex_enter(DBUF_HASH_MUTEX(h, idx)); 128168404Spjd for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { 129168404Spjd if (DBUF_EQUAL(db, os, obj, level, blkid)) { 130168404Spjd mutex_enter(&db->db_mtx); 131168404Spjd if (db->db_state != DB_EVICTING) { 132168404Spjd mutex_exit(DBUF_HASH_MUTEX(h, idx)); 133168404Spjd return (db); 134168404Spjd } 135168404Spjd mutex_exit(&db->db_mtx); 136168404Spjd } 137168404Spjd } 138168404Spjd mutex_exit(DBUF_HASH_MUTEX(h, idx)); 139168404Spjd return (NULL); 140168404Spjd} 141168404Spjd 142168404Spjd/* 143168404Spjd * Insert an entry into the hash table. If there is already an element 144168404Spjd * equal to elem in the hash table, then the already existing element 145168404Spjd * will be returned and the new element will not be inserted. 146168404Spjd * Otherwise returns NULL. 147168404Spjd */ 148168404Spjdstatic dmu_buf_impl_t * 149168404Spjddbuf_hash_insert(dmu_buf_impl_t *db) 150168404Spjd{ 151168404Spjd dbuf_hash_table_t *h = &dbuf_hash_table; 152219089Spjd objset_t *os = db->db_objset; 153168404Spjd uint64_t obj = db->db.db_object; 154168404Spjd int level = db->db_level; 155168404Spjd uint64_t blkid = db->db_blkid; 156168404Spjd uint64_t hv = DBUF_HASH(os, obj, level, blkid); 157168404Spjd uint64_t idx = hv & h->hash_table_mask; 158168404Spjd dmu_buf_impl_t *dbf; 159168404Spjd 160168404Spjd mutex_enter(DBUF_HASH_MUTEX(h, idx)); 161168404Spjd for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { 162168404Spjd if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { 163168404Spjd mutex_enter(&dbf->db_mtx); 164168404Spjd if (dbf->db_state != DB_EVICTING) { 165168404Spjd mutex_exit(DBUF_HASH_MUTEX(h, idx)); 166168404Spjd return (dbf); 167168404Spjd } 168168404Spjd mutex_exit(&dbf->db_mtx); 169168404Spjd } 170168404Spjd } 171168404Spjd 172168404Spjd mutex_enter(&db->db_mtx); 173168404Spjd db->db_hash_next = h->hash_table[idx]; 174168404Spjd h->hash_table[idx] = db; 175168404Spjd mutex_exit(DBUF_HASH_MUTEX(h, idx)); 176168404Spjd atomic_add_64(&dbuf_hash_count, 1); 177168404Spjd 178168404Spjd return (NULL); 179168404Spjd} 180168404Spjd 181168404Spjd/* 182168404Spjd * Remove an entry from the hash table. This operation will 183168404Spjd * fail if there are any existing holds on the db. 184168404Spjd */ 185168404Spjdstatic void 186168404Spjddbuf_hash_remove(dmu_buf_impl_t *db) 187168404Spjd{ 188168404Spjd dbuf_hash_table_t *h = &dbuf_hash_table; 189168404Spjd uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, 190168404Spjd db->db_level, db->db_blkid); 191168404Spjd uint64_t idx = hv & h->hash_table_mask; 192168404Spjd dmu_buf_impl_t *dbf, **dbp; 193168404Spjd 194168404Spjd /* 195168404Spjd * We musn't hold db_mtx to maintin lock ordering: 196168404Spjd * DBUF_HASH_MUTEX > db_mtx. 197168404Spjd */ 198168404Spjd ASSERT(refcount_is_zero(&db->db_holds)); 199168404Spjd ASSERT(db->db_state == DB_EVICTING); 200168404Spjd ASSERT(!MUTEX_HELD(&db->db_mtx)); 201168404Spjd 202168404Spjd mutex_enter(DBUF_HASH_MUTEX(h, idx)); 203168404Spjd dbp = &h->hash_table[idx]; 204168404Spjd while ((dbf = *dbp) != db) { 205168404Spjd dbp = &dbf->db_hash_next; 206168404Spjd ASSERT(dbf != NULL); 207168404Spjd } 208168404Spjd *dbp = db->db_hash_next; 209168404Spjd db->db_hash_next = NULL; 210168404Spjd mutex_exit(DBUF_HASH_MUTEX(h, idx)); 211168404Spjd atomic_add_64(&dbuf_hash_count, -1); 212168404Spjd} 213168404Spjd 214168404Spjdstatic arc_evict_func_t dbuf_do_evict; 215168404Spjd 216168404Spjdstatic void 217168404Spjddbuf_evict_user(dmu_buf_impl_t *db) 218168404Spjd{ 219168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 220168404Spjd 221168404Spjd if (db->db_level != 0 || db->db_evict_func == NULL) 222168404Spjd return; 223168404Spjd 224168404Spjd if (db->db_user_data_ptr_ptr) 225168404Spjd *db->db_user_data_ptr_ptr = db->db.db_data; 226168404Spjd db->db_evict_func(&db->db, db->db_user_ptr); 227168404Spjd db->db_user_ptr = NULL; 228168404Spjd db->db_user_data_ptr_ptr = NULL; 229168404Spjd db->db_evict_func = NULL; 230168404Spjd} 231168404Spjd 232219089Spjdboolean_t 233219089Spjddbuf_is_metadata(dmu_buf_impl_t *db) 234219089Spjd{ 235219089Spjd if (db->db_level > 0) { 236219089Spjd return (B_TRUE); 237219089Spjd } else { 238219089Spjd boolean_t is_metadata; 239219089Spjd 240219089Spjd DB_DNODE_ENTER(db); 241243674Smm is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type); 242219089Spjd DB_DNODE_EXIT(db); 243219089Spjd 244219089Spjd return (is_metadata); 245219089Spjd } 246219089Spjd} 247219089Spjd 248168404Spjdvoid 249168404Spjddbuf_evict(dmu_buf_impl_t *db) 250168404Spjd{ 251168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 252168404Spjd ASSERT(db->db_buf == NULL); 253168404Spjd ASSERT(db->db_data_pending == NULL); 254168404Spjd 255168404Spjd dbuf_clear(db); 256168404Spjd dbuf_destroy(db); 257168404Spjd} 258168404Spjd 259168404Spjdvoid 260168404Spjddbuf_init(void) 261168404Spjd{ 262168404Spjd uint64_t hsize = 1ULL << 16; 263168404Spjd dbuf_hash_table_t *h = &dbuf_hash_table; 264168404Spjd int i; 265168404Spjd 266168404Spjd /* 267168404Spjd * The hash table is big enough to fill all of physical memory 268168404Spjd * with an average 4K block size. The table will take up 269168404Spjd * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). 270168404Spjd */ 271168696Spjd while (hsize * 4096 < (uint64_t)physmem * PAGESIZE) 272168404Spjd hsize <<= 1; 273168404Spjd 274168404Spjdretry: 275168404Spjd h->hash_table_mask = hsize - 1; 276168404Spjd h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); 277168404Spjd if (h->hash_table == NULL) { 278168404Spjd /* XXX - we should really return an error instead of assert */ 279168404Spjd ASSERT(hsize > (1ULL << 10)); 280168404Spjd hsize >>= 1; 281168404Spjd goto retry; 282168404Spjd } 283168404Spjd 284168404Spjd dbuf_cache = kmem_cache_create("dmu_buf_impl_t", 285168404Spjd sizeof (dmu_buf_impl_t), 286168404Spjd 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); 287168404Spjd 288168404Spjd for (i = 0; i < DBUF_MUTEXES; i++) 289168404Spjd mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); 290168404Spjd} 291168404Spjd 292168404Spjdvoid 293168404Spjddbuf_fini(void) 294168404Spjd{ 295168404Spjd dbuf_hash_table_t *h = &dbuf_hash_table; 296168404Spjd int i; 297168404Spjd 298168404Spjd for (i = 0; i < DBUF_MUTEXES; i++) 299168404Spjd mutex_destroy(&h->hash_mutexes[i]); 300168404Spjd kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); 301168404Spjd kmem_cache_destroy(dbuf_cache); 302168404Spjd} 303168404Spjd 304168404Spjd/* 305168404Spjd * Other stuff. 306168404Spjd */ 307168404Spjd 308168404Spjd#ifdef ZFS_DEBUG 309168404Spjdstatic void 310168404Spjddbuf_verify(dmu_buf_impl_t *db) 311168404Spjd{ 312219089Spjd dnode_t *dn; 313219089Spjd dbuf_dirty_record_t *dr; 314168404Spjd 315168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 316168404Spjd 317168404Spjd if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) 318168404Spjd return; 319168404Spjd 320168404Spjd ASSERT(db->db_objset != NULL); 321219089Spjd DB_DNODE_ENTER(db); 322219089Spjd dn = DB_DNODE(db); 323168404Spjd if (dn == NULL) { 324168404Spjd ASSERT(db->db_parent == NULL); 325168404Spjd ASSERT(db->db_blkptr == NULL); 326168404Spjd } else { 327168404Spjd ASSERT3U(db->db.db_object, ==, dn->dn_object); 328168404Spjd ASSERT3P(db->db_objset, ==, dn->dn_objset); 329168404Spjd ASSERT3U(db->db_level, <, dn->dn_nlevels); 330219089Spjd ASSERT(db->db_blkid == DMU_BONUS_BLKID || 331219089Spjd db->db_blkid == DMU_SPILL_BLKID || 332219089Spjd !list_is_empty(&dn->dn_dbufs)); 333168404Spjd } 334219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 335168404Spjd ASSERT(dn != NULL); 336185029Spjd ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 337219089Spjd ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); 338219089Spjd } else if (db->db_blkid == DMU_SPILL_BLKID) { 339219089Spjd ASSERT(dn != NULL); 340219089Spjd ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 341243674Smm ASSERT0(db->db.db_offset); 342168404Spjd } else { 343168404Spjd ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); 344168404Spjd } 345168404Spjd 346219089Spjd for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) 347219089Spjd ASSERT(dr->dr_dbuf == db); 348219089Spjd 349219089Spjd for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) 350219089Spjd ASSERT(dr->dr_dbuf == db); 351219089Spjd 352208047Smm /* 353208047Smm * We can't assert that db_size matches dn_datablksz because it 354208047Smm * can be momentarily different when another thread is doing 355208047Smm * dnode_set_blksz(). 356208047Smm */ 357208047Smm if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { 358219089Spjd dr = db->db_data_pending; 359208047Smm /* 360208047Smm * It should only be modified in syncing context, so 361208047Smm * make sure we only have one copy of the data. 362208047Smm */ 363208047Smm ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); 364168404Spjd } 365168404Spjd 366168404Spjd /* verify db->db_blkptr */ 367168404Spjd if (db->db_blkptr) { 368168404Spjd if (db->db_parent == dn->dn_dbuf) { 369168404Spjd /* db is pointed to by the dnode */ 370168404Spjd /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ 371209962Smm if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) 372168404Spjd ASSERT(db->db_parent == NULL); 373168404Spjd else 374168404Spjd ASSERT(db->db_parent != NULL); 375219089Spjd if (db->db_blkid != DMU_SPILL_BLKID) 376219089Spjd ASSERT3P(db->db_blkptr, ==, 377219089Spjd &dn->dn_phys->dn_blkptr[db->db_blkid]); 378168404Spjd } else { 379168404Spjd /* db is pointed to by an indirect block */ 380168404Spjd int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; 381168404Spjd ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); 382168404Spjd ASSERT3U(db->db_parent->db.db_object, ==, 383168404Spjd db->db.db_object); 384168404Spjd /* 385168404Spjd * dnode_grow_indblksz() can make this fail if we don't 386168404Spjd * have the struct_rwlock. XXX indblksz no longer 387168404Spjd * grows. safe to do this now? 388168404Spjd */ 389219089Spjd if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 390168404Spjd ASSERT3P(db->db_blkptr, ==, 391168404Spjd ((blkptr_t *)db->db_parent->db.db_data + 392168404Spjd db->db_blkid % epb)); 393168404Spjd } 394168404Spjd } 395168404Spjd } 396168404Spjd if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && 397219089Spjd (db->db_buf == NULL || db->db_buf->b_data) && 398219089Spjd db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && 399168404Spjd db->db_state != DB_FILL && !dn->dn_free_txg) { 400168404Spjd /* 401168404Spjd * If the blkptr isn't set but they have nonzero data, 402168404Spjd * it had better be dirty, otherwise we'll lose that 403168404Spjd * data when we evict this buffer. 404168404Spjd */ 405168404Spjd if (db->db_dirtycnt == 0) { 406168404Spjd uint64_t *buf = db->db.db_data; 407168404Spjd int i; 408168404Spjd 409168404Spjd for (i = 0; i < db->db.db_size >> 3; i++) { 410168404Spjd ASSERT(buf[i] == 0); 411168404Spjd } 412168404Spjd } 413168404Spjd } 414219089Spjd DB_DNODE_EXIT(db); 415168404Spjd} 416168404Spjd#endif 417168404Spjd 418168404Spjdstatic void 419168404Spjddbuf_update_data(dmu_buf_impl_t *db) 420168404Spjd{ 421168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 422168404Spjd if (db->db_level == 0 && db->db_user_data_ptr_ptr) { 423168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 424168404Spjd *db->db_user_data_ptr_ptr = db->db.db_data; 425168404Spjd } 426168404Spjd} 427168404Spjd 428168404Spjdstatic void 429168404Spjddbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) 430168404Spjd{ 431168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 432168404Spjd ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf)); 433168404Spjd db->db_buf = buf; 434168404Spjd if (buf != NULL) { 435168404Spjd ASSERT(buf->b_data != NULL); 436168404Spjd db->db.db_data = buf->b_data; 437168404Spjd if (!arc_released(buf)) 438168404Spjd arc_set_callback(buf, dbuf_do_evict, db); 439168404Spjd dbuf_update_data(db); 440168404Spjd } else { 441168404Spjd dbuf_evict_user(db); 442168404Spjd db->db.db_data = NULL; 443219089Spjd if (db->db_state != DB_NOFILL) 444219089Spjd db->db_state = DB_UNCACHED; 445168404Spjd } 446168404Spjd} 447168404Spjd 448219089Spjd/* 449219089Spjd * Loan out an arc_buf for read. Return the loaned arc_buf. 450219089Spjd */ 451219089Spjdarc_buf_t * 452219089Spjddbuf_loan_arcbuf(dmu_buf_impl_t *db) 453219089Spjd{ 454219089Spjd arc_buf_t *abuf; 455219089Spjd 456219089Spjd mutex_enter(&db->db_mtx); 457219089Spjd if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { 458219089Spjd int blksz = db->db.db_size; 459263398Sdelphij spa_t *spa = db->db_objset->os_spa; 460219089Spjd 461219089Spjd mutex_exit(&db->db_mtx); 462219089Spjd abuf = arc_loan_buf(spa, blksz); 463219089Spjd bcopy(db->db.db_data, abuf->b_data, blksz); 464219089Spjd } else { 465219089Spjd abuf = db->db_buf; 466219089Spjd arc_loan_inuse_buf(abuf, db); 467219089Spjd dbuf_set_data(db, NULL); 468219089Spjd mutex_exit(&db->db_mtx); 469219089Spjd } 470219089Spjd return (abuf); 471219089Spjd} 472219089Spjd 473168404Spjduint64_t 474168404Spjddbuf_whichblock(dnode_t *dn, uint64_t offset) 475168404Spjd{ 476168404Spjd if (dn->dn_datablkshift) { 477168404Spjd return (offset >> dn->dn_datablkshift); 478168404Spjd } else { 479168404Spjd ASSERT3U(offset, <, dn->dn_datablksz); 480168404Spjd return (0); 481168404Spjd } 482168404Spjd} 483168404Spjd 484168404Spjdstatic void 485168404Spjddbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) 486168404Spjd{ 487168404Spjd dmu_buf_impl_t *db = vdb; 488168404Spjd 489168404Spjd mutex_enter(&db->db_mtx); 490168404Spjd ASSERT3U(db->db_state, ==, DB_READ); 491168404Spjd /* 492168404Spjd * All reads are synchronous, so we must have a hold on the dbuf 493168404Spjd */ 494168404Spjd ASSERT(refcount_count(&db->db_holds) > 0); 495168404Spjd ASSERT(db->db_buf == NULL); 496168404Spjd ASSERT(db->db.db_data == NULL); 497168404Spjd if (db->db_level == 0 && db->db_freed_in_flight) { 498168404Spjd /* we were freed in flight; disregard any error */ 499168404Spjd arc_release(buf, db); 500168404Spjd bzero(buf->b_data, db->db.db_size); 501168404Spjd arc_buf_freeze(buf); 502168404Spjd db->db_freed_in_flight = FALSE; 503168404Spjd dbuf_set_data(db, buf); 504168404Spjd db->db_state = DB_CACHED; 505168404Spjd } else if (zio == NULL || zio->io_error == 0) { 506168404Spjd dbuf_set_data(db, buf); 507168404Spjd db->db_state = DB_CACHED; 508168404Spjd } else { 509219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 510168404Spjd ASSERT3P(db->db_buf, ==, NULL); 511249643Smm VERIFY(arc_buf_remove_ref(buf, db)); 512168404Spjd db->db_state = DB_UNCACHED; 513168404Spjd } 514168404Spjd cv_broadcast(&db->db_changed); 515219089Spjd dbuf_rele_and_unlock(db, NULL); 516168404Spjd} 517168404Spjd 518168404Spjdstatic void 519168404Spjddbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) 520168404Spjd{ 521219089Spjd dnode_t *dn; 522168404Spjd zbookmark_t zb; 523168404Spjd uint32_t aflags = ARC_NOWAIT; 524168404Spjd 525219089Spjd DB_DNODE_ENTER(db); 526219089Spjd dn = DB_DNODE(db); 527168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 528168404Spjd /* We need the struct_rwlock to prevent db_blkptr from changing. */ 529185029Spjd ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 530168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 531168404Spjd ASSERT(db->db_state == DB_UNCACHED); 532168404Spjd ASSERT(db->db_buf == NULL); 533168404Spjd 534219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 535207624Smm int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); 536185029Spjd 537185029Spjd ASSERT3U(bonuslen, <=, db->db.db_size); 538168404Spjd db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); 539208373Smm arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 540185029Spjd if (bonuslen < DN_MAX_BONUSLEN) 541168404Spjd bzero(db->db.db_data, DN_MAX_BONUSLEN); 542207624Smm if (bonuslen) 543207624Smm bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); 544219089Spjd DB_DNODE_EXIT(db); 545168404Spjd dbuf_update_data(db); 546168404Spjd db->db_state = DB_CACHED; 547168404Spjd mutex_exit(&db->db_mtx); 548168404Spjd return; 549168404Spjd } 550168404Spjd 551185029Spjd /* 552185029Spjd * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() 553185029Spjd * processes the delete record and clears the bp while we are waiting 554185029Spjd * for the dn_mtx (resulting in a "no" from block_freed). 555185029Spjd */ 556185029Spjd if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || 557185029Spjd (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || 558185029Spjd BP_IS_HOLE(db->db_blkptr)))) { 559168404Spjd arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 560168404Spjd 561263398Sdelphij DB_DNODE_EXIT(db); 562263398Sdelphij dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa, 563168404Spjd db->db.db_size, db, type)); 564168404Spjd bzero(db->db.db_data, db->db.db_size); 565168404Spjd db->db_state = DB_CACHED; 566168404Spjd *flags |= DB_RF_CACHED; 567168404Spjd mutex_exit(&db->db_mtx); 568168404Spjd return; 569168404Spjd } 570168404Spjd 571219089Spjd DB_DNODE_EXIT(db); 572219089Spjd 573168404Spjd db->db_state = DB_READ; 574168404Spjd mutex_exit(&db->db_mtx); 575168404Spjd 576185029Spjd if (DBUF_IS_L2CACHEABLE(db)) 577185029Spjd aflags |= ARC_L2CACHE; 578252140Sdelphij if (DBUF_IS_L2COMPRESSIBLE(db)) 579252140Sdelphij aflags |= ARC_L2COMPRESS; 580185029Spjd 581219089Spjd SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? 582219089Spjd db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, 583219089Spjd db->db.db_object, db->db_level, db->db_blkid); 584168404Spjd 585168404Spjd dbuf_add_ref(db, NULL); 586185029Spjd 587263398Sdelphij (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr, 588168404Spjd dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, 589168404Spjd (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, 590168404Spjd &aflags, &zb); 591168404Spjd if (aflags & ARC_CACHED) 592168404Spjd *flags |= DB_RF_CACHED; 593168404Spjd} 594168404Spjd 595168404Spjdint 596168404Spjddbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 597168404Spjd{ 598168404Spjd int err = 0; 599263398Sdelphij boolean_t havepzio = (zio != NULL); 600263398Sdelphij boolean_t prefetch; 601219089Spjd dnode_t *dn; 602168404Spjd 603168404Spjd /* 604168404Spjd * We don't have to hold the mutex to check db_state because it 605168404Spjd * can't be freed while we have a hold on the buffer. 606168404Spjd */ 607168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 608168404Spjd 609219089Spjd if (db->db_state == DB_NOFILL) 610249643Smm return (SET_ERROR(EIO)); 611219089Spjd 612219089Spjd DB_DNODE_ENTER(db); 613219089Spjd dn = DB_DNODE(db); 614168404Spjd if ((flags & DB_RF_HAVESTRUCT) == 0) 615219089Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 616168404Spjd 617219089Spjd prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 618219089Spjd (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && 619185029Spjd DBUF_IS_CACHEABLE(db); 620168404Spjd 621168404Spjd mutex_enter(&db->db_mtx); 622168404Spjd if (db->db_state == DB_CACHED) { 623168404Spjd mutex_exit(&db->db_mtx); 624168404Spjd if (prefetch) 625219089Spjd dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 626168404Spjd db->db.db_size, TRUE); 627168404Spjd if ((flags & DB_RF_HAVESTRUCT) == 0) 628219089Spjd rw_exit(&dn->dn_struct_rwlock); 629219089Spjd DB_DNODE_EXIT(db); 630168404Spjd } else if (db->db_state == DB_UNCACHED) { 631219089Spjd spa_t *spa = dn->dn_objset->os_spa; 632219089Spjd 633219089Spjd if (zio == NULL) 634219089Spjd zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 635168404Spjd dbuf_read_impl(db, zio, &flags); 636168404Spjd 637168404Spjd /* dbuf_read_impl has dropped db_mtx for us */ 638168404Spjd 639168404Spjd if (prefetch) 640219089Spjd dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 641168404Spjd db->db.db_size, flags & DB_RF_CACHED); 642168404Spjd 643168404Spjd if ((flags & DB_RF_HAVESTRUCT) == 0) 644219089Spjd rw_exit(&dn->dn_struct_rwlock); 645219089Spjd DB_DNODE_EXIT(db); 646168404Spjd 647168404Spjd if (!havepzio) 648168404Spjd err = zio_wait(zio); 649168404Spjd } else { 650252749Sdelphij /* 651252749Sdelphij * Another reader came in while the dbuf was in flight 652252749Sdelphij * between UNCACHED and CACHED. Either a writer will finish 653252749Sdelphij * writing the buffer (sending the dbuf to CACHED) or the 654252749Sdelphij * first reader's request will reach the read_done callback 655252749Sdelphij * and send the dbuf to CACHED. Otherwise, a failure 656252749Sdelphij * occurred and the dbuf went to UNCACHED. 657252749Sdelphij */ 658168404Spjd mutex_exit(&db->db_mtx); 659168404Spjd if (prefetch) 660219089Spjd dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 661168404Spjd db->db.db_size, TRUE); 662168404Spjd if ((flags & DB_RF_HAVESTRUCT) == 0) 663219089Spjd rw_exit(&dn->dn_struct_rwlock); 664219089Spjd DB_DNODE_EXIT(db); 665168404Spjd 666252749Sdelphij /* Skip the wait per the caller's request. */ 667168404Spjd mutex_enter(&db->db_mtx); 668168404Spjd if ((flags & DB_RF_NEVERWAIT) == 0) { 669168404Spjd while (db->db_state == DB_READ || 670168404Spjd db->db_state == DB_FILL) { 671168404Spjd ASSERT(db->db_state == DB_READ || 672168404Spjd (flags & DB_RF_HAVESTRUCT) == 0); 673168404Spjd cv_wait(&db->db_changed, &db->db_mtx); 674168404Spjd } 675168404Spjd if (db->db_state == DB_UNCACHED) 676249643Smm err = SET_ERROR(EIO); 677168404Spjd } 678168404Spjd mutex_exit(&db->db_mtx); 679168404Spjd } 680168404Spjd 681168404Spjd ASSERT(err || havepzio || db->db_state == DB_CACHED); 682168404Spjd return (err); 683168404Spjd} 684168404Spjd 685168404Spjdstatic void 686168404Spjddbuf_noread(dmu_buf_impl_t *db) 687168404Spjd{ 688168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 689219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 690168404Spjd mutex_enter(&db->db_mtx); 691168404Spjd while (db->db_state == DB_READ || db->db_state == DB_FILL) 692168404Spjd cv_wait(&db->db_changed, &db->db_mtx); 693168404Spjd if (db->db_state == DB_UNCACHED) { 694168404Spjd arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 695263398Sdelphij spa_t *spa = db->db_objset->os_spa; 696168404Spjd 697168404Spjd ASSERT(db->db_buf == NULL); 698168404Spjd ASSERT(db->db.db_data == NULL); 699219089Spjd dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); 700168404Spjd db->db_state = DB_FILL; 701219089Spjd } else if (db->db_state == DB_NOFILL) { 702219089Spjd dbuf_set_data(db, NULL); 703168404Spjd } else { 704168404Spjd ASSERT3U(db->db_state, ==, DB_CACHED); 705168404Spjd } 706168404Spjd mutex_exit(&db->db_mtx); 707168404Spjd} 708168404Spjd 709168404Spjd/* 710168404Spjd * This is our just-in-time copy function. It makes a copy of 711168404Spjd * buffers, that have been modified in a previous transaction 712168404Spjd * group, before we modify them in the current active group. 713168404Spjd * 714168404Spjd * This function is used in two places: when we are dirtying a 715168404Spjd * buffer for the first time in a txg, and when we are freeing 716168404Spjd * a range in a dnode that includes this buffer. 717168404Spjd * 718168404Spjd * Note that when we are called from dbuf_free_range() we do 719168404Spjd * not put a hold on the buffer, we just traverse the active 720168404Spjd * dbuf list for the dnode. 721168404Spjd */ 722168404Spjdstatic void 723168404Spjddbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) 724168404Spjd{ 725168404Spjd dbuf_dirty_record_t *dr = db->db_last_dirty; 726168404Spjd 727168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 728168404Spjd ASSERT(db->db.db_data != NULL); 729168404Spjd ASSERT(db->db_level == 0); 730168404Spjd ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); 731168404Spjd 732168404Spjd if (dr == NULL || 733168404Spjd (dr->dt.dl.dr_data != 734219089Spjd ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) 735168404Spjd return; 736168404Spjd 737168404Spjd /* 738168404Spjd * If the last dirty record for this dbuf has not yet synced 739168404Spjd * and its referencing the dbuf data, either: 740219089Spjd * reset the reference to point to a new copy, 741168404Spjd * or (if there a no active holders) 742168404Spjd * just null out the current db_data pointer. 743168404Spjd */ 744168404Spjd ASSERT(dr->dr_txg >= txg - 2); 745219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 746168404Spjd /* Note that the data bufs here are zio_bufs */ 747168404Spjd dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); 748208373Smm arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 749168404Spjd bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); 750168404Spjd } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 751168404Spjd int size = db->db.db_size; 752168404Spjd arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 753263398Sdelphij spa_t *spa = db->db_objset->os_spa; 754219089Spjd 755219089Spjd dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); 756168404Spjd bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); 757168404Spjd } else { 758168404Spjd dbuf_set_data(db, NULL); 759168404Spjd } 760168404Spjd} 761168404Spjd 762168404Spjdvoid 763168404Spjddbuf_unoverride(dbuf_dirty_record_t *dr) 764168404Spjd{ 765168404Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 766219089Spjd blkptr_t *bp = &dr->dt.dl.dr_overridden_by; 767168404Spjd uint64_t txg = dr->dr_txg; 768168404Spjd 769168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 770168404Spjd ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); 771168404Spjd ASSERT(db->db_level == 0); 772168404Spjd 773219089Spjd if (db->db_blkid == DMU_BONUS_BLKID || 774168404Spjd dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) 775168404Spjd return; 776168404Spjd 777219089Spjd ASSERT(db->db_data_pending != dr); 778219089Spjd 779168404Spjd /* free this block */ 780263398Sdelphij if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) 781263398Sdelphij zio_free(db->db_objset->os_spa, txg, bp); 782219089Spjd 783168404Spjd dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 784244087Smm dr->dt.dl.dr_nopwrite = B_FALSE; 785244087Smm 786168404Spjd /* 787168404Spjd * Release the already-written buffer, so we leave it in 788168404Spjd * a consistent dirty state. Note that all callers are 789168404Spjd * modifying the buffer, so they will immediately do 790168404Spjd * another (redundant) arc_release(). Therefore, leave 791168404Spjd * the buf thawed to save the effort of freezing & 792168404Spjd * immediately re-thawing it. 793168404Spjd */ 794168404Spjd arc_release(dr->dt.dl.dr_data, db); 795168404Spjd} 796168404Spjd 797185029Spjd/* 798185029Spjd * Evict (if its unreferenced) or clear (if its referenced) any level-0 799185029Spjd * data blocks in the free range, so that any future readers will find 800263398Sdelphij * empty blocks. 801260722Savg * 802260722Savg * This is a no-op if the dataset is in the middle of an incremental 803260722Savg * receive; see comment below for details. 804185029Spjd */ 805168404Spjdvoid 806185029Spjddbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) 807168404Spjd{ 808168404Spjd dmu_buf_impl_t *db, *db_next; 809168404Spjd uint64_t txg = tx->tx_txg; 810168404Spjd 811263398Sdelphij if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) 812185029Spjd end = dn->dn_maxblkid; 813185029Spjd dprintf_dnode(dn, "start=%llu end=%llu\n", start, end); 814260722Savg 815168404Spjd mutex_enter(&dn->dn_dbufs_mtx); 816260722Savg if (start >= dn->dn_unlisted_l0_blkid * dn->dn_datablksz) { 817260722Savg /* There can't be any dbufs in this range; no need to search. */ 818260722Savg mutex_exit(&dn->dn_dbufs_mtx); 819260722Savg return; 820260722Savg } else if (dmu_objset_is_receiving(dn->dn_objset)) { 821260722Savg /* 822260722Savg * If we are receiving, we expect there to be no dbufs in 823260722Savg * the range to be freed, because receive modifies each 824260722Savg * block at most once, and in offset order. If this is 825260722Savg * not the case, it can lead to performance problems, 826260722Savg * so note that we unexpectedly took the slow path. 827260722Savg */ 828260722Savg atomic_inc_64(&zfs_free_range_recv_miss); 829260722Savg } 830260722Savg 831260764Savg for (db = list_head(&dn->dn_dbufs); db != NULL; db = db_next) { 832168404Spjd db_next = list_next(&dn->dn_dbufs, db); 833219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 834185029Spjd 835168404Spjd if (db->db_level != 0) 836168404Spjd continue; 837185029Spjd if (db->db_blkid < start || db->db_blkid > end) 838168404Spjd continue; 839168404Spjd 840168404Spjd /* found a level 0 buffer in the range */ 841249643Smm mutex_enter(&db->db_mtx); 842249643Smm if (dbuf_undirty(db, tx)) { 843249643Smm /* mutex has been dropped and dbuf destroyed */ 844168404Spjd continue; 845249643Smm } 846168404Spjd 847168404Spjd if (db->db_state == DB_UNCACHED || 848219089Spjd db->db_state == DB_NOFILL || 849168404Spjd db->db_state == DB_EVICTING) { 850168404Spjd ASSERT(db->db.db_data == NULL); 851168404Spjd mutex_exit(&db->db_mtx); 852168404Spjd continue; 853168404Spjd } 854168404Spjd if (db->db_state == DB_READ || db->db_state == DB_FILL) { 855168404Spjd /* will be handled in dbuf_read_done or dbuf_rele */ 856168404Spjd db->db_freed_in_flight = TRUE; 857168404Spjd mutex_exit(&db->db_mtx); 858168404Spjd continue; 859168404Spjd } 860168404Spjd if (refcount_count(&db->db_holds) == 0) { 861168404Spjd ASSERT(db->db_buf); 862168404Spjd dbuf_clear(db); 863168404Spjd continue; 864168404Spjd } 865168404Spjd /* The dbuf is referenced */ 866168404Spjd 867168404Spjd if (db->db_last_dirty != NULL) { 868168404Spjd dbuf_dirty_record_t *dr = db->db_last_dirty; 869168404Spjd 870168404Spjd if (dr->dr_txg == txg) { 871168404Spjd /* 872168404Spjd * This buffer is "in-use", re-adjust the file 873168404Spjd * size to reflect that this buffer may 874168404Spjd * contain new data when we sync. 875168404Spjd */ 876219089Spjd if (db->db_blkid != DMU_SPILL_BLKID && 877219089Spjd db->db_blkid > dn->dn_maxblkid) 878168404Spjd dn->dn_maxblkid = db->db_blkid; 879168404Spjd dbuf_unoverride(dr); 880168404Spjd } else { 881168404Spjd /* 882168404Spjd * This dbuf is not dirty in the open context. 883168404Spjd * Either uncache it (if its not referenced in 884168404Spjd * the open context) or reset its contents to 885168404Spjd * empty. 886168404Spjd */ 887168404Spjd dbuf_fix_old_data(db, txg); 888168404Spjd } 889168404Spjd } 890168404Spjd /* clear the contents if its cached */ 891168404Spjd if (db->db_state == DB_CACHED) { 892168404Spjd ASSERT(db->db.db_data != NULL); 893168404Spjd arc_release(db->db_buf, db); 894168404Spjd bzero(db->db.db_data, db->db.db_size); 895168404Spjd arc_buf_freeze(db->db_buf); 896168404Spjd } 897168404Spjd 898168404Spjd mutex_exit(&db->db_mtx); 899168404Spjd } 900168404Spjd mutex_exit(&dn->dn_dbufs_mtx); 901168404Spjd} 902168404Spjd 903168404Spjdstatic int 904185029Spjddbuf_block_freeable(dmu_buf_impl_t *db) 905168404Spjd{ 906168404Spjd dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; 907168404Spjd uint64_t birth_txg = 0; 908168404Spjd 909168404Spjd /* 910168404Spjd * We don't need any locking to protect db_blkptr: 911168404Spjd * If it's syncing, then db_last_dirty will be set 912168404Spjd * so we'll ignore db_blkptr. 913263398Sdelphij * 914263398Sdelphij * This logic ensures that only block births for 915263398Sdelphij * filled blocks are considered. 916168404Spjd */ 917168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 918263398Sdelphij if (db->db_last_dirty && (db->db_blkptr == NULL || 919263398Sdelphij !BP_IS_HOLE(db->db_blkptr))) { 920168404Spjd birth_txg = db->db_last_dirty->dr_txg; 921263398Sdelphij } else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { 922168404Spjd birth_txg = db->db_blkptr->blk_birth; 923263398Sdelphij } 924168404Spjd 925219089Spjd /* 926263398Sdelphij * If this block don't exist or is in a snapshot, it can't be freed. 927219089Spjd * Don't pass the bp to dsl_dataset_block_freeable() since we 928219089Spjd * are holding the db_mtx lock and might deadlock if we are 929219089Spjd * prefetching a dedup-ed block. 930219089Spjd */ 931263398Sdelphij if (birth_txg != 0) 932185029Spjd return (ds == NULL || 933219089Spjd dsl_dataset_block_freeable(ds, NULL, birth_txg)); 934168404Spjd else 935263398Sdelphij return (B_FALSE); 936168404Spjd} 937168404Spjd 938168404Spjdvoid 939168404Spjddbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) 940168404Spjd{ 941168404Spjd arc_buf_t *buf, *obuf; 942168404Spjd int osize = db->db.db_size; 943168404Spjd arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 944219089Spjd dnode_t *dn; 945168404Spjd 946219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 947168404Spjd 948219089Spjd DB_DNODE_ENTER(db); 949219089Spjd dn = DB_DNODE(db); 950219089Spjd 951168404Spjd /* XXX does *this* func really need the lock? */ 952219089Spjd ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 953168404Spjd 954168404Spjd /* 955263398Sdelphij * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held 956168404Spjd * is OK, because there can be no other references to the db 957168404Spjd * when we are changing its size, so no concurrent DB_FILL can 958168404Spjd * be happening. 959168404Spjd */ 960168404Spjd /* 961168404Spjd * XXX we should be doing a dbuf_read, checking the return 962168404Spjd * value and returning that up to our callers 963168404Spjd */ 964263398Sdelphij dmu_buf_will_dirty(&db->db, tx); 965168404Spjd 966168404Spjd /* create the data buffer for the new block */ 967219089Spjd buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type); 968168404Spjd 969168404Spjd /* copy old block data to the new block */ 970168404Spjd obuf = db->db_buf; 971168404Spjd bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); 972168404Spjd /* zero the remainder */ 973168404Spjd if (size > osize) 974168404Spjd bzero((uint8_t *)buf->b_data + osize, size - osize); 975168404Spjd 976168404Spjd mutex_enter(&db->db_mtx); 977168404Spjd dbuf_set_data(db, buf); 978249643Smm VERIFY(arc_buf_remove_ref(obuf, db)); 979168404Spjd db->db.db_size = size; 980168404Spjd 981168404Spjd if (db->db_level == 0) { 982168404Spjd ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 983168404Spjd db->db_last_dirty->dt.dl.dr_data = buf; 984168404Spjd } 985168404Spjd mutex_exit(&db->db_mtx); 986168404Spjd 987219089Spjd dnode_willuse_space(dn, size-osize, tx); 988219089Spjd DB_DNODE_EXIT(db); 989168404Spjd} 990168404Spjd 991219089Spjdvoid 992219089Spjddbuf_release_bp(dmu_buf_impl_t *db) 993219089Spjd{ 994263398Sdelphij objset_t *os = db->db_objset; 995219089Spjd 996219089Spjd ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); 997219089Spjd ASSERT(arc_released(os->os_phys_buf) || 998219089Spjd list_link_active(&os->os_dsl_dataset->ds_synced_link)); 999219089Spjd ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); 1000219089Spjd 1001247406Smm (void) arc_release(db->db_buf, db); 1002219089Spjd} 1003219089Spjd 1004168404Spjddbuf_dirty_record_t * 1005168404Spjddbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1006168404Spjd{ 1007219089Spjd dnode_t *dn; 1008219089Spjd objset_t *os; 1009168404Spjd dbuf_dirty_record_t **drp, *dr; 1010168404Spjd int drop_struct_lock = FALSE; 1011185029Spjd boolean_t do_free_accounting = B_FALSE; 1012168404Spjd int txgoff = tx->tx_txg & TXG_MASK; 1013168404Spjd 1014168404Spjd ASSERT(tx->tx_txg != 0); 1015168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 1016168404Spjd DMU_TX_DIRTY_BUF(tx, db); 1017168404Spjd 1018219089Spjd DB_DNODE_ENTER(db); 1019219089Spjd dn = DB_DNODE(db); 1020168404Spjd /* 1021168404Spjd * Shouldn't dirty a regular buffer in syncing context. Private 1022168404Spjd * objects may be dirtied in syncing context, but only if they 1023168404Spjd * were already pre-dirtied in open context. 1024168404Spjd */ 1025168404Spjd ASSERT(!dmu_tx_is_syncing(tx) || 1026168404Spjd BP_IS_HOLE(dn->dn_objset->os_rootbp) || 1027209962Smm DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 1028209962Smm dn->dn_objset->os_dsl_dataset == NULL); 1029168404Spjd /* 1030168404Spjd * We make this assert for private objects as well, but after we 1031168404Spjd * check if we're already dirty. They are allowed to re-dirty 1032168404Spjd * in syncing context. 1033168404Spjd */ 1034168404Spjd ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1035168404Spjd dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1036168404Spjd (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1037168404Spjd 1038168404Spjd mutex_enter(&db->db_mtx); 1039168404Spjd /* 1040168404Spjd * XXX make this true for indirects too? The problem is that 1041168404Spjd * transactions created with dmu_tx_create_assigned() from 1042168404Spjd * syncing context don't bother holding ahead. 1043168404Spjd */ 1044168404Spjd ASSERT(db->db_level != 0 || 1045219089Spjd db->db_state == DB_CACHED || db->db_state == DB_FILL || 1046219089Spjd db->db_state == DB_NOFILL); 1047168404Spjd 1048168404Spjd mutex_enter(&dn->dn_mtx); 1049168404Spjd /* 1050168404Spjd * Don't set dirtyctx to SYNC if we're just modifying this as we 1051168404Spjd * initialize the objset. 1052168404Spjd */ 1053168404Spjd if (dn->dn_dirtyctx == DN_UNDIRTIED && 1054168404Spjd !BP_IS_HOLE(dn->dn_objset->os_rootbp)) { 1055168404Spjd dn->dn_dirtyctx = 1056168404Spjd (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); 1057168404Spjd ASSERT(dn->dn_dirtyctx_firstset == NULL); 1058168404Spjd dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); 1059168404Spjd } 1060168404Spjd mutex_exit(&dn->dn_mtx); 1061168404Spjd 1062219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) 1063219089Spjd dn->dn_have_spill = B_TRUE; 1064219089Spjd 1065168404Spjd /* 1066168404Spjd * If this buffer is already dirty, we're done. 1067168404Spjd */ 1068168404Spjd drp = &db->db_last_dirty; 1069168404Spjd ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || 1070168404Spjd db->db.db_object == DMU_META_DNODE_OBJECT); 1071185029Spjd while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) 1072185029Spjd drp = &dr->dr_next; 1073185029Spjd if (dr && dr->dr_txg == tx->tx_txg) { 1074219089Spjd DB_DNODE_EXIT(db); 1075219089Spjd 1076219089Spjd if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { 1077168404Spjd /* 1078168404Spjd * If this buffer has already been written out, 1079168404Spjd * we now need to reset its state. 1080168404Spjd */ 1081185029Spjd dbuf_unoverride(dr); 1082219089Spjd if (db->db.db_object != DMU_META_DNODE_OBJECT && 1083219089Spjd db->db_state != DB_NOFILL) 1084168404Spjd arc_buf_thaw(db->db_buf); 1085168404Spjd } 1086168404Spjd mutex_exit(&db->db_mtx); 1087185029Spjd return (dr); 1088168404Spjd } 1089168404Spjd 1090168404Spjd /* 1091168404Spjd * Only valid if not already dirty. 1092168404Spjd */ 1093209962Smm ASSERT(dn->dn_object == 0 || 1094209962Smm dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1095168404Spjd (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1096168404Spjd 1097168404Spjd ASSERT3U(dn->dn_nlevels, >, db->db_level); 1098168404Spjd ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || 1099168404Spjd dn->dn_phys->dn_nlevels > db->db_level || 1100168404Spjd dn->dn_next_nlevels[txgoff] > db->db_level || 1101168404Spjd dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || 1102168404Spjd dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); 1103168404Spjd 1104168404Spjd /* 1105168404Spjd * We should only be dirtying in syncing context if it's the 1106209962Smm * mos or we're initializing the os or it's a special object. 1107209962Smm * However, we are allowed to dirty in syncing context provided 1108209962Smm * we already dirtied it in open context. Hence we must make 1109209962Smm * this assertion only if we're not already dirty. 1110168404Spjd */ 1111219089Spjd os = dn->dn_objset; 1112209962Smm ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 1113209962Smm os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); 1114168404Spjd ASSERT(db->db.db_size != 0); 1115168404Spjd 1116168404Spjd dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1117168404Spjd 1118219089Spjd if (db->db_blkid != DMU_BONUS_BLKID) { 1119185029Spjd /* 1120185029Spjd * Update the accounting. 1121185029Spjd * Note: we delay "free accounting" until after we drop 1122185029Spjd * the db_mtx. This keeps us from grabbing other locks 1123219089Spjd * (and possibly deadlocking) in bp_get_dsize() while 1124185029Spjd * also holding the db_mtx. 1125185029Spjd */ 1126185029Spjd dnode_willuse_space(dn, db->db.db_size, tx); 1127185029Spjd do_free_accounting = dbuf_block_freeable(db); 1128185029Spjd } 1129185029Spjd 1130168404Spjd /* 1131168404Spjd * If this buffer is dirty in an old transaction group we need 1132168404Spjd * to make a copy of it so that the changes we make in this 1133168404Spjd * transaction group won't leak out when we sync the older txg. 1134168404Spjd */ 1135168404Spjd dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); 1136168404Spjd if (db->db_level == 0) { 1137168404Spjd void *data_old = db->db_buf; 1138168404Spjd 1139219089Spjd if (db->db_state != DB_NOFILL) { 1140219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 1141219089Spjd dbuf_fix_old_data(db, tx->tx_txg); 1142219089Spjd data_old = db->db.db_data; 1143219089Spjd } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { 1144219089Spjd /* 1145219089Spjd * Release the data buffer from the cache so 1146219089Spjd * that we can modify it without impacting 1147219089Spjd * possible other users of this cached data 1148219089Spjd * block. Note that indirect blocks and 1149219089Spjd * private objects are not released until the 1150219089Spjd * syncing state (since they are only modified 1151219089Spjd * then). 1152219089Spjd */ 1153219089Spjd arc_release(db->db_buf, db); 1154219089Spjd dbuf_fix_old_data(db, tx->tx_txg); 1155219089Spjd data_old = db->db_buf; 1156219089Spjd } 1157219089Spjd ASSERT(data_old != NULL); 1158168404Spjd } 1159168404Spjd dr->dt.dl.dr_data = data_old; 1160168404Spjd } else { 1161168404Spjd mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); 1162168404Spjd list_create(&dr->dt.di.dr_children, 1163168404Spjd sizeof (dbuf_dirty_record_t), 1164168404Spjd offsetof(dbuf_dirty_record_t, dr_dirty_node)); 1165168404Spjd } 1166260764Savg if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL) 1167260764Savg dr->dr_accounted = db->db.db_size; 1168168404Spjd dr->dr_dbuf = db; 1169168404Spjd dr->dr_txg = tx->tx_txg; 1170168404Spjd dr->dr_next = *drp; 1171168404Spjd *drp = dr; 1172168404Spjd 1173168404Spjd /* 1174168404Spjd * We could have been freed_in_flight between the dbuf_noread 1175168404Spjd * and dbuf_dirty. We win, as though the dbuf_noread() had 1176168404Spjd * happened after the free. 1177168404Spjd */ 1178219089Spjd if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 1179219089Spjd db->db_blkid != DMU_SPILL_BLKID) { 1180168404Spjd mutex_enter(&dn->dn_mtx); 1181265751Sdelphij if (dn->dn_free_ranges[txgoff] != NULL) { 1182265751Sdelphij range_tree_clear(dn->dn_free_ranges[txgoff], 1183265751Sdelphij db->db_blkid, 1); 1184265751Sdelphij } 1185168404Spjd mutex_exit(&dn->dn_mtx); 1186168404Spjd db->db_freed_in_flight = FALSE; 1187168404Spjd } 1188168404Spjd 1189168404Spjd /* 1190168404Spjd * This buffer is now part of this txg 1191168404Spjd */ 1192168404Spjd dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); 1193168404Spjd db->db_dirtycnt += 1; 1194168404Spjd ASSERT3U(db->db_dirtycnt, <=, 3); 1195168404Spjd 1196168404Spjd mutex_exit(&db->db_mtx); 1197168404Spjd 1198219089Spjd if (db->db_blkid == DMU_BONUS_BLKID || 1199219089Spjd db->db_blkid == DMU_SPILL_BLKID) { 1200168404Spjd mutex_enter(&dn->dn_mtx); 1201168404Spjd ASSERT(!list_link_active(&dr->dr_dirty_node)); 1202168404Spjd list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1203168404Spjd mutex_exit(&dn->dn_mtx); 1204168404Spjd dnode_setdirty(dn, tx); 1205219089Spjd DB_DNODE_EXIT(db); 1206168404Spjd return (dr); 1207185029Spjd } else if (do_free_accounting) { 1208185029Spjd blkptr_t *bp = db->db_blkptr; 1209185029Spjd int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? 1210219089Spjd bp_get_dsize(os->os_spa, bp) : db->db.db_size; 1211185029Spjd /* 1212185029Spjd * This is only a guess -- if the dbuf is dirty 1213185029Spjd * in a previous txg, we don't know how much 1214185029Spjd * space it will use on disk yet. We should 1215185029Spjd * really have the struct_rwlock to access 1216185029Spjd * db_blkptr, but since this is just a guess, 1217185029Spjd * it's OK if we get an odd answer. 1218185029Spjd */ 1219219089Spjd ddt_prefetch(os->os_spa, bp); 1220185029Spjd dnode_willuse_space(dn, -willfree, tx); 1221168404Spjd } 1222168404Spjd 1223168404Spjd if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 1224168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 1225168404Spjd drop_struct_lock = TRUE; 1226168404Spjd } 1227168404Spjd 1228185029Spjd if (db->db_level == 0) { 1229185029Spjd dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); 1230185029Spjd ASSERT(dn->dn_maxblkid >= db->db_blkid); 1231185029Spjd } 1232185029Spjd 1233168404Spjd if (db->db_level+1 < dn->dn_nlevels) { 1234168404Spjd dmu_buf_impl_t *parent = db->db_parent; 1235168404Spjd dbuf_dirty_record_t *di; 1236168404Spjd int parent_held = FALSE; 1237168404Spjd 1238168404Spjd if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { 1239168404Spjd int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1240168404Spjd 1241168404Spjd parent = dbuf_hold_level(dn, db->db_level+1, 1242168404Spjd db->db_blkid >> epbs, FTAG); 1243219089Spjd ASSERT(parent != NULL); 1244168404Spjd parent_held = TRUE; 1245168404Spjd } 1246168404Spjd if (drop_struct_lock) 1247168404Spjd rw_exit(&dn->dn_struct_rwlock); 1248168404Spjd ASSERT3U(db->db_level+1, ==, parent->db_level); 1249168404Spjd di = dbuf_dirty(parent, tx); 1250168404Spjd if (parent_held) 1251168404Spjd dbuf_rele(parent, FTAG); 1252168404Spjd 1253168404Spjd mutex_enter(&db->db_mtx); 1254260764Savg /* 1255260764Savg * Since we've dropped the mutex, it's possible that 1256260764Savg * dbuf_undirty() might have changed this out from under us. 1257260764Savg */ 1258168404Spjd if (db->db_last_dirty == dr || 1259168404Spjd dn->dn_object == DMU_META_DNODE_OBJECT) { 1260168404Spjd mutex_enter(&di->dt.di.dr_mtx); 1261168404Spjd ASSERT3U(di->dr_txg, ==, tx->tx_txg); 1262168404Spjd ASSERT(!list_link_active(&dr->dr_dirty_node)); 1263168404Spjd list_insert_tail(&di->dt.di.dr_children, dr); 1264168404Spjd mutex_exit(&di->dt.di.dr_mtx); 1265168404Spjd dr->dr_parent = di; 1266168404Spjd } 1267168404Spjd mutex_exit(&db->db_mtx); 1268168404Spjd } else { 1269168404Spjd ASSERT(db->db_level+1 == dn->dn_nlevels); 1270168404Spjd ASSERT(db->db_blkid < dn->dn_nblkptr); 1271219089Spjd ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); 1272168404Spjd mutex_enter(&dn->dn_mtx); 1273168404Spjd ASSERT(!list_link_active(&dr->dr_dirty_node)); 1274168404Spjd list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1275168404Spjd mutex_exit(&dn->dn_mtx); 1276168404Spjd if (drop_struct_lock) 1277168404Spjd rw_exit(&dn->dn_struct_rwlock); 1278168404Spjd } 1279168404Spjd 1280168404Spjd dnode_setdirty(dn, tx); 1281219089Spjd DB_DNODE_EXIT(db); 1282168404Spjd return (dr); 1283168404Spjd} 1284168404Spjd 1285249643Smm/* 1286252749Sdelphij * Undirty a buffer in the transaction group referenced by the given 1287252749Sdelphij * transaction. Return whether this evicted the dbuf. 1288249643Smm */ 1289249643Smmstatic boolean_t 1290168404Spjddbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1291168404Spjd{ 1292219089Spjd dnode_t *dn; 1293168404Spjd uint64_t txg = tx->tx_txg; 1294185029Spjd dbuf_dirty_record_t *dr, **drp; 1295168404Spjd 1296168404Spjd ASSERT(txg != 0); 1297219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1298249643Smm ASSERT0(db->db_level); 1299249643Smm ASSERT(MUTEX_HELD(&db->db_mtx)); 1300168404Spjd 1301168404Spjd /* 1302168404Spjd * If this buffer is not dirty, we're done. 1303168404Spjd */ 1304185029Spjd for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) 1305168404Spjd if (dr->dr_txg <= txg) 1306168404Spjd break; 1307249643Smm if (dr == NULL || dr->dr_txg < txg) 1308249643Smm return (B_FALSE); 1309168404Spjd ASSERT(dr->dr_txg == txg); 1310219089Spjd ASSERT(dr->dr_dbuf == db); 1311168404Spjd 1312219089Spjd DB_DNODE_ENTER(db); 1313219089Spjd dn = DB_DNODE(db); 1314219089Spjd 1315168404Spjd dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1316168404Spjd 1317168404Spjd ASSERT(db->db.db_size != 0); 1318168404Spjd 1319260764Savg /* 1320260764Savg * Any space we accounted for in dp_dirty_* will be cleaned up by 1321260764Savg * dsl_pool_sync(). This is relatively rare so the discrepancy 1322260764Savg * is not a big deal. 1323260764Savg */ 1324168404Spjd 1325185029Spjd *drp = dr->dr_next; 1326168404Spjd 1327219636Spjd /* 1328219636Spjd * Note that there are three places in dbuf_dirty() 1329219636Spjd * where this dirty record may be put on a list. 1330219636Spjd * Make sure to do a list_remove corresponding to 1331219636Spjd * every one of those list_insert calls. 1332219636Spjd */ 1333168404Spjd if (dr->dr_parent) { 1334168404Spjd mutex_enter(&dr->dr_parent->dt.di.dr_mtx); 1335168404Spjd list_remove(&dr->dr_parent->dt.di.dr_children, dr); 1336168404Spjd mutex_exit(&dr->dr_parent->dt.di.dr_mtx); 1337219636Spjd } else if (db->db_blkid == DMU_SPILL_BLKID || 1338219636Spjd db->db_level+1 == dn->dn_nlevels) { 1339185029Spjd ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); 1340168404Spjd mutex_enter(&dn->dn_mtx); 1341168404Spjd list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); 1342168404Spjd mutex_exit(&dn->dn_mtx); 1343168404Spjd } 1344219089Spjd DB_DNODE_EXIT(db); 1345168404Spjd 1346249643Smm if (db->db_state != DB_NOFILL) { 1347249643Smm dbuf_unoverride(dr); 1348168404Spjd 1349168404Spjd ASSERT(db->db_buf != NULL); 1350249643Smm ASSERT(dr->dt.dl.dr_data != NULL); 1351249643Smm if (dr->dt.dl.dr_data != db->db_buf) 1352249643Smm VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db)); 1353168404Spjd } 1354168404Spjd kmem_free(dr, sizeof (dbuf_dirty_record_t)); 1355168404Spjd 1356168404Spjd ASSERT(db->db_dirtycnt > 0); 1357168404Spjd db->db_dirtycnt -= 1; 1358168404Spjd 1359168404Spjd if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { 1360168404Spjd arc_buf_t *buf = db->db_buf; 1361168404Spjd 1362219089Spjd ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); 1363168404Spjd dbuf_set_data(db, NULL); 1364249643Smm VERIFY(arc_buf_remove_ref(buf, db)); 1365168404Spjd dbuf_evict(db); 1366249643Smm return (B_TRUE); 1367168404Spjd } 1368168404Spjd 1369249643Smm return (B_FALSE); 1370168404Spjd} 1371168404Spjd 1372168404Spjdvoid 1373263398Sdelphijdmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) 1374168404Spjd{ 1375263398Sdelphij dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1376185029Spjd int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; 1377168404Spjd 1378168404Spjd ASSERT(tx->tx_txg != 0); 1379168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 1380168404Spjd 1381219089Spjd DB_DNODE_ENTER(db); 1382219089Spjd if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) 1383168404Spjd rf |= DB_RF_HAVESTRUCT; 1384219089Spjd DB_DNODE_EXIT(db); 1385168404Spjd (void) dbuf_read(db, NULL, rf); 1386168404Spjd (void) dbuf_dirty(db, tx); 1387168404Spjd} 1388168404Spjd 1389168404Spjdvoid 1390219089Spjddmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1391219089Spjd{ 1392219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1393219089Spjd 1394219089Spjd db->db_state = DB_NOFILL; 1395219089Spjd 1396219089Spjd dmu_buf_will_fill(db_fake, tx); 1397219089Spjd} 1398219089Spjd 1399219089Spjdvoid 1400168404Spjddmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1401168404Spjd{ 1402168404Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1403168404Spjd 1404219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1405168404Spjd ASSERT(tx->tx_txg != 0); 1406168404Spjd ASSERT(db->db_level == 0); 1407168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 1408168404Spjd 1409168404Spjd ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || 1410168404Spjd dmu_tx_private_ok(tx)); 1411168404Spjd 1412168404Spjd dbuf_noread(db); 1413168404Spjd (void) dbuf_dirty(db, tx); 1414168404Spjd} 1415168404Spjd 1416168404Spjd#pragma weak dmu_buf_fill_done = dbuf_fill_done 1417168404Spjd/* ARGSUSED */ 1418168404Spjdvoid 1419168404Spjddbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) 1420168404Spjd{ 1421168404Spjd mutex_enter(&db->db_mtx); 1422168404Spjd DBUF_VERIFY(db); 1423168404Spjd 1424168404Spjd if (db->db_state == DB_FILL) { 1425168404Spjd if (db->db_level == 0 && db->db_freed_in_flight) { 1426219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1427168404Spjd /* we were freed while filling */ 1428168404Spjd /* XXX dbuf_undirty? */ 1429168404Spjd bzero(db->db.db_data, db->db.db_size); 1430168404Spjd db->db_freed_in_flight = FALSE; 1431168404Spjd } 1432168404Spjd db->db_state = DB_CACHED; 1433168404Spjd cv_broadcast(&db->db_changed); 1434168404Spjd } 1435168404Spjd mutex_exit(&db->db_mtx); 1436168404Spjd} 1437168404Spjd 1438168404Spjd/* 1439209962Smm * Directly assign a provided arc buf to a given dbuf if it's not referenced 1440209962Smm * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. 1441209962Smm */ 1442209962Smmvoid 1443209962Smmdbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) 1444209962Smm{ 1445209962Smm ASSERT(!refcount_is_zero(&db->db_holds)); 1446219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1447209962Smm ASSERT(db->db_level == 0); 1448209962Smm ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); 1449209962Smm ASSERT(buf != NULL); 1450209962Smm ASSERT(arc_buf_size(buf) == db->db.db_size); 1451209962Smm ASSERT(tx->tx_txg != 0); 1452209962Smm 1453209962Smm arc_return_buf(buf, db); 1454209962Smm ASSERT(arc_released(buf)); 1455209962Smm 1456209962Smm mutex_enter(&db->db_mtx); 1457209962Smm 1458209962Smm while (db->db_state == DB_READ || db->db_state == DB_FILL) 1459209962Smm cv_wait(&db->db_changed, &db->db_mtx); 1460209962Smm 1461209962Smm ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); 1462209962Smm 1463209962Smm if (db->db_state == DB_CACHED && 1464209962Smm refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { 1465209962Smm mutex_exit(&db->db_mtx); 1466209962Smm (void) dbuf_dirty(db, tx); 1467209962Smm bcopy(buf->b_data, db->db.db_data, db->db.db_size); 1468249643Smm VERIFY(arc_buf_remove_ref(buf, db)); 1469219089Spjd xuio_stat_wbuf_copied(); 1470209962Smm return; 1471209962Smm } 1472209962Smm 1473219089Spjd xuio_stat_wbuf_nocopy(); 1474209962Smm if (db->db_state == DB_CACHED) { 1475209962Smm dbuf_dirty_record_t *dr = db->db_last_dirty; 1476209962Smm 1477209962Smm ASSERT(db->db_buf != NULL); 1478209962Smm if (dr != NULL && dr->dr_txg == tx->tx_txg) { 1479209962Smm ASSERT(dr->dt.dl.dr_data == db->db_buf); 1480209962Smm if (!arc_released(db->db_buf)) { 1481209962Smm ASSERT(dr->dt.dl.dr_override_state == 1482209962Smm DR_OVERRIDDEN); 1483209962Smm arc_release(db->db_buf, db); 1484209962Smm } 1485209962Smm dr->dt.dl.dr_data = buf; 1486249643Smm VERIFY(arc_buf_remove_ref(db->db_buf, db)); 1487209962Smm } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { 1488209962Smm arc_release(db->db_buf, db); 1489249643Smm VERIFY(arc_buf_remove_ref(db->db_buf, db)); 1490209962Smm } 1491209962Smm db->db_buf = NULL; 1492209962Smm } 1493209962Smm ASSERT(db->db_buf == NULL); 1494209962Smm dbuf_set_data(db, buf); 1495209962Smm db->db_state = DB_FILL; 1496209962Smm mutex_exit(&db->db_mtx); 1497209962Smm (void) dbuf_dirty(db, tx); 1498263398Sdelphij dmu_buf_fill_done(&db->db, tx); 1499209962Smm} 1500209962Smm 1501209962Smm/* 1502168404Spjd * "Clear" the contents of this dbuf. This will mark the dbuf 1503260764Savg * EVICTING and clear *most* of its references. Unfortunately, 1504168404Spjd * when we are not holding the dn_dbufs_mtx, we can't clear the 1505168404Spjd * entry in the dn_dbufs list. We have to wait until dbuf_destroy() 1506168404Spjd * in this case. For callers from the DMU we will usually see: 1507168404Spjd * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy() 1508168404Spjd * For the arc callback, we will usually see: 1509219089Spjd * dbuf_do_evict()->dbuf_clear();dbuf_destroy() 1510168404Spjd * Sometimes, though, we will get a mix of these two: 1511168404Spjd * DMU: dbuf_clear()->arc_buf_evict() 1512168404Spjd * ARC: dbuf_do_evict()->dbuf_destroy() 1513168404Spjd */ 1514168404Spjdvoid 1515168404Spjddbuf_clear(dmu_buf_impl_t *db) 1516168404Spjd{ 1517219089Spjd dnode_t *dn; 1518168404Spjd dmu_buf_impl_t *parent = db->db_parent; 1519219089Spjd dmu_buf_impl_t *dndb; 1520168404Spjd int dbuf_gone = FALSE; 1521168404Spjd 1522168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 1523168404Spjd ASSERT(refcount_is_zero(&db->db_holds)); 1524168404Spjd 1525168404Spjd dbuf_evict_user(db); 1526168404Spjd 1527168404Spjd if (db->db_state == DB_CACHED) { 1528168404Spjd ASSERT(db->db.db_data != NULL); 1529219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 1530168404Spjd zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); 1531208373Smm arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 1532185029Spjd } 1533168404Spjd db->db.db_data = NULL; 1534168404Spjd db->db_state = DB_UNCACHED; 1535168404Spjd } 1536168404Spjd 1537219089Spjd ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); 1538168404Spjd ASSERT(db->db_data_pending == NULL); 1539168404Spjd 1540168404Spjd db->db_state = DB_EVICTING; 1541168404Spjd db->db_blkptr = NULL; 1542168404Spjd 1543219089Spjd DB_DNODE_ENTER(db); 1544219089Spjd dn = DB_DNODE(db); 1545219089Spjd dndb = dn->dn_dbuf; 1546219089Spjd if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { 1547168404Spjd list_remove(&dn->dn_dbufs, db); 1548219089Spjd (void) atomic_dec_32_nv(&dn->dn_dbufs_count); 1549219089Spjd membar_producer(); 1550219089Spjd DB_DNODE_EXIT(db); 1551219089Spjd /* 1552219089Spjd * Decrementing the dbuf count means that the hold corresponding 1553219089Spjd * to the removed dbuf is no longer discounted in dnode_move(), 1554219089Spjd * so the dnode cannot be moved until after we release the hold. 1555219089Spjd * The membar_producer() ensures visibility of the decremented 1556219089Spjd * value in dnode_move(), since DB_DNODE_EXIT doesn't actually 1557219089Spjd * release any lock. 1558219089Spjd */ 1559168404Spjd dnode_rele(dn, db); 1560219089Spjd db->db_dnode_handle = NULL; 1561219089Spjd } else { 1562219089Spjd DB_DNODE_EXIT(db); 1563168404Spjd } 1564168404Spjd 1565168404Spjd if (db->db_buf) 1566168404Spjd dbuf_gone = arc_buf_evict(db->db_buf); 1567168404Spjd 1568168404Spjd if (!dbuf_gone) 1569168404Spjd mutex_exit(&db->db_mtx); 1570168404Spjd 1571168404Spjd /* 1572219089Spjd * If this dbuf is referenced from an indirect dbuf, 1573168404Spjd * decrement the ref count on the indirect dbuf. 1574168404Spjd */ 1575168404Spjd if (parent && parent != dndb) 1576168404Spjd dbuf_rele(parent, db); 1577168404Spjd} 1578168404Spjd 1579168404Spjdstatic int 1580168404Spjddbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, 1581168404Spjd dmu_buf_impl_t **parentp, blkptr_t **bpp) 1582168404Spjd{ 1583168404Spjd int nlevels, epbs; 1584168404Spjd 1585168404Spjd *parentp = NULL; 1586168404Spjd *bpp = NULL; 1587168404Spjd 1588219089Spjd ASSERT(blkid != DMU_BONUS_BLKID); 1589168404Spjd 1590219089Spjd if (blkid == DMU_SPILL_BLKID) { 1591219089Spjd mutex_enter(&dn->dn_mtx); 1592219089Spjd if (dn->dn_have_spill && 1593219089Spjd (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) 1594219089Spjd *bpp = &dn->dn_phys->dn_spill; 1595219089Spjd else 1596219089Spjd *bpp = NULL; 1597219089Spjd dbuf_add_ref(dn->dn_dbuf, NULL); 1598219089Spjd *parentp = dn->dn_dbuf; 1599219089Spjd mutex_exit(&dn->dn_mtx); 1600219089Spjd return (0); 1601219089Spjd } 1602219089Spjd 1603168404Spjd if (dn->dn_phys->dn_nlevels == 0) 1604168404Spjd nlevels = 1; 1605168404Spjd else 1606168404Spjd nlevels = dn->dn_phys->dn_nlevels; 1607168404Spjd 1608168404Spjd epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1609168404Spjd 1610168404Spjd ASSERT3U(level * epbs, <, 64); 1611168404Spjd ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1612168404Spjd if (level >= nlevels || 1613168404Spjd (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { 1614168404Spjd /* the buffer has no parent yet */ 1615249643Smm return (SET_ERROR(ENOENT)); 1616168404Spjd } else if (level < nlevels-1) { 1617168404Spjd /* this block is referenced from an indirect block */ 1618168404Spjd int err = dbuf_hold_impl(dn, level+1, 1619168404Spjd blkid >> epbs, fail_sparse, NULL, parentp); 1620168404Spjd if (err) 1621168404Spjd return (err); 1622168404Spjd err = dbuf_read(*parentp, NULL, 1623168404Spjd (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); 1624168404Spjd if (err) { 1625168404Spjd dbuf_rele(*parentp, NULL); 1626168404Spjd *parentp = NULL; 1627168404Spjd return (err); 1628168404Spjd } 1629168404Spjd *bpp = ((blkptr_t *)(*parentp)->db.db_data) + 1630168404Spjd (blkid & ((1ULL << epbs) - 1)); 1631168404Spjd return (0); 1632168404Spjd } else { 1633168404Spjd /* the block is referenced from the dnode */ 1634168404Spjd ASSERT3U(level, ==, nlevels-1); 1635168404Spjd ASSERT(dn->dn_phys->dn_nblkptr == 0 || 1636168404Spjd blkid < dn->dn_phys->dn_nblkptr); 1637168404Spjd if (dn->dn_dbuf) { 1638168404Spjd dbuf_add_ref(dn->dn_dbuf, NULL); 1639168404Spjd *parentp = dn->dn_dbuf; 1640168404Spjd } 1641168404Spjd *bpp = &dn->dn_phys->dn_blkptr[blkid]; 1642168404Spjd return (0); 1643168404Spjd } 1644168404Spjd} 1645168404Spjd 1646168404Spjdstatic dmu_buf_impl_t * 1647168404Spjddbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, 1648168404Spjd dmu_buf_impl_t *parent, blkptr_t *blkptr) 1649168404Spjd{ 1650219089Spjd objset_t *os = dn->dn_objset; 1651168404Spjd dmu_buf_impl_t *db, *odb; 1652168404Spjd 1653168404Spjd ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1654168404Spjd ASSERT(dn->dn_type != DMU_OT_NONE); 1655168404Spjd 1656168404Spjd db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); 1657168404Spjd 1658168404Spjd db->db_objset = os; 1659168404Spjd db->db.db_object = dn->dn_object; 1660168404Spjd db->db_level = level; 1661168404Spjd db->db_blkid = blkid; 1662168404Spjd db->db_last_dirty = NULL; 1663168404Spjd db->db_dirtycnt = 0; 1664219089Spjd db->db_dnode_handle = dn->dn_handle; 1665168404Spjd db->db_parent = parent; 1666168404Spjd db->db_blkptr = blkptr; 1667168404Spjd 1668168404Spjd db->db_user_ptr = NULL; 1669168404Spjd db->db_user_data_ptr_ptr = NULL; 1670168404Spjd db->db_evict_func = NULL; 1671168404Spjd db->db_immediate_evict = 0; 1672168404Spjd db->db_freed_in_flight = 0; 1673168404Spjd 1674219089Spjd if (blkid == DMU_BONUS_BLKID) { 1675168404Spjd ASSERT3P(parent, ==, dn->dn_dbuf); 1676185029Spjd db->db.db_size = DN_MAX_BONUSLEN - 1677185029Spjd (dn->dn_nblkptr-1) * sizeof (blkptr_t); 1678185029Spjd ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 1679219089Spjd db->db.db_offset = DMU_BONUS_BLKID; 1680168404Spjd db->db_state = DB_UNCACHED; 1681168404Spjd /* the bonus dbuf is not placed in the hash table */ 1682208373Smm arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1683168404Spjd return (db); 1684219089Spjd } else if (blkid == DMU_SPILL_BLKID) { 1685219089Spjd db->db.db_size = (blkptr != NULL) ? 1686219089Spjd BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; 1687219089Spjd db->db.db_offset = 0; 1688168404Spjd } else { 1689168404Spjd int blocksize = 1690260764Savg db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz; 1691168404Spjd db->db.db_size = blocksize; 1692168404Spjd db->db.db_offset = db->db_blkid * blocksize; 1693168404Spjd } 1694168404Spjd 1695168404Spjd /* 1696168404Spjd * Hold the dn_dbufs_mtx while we get the new dbuf 1697168404Spjd * in the hash table *and* added to the dbufs list. 1698168404Spjd * This prevents a possible deadlock with someone 1699168404Spjd * trying to look up this dbuf before its added to the 1700168404Spjd * dn_dbufs list. 1701168404Spjd */ 1702168404Spjd mutex_enter(&dn->dn_dbufs_mtx); 1703168404Spjd db->db_state = DB_EVICTING; 1704168404Spjd if ((odb = dbuf_hash_insert(db)) != NULL) { 1705168404Spjd /* someone else inserted it first */ 1706168404Spjd kmem_cache_free(dbuf_cache, db); 1707168404Spjd mutex_exit(&dn->dn_dbufs_mtx); 1708168404Spjd return (odb); 1709168404Spjd } 1710168404Spjd list_insert_head(&dn->dn_dbufs, db); 1711260722Savg if (db->db_level == 0 && db->db_blkid >= 1712260722Savg dn->dn_unlisted_l0_blkid) 1713260722Savg dn->dn_unlisted_l0_blkid = db->db_blkid + 1; 1714168404Spjd db->db_state = DB_UNCACHED; 1715168404Spjd mutex_exit(&dn->dn_dbufs_mtx); 1716208373Smm arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1717168404Spjd 1718168404Spjd if (parent && parent != dn->dn_dbuf) 1719168404Spjd dbuf_add_ref(parent, db); 1720168404Spjd 1721168404Spjd ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1722168404Spjd refcount_count(&dn->dn_holds) > 0); 1723168404Spjd (void) refcount_add(&dn->dn_holds, db); 1724219089Spjd (void) atomic_inc_32_nv(&dn->dn_dbufs_count); 1725168404Spjd 1726168404Spjd dprintf_dbuf(db, "db=%p\n", db); 1727168404Spjd 1728168404Spjd return (db); 1729168404Spjd} 1730168404Spjd 1731168404Spjdstatic int 1732168404Spjddbuf_do_evict(void *private) 1733168404Spjd{ 1734168404Spjd arc_buf_t *buf = private; 1735168404Spjd dmu_buf_impl_t *db = buf->b_private; 1736168404Spjd 1737168404Spjd if (!MUTEX_HELD(&db->db_mtx)) 1738168404Spjd mutex_enter(&db->db_mtx); 1739168404Spjd 1740168404Spjd ASSERT(refcount_is_zero(&db->db_holds)); 1741168404Spjd 1742168404Spjd if (db->db_state != DB_EVICTING) { 1743168404Spjd ASSERT(db->db_state == DB_CACHED); 1744168404Spjd DBUF_VERIFY(db); 1745168404Spjd db->db_buf = NULL; 1746168404Spjd dbuf_evict(db); 1747168404Spjd } else { 1748168404Spjd mutex_exit(&db->db_mtx); 1749168404Spjd dbuf_destroy(db); 1750168404Spjd } 1751168404Spjd return (0); 1752168404Spjd} 1753168404Spjd 1754168404Spjdstatic void 1755168404Spjddbuf_destroy(dmu_buf_impl_t *db) 1756168404Spjd{ 1757168404Spjd ASSERT(refcount_is_zero(&db->db_holds)); 1758168404Spjd 1759219089Spjd if (db->db_blkid != DMU_BONUS_BLKID) { 1760168404Spjd /* 1761168404Spjd * If this dbuf is still on the dn_dbufs list, 1762168404Spjd * remove it from that list. 1763168404Spjd */ 1764219089Spjd if (db->db_dnode_handle != NULL) { 1765219089Spjd dnode_t *dn; 1766185029Spjd 1767219089Spjd DB_DNODE_ENTER(db); 1768219089Spjd dn = DB_DNODE(db); 1769168404Spjd mutex_enter(&dn->dn_dbufs_mtx); 1770168404Spjd list_remove(&dn->dn_dbufs, db); 1771219089Spjd (void) atomic_dec_32_nv(&dn->dn_dbufs_count); 1772168404Spjd mutex_exit(&dn->dn_dbufs_mtx); 1773219089Spjd DB_DNODE_EXIT(db); 1774219089Spjd /* 1775219089Spjd * Decrementing the dbuf count means that the hold 1776219089Spjd * corresponding to the removed dbuf is no longer 1777219089Spjd * discounted in dnode_move(), so the dnode cannot be 1778219089Spjd * moved until after we release the hold. 1779219089Spjd */ 1780168404Spjd dnode_rele(dn, db); 1781219089Spjd db->db_dnode_handle = NULL; 1782168404Spjd } 1783168404Spjd dbuf_hash_remove(db); 1784168404Spjd } 1785168404Spjd db->db_parent = NULL; 1786168404Spjd db->db_buf = NULL; 1787168404Spjd 1788185029Spjd ASSERT(!list_link_active(&db->db_link)); 1789168404Spjd ASSERT(db->db.db_data == NULL); 1790168404Spjd ASSERT(db->db_hash_next == NULL); 1791168404Spjd ASSERT(db->db_blkptr == NULL); 1792168404Spjd ASSERT(db->db_data_pending == NULL); 1793168404Spjd 1794168404Spjd kmem_cache_free(dbuf_cache, db); 1795208373Smm arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1796168404Spjd} 1797168404Spjd 1798168404Spjdvoid 1799260764Savgdbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio) 1800168404Spjd{ 1801168404Spjd dmu_buf_impl_t *db = NULL; 1802168404Spjd blkptr_t *bp = NULL; 1803168404Spjd 1804219089Spjd ASSERT(blkid != DMU_BONUS_BLKID); 1805168404Spjd ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1806168404Spjd 1807168404Spjd if (dnode_block_freed(dn, blkid)) 1808168404Spjd return; 1809168404Spjd 1810168404Spjd /* dbuf_find() returns with db_mtx held */ 1811168404Spjd if (db = dbuf_find(dn, 0, blkid)) { 1812219089Spjd /* 1813219089Spjd * This dbuf is already in the cache. We assume that 1814219089Spjd * it is already CACHED, or else about to be either 1815219089Spjd * read or filled. 1816219089Spjd */ 1817168404Spjd mutex_exit(&db->db_mtx); 1818219089Spjd return; 1819168404Spjd } 1820168404Spjd 1821168404Spjd if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { 1822168404Spjd if (bp && !BP_IS_HOLE(bp)) { 1823219089Spjd dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; 1824168404Spjd uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; 1825168404Spjd zbookmark_t zb; 1826168404Spjd 1827219089Spjd SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, 1828219089Spjd dn->dn_object, 0, blkid); 1829219089Spjd 1830247406Smm (void) arc_read(NULL, dn->dn_objset->os_spa, 1831260764Savg bp, NULL, NULL, prio, 1832168404Spjd ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 1833168404Spjd &aflags, &zb); 1834168404Spjd } 1835168404Spjd if (db) 1836168404Spjd dbuf_rele(db, NULL); 1837168404Spjd } 1838168404Spjd} 1839168404Spjd 1840168404Spjd/* 1841168404Spjd * Returns with db_holds incremented, and db_mtx not held. 1842168404Spjd * Note: dn_struct_rwlock must be held. 1843168404Spjd */ 1844168404Spjdint 1845168404Spjddbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, 1846168404Spjd void *tag, dmu_buf_impl_t **dbp) 1847168404Spjd{ 1848168404Spjd dmu_buf_impl_t *db, *parent = NULL; 1849168404Spjd 1850219089Spjd ASSERT(blkid != DMU_BONUS_BLKID); 1851168404Spjd ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1852168404Spjd ASSERT3U(dn->dn_nlevels, >, level); 1853168404Spjd 1854168404Spjd *dbp = NULL; 1855168404Spjdtop: 1856168404Spjd /* dbuf_find() returns with db_mtx held */ 1857168404Spjd db = dbuf_find(dn, level, blkid); 1858168404Spjd 1859168404Spjd if (db == NULL) { 1860168404Spjd blkptr_t *bp = NULL; 1861168404Spjd int err; 1862168404Spjd 1863168404Spjd ASSERT3P(parent, ==, NULL); 1864168404Spjd err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); 1865168404Spjd if (fail_sparse) { 1866168404Spjd if (err == 0 && bp && BP_IS_HOLE(bp)) 1867249643Smm err = SET_ERROR(ENOENT); 1868168404Spjd if (err) { 1869168404Spjd if (parent) 1870168404Spjd dbuf_rele(parent, NULL); 1871168404Spjd return (err); 1872168404Spjd } 1873168404Spjd } 1874168404Spjd if (err && err != ENOENT) 1875168404Spjd return (err); 1876168404Spjd db = dbuf_create(dn, level, blkid, parent, bp); 1877168404Spjd } 1878168404Spjd 1879168404Spjd if (db->db_buf && refcount_is_zero(&db->db_holds)) { 1880168404Spjd arc_buf_add_ref(db->db_buf, db); 1881168404Spjd if (db->db_buf->b_data == NULL) { 1882168404Spjd dbuf_clear(db); 1883168404Spjd if (parent) { 1884168404Spjd dbuf_rele(parent, NULL); 1885168404Spjd parent = NULL; 1886168404Spjd } 1887168404Spjd goto top; 1888168404Spjd } 1889168404Spjd ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); 1890168404Spjd } 1891168404Spjd 1892168404Spjd ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); 1893168404Spjd 1894168404Spjd /* 1895168404Spjd * If this buffer is currently syncing out, and we are are 1896168404Spjd * still referencing it from db_data, we need to make a copy 1897168404Spjd * of it in case we decide we want to dirty it again in this txg. 1898168404Spjd */ 1899219089Spjd if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 1900168404Spjd dn->dn_object != DMU_META_DNODE_OBJECT && 1901168404Spjd db->db_state == DB_CACHED && db->db_data_pending) { 1902168404Spjd dbuf_dirty_record_t *dr = db->db_data_pending; 1903168404Spjd 1904168404Spjd if (dr->dt.dl.dr_data == db->db_buf) { 1905168404Spjd arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1906168404Spjd 1907168404Spjd dbuf_set_data(db, 1908219089Spjd arc_buf_alloc(dn->dn_objset->os_spa, 1909168404Spjd db->db.db_size, db, type)); 1910168404Spjd bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, 1911168404Spjd db->db.db_size); 1912168404Spjd } 1913168404Spjd } 1914168404Spjd 1915168404Spjd (void) refcount_add(&db->db_holds, tag); 1916168404Spjd dbuf_update_data(db); 1917168404Spjd DBUF_VERIFY(db); 1918168404Spjd mutex_exit(&db->db_mtx); 1919168404Spjd 1920168404Spjd /* NOTE: we can't rele the parent until after we drop the db_mtx */ 1921168404Spjd if (parent) 1922168404Spjd dbuf_rele(parent, NULL); 1923168404Spjd 1924219089Spjd ASSERT3P(DB_DNODE(db), ==, dn); 1925168404Spjd ASSERT3U(db->db_blkid, ==, blkid); 1926168404Spjd ASSERT3U(db->db_level, ==, level); 1927168404Spjd *dbp = db; 1928168404Spjd 1929168404Spjd return (0); 1930168404Spjd} 1931168404Spjd 1932168404Spjddmu_buf_impl_t * 1933168404Spjddbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) 1934168404Spjd{ 1935168404Spjd dmu_buf_impl_t *db; 1936168404Spjd int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db); 1937168404Spjd return (err ? NULL : db); 1938168404Spjd} 1939168404Spjd 1940168404Spjddmu_buf_impl_t * 1941168404Spjddbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) 1942168404Spjd{ 1943168404Spjd dmu_buf_impl_t *db; 1944168404Spjd int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); 1945168404Spjd return (err ? NULL : db); 1946168404Spjd} 1947168404Spjd 1948185029Spjdvoid 1949168404Spjddbuf_create_bonus(dnode_t *dn) 1950168404Spjd{ 1951168404Spjd ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 1952168404Spjd 1953168404Spjd ASSERT(dn->dn_bonus == NULL); 1954219089Spjd dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); 1955168404Spjd} 1956168404Spjd 1957219089Spjdint 1958219089Spjddbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) 1959219089Spjd{ 1960219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1961219089Spjd dnode_t *dn; 1962219089Spjd 1963219089Spjd if (db->db_blkid != DMU_SPILL_BLKID) 1964249643Smm return (SET_ERROR(ENOTSUP)); 1965219089Spjd if (blksz == 0) 1966219089Spjd blksz = SPA_MINBLOCKSIZE; 1967219089Spjd if (blksz > SPA_MAXBLOCKSIZE) 1968219089Spjd blksz = SPA_MAXBLOCKSIZE; 1969219089Spjd else 1970219089Spjd blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); 1971219089Spjd 1972219089Spjd DB_DNODE_ENTER(db); 1973219089Spjd dn = DB_DNODE(db); 1974219089Spjd rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 1975219089Spjd dbuf_new_size(db, blksz, tx); 1976219089Spjd rw_exit(&dn->dn_struct_rwlock); 1977219089Spjd DB_DNODE_EXIT(db); 1978219089Spjd 1979219089Spjd return (0); 1980219089Spjd} 1981219089Spjd 1982219089Spjdvoid 1983219089Spjddbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) 1984219089Spjd{ 1985219089Spjd dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); 1986219089Spjd} 1987219089Spjd 1988168404Spjd#pragma weak dmu_buf_add_ref = dbuf_add_ref 1989168404Spjdvoid 1990168404Spjddbuf_add_ref(dmu_buf_impl_t *db, void *tag) 1991168404Spjd{ 1992168404Spjd int64_t holds = refcount_add(&db->db_holds, tag); 1993168404Spjd ASSERT(holds > 1); 1994168404Spjd} 1995168404Spjd 1996219089Spjd/* 1997219089Spjd * If you call dbuf_rele() you had better not be referencing the dnode handle 1998219089Spjd * unless you have some other direct or indirect hold on the dnode. (An indirect 1999219089Spjd * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) 2000219089Spjd * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the 2001219089Spjd * dnode's parent dbuf evicting its dnode handles. 2002219089Spjd */ 2003168404Spjdvoid 2004168404Spjddbuf_rele(dmu_buf_impl_t *db, void *tag) 2005168404Spjd{ 2006219089Spjd mutex_enter(&db->db_mtx); 2007219089Spjd dbuf_rele_and_unlock(db, tag); 2008219089Spjd} 2009219089Spjd 2010263398Sdelphijvoid 2011263398Sdelphijdmu_buf_rele(dmu_buf_t *db, void *tag) 2012263398Sdelphij{ 2013263398Sdelphij dbuf_rele((dmu_buf_impl_t *)db, tag); 2014263398Sdelphij} 2015263398Sdelphij 2016219089Spjd/* 2017219089Spjd * dbuf_rele() for an already-locked dbuf. This is necessary to allow 2018219089Spjd * db_dirtycnt and db_holds to be updated atomically. 2019219089Spjd */ 2020219089Spjdvoid 2021219089Spjddbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) 2022219089Spjd{ 2023168404Spjd int64_t holds; 2024168404Spjd 2025219089Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 2026168404Spjd DBUF_VERIFY(db); 2027168404Spjd 2028219089Spjd /* 2029219089Spjd * Remove the reference to the dbuf before removing its hold on the 2030219089Spjd * dnode so we can guarantee in dnode_move() that a referenced bonus 2031219089Spjd * buffer has a corresponding dnode hold. 2032219089Spjd */ 2033168404Spjd holds = refcount_remove(&db->db_holds, tag); 2034168404Spjd ASSERT(holds >= 0); 2035168404Spjd 2036168404Spjd /* 2037168404Spjd * We can't freeze indirects if there is a possibility that they 2038168404Spjd * may be modified in the current syncing context. 2039168404Spjd */ 2040168404Spjd if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) 2041168404Spjd arc_buf_freeze(db->db_buf); 2042168404Spjd 2043168404Spjd if (holds == db->db_dirtycnt && 2044168404Spjd db->db_level == 0 && db->db_immediate_evict) 2045168404Spjd dbuf_evict_user(db); 2046168404Spjd 2047168404Spjd if (holds == 0) { 2048219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 2049168404Spjd mutex_exit(&db->db_mtx); 2050219089Spjd 2051219089Spjd /* 2052219089Spjd * If the dnode moves here, we cannot cross this barrier 2053219089Spjd * until the move completes. 2054219089Spjd */ 2055219089Spjd DB_DNODE_ENTER(db); 2056219089Spjd (void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count); 2057219089Spjd DB_DNODE_EXIT(db); 2058219089Spjd /* 2059219089Spjd * The bonus buffer's dnode hold is no longer discounted 2060219089Spjd * in dnode_move(). The dnode cannot move until after 2061219089Spjd * the dnode_rele(). 2062219089Spjd */ 2063219089Spjd dnode_rele(DB_DNODE(db), db); 2064168404Spjd } else if (db->db_buf == NULL) { 2065168404Spjd /* 2066168404Spjd * This is a special case: we never associated this 2067168404Spjd * dbuf with any data allocated from the ARC. 2068168404Spjd */ 2069219089Spjd ASSERT(db->db_state == DB_UNCACHED || 2070219089Spjd db->db_state == DB_NOFILL); 2071168404Spjd dbuf_evict(db); 2072168404Spjd } else if (arc_released(db->db_buf)) { 2073168404Spjd arc_buf_t *buf = db->db_buf; 2074168404Spjd /* 2075168404Spjd * This dbuf has anonymous data associated with it. 2076168404Spjd */ 2077168404Spjd dbuf_set_data(db, NULL); 2078249643Smm VERIFY(arc_buf_remove_ref(buf, db)); 2079168404Spjd dbuf_evict(db); 2080168404Spjd } else { 2081249643Smm VERIFY(!arc_buf_remove_ref(db->db_buf, db)); 2082248547Smm 2083248547Smm /* 2084248547Smm * A dbuf will be eligible for eviction if either the 2085248547Smm * 'primarycache' property is set or a duplicate 2086248547Smm * copy of this buffer is already cached in the arc. 2087248547Smm * 2088248547Smm * In the case of the 'primarycache' a buffer 2089248547Smm * is considered for eviction if it matches the 2090248547Smm * criteria set in the property. 2091248547Smm * 2092248547Smm * To decide if our buffer is considered a 2093248547Smm * duplicate, we must call into the arc to determine 2094248547Smm * if multiple buffers are referencing the same 2095248547Smm * block on-disk. If so, then we simply evict 2096248547Smm * ourselves. 2097248547Smm */ 2098248547Smm if (!DBUF_IS_CACHEABLE(db) || 2099248547Smm arc_buf_eviction_needed(db->db_buf)) 2100185029Spjd dbuf_clear(db); 2101185029Spjd else 2102185029Spjd mutex_exit(&db->db_mtx); 2103168404Spjd } 2104168404Spjd } else { 2105168404Spjd mutex_exit(&db->db_mtx); 2106168404Spjd } 2107168404Spjd} 2108168404Spjd 2109168404Spjd#pragma weak dmu_buf_refcount = dbuf_refcount 2110168404Spjduint64_t 2111168404Spjddbuf_refcount(dmu_buf_impl_t *db) 2112168404Spjd{ 2113168404Spjd return (refcount_count(&db->db_holds)); 2114168404Spjd} 2115168404Spjd 2116168404Spjdvoid * 2117168404Spjddmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 2118168404Spjd dmu_buf_evict_func_t *evict_func) 2119168404Spjd{ 2120168404Spjd return (dmu_buf_update_user(db_fake, NULL, user_ptr, 2121168404Spjd user_data_ptr_ptr, evict_func)); 2122168404Spjd} 2123168404Spjd 2124168404Spjdvoid * 2125168404Spjddmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 2126168404Spjd dmu_buf_evict_func_t *evict_func) 2127168404Spjd{ 2128168404Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2129168404Spjd 2130168404Spjd db->db_immediate_evict = TRUE; 2131168404Spjd return (dmu_buf_update_user(db_fake, NULL, user_ptr, 2132168404Spjd user_data_ptr_ptr, evict_func)); 2133168404Spjd} 2134168404Spjd 2135168404Spjdvoid * 2136168404Spjddmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, 2137168404Spjd void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func) 2138168404Spjd{ 2139168404Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2140168404Spjd ASSERT(db->db_level == 0); 2141168404Spjd 2142168404Spjd ASSERT((user_ptr == NULL) == (evict_func == NULL)); 2143168404Spjd 2144168404Spjd mutex_enter(&db->db_mtx); 2145168404Spjd 2146168404Spjd if (db->db_user_ptr == old_user_ptr) { 2147168404Spjd db->db_user_ptr = user_ptr; 2148168404Spjd db->db_user_data_ptr_ptr = user_data_ptr_ptr; 2149168404Spjd db->db_evict_func = evict_func; 2150168404Spjd 2151168404Spjd dbuf_update_data(db); 2152168404Spjd } else { 2153168404Spjd old_user_ptr = db->db_user_ptr; 2154168404Spjd } 2155168404Spjd 2156168404Spjd mutex_exit(&db->db_mtx); 2157168404Spjd return (old_user_ptr); 2158168404Spjd} 2159168404Spjd 2160168404Spjdvoid * 2161168404Spjddmu_buf_get_user(dmu_buf_t *db_fake) 2162168404Spjd{ 2163168404Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2164168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 2165168404Spjd 2166168404Spjd return (db->db_user_ptr); 2167168404Spjd} 2168168404Spjd 2169209962Smmboolean_t 2170209962Smmdmu_buf_freeable(dmu_buf_t *dbuf) 2171209962Smm{ 2172209962Smm boolean_t res = B_FALSE; 2173209962Smm dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; 2174209962Smm 2175209962Smm if (db->db_blkptr) 2176209962Smm res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset, 2177219089Spjd db->db_blkptr, db->db_blkptr->blk_birth); 2178209962Smm 2179209962Smm return (res); 2180209962Smm} 2181209962Smm 2182244087Smmblkptr_t * 2183244087Smmdmu_buf_get_blkptr(dmu_buf_t *db) 2184244087Smm{ 2185244087Smm dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; 2186244087Smm return (dbi->db_blkptr); 2187244087Smm} 2188244087Smm 2189168404Spjdstatic void 2190168404Spjddbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) 2191168404Spjd{ 2192168404Spjd /* ASSERT(dmu_tx_is_syncing(tx) */ 2193168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 2194168404Spjd 2195168404Spjd if (db->db_blkptr != NULL) 2196168404Spjd return; 2197168404Spjd 2198219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) { 2199219089Spjd db->db_blkptr = &dn->dn_phys->dn_spill; 2200219089Spjd BP_ZERO(db->db_blkptr); 2201219089Spjd return; 2202219089Spjd } 2203168404Spjd if (db->db_level == dn->dn_phys->dn_nlevels-1) { 2204168404Spjd /* 2205168404Spjd * This buffer was allocated at a time when there was 2206168404Spjd * no available blkptrs from the dnode, or it was 2207168404Spjd * inappropriate to hook it in (i.e., nlevels mis-match). 2208168404Spjd */ 2209168404Spjd ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); 2210168404Spjd ASSERT(db->db_parent == NULL); 2211168404Spjd db->db_parent = dn->dn_dbuf; 2212168404Spjd db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; 2213168404Spjd DBUF_VERIFY(db); 2214168404Spjd } else { 2215168404Spjd dmu_buf_impl_t *parent = db->db_parent; 2216168404Spjd int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2217168404Spjd 2218168404Spjd ASSERT(dn->dn_phys->dn_nlevels > 1); 2219168404Spjd if (parent == NULL) { 2220168404Spjd mutex_exit(&db->db_mtx); 2221168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 2222168404Spjd (void) dbuf_hold_impl(dn, db->db_level+1, 2223168404Spjd db->db_blkid >> epbs, FALSE, db, &parent); 2224168404Spjd rw_exit(&dn->dn_struct_rwlock); 2225168404Spjd mutex_enter(&db->db_mtx); 2226168404Spjd db->db_parent = parent; 2227168404Spjd } 2228168404Spjd db->db_blkptr = (blkptr_t *)parent->db.db_data + 2229168404Spjd (db->db_blkid & ((1ULL << epbs) - 1)); 2230168404Spjd DBUF_VERIFY(db); 2231168404Spjd } 2232168404Spjd} 2233168404Spjd 2234168404Spjdstatic void 2235168404Spjddbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2236168404Spjd{ 2237168404Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 2238219089Spjd dnode_t *dn; 2239168404Spjd zio_t *zio; 2240168404Spjd 2241168404Spjd ASSERT(dmu_tx_is_syncing(tx)); 2242168404Spjd 2243168404Spjd dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2244168404Spjd 2245168404Spjd mutex_enter(&db->db_mtx); 2246168404Spjd 2247168404Spjd ASSERT(db->db_level > 0); 2248168404Spjd DBUF_VERIFY(db); 2249168404Spjd 2250252749Sdelphij /* Read the block if it hasn't been read yet. */ 2251168404Spjd if (db->db_buf == NULL) { 2252168404Spjd mutex_exit(&db->db_mtx); 2253168404Spjd (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); 2254168404Spjd mutex_enter(&db->db_mtx); 2255168404Spjd } 2256168404Spjd ASSERT3U(db->db_state, ==, DB_CACHED); 2257168404Spjd ASSERT(db->db_buf != NULL); 2258168404Spjd 2259219089Spjd DB_DNODE_ENTER(db); 2260219089Spjd dn = DB_DNODE(db); 2261252749Sdelphij /* Indirect block size must match what the dnode thinks it is. */ 2262219089Spjd ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2263168404Spjd dbuf_check_blkptr(dn, db); 2264219089Spjd DB_DNODE_EXIT(db); 2265168404Spjd 2266252749Sdelphij /* Provide the pending dirty record to child dbufs */ 2267168404Spjd db->db_data_pending = dr; 2268168404Spjd 2269168404Spjd mutex_exit(&db->db_mtx); 2270185029Spjd dbuf_write(dr, db->db_buf, tx); 2271168404Spjd 2272168404Spjd zio = dr->dr_zio; 2273168404Spjd mutex_enter(&dr->dt.di.dr_mtx); 2274168404Spjd dbuf_sync_list(&dr->dt.di.dr_children, tx); 2275168404Spjd ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 2276168404Spjd mutex_exit(&dr->dt.di.dr_mtx); 2277168404Spjd zio_nowait(zio); 2278168404Spjd} 2279168404Spjd 2280168404Spjdstatic void 2281168404Spjddbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2282168404Spjd{ 2283168404Spjd arc_buf_t **datap = &dr->dt.dl.dr_data; 2284168404Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 2285219089Spjd dnode_t *dn; 2286219089Spjd objset_t *os; 2287168404Spjd uint64_t txg = tx->tx_txg; 2288168404Spjd 2289168404Spjd ASSERT(dmu_tx_is_syncing(tx)); 2290168404Spjd 2291168404Spjd dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2292168404Spjd 2293168404Spjd mutex_enter(&db->db_mtx); 2294168404Spjd /* 2295168404Spjd * To be synced, we must be dirtied. But we 2296168404Spjd * might have been freed after the dirty. 2297168404Spjd */ 2298168404Spjd if (db->db_state == DB_UNCACHED) { 2299168404Spjd /* This buffer has been freed since it was dirtied */ 2300168404Spjd ASSERT(db->db.db_data == NULL); 2301168404Spjd } else if (db->db_state == DB_FILL) { 2302168404Spjd /* This buffer was freed and is now being re-filled */ 2303168404Spjd ASSERT(db->db.db_data != dr->dt.dl.dr_data); 2304168404Spjd } else { 2305219089Spjd ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); 2306168404Spjd } 2307168404Spjd DBUF_VERIFY(db); 2308168404Spjd 2309219089Spjd DB_DNODE_ENTER(db); 2310219089Spjd dn = DB_DNODE(db); 2311219089Spjd 2312219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) { 2313219089Spjd mutex_enter(&dn->dn_mtx); 2314219089Spjd dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; 2315219089Spjd mutex_exit(&dn->dn_mtx); 2316219089Spjd } 2317219089Spjd 2318168404Spjd /* 2319168404Spjd * If this is a bonus buffer, simply copy the bonus data into the 2320168404Spjd * dnode. It will be written out when the dnode is synced (and it 2321168404Spjd * will be synced, since it must have been dirty for dbuf_sync to 2322168404Spjd * be called). 2323168404Spjd */ 2324219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 2325168404Spjd dbuf_dirty_record_t **drp; 2326185029Spjd 2327168404Spjd ASSERT(*datap != NULL); 2328243674Smm ASSERT0(db->db_level); 2329168404Spjd ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); 2330168404Spjd bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); 2331219089Spjd DB_DNODE_EXIT(db); 2332219089Spjd 2333185029Spjd if (*datap != db->db.db_data) { 2334168404Spjd zio_buf_free(*datap, DN_MAX_BONUSLEN); 2335208373Smm arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 2336185029Spjd } 2337168404Spjd db->db_data_pending = NULL; 2338168404Spjd drp = &db->db_last_dirty; 2339168404Spjd while (*drp != dr) 2340168404Spjd drp = &(*drp)->dr_next; 2341185029Spjd ASSERT(dr->dr_next == NULL); 2342219089Spjd ASSERT(dr->dr_dbuf == db); 2343185029Spjd *drp = dr->dr_next; 2344169325Spjd if (dr->dr_dbuf->db_level != 0) { 2345169325Spjd list_destroy(&dr->dt.di.dr_children); 2346169325Spjd mutex_destroy(&dr->dt.di.dr_mtx); 2347169325Spjd } 2348168404Spjd kmem_free(dr, sizeof (dbuf_dirty_record_t)); 2349168404Spjd ASSERT(db->db_dirtycnt > 0); 2350168404Spjd db->db_dirtycnt -= 1; 2351219089Spjd dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); 2352168404Spjd return; 2353168404Spjd } 2354168404Spjd 2355219089Spjd os = dn->dn_objset; 2356219089Spjd 2357168404Spjd /* 2358185029Spjd * This function may have dropped the db_mtx lock allowing a dmu_sync 2359185029Spjd * operation to sneak in. As a result, we need to ensure that we 2360185029Spjd * don't check the dr_override_state until we have returned from 2361185029Spjd * dbuf_check_blkptr. 2362185029Spjd */ 2363185029Spjd dbuf_check_blkptr(dn, db); 2364185029Spjd 2365185029Spjd /* 2366219089Spjd * If this buffer is in the middle of an immediate write, 2367168404Spjd * wait for the synchronous IO to complete. 2368168404Spjd */ 2369168404Spjd while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { 2370168404Spjd ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); 2371168404Spjd cv_wait(&db->db_changed, &db->db_mtx); 2372168404Spjd ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); 2373168404Spjd } 2374168404Spjd 2375219089Spjd if (db->db_state != DB_NOFILL && 2376219089Spjd dn->dn_object != DMU_META_DNODE_OBJECT && 2377208050Smm refcount_count(&db->db_holds) > 1 && 2378219089Spjd dr->dt.dl.dr_override_state != DR_OVERRIDDEN && 2379208050Smm *datap == db->db_buf) { 2380168404Spjd /* 2381208050Smm * If this buffer is currently "in use" (i.e., there 2382208050Smm * are active holds and db_data still references it), 2383208050Smm * then make a copy before we start the write so that 2384208050Smm * any modifications from the open txg will not leak 2385208050Smm * into this write. 2386168404Spjd * 2387208050Smm * NOTE: this copy does not need to be made for 2388208050Smm * objects only modified in the syncing context (e.g. 2389208050Smm * DNONE_DNODE blocks). 2390168404Spjd */ 2391208050Smm int blksz = arc_buf_size(*datap); 2392208050Smm arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 2393208050Smm *datap = arc_buf_alloc(os->os_spa, blksz, db, type); 2394208050Smm bcopy(db->db.db_data, (*datap)->b_data, blksz); 2395168404Spjd } 2396168404Spjd db->db_data_pending = dr; 2397168404Spjd 2398168404Spjd mutex_exit(&db->db_mtx); 2399168404Spjd 2400185029Spjd dbuf_write(dr, *datap, tx); 2401168404Spjd 2402168404Spjd ASSERT(!list_link_active(&dr->dr_dirty_node)); 2403219089Spjd if (dn->dn_object == DMU_META_DNODE_OBJECT) { 2404168404Spjd list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); 2405219089Spjd DB_DNODE_EXIT(db); 2406219089Spjd } else { 2407219089Spjd /* 2408219089Spjd * Although zio_nowait() does not "wait for an IO", it does 2409219089Spjd * initiate the IO. If this is an empty write it seems plausible 2410219089Spjd * that the IO could actually be completed before the nowait 2411219089Spjd * returns. We need to DB_DNODE_EXIT() first in case 2412219089Spjd * zio_nowait() invalidates the dbuf. 2413219089Spjd */ 2414219089Spjd DB_DNODE_EXIT(db); 2415168404Spjd zio_nowait(dr->dr_zio); 2416219089Spjd } 2417168404Spjd} 2418168404Spjd 2419168404Spjdvoid 2420168404Spjddbuf_sync_list(list_t *list, dmu_tx_t *tx) 2421168404Spjd{ 2422168404Spjd dbuf_dirty_record_t *dr; 2423168404Spjd 2424168404Spjd while (dr = list_head(list)) { 2425168404Spjd if (dr->dr_zio != NULL) { 2426168404Spjd /* 2427168404Spjd * If we find an already initialized zio then we 2428168404Spjd * are processing the meta-dnode, and we have finished. 2429168404Spjd * The dbufs for all dnodes are put back on the list 2430168404Spjd * during processing, so that we can zio_wait() 2431168404Spjd * these IOs after initiating all child IOs. 2432168404Spjd */ 2433168404Spjd ASSERT3U(dr->dr_dbuf->db.db_object, ==, 2434168404Spjd DMU_META_DNODE_OBJECT); 2435168404Spjd break; 2436168404Spjd } 2437168404Spjd list_remove(list, dr); 2438168404Spjd if (dr->dr_dbuf->db_level > 0) 2439168404Spjd dbuf_sync_indirect(dr, tx); 2440168404Spjd else 2441168404Spjd dbuf_sync_leaf(dr, tx); 2442168404Spjd } 2443168404Spjd} 2444168404Spjd 2445168404Spjd/* ARGSUSED */ 2446168404Spjdstatic void 2447168404Spjddbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) 2448168404Spjd{ 2449168404Spjd dmu_buf_impl_t *db = vdb; 2450219089Spjd dnode_t *dn; 2451185029Spjd blkptr_t *bp = zio->io_bp; 2452168404Spjd blkptr_t *bp_orig = &zio->io_bp_orig; 2453219089Spjd spa_t *spa = zio->io_spa; 2454219089Spjd int64_t delta; 2455168404Spjd uint64_t fill = 0; 2456219089Spjd int i; 2457168404Spjd 2458185029Spjd ASSERT(db->db_blkptr == bp); 2459185029Spjd 2460219089Spjd DB_DNODE_ENTER(db); 2461219089Spjd dn = DB_DNODE(db); 2462219089Spjd delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); 2463219089Spjd dnode_diduse_space(dn, delta - zio->io_prev_space_delta); 2464219089Spjd zio->io_prev_space_delta = delta; 2465168404Spjd 2466263398Sdelphij if (bp->blk_birth != 0) { 2467263398Sdelphij ASSERT((db->db_blkid != DMU_SPILL_BLKID && 2468263398Sdelphij BP_GET_TYPE(bp) == dn->dn_type) || 2469263398Sdelphij (db->db_blkid == DMU_SPILL_BLKID && 2470263398Sdelphij BP_GET_TYPE(bp) == dn->dn_bonustype)); 2471263398Sdelphij ASSERT(BP_GET_LEVEL(bp) == db->db_level); 2472168404Spjd } 2473168404Spjd 2474168404Spjd mutex_enter(&db->db_mtx); 2475168404Spjd 2476219089Spjd#ifdef ZFS_DEBUG 2477219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) { 2478219089Spjd ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 2479219089Spjd ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 2480219089Spjd db->db_blkptr == &dn->dn_phys->dn_spill); 2481219089Spjd } 2482219089Spjd#endif 2483219089Spjd 2484168404Spjd if (db->db_level == 0) { 2485168404Spjd mutex_enter(&dn->dn_mtx); 2486219089Spjd if (db->db_blkid > dn->dn_phys->dn_maxblkid && 2487219089Spjd db->db_blkid != DMU_SPILL_BLKID) 2488168404Spjd dn->dn_phys->dn_maxblkid = db->db_blkid; 2489168404Spjd mutex_exit(&dn->dn_mtx); 2490168404Spjd 2491168404Spjd if (dn->dn_type == DMU_OT_DNODE) { 2492168404Spjd dnode_phys_t *dnp = db->db.db_data; 2493168404Spjd for (i = db->db.db_size >> DNODE_SHIFT; i > 0; 2494168404Spjd i--, dnp++) { 2495168404Spjd if (dnp->dn_type != DMU_OT_NONE) 2496168404Spjd fill++; 2497168404Spjd } 2498168404Spjd } else { 2499263398Sdelphij if (BP_IS_HOLE(bp)) { 2500263398Sdelphij fill = 0; 2501263398Sdelphij } else { 2502263398Sdelphij fill = 1; 2503263398Sdelphij } 2504168404Spjd } 2505168404Spjd } else { 2506185029Spjd blkptr_t *ibp = db->db.db_data; 2507168404Spjd ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2508185029Spjd for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { 2509185029Spjd if (BP_IS_HOLE(ibp)) 2510168404Spjd continue; 2511185029Spjd fill += ibp->blk_fill; 2512168404Spjd } 2513168404Spjd } 2514219089Spjd DB_DNODE_EXIT(db); 2515168404Spjd 2516185029Spjd bp->blk_fill = fill; 2517168404Spjd 2518168404Spjd mutex_exit(&db->db_mtx); 2519168404Spjd} 2520168404Spjd 2521260764Savg/* 2522260764Savg * The SPA will call this callback several times for each zio - once 2523260764Savg * for every physical child i/o (zio->io_phys_children times). This 2524260764Savg * allows the DMU to monitor the progress of each logical i/o. For example, 2525260764Savg * there may be 2 copies of an indirect block, or many fragments of a RAID-Z 2526260764Savg * block. There may be a long delay before all copies/fragments are completed, 2527260764Savg * so this callback allows us to retire dirty space gradually, as the physical 2528260764Savg * i/os complete. 2529260764Savg */ 2530168404Spjd/* ARGSUSED */ 2531168404Spjdstatic void 2532260764Savgdbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg) 2533260764Savg{ 2534260764Savg dmu_buf_impl_t *db = arg; 2535260764Savg objset_t *os = db->db_objset; 2536260764Savg dsl_pool_t *dp = dmu_objset_pool(os); 2537260764Savg dbuf_dirty_record_t *dr; 2538260764Savg int delta = 0; 2539260764Savg 2540260764Savg dr = db->db_data_pending; 2541260764Savg ASSERT3U(dr->dr_txg, ==, zio->io_txg); 2542260764Savg 2543260764Savg /* 2544260764Savg * The callback will be called io_phys_children times. Retire one 2545260764Savg * portion of our dirty space each time we are called. Any rounding 2546260764Savg * error will be cleaned up by dsl_pool_sync()'s call to 2547260764Savg * dsl_pool_undirty_space(). 2548260764Savg */ 2549260764Savg delta = dr->dr_accounted / zio->io_phys_children; 2550260764Savg dsl_pool_undirty_space(dp, delta, zio->io_txg); 2551260764Savg} 2552260764Savg 2553260764Savg/* ARGSUSED */ 2554260764Savgstatic void 2555168404Spjddbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) 2556168404Spjd{ 2557168404Spjd dmu_buf_impl_t *db = vdb; 2558219089Spjd blkptr_t *bp_orig = &zio->io_bp_orig; 2559263398Sdelphij blkptr_t *bp = db->db_blkptr; 2560263398Sdelphij objset_t *os = db->db_objset; 2561263398Sdelphij dmu_tx_t *tx = os->os_synctx; 2562168404Spjd dbuf_dirty_record_t **drp, *dr; 2563168404Spjd 2564243674Smm ASSERT0(zio->io_error); 2565219089Spjd ASSERT(db->db_blkptr == bp); 2566168404Spjd 2567244087Smm /* 2568244087Smm * For nopwrites and rewrites we ensure that the bp matches our 2569244087Smm * original and bypass all the accounting. 2570244087Smm */ 2571244087Smm if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { 2572219089Spjd ASSERT(BP_EQUAL(bp, bp_orig)); 2573219089Spjd } else { 2574263398Sdelphij dsl_dataset_t *ds = os->os_dsl_dataset; 2575219089Spjd (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); 2576219089Spjd dsl_dataset_block_born(ds, bp, tx); 2577219089Spjd } 2578219089Spjd 2579168404Spjd mutex_enter(&db->db_mtx); 2580168404Spjd 2581219089Spjd DBUF_VERIFY(db); 2582219089Spjd 2583168404Spjd drp = &db->db_last_dirty; 2584185029Spjd while ((dr = *drp) != db->db_data_pending) 2585185029Spjd drp = &dr->dr_next; 2586185029Spjd ASSERT(!list_link_active(&dr->dr_dirty_node)); 2587219089Spjd ASSERT(dr->dr_dbuf == db); 2588185029Spjd ASSERT(dr->dr_next == NULL); 2589185029Spjd *drp = dr->dr_next; 2590168404Spjd 2591219089Spjd#ifdef ZFS_DEBUG 2592219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) { 2593219089Spjd dnode_t *dn; 2594219089Spjd 2595219089Spjd DB_DNODE_ENTER(db); 2596219089Spjd dn = DB_DNODE(db); 2597219089Spjd ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 2598219089Spjd ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 2599219089Spjd db->db_blkptr == &dn->dn_phys->dn_spill); 2600219089Spjd DB_DNODE_EXIT(db); 2601219089Spjd } 2602219089Spjd#endif 2603219089Spjd 2604168404Spjd if (db->db_level == 0) { 2605219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 2606168404Spjd ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); 2607219089Spjd if (db->db_state != DB_NOFILL) { 2608219089Spjd if (dr->dt.dl.dr_data != db->db_buf) 2609219089Spjd VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, 2610249643Smm db)); 2611219089Spjd else if (!arc_released(db->db_buf)) 2612219089Spjd arc_set_callback(db->db_buf, dbuf_do_evict, db); 2613219089Spjd } 2614168404Spjd } else { 2615219089Spjd dnode_t *dn; 2616168404Spjd 2617219089Spjd DB_DNODE_ENTER(db); 2618219089Spjd dn = DB_DNODE(db); 2619168404Spjd ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 2620263398Sdelphij ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); 2621168404Spjd if (!BP_IS_HOLE(db->db_blkptr)) { 2622168404Spjd int epbs = 2623168404Spjd dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2624263398Sdelphij ASSERT3U(db->db_blkid, <=, 2625263398Sdelphij dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); 2626168404Spjd ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 2627168404Spjd db->db.db_size); 2628168404Spjd arc_set_callback(db->db_buf, dbuf_do_evict, db); 2629168404Spjd } 2630219089Spjd DB_DNODE_EXIT(db); 2631185029Spjd mutex_destroy(&dr->dt.di.dr_mtx); 2632169325Spjd list_destroy(&dr->dt.di.dr_children); 2633168404Spjd } 2634168404Spjd kmem_free(dr, sizeof (dbuf_dirty_record_t)); 2635168404Spjd 2636168404Spjd cv_broadcast(&db->db_changed); 2637168404Spjd ASSERT(db->db_dirtycnt > 0); 2638168404Spjd db->db_dirtycnt -= 1; 2639168404Spjd db->db_data_pending = NULL; 2640263398Sdelphij dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg); 2641219089Spjd} 2642219089Spjd 2643219089Spjdstatic void 2644219089Spjddbuf_write_nofill_ready(zio_t *zio) 2645219089Spjd{ 2646219089Spjd dbuf_write_ready(zio, NULL, zio->io_private); 2647219089Spjd} 2648219089Spjd 2649219089Spjdstatic void 2650219089Spjddbuf_write_nofill_done(zio_t *zio) 2651219089Spjd{ 2652219089Spjd dbuf_write_done(zio, NULL, zio->io_private); 2653219089Spjd} 2654219089Spjd 2655219089Spjdstatic void 2656219089Spjddbuf_write_override_ready(zio_t *zio) 2657219089Spjd{ 2658219089Spjd dbuf_dirty_record_t *dr = zio->io_private; 2659219089Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 2660219089Spjd 2661219089Spjd dbuf_write_ready(zio, NULL, db); 2662219089Spjd} 2663219089Spjd 2664219089Spjdstatic void 2665219089Spjddbuf_write_override_done(zio_t *zio) 2666219089Spjd{ 2667219089Spjd dbuf_dirty_record_t *dr = zio->io_private; 2668219089Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 2669219089Spjd blkptr_t *obp = &dr->dt.dl.dr_overridden_by; 2670219089Spjd 2671219089Spjd mutex_enter(&db->db_mtx); 2672219089Spjd if (!BP_EQUAL(zio->io_bp, obp)) { 2673219089Spjd if (!BP_IS_HOLE(obp)) 2674219089Spjd dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); 2675219089Spjd arc_release(dr->dt.dl.dr_data, db); 2676219089Spjd } 2677168404Spjd mutex_exit(&db->db_mtx); 2678168404Spjd 2679219089Spjd dbuf_write_done(zio, NULL, db); 2680219089Spjd} 2681168404Spjd 2682252749Sdelphij/* Issue I/O to commit a dirty buffer to disk. */ 2683219089Spjdstatic void 2684219089Spjddbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) 2685219089Spjd{ 2686219089Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 2687219089Spjd dnode_t *dn; 2688219089Spjd objset_t *os; 2689219089Spjd dmu_buf_impl_t *parent = db->db_parent; 2690219089Spjd uint64_t txg = tx->tx_txg; 2691219089Spjd zbookmark_t zb; 2692219089Spjd zio_prop_t zp; 2693219089Spjd zio_t *zio; 2694219089Spjd int wp_flag = 0; 2695219089Spjd 2696219089Spjd DB_DNODE_ENTER(db); 2697219089Spjd dn = DB_DNODE(db); 2698219089Spjd os = dn->dn_objset; 2699219089Spjd 2700219089Spjd if (db->db_state != DB_NOFILL) { 2701219089Spjd if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { 2702219089Spjd /* 2703219089Spjd * Private object buffers are released here rather 2704219089Spjd * than in dbuf_dirty() since they are only modified 2705219089Spjd * in the syncing context and we don't want the 2706219089Spjd * overhead of making multiple copies of the data. 2707219089Spjd */ 2708219089Spjd if (BP_IS_HOLE(db->db_blkptr)) { 2709219089Spjd arc_buf_thaw(data); 2710219089Spjd } else { 2711219089Spjd dbuf_release_bp(db); 2712219089Spjd } 2713219089Spjd } 2714219089Spjd } 2715219089Spjd 2716219089Spjd if (parent != dn->dn_dbuf) { 2717252749Sdelphij /* Our parent is an indirect block. */ 2718252749Sdelphij /* We have a dirty parent that has been scheduled for write. */ 2719219089Spjd ASSERT(parent && parent->db_data_pending); 2720252749Sdelphij /* Our parent's buffer is one level closer to the dnode. */ 2721219089Spjd ASSERT(db->db_level == parent->db_level-1); 2722252749Sdelphij /* 2723252749Sdelphij * We're about to modify our parent's db_data by modifying 2724252749Sdelphij * our block pointer, so the parent must be released. 2725252749Sdelphij */ 2726219089Spjd ASSERT(arc_released(parent->db_buf)); 2727219089Spjd zio = parent->db_data_pending->dr_zio; 2728219089Spjd } else { 2729252749Sdelphij /* Our parent is the dnode itself. */ 2730219089Spjd ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && 2731219089Spjd db->db_blkid != DMU_SPILL_BLKID) || 2732219089Spjd (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); 2733219089Spjd if (db->db_blkid != DMU_SPILL_BLKID) 2734219089Spjd ASSERT3P(db->db_blkptr, ==, 2735219089Spjd &dn->dn_phys->dn_blkptr[db->db_blkid]); 2736219089Spjd zio = dn->dn_zio; 2737219089Spjd } 2738219089Spjd 2739219089Spjd ASSERT(db->db_level == 0 || data == db->db_buf); 2740219089Spjd ASSERT3U(db->db_blkptr->blk_birth, <=, txg); 2741219089Spjd ASSERT(zio); 2742219089Spjd 2743219089Spjd SET_BOOKMARK(&zb, os->os_dsl_dataset ? 2744219089Spjd os->os_dsl_dataset->ds_object : DMU_META_OBJSET, 2745219089Spjd db->db.db_object, db->db_level, db->db_blkid); 2746219089Spjd 2747219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) 2748219089Spjd wp_flag = WP_SPILL; 2749219089Spjd wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; 2750219089Spjd 2751219089Spjd dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); 2752219089Spjd DB_DNODE_EXIT(db); 2753219089Spjd 2754219089Spjd if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 2755219089Spjd ASSERT(db->db_state != DB_NOFILL); 2756219089Spjd dr->dr_zio = zio_write(zio, os->os_spa, txg, 2757219089Spjd db->db_blkptr, data->b_data, arc_buf_size(data), &zp, 2758260764Savg dbuf_write_override_ready, NULL, dbuf_write_override_done, 2759260764Savg dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 2760219089Spjd mutex_enter(&db->db_mtx); 2761219089Spjd dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 2762219089Spjd zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, 2763244087Smm dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); 2764219089Spjd mutex_exit(&db->db_mtx); 2765219089Spjd } else if (db->db_state == DB_NOFILL) { 2766262089Savg ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || 2767262089Savg zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); 2768219089Spjd dr->dr_zio = zio_write(zio, os->os_spa, txg, 2769219089Spjd db->db_blkptr, NULL, db->db.db_size, &zp, 2770260764Savg dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db, 2771219089Spjd ZIO_PRIORITY_ASYNC_WRITE, 2772219089Spjd ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); 2773219089Spjd } else { 2774219089Spjd ASSERT(arc_released(data)); 2775219089Spjd dr->dr_zio = arc_write(zio, os->os_spa, txg, 2776252140Sdelphij db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), 2777252140Sdelphij DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready, 2778260764Savg dbuf_write_physdone, dbuf_write_done, db, 2779260764Savg ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 2780219089Spjd } 2781168404Spjd} 2782