1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23219636Spjd * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24265740Sdelphij * Copyright (c) 2012, 2014 by Delphix. All rights reserved. 25251478Sdelphij * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 26255750Sdelphij * Copyright (c) 2013, Joyent, Inc. All rights reserved. 27168404Spjd */ 28168404Spjd 29168404Spjd#include <sys/zfs_context.h> 30168404Spjd#include <sys/dmu.h> 31253821Sdelphij#include <sys/dmu_send.h> 32168404Spjd#include <sys/dmu_impl.h> 33168404Spjd#include <sys/dbuf.h> 34168404Spjd#include <sys/dmu_objset.h> 35168404Spjd#include <sys/dsl_dataset.h> 36168404Spjd#include <sys/dsl_dir.h> 37168404Spjd#include <sys/dmu_tx.h> 38168404Spjd#include <sys/spa.h> 39168404Spjd#include <sys/zio.h> 40168404Spjd#include <sys/dmu_zfetch.h> 41219089Spjd#include <sys/sa.h> 42219089Spjd#include <sys/sa_impl.h> 43268649Sdelphij#include <sys/zfeature.h> 44268649Sdelphij#include <sys/blkptr.h> 45265740Sdelphij#include <sys/range_tree.h> 46168404Spjd 47254753Sdelphij/* 48254753Sdelphij * Number of times that zfs_free_range() took the slow path while doing 49254753Sdelphij * a zfs receive. A nonzero value indicates a potential performance problem. 50254753Sdelphij */ 51254753Sdelphijuint64_t zfs_free_range_recv_miss; 52254753Sdelphij 53168404Spjdstatic void dbuf_destroy(dmu_buf_impl_t *db); 54248571Smmstatic boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); 55185029Spjdstatic void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); 56168404Spjd 57168404Spjd/* 58168404Spjd * Global data structures and functions for the dbuf cache. 59168404Spjd */ 60168404Spjdstatic kmem_cache_t *dbuf_cache; 61168404Spjd 62168404Spjd/* ARGSUSED */ 63168404Spjdstatic int 64168404Spjddbuf_cons(void *vdb, void *unused, int kmflag) 65168404Spjd{ 66168404Spjd dmu_buf_impl_t *db = vdb; 67168404Spjd bzero(db, sizeof (dmu_buf_impl_t)); 68168404Spjd 69168404Spjd mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); 70168404Spjd cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); 71168404Spjd refcount_create(&db->db_holds); 72269845Sdelphij 73168404Spjd return (0); 74168404Spjd} 75168404Spjd 76168404Spjd/* ARGSUSED */ 77168404Spjdstatic void 78168404Spjddbuf_dest(void *vdb, void *unused) 79168404Spjd{ 80168404Spjd dmu_buf_impl_t *db = vdb; 81168404Spjd mutex_destroy(&db->db_mtx); 82168404Spjd cv_destroy(&db->db_changed); 83168404Spjd refcount_destroy(&db->db_holds); 84168404Spjd} 85168404Spjd 86168404Spjd/* 87168404Spjd * dbuf hash table routines 88168404Spjd */ 89168404Spjdstatic dbuf_hash_table_t dbuf_hash_table; 90168404Spjd 91168404Spjdstatic uint64_t dbuf_hash_count; 92168404Spjd 93168404Spjdstatic uint64_t 94168404Spjddbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) 95168404Spjd{ 96168404Spjd uintptr_t osv = (uintptr_t)os; 97168404Spjd uint64_t crc = -1ULL; 98168404Spjd 99168404Spjd ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 100168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; 101168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; 102168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; 103168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; 104168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; 105168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; 106168404Spjd 107168404Spjd crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); 108168404Spjd 109168404Spjd return (crc); 110168404Spjd} 111168404Spjd 112168404Spjd#define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); 113168404Spjd 114168404Spjd#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ 115168404Spjd ((dbuf)->db.db_object == (obj) && \ 116168404Spjd (dbuf)->db_objset == (os) && \ 117168404Spjd (dbuf)->db_level == (level) && \ 118168404Spjd (dbuf)->db_blkid == (blkid)) 119168404Spjd 120168404Spjddmu_buf_impl_t * 121168404Spjddbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) 122168404Spjd{ 123168404Spjd dbuf_hash_table_t *h = &dbuf_hash_table; 124219089Spjd objset_t *os = dn->dn_objset; 125168404Spjd uint64_t obj = dn->dn_object; 126168404Spjd uint64_t hv = DBUF_HASH(os, obj, level, blkid); 127168404Spjd uint64_t idx = hv & h->hash_table_mask; 128168404Spjd dmu_buf_impl_t *db; 129168404Spjd 130168404Spjd mutex_enter(DBUF_HASH_MUTEX(h, idx)); 131168404Spjd for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { 132168404Spjd if (DBUF_EQUAL(db, os, obj, level, blkid)) { 133168404Spjd mutex_enter(&db->db_mtx); 134168404Spjd if (db->db_state != DB_EVICTING) { 135168404Spjd mutex_exit(DBUF_HASH_MUTEX(h, idx)); 136168404Spjd return (db); 137168404Spjd } 138168404Spjd mutex_exit(&db->db_mtx); 139168404Spjd } 140168404Spjd } 141168404Spjd mutex_exit(DBUF_HASH_MUTEX(h, idx)); 142168404Spjd return (NULL); 143168404Spjd} 144168404Spjd 145168404Spjd/* 146168404Spjd * Insert an entry into the hash table. If there is already an element 147168404Spjd * equal to elem in the hash table, then the already existing element 148168404Spjd * will be returned and the new element will not be inserted. 149168404Spjd * Otherwise returns NULL. 150168404Spjd */ 151168404Spjdstatic dmu_buf_impl_t * 152168404Spjddbuf_hash_insert(dmu_buf_impl_t *db) 153168404Spjd{ 154168404Spjd dbuf_hash_table_t *h = &dbuf_hash_table; 155219089Spjd objset_t *os = db->db_objset; 156168404Spjd uint64_t obj = db->db.db_object; 157168404Spjd int level = db->db_level; 158168404Spjd uint64_t blkid = db->db_blkid; 159168404Spjd uint64_t hv = DBUF_HASH(os, obj, level, blkid); 160168404Spjd uint64_t idx = hv & h->hash_table_mask; 161168404Spjd dmu_buf_impl_t *dbf; 162168404Spjd 163168404Spjd mutex_enter(DBUF_HASH_MUTEX(h, idx)); 164168404Spjd for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { 165168404Spjd if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { 166168404Spjd mutex_enter(&dbf->db_mtx); 167168404Spjd if (dbf->db_state != DB_EVICTING) { 168168404Spjd mutex_exit(DBUF_HASH_MUTEX(h, idx)); 169168404Spjd return (dbf); 170168404Spjd } 171168404Spjd mutex_exit(&dbf->db_mtx); 172168404Spjd } 173168404Spjd } 174168404Spjd 175168404Spjd mutex_enter(&db->db_mtx); 176168404Spjd db->db_hash_next = h->hash_table[idx]; 177168404Spjd h->hash_table[idx] = db; 178168404Spjd mutex_exit(DBUF_HASH_MUTEX(h, idx)); 179271001Sdelphij atomic_inc_64(&dbuf_hash_count); 180168404Spjd 181168404Spjd return (NULL); 182168404Spjd} 183168404Spjd 184168404Spjd/* 185269417Sdelphij * Remove an entry from the hash table. It must be in the EVICTING state. 186168404Spjd */ 187168404Spjdstatic void 188168404Spjddbuf_hash_remove(dmu_buf_impl_t *db) 189168404Spjd{ 190168404Spjd dbuf_hash_table_t *h = &dbuf_hash_table; 191168404Spjd uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, 192168404Spjd db->db_level, db->db_blkid); 193168404Spjd uint64_t idx = hv & h->hash_table_mask; 194168404Spjd dmu_buf_impl_t *dbf, **dbp; 195168404Spjd 196168404Spjd /* 197269417Sdelphij * We musn't hold db_mtx to maintain lock ordering: 198168404Spjd * DBUF_HASH_MUTEX > db_mtx. 199168404Spjd */ 200168404Spjd ASSERT(refcount_is_zero(&db->db_holds)); 201168404Spjd ASSERT(db->db_state == DB_EVICTING); 202168404Spjd ASSERT(!MUTEX_HELD(&db->db_mtx)); 203168404Spjd 204168404Spjd mutex_enter(DBUF_HASH_MUTEX(h, idx)); 205168404Spjd dbp = &h->hash_table[idx]; 206168404Spjd while ((dbf = *dbp) != db) { 207168404Spjd dbp = &dbf->db_hash_next; 208168404Spjd ASSERT(dbf != NULL); 209168404Spjd } 210168404Spjd *dbp = db->db_hash_next; 211168404Spjd db->db_hash_next = NULL; 212168404Spjd mutex_exit(DBUF_HASH_MUTEX(h, idx)); 213271001Sdelphij atomic_dec_64(&dbuf_hash_count); 214168404Spjd} 215168404Spjd 216168404Spjdstatic arc_evict_func_t dbuf_do_evict; 217168404Spjd 218168404Spjdstatic void 219168404Spjddbuf_evict_user(dmu_buf_impl_t *db) 220168404Spjd{ 221168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 222168404Spjd 223168404Spjd if (db->db_level != 0 || db->db_evict_func == NULL) 224168404Spjd return; 225168404Spjd 226168404Spjd if (db->db_user_data_ptr_ptr) 227168404Spjd *db->db_user_data_ptr_ptr = db->db.db_data; 228168404Spjd db->db_evict_func(&db->db, db->db_user_ptr); 229168404Spjd db->db_user_ptr = NULL; 230168404Spjd db->db_user_data_ptr_ptr = NULL; 231168404Spjd db->db_evict_func = NULL; 232168404Spjd} 233168404Spjd 234219089Spjdboolean_t 235219089Spjddbuf_is_metadata(dmu_buf_impl_t *db) 236219089Spjd{ 237219089Spjd if (db->db_level > 0) { 238219089Spjd return (B_TRUE); 239219089Spjd } else { 240219089Spjd boolean_t is_metadata; 241219089Spjd 242219089Spjd DB_DNODE_ENTER(db); 243236884Smm is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type); 244219089Spjd DB_DNODE_EXIT(db); 245219089Spjd 246219089Spjd return (is_metadata); 247219089Spjd } 248219089Spjd} 249219089Spjd 250168404Spjdvoid 251168404Spjddbuf_evict(dmu_buf_impl_t *db) 252168404Spjd{ 253168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 254168404Spjd ASSERT(db->db_buf == NULL); 255168404Spjd ASSERT(db->db_data_pending == NULL); 256168404Spjd 257168404Spjd dbuf_clear(db); 258168404Spjd dbuf_destroy(db); 259168404Spjd} 260168404Spjd 261168404Spjdvoid 262168404Spjddbuf_init(void) 263168404Spjd{ 264168404Spjd uint64_t hsize = 1ULL << 16; 265168404Spjd dbuf_hash_table_t *h = &dbuf_hash_table; 266168404Spjd int i; 267168404Spjd 268168404Spjd /* 269168404Spjd * The hash table is big enough to fill all of physical memory 270168404Spjd * with an average 4K block size. The table will take up 271168404Spjd * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). 272168404Spjd */ 273168696Spjd while (hsize * 4096 < (uint64_t)physmem * PAGESIZE) 274168404Spjd hsize <<= 1; 275168404Spjd 276168404Spjdretry: 277168404Spjd h->hash_table_mask = hsize - 1; 278168404Spjd h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); 279168404Spjd if (h->hash_table == NULL) { 280168404Spjd /* XXX - we should really return an error instead of assert */ 281168404Spjd ASSERT(hsize > (1ULL << 10)); 282168404Spjd hsize >>= 1; 283168404Spjd goto retry; 284168404Spjd } 285168404Spjd 286168404Spjd dbuf_cache = kmem_cache_create("dmu_buf_impl_t", 287168404Spjd sizeof (dmu_buf_impl_t), 288168404Spjd 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); 289168404Spjd 290168404Spjd for (i = 0; i < DBUF_MUTEXES; i++) 291168404Spjd mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); 292168404Spjd} 293168404Spjd 294168404Spjdvoid 295168404Spjddbuf_fini(void) 296168404Spjd{ 297168404Spjd dbuf_hash_table_t *h = &dbuf_hash_table; 298168404Spjd int i; 299168404Spjd 300168404Spjd for (i = 0; i < DBUF_MUTEXES; i++) 301168404Spjd mutex_destroy(&h->hash_mutexes[i]); 302168404Spjd kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); 303168404Spjd kmem_cache_destroy(dbuf_cache); 304168404Spjd} 305168404Spjd 306168404Spjd/* 307168404Spjd * Other stuff. 308168404Spjd */ 309168404Spjd 310168404Spjd#ifdef ZFS_DEBUG 311168404Spjdstatic void 312168404Spjddbuf_verify(dmu_buf_impl_t *db) 313168404Spjd{ 314219089Spjd dnode_t *dn; 315219089Spjd dbuf_dirty_record_t *dr; 316168404Spjd 317168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 318168404Spjd 319168404Spjd if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) 320168404Spjd return; 321168404Spjd 322168404Spjd ASSERT(db->db_objset != NULL); 323219089Spjd DB_DNODE_ENTER(db); 324219089Spjd dn = DB_DNODE(db); 325168404Spjd if (dn == NULL) { 326168404Spjd ASSERT(db->db_parent == NULL); 327168404Spjd ASSERT(db->db_blkptr == NULL); 328168404Spjd } else { 329168404Spjd ASSERT3U(db->db.db_object, ==, dn->dn_object); 330168404Spjd ASSERT3P(db->db_objset, ==, dn->dn_objset); 331168404Spjd ASSERT3U(db->db_level, <, dn->dn_nlevels); 332219089Spjd ASSERT(db->db_blkid == DMU_BONUS_BLKID || 333219089Spjd db->db_blkid == DMU_SPILL_BLKID || 334269845Sdelphij !avl_is_empty(&dn->dn_dbufs)); 335168404Spjd } 336219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 337168404Spjd ASSERT(dn != NULL); 338185029Spjd ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 339219089Spjd ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); 340219089Spjd } else if (db->db_blkid == DMU_SPILL_BLKID) { 341219089Spjd ASSERT(dn != NULL); 342219089Spjd ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 343240415Smm ASSERT0(db->db.db_offset); 344168404Spjd } else { 345168404Spjd ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); 346168404Spjd } 347168404Spjd 348219089Spjd for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) 349219089Spjd ASSERT(dr->dr_dbuf == db); 350219089Spjd 351219089Spjd for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) 352219089Spjd ASSERT(dr->dr_dbuf == db); 353219089Spjd 354208047Smm /* 355208047Smm * We can't assert that db_size matches dn_datablksz because it 356208047Smm * can be momentarily different when another thread is doing 357208047Smm * dnode_set_blksz(). 358208047Smm */ 359208047Smm if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { 360219089Spjd dr = db->db_data_pending; 361208047Smm /* 362208047Smm * It should only be modified in syncing context, so 363208047Smm * make sure we only have one copy of the data. 364208047Smm */ 365208047Smm ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); 366168404Spjd } 367168404Spjd 368168404Spjd /* verify db->db_blkptr */ 369168404Spjd if (db->db_blkptr) { 370168404Spjd if (db->db_parent == dn->dn_dbuf) { 371168404Spjd /* db is pointed to by the dnode */ 372168404Spjd /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ 373209962Smm if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) 374168404Spjd ASSERT(db->db_parent == NULL); 375168404Spjd else 376168404Spjd ASSERT(db->db_parent != NULL); 377219089Spjd if (db->db_blkid != DMU_SPILL_BLKID) 378219089Spjd ASSERT3P(db->db_blkptr, ==, 379219089Spjd &dn->dn_phys->dn_blkptr[db->db_blkid]); 380168404Spjd } else { 381168404Spjd /* db is pointed to by an indirect block */ 382168404Spjd int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; 383168404Spjd ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); 384168404Spjd ASSERT3U(db->db_parent->db.db_object, ==, 385168404Spjd db->db.db_object); 386168404Spjd /* 387168404Spjd * dnode_grow_indblksz() can make this fail if we don't 388168404Spjd * have the struct_rwlock. XXX indblksz no longer 389168404Spjd * grows. safe to do this now? 390168404Spjd */ 391219089Spjd if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 392168404Spjd ASSERT3P(db->db_blkptr, ==, 393168404Spjd ((blkptr_t *)db->db_parent->db.db_data + 394168404Spjd db->db_blkid % epb)); 395168404Spjd } 396168404Spjd } 397168404Spjd } 398168404Spjd if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && 399219089Spjd (db->db_buf == NULL || db->db_buf->b_data) && 400219089Spjd db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && 401168404Spjd db->db_state != DB_FILL && !dn->dn_free_txg) { 402168404Spjd /* 403168404Spjd * If the blkptr isn't set but they have nonzero data, 404168404Spjd * it had better be dirty, otherwise we'll lose that 405168404Spjd * data when we evict this buffer. 406168404Spjd */ 407168404Spjd if (db->db_dirtycnt == 0) { 408168404Spjd uint64_t *buf = db->db.db_data; 409168404Spjd int i; 410168404Spjd 411168404Spjd for (i = 0; i < db->db.db_size >> 3; i++) { 412168404Spjd ASSERT(buf[i] == 0); 413168404Spjd } 414168404Spjd } 415168404Spjd } 416219089Spjd DB_DNODE_EXIT(db); 417168404Spjd} 418168404Spjd#endif 419168404Spjd 420168404Spjdstatic void 421168404Spjddbuf_update_data(dmu_buf_impl_t *db) 422168404Spjd{ 423168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 424168404Spjd if (db->db_level == 0 && db->db_user_data_ptr_ptr) { 425168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 426168404Spjd *db->db_user_data_ptr_ptr = db->db.db_data; 427168404Spjd } 428168404Spjd} 429168404Spjd 430168404Spjdstatic void 431168404Spjddbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) 432168404Spjd{ 433168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 434168404Spjd db->db_buf = buf; 435168404Spjd if (buf != NULL) { 436168404Spjd ASSERT(buf->b_data != NULL); 437168404Spjd db->db.db_data = buf->b_data; 438168404Spjd if (!arc_released(buf)) 439168404Spjd arc_set_callback(buf, dbuf_do_evict, db); 440168404Spjd dbuf_update_data(db); 441168404Spjd } else { 442168404Spjd dbuf_evict_user(db); 443168404Spjd db->db.db_data = NULL; 444219089Spjd if (db->db_state != DB_NOFILL) 445219089Spjd db->db_state = DB_UNCACHED; 446168404Spjd } 447168404Spjd} 448168404Spjd 449219089Spjd/* 450219089Spjd * Loan out an arc_buf for read. Return the loaned arc_buf. 451219089Spjd */ 452219089Spjdarc_buf_t * 453219089Spjddbuf_loan_arcbuf(dmu_buf_impl_t *db) 454219089Spjd{ 455219089Spjd arc_buf_t *abuf; 456219089Spjd 457219089Spjd mutex_enter(&db->db_mtx); 458219089Spjd if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { 459219089Spjd int blksz = db->db.db_size; 460263397Sdelphij spa_t *spa = db->db_objset->os_spa; 461219089Spjd 462219089Spjd mutex_exit(&db->db_mtx); 463219089Spjd abuf = arc_loan_buf(spa, blksz); 464219089Spjd bcopy(db->db.db_data, abuf->b_data, blksz); 465219089Spjd } else { 466219089Spjd abuf = db->db_buf; 467219089Spjd arc_loan_inuse_buf(abuf, db); 468219089Spjd dbuf_set_data(db, NULL); 469219089Spjd mutex_exit(&db->db_mtx); 470219089Spjd } 471219089Spjd return (abuf); 472219089Spjd} 473219089Spjd 474168404Spjduint64_t 475168404Spjddbuf_whichblock(dnode_t *dn, uint64_t offset) 476168404Spjd{ 477168404Spjd if (dn->dn_datablkshift) { 478168404Spjd return (offset >> dn->dn_datablkshift); 479168404Spjd } else { 480168404Spjd ASSERT3U(offset, <, dn->dn_datablksz); 481168404Spjd return (0); 482168404Spjd } 483168404Spjd} 484168404Spjd 485168404Spjdstatic void 486168404Spjddbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) 487168404Spjd{ 488168404Spjd dmu_buf_impl_t *db = vdb; 489168404Spjd 490168404Spjd mutex_enter(&db->db_mtx); 491168404Spjd ASSERT3U(db->db_state, ==, DB_READ); 492168404Spjd /* 493168404Spjd * All reads are synchronous, so we must have a hold on the dbuf 494168404Spjd */ 495168404Spjd ASSERT(refcount_count(&db->db_holds) > 0); 496168404Spjd ASSERT(db->db_buf == NULL); 497168404Spjd ASSERT(db->db.db_data == NULL); 498168404Spjd if (db->db_level == 0 && db->db_freed_in_flight) { 499168404Spjd /* we were freed in flight; disregard any error */ 500168404Spjd arc_release(buf, db); 501168404Spjd bzero(buf->b_data, db->db.db_size); 502168404Spjd arc_buf_freeze(buf); 503168404Spjd db->db_freed_in_flight = FALSE; 504168404Spjd dbuf_set_data(db, buf); 505168404Spjd db->db_state = DB_CACHED; 506168404Spjd } else if (zio == NULL || zio->io_error == 0) { 507168404Spjd dbuf_set_data(db, buf); 508168404Spjd db->db_state = DB_CACHED; 509168404Spjd } else { 510219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 511168404Spjd ASSERT3P(db->db_buf, ==, NULL); 512248571Smm VERIFY(arc_buf_remove_ref(buf, db)); 513168404Spjd db->db_state = DB_UNCACHED; 514168404Spjd } 515168404Spjd cv_broadcast(&db->db_changed); 516219089Spjd dbuf_rele_and_unlock(db, NULL); 517168404Spjd} 518168404Spjd 519168404Spjdstatic void 520168404Spjddbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) 521168404Spjd{ 522219089Spjd dnode_t *dn; 523268657Sdelphij zbookmark_phys_t zb; 524168404Spjd uint32_t aflags = ARC_NOWAIT; 525168404Spjd 526219089Spjd DB_DNODE_ENTER(db); 527219089Spjd dn = DB_DNODE(db); 528168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 529168404Spjd /* We need the struct_rwlock to prevent db_blkptr from changing. */ 530185029Spjd ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 531168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 532168404Spjd ASSERT(db->db_state == DB_UNCACHED); 533168404Spjd ASSERT(db->db_buf == NULL); 534168404Spjd 535219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 536207624Smm int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); 537185029Spjd 538185029Spjd ASSERT3U(bonuslen, <=, db->db.db_size); 539168404Spjd db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); 540208373Smm arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 541185029Spjd if (bonuslen < DN_MAX_BONUSLEN) 542168404Spjd bzero(db->db.db_data, DN_MAX_BONUSLEN); 543207624Smm if (bonuslen) 544207624Smm bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); 545219089Spjd DB_DNODE_EXIT(db); 546168404Spjd dbuf_update_data(db); 547168404Spjd db->db_state = DB_CACHED; 548168404Spjd mutex_exit(&db->db_mtx); 549168404Spjd return; 550168404Spjd } 551168404Spjd 552185029Spjd /* 553185029Spjd * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() 554185029Spjd * processes the delete record and clears the bp while we are waiting 555185029Spjd * for the dn_mtx (resulting in a "no" from block_freed). 556185029Spjd */ 557185029Spjd if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || 558185029Spjd (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || 559185029Spjd BP_IS_HOLE(db->db_blkptr)))) { 560168404Spjd arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 561168404Spjd 562263397Sdelphij DB_DNODE_EXIT(db); 563263397Sdelphij dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa, 564168404Spjd db->db.db_size, db, type)); 565168404Spjd bzero(db->db.db_data, db->db.db_size); 566168404Spjd db->db_state = DB_CACHED; 567168404Spjd *flags |= DB_RF_CACHED; 568168404Spjd mutex_exit(&db->db_mtx); 569168404Spjd return; 570168404Spjd } 571168404Spjd 572219089Spjd DB_DNODE_EXIT(db); 573219089Spjd 574168404Spjd db->db_state = DB_READ; 575168404Spjd mutex_exit(&db->db_mtx); 576168404Spjd 577185029Spjd if (DBUF_IS_L2CACHEABLE(db)) 578185029Spjd aflags |= ARC_L2CACHE; 579251478Sdelphij if (DBUF_IS_L2COMPRESSIBLE(db)) 580251478Sdelphij aflags |= ARC_L2COMPRESS; 581185029Spjd 582219089Spjd SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? 583219089Spjd db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, 584219089Spjd db->db.db_object, db->db_level, db->db_blkid); 585168404Spjd 586168404Spjd dbuf_add_ref(db, NULL); 587185029Spjd 588263397Sdelphij (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr, 589168404Spjd dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, 590168404Spjd (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, 591168404Spjd &aflags, &zb); 592168404Spjd if (aflags & ARC_CACHED) 593168404Spjd *flags |= DB_RF_CACHED; 594168404Spjd} 595168404Spjd 596168404Spjdint 597168404Spjddbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 598168404Spjd{ 599168404Spjd int err = 0; 600263397Sdelphij boolean_t havepzio = (zio != NULL); 601263397Sdelphij boolean_t prefetch; 602219089Spjd dnode_t *dn; 603168404Spjd 604168404Spjd /* 605168404Spjd * We don't have to hold the mutex to check db_state because it 606168404Spjd * can't be freed while we have a hold on the buffer. 607168404Spjd */ 608168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 609168404Spjd 610219089Spjd if (db->db_state == DB_NOFILL) 611249195Smm return (SET_ERROR(EIO)); 612219089Spjd 613219089Spjd DB_DNODE_ENTER(db); 614219089Spjd dn = DB_DNODE(db); 615168404Spjd if ((flags & DB_RF_HAVESTRUCT) == 0) 616219089Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 617168404Spjd 618219089Spjd prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 619219089Spjd (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && 620185029Spjd DBUF_IS_CACHEABLE(db); 621168404Spjd 622168404Spjd mutex_enter(&db->db_mtx); 623168404Spjd if (db->db_state == DB_CACHED) { 624168404Spjd mutex_exit(&db->db_mtx); 625168404Spjd if (prefetch) 626219089Spjd dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 627168404Spjd db->db.db_size, TRUE); 628168404Spjd if ((flags & DB_RF_HAVESTRUCT) == 0) 629219089Spjd rw_exit(&dn->dn_struct_rwlock); 630219089Spjd DB_DNODE_EXIT(db); 631168404Spjd } else if (db->db_state == DB_UNCACHED) { 632219089Spjd spa_t *spa = dn->dn_objset->os_spa; 633219089Spjd 634219089Spjd if (zio == NULL) 635219089Spjd zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 636168404Spjd dbuf_read_impl(db, zio, &flags); 637168404Spjd 638168404Spjd /* dbuf_read_impl has dropped db_mtx for us */ 639168404Spjd 640168404Spjd if (prefetch) 641219089Spjd dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 642168404Spjd db->db.db_size, flags & DB_RF_CACHED); 643168404Spjd 644168404Spjd if ((flags & DB_RF_HAVESTRUCT) == 0) 645219089Spjd rw_exit(&dn->dn_struct_rwlock); 646219089Spjd DB_DNODE_EXIT(db); 647168404Spjd 648168404Spjd if (!havepzio) 649168404Spjd err = zio_wait(zio); 650168404Spjd } else { 651251629Sdelphij /* 652251629Sdelphij * Another reader came in while the dbuf was in flight 653251629Sdelphij * between UNCACHED and CACHED. Either a writer will finish 654251629Sdelphij * writing the buffer (sending the dbuf to CACHED) or the 655251629Sdelphij * first reader's request will reach the read_done callback 656251629Sdelphij * and send the dbuf to CACHED. Otherwise, a failure 657251629Sdelphij * occurred and the dbuf went to UNCACHED. 658251629Sdelphij */ 659168404Spjd mutex_exit(&db->db_mtx); 660168404Spjd if (prefetch) 661219089Spjd dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 662168404Spjd db->db.db_size, TRUE); 663168404Spjd if ((flags & DB_RF_HAVESTRUCT) == 0) 664219089Spjd rw_exit(&dn->dn_struct_rwlock); 665219089Spjd DB_DNODE_EXIT(db); 666168404Spjd 667251629Sdelphij /* Skip the wait per the caller's request. */ 668168404Spjd mutex_enter(&db->db_mtx); 669168404Spjd if ((flags & DB_RF_NEVERWAIT) == 0) { 670168404Spjd while (db->db_state == DB_READ || 671168404Spjd db->db_state == DB_FILL) { 672168404Spjd ASSERT(db->db_state == DB_READ || 673168404Spjd (flags & DB_RF_HAVESTRUCT) == 0); 674168404Spjd cv_wait(&db->db_changed, &db->db_mtx); 675168404Spjd } 676168404Spjd if (db->db_state == DB_UNCACHED) 677249195Smm err = SET_ERROR(EIO); 678168404Spjd } 679168404Spjd mutex_exit(&db->db_mtx); 680168404Spjd } 681168404Spjd 682168404Spjd ASSERT(err || havepzio || db->db_state == DB_CACHED); 683168404Spjd return (err); 684168404Spjd} 685168404Spjd 686168404Spjdstatic void 687168404Spjddbuf_noread(dmu_buf_impl_t *db) 688168404Spjd{ 689168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 690219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 691168404Spjd mutex_enter(&db->db_mtx); 692168404Spjd while (db->db_state == DB_READ || db->db_state == DB_FILL) 693168404Spjd cv_wait(&db->db_changed, &db->db_mtx); 694168404Spjd if (db->db_state == DB_UNCACHED) { 695168404Spjd arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 696263397Sdelphij spa_t *spa = db->db_objset->os_spa; 697168404Spjd 698168404Spjd ASSERT(db->db_buf == NULL); 699168404Spjd ASSERT(db->db.db_data == NULL); 700219089Spjd dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); 701168404Spjd db->db_state = DB_FILL; 702219089Spjd } else if (db->db_state == DB_NOFILL) { 703219089Spjd dbuf_set_data(db, NULL); 704168404Spjd } else { 705168404Spjd ASSERT3U(db->db_state, ==, DB_CACHED); 706168404Spjd } 707168404Spjd mutex_exit(&db->db_mtx); 708168404Spjd} 709168404Spjd 710168404Spjd/* 711168404Spjd * This is our just-in-time copy function. It makes a copy of 712168404Spjd * buffers, that have been modified in a previous transaction 713168404Spjd * group, before we modify them in the current active group. 714168404Spjd * 715168404Spjd * This function is used in two places: when we are dirtying a 716168404Spjd * buffer for the first time in a txg, and when we are freeing 717168404Spjd * a range in a dnode that includes this buffer. 718168404Spjd * 719168404Spjd * Note that when we are called from dbuf_free_range() we do 720168404Spjd * not put a hold on the buffer, we just traverse the active 721168404Spjd * dbuf list for the dnode. 722168404Spjd */ 723168404Spjdstatic void 724168404Spjddbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) 725168404Spjd{ 726168404Spjd dbuf_dirty_record_t *dr = db->db_last_dirty; 727168404Spjd 728168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 729168404Spjd ASSERT(db->db.db_data != NULL); 730168404Spjd ASSERT(db->db_level == 0); 731168404Spjd ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); 732168404Spjd 733168404Spjd if (dr == NULL || 734168404Spjd (dr->dt.dl.dr_data != 735219089Spjd ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) 736168404Spjd return; 737168404Spjd 738168404Spjd /* 739168404Spjd * If the last dirty record for this dbuf has not yet synced 740168404Spjd * and its referencing the dbuf data, either: 741219089Spjd * reset the reference to point to a new copy, 742168404Spjd * or (if there a no active holders) 743168404Spjd * just null out the current db_data pointer. 744168404Spjd */ 745168404Spjd ASSERT(dr->dr_txg >= txg - 2); 746219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 747168404Spjd /* Note that the data bufs here are zio_bufs */ 748168404Spjd dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); 749208373Smm arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 750168404Spjd bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); 751168404Spjd } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 752168404Spjd int size = db->db.db_size; 753168404Spjd arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 754263397Sdelphij spa_t *spa = db->db_objset->os_spa; 755219089Spjd 756219089Spjd dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); 757168404Spjd bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); 758168404Spjd } else { 759168404Spjd dbuf_set_data(db, NULL); 760168404Spjd } 761168404Spjd} 762168404Spjd 763168404Spjdvoid 764168404Spjddbuf_unoverride(dbuf_dirty_record_t *dr) 765168404Spjd{ 766168404Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 767219089Spjd blkptr_t *bp = &dr->dt.dl.dr_overridden_by; 768168404Spjd uint64_t txg = dr->dr_txg; 769168404Spjd 770168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 771168404Spjd ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); 772168404Spjd ASSERT(db->db_level == 0); 773168404Spjd 774219089Spjd if (db->db_blkid == DMU_BONUS_BLKID || 775168404Spjd dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) 776168404Spjd return; 777168404Spjd 778219089Spjd ASSERT(db->db_data_pending != dr); 779219089Spjd 780168404Spjd /* free this block */ 781263397Sdelphij if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) 782263397Sdelphij zio_free(db->db_objset->os_spa, txg, bp); 783219089Spjd 784168404Spjd dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 785243524Smm dr->dt.dl.dr_nopwrite = B_FALSE; 786243524Smm 787168404Spjd /* 788168404Spjd * Release the already-written buffer, so we leave it in 789168404Spjd * a consistent dirty state. Note that all callers are 790168404Spjd * modifying the buffer, so they will immediately do 791168404Spjd * another (redundant) arc_release(). Therefore, leave 792168404Spjd * the buf thawed to save the effort of freezing & 793168404Spjd * immediately re-thawing it. 794168404Spjd */ 795168404Spjd arc_release(dr->dt.dl.dr_data, db); 796168404Spjd} 797168404Spjd 798185029Spjd/* 799185029Spjd * Evict (if its unreferenced) or clear (if its referenced) any level-0 800185029Spjd * data blocks in the free range, so that any future readers will find 801263397Sdelphij * empty blocks. 802253821Sdelphij * 803253821Sdelphij * This is a no-op if the dataset is in the middle of an incremental 804253821Sdelphij * receive; see comment below for details. 805185029Spjd */ 806168404Spjdvoid 807269845Sdelphijdbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, 808269845Sdelphij dmu_tx_t *tx) 809168404Spjd{ 810269845Sdelphij dmu_buf_impl_t *db, *db_next, db_search; 811168404Spjd uint64_t txg = tx->tx_txg; 812269845Sdelphij avl_index_t where; 813168404Spjd 814269845Sdelphij if (end_blkid > dn->dn_maxblkid && (end_blkid != DMU_SPILL_BLKID)) 815269845Sdelphij end_blkid = dn->dn_maxblkid; 816269845Sdelphij dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid); 817253821Sdelphij 818269845Sdelphij db_search.db_level = 0; 819269845Sdelphij db_search.db_blkid = start_blkid; 820270809Sdelphij db_search.db_state = DB_SEARCH; 821269845Sdelphij 822254753Sdelphij mutex_enter(&dn->dn_dbufs_mtx); 823269845Sdelphij if (start_blkid >= dn->dn_unlisted_l0_blkid) { 824254753Sdelphij /* There can't be any dbufs in this range; no need to search. */ 825269845Sdelphij#ifdef DEBUG 826269845Sdelphij db = avl_find(&dn->dn_dbufs, &db_search, &where); 827269845Sdelphij ASSERT3P(db, ==, NULL); 828269845Sdelphij db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); 829269845Sdelphij ASSERT(db == NULL || db->db_level > 0); 830269845Sdelphij#endif 831254753Sdelphij mutex_exit(&dn->dn_dbufs_mtx); 832254753Sdelphij return; 833254753Sdelphij } else if (dmu_objset_is_receiving(dn->dn_objset)) { 834253821Sdelphij /* 835254753Sdelphij * If we are receiving, we expect there to be no dbufs in 836254753Sdelphij * the range to be freed, because receive modifies each 837254753Sdelphij * block at most once, and in offset order. If this is 838254753Sdelphij * not the case, it can lead to performance problems, 839254753Sdelphij * so note that we unexpectedly took the slow path. 840253821Sdelphij */ 841254753Sdelphij atomic_inc_64(&zfs_free_range_recv_miss); 842253821Sdelphij } 843253821Sdelphij 844269845Sdelphij db = avl_find(&dn->dn_dbufs, &db_search, &where); 845269845Sdelphij ASSERT3P(db, ==, NULL); 846269845Sdelphij db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); 847269845Sdelphij 848269845Sdelphij for (; db != NULL; db = db_next) { 849269845Sdelphij db_next = AVL_NEXT(&dn->dn_dbufs, db); 850219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 851185029Spjd 852269845Sdelphij if (db->db_level != 0 || db->db_blkid > end_blkid) { 853269845Sdelphij break; 854269845Sdelphij } 855269845Sdelphij ASSERT3U(db->db_blkid, >=, start_blkid); 856168404Spjd 857168404Spjd /* found a level 0 buffer in the range */ 858248571Smm mutex_enter(&db->db_mtx); 859248571Smm if (dbuf_undirty(db, tx)) { 860248571Smm /* mutex has been dropped and dbuf destroyed */ 861168404Spjd continue; 862248571Smm } 863168404Spjd 864168404Spjd if (db->db_state == DB_UNCACHED || 865219089Spjd db->db_state == DB_NOFILL || 866168404Spjd db->db_state == DB_EVICTING) { 867168404Spjd ASSERT(db->db.db_data == NULL); 868168404Spjd mutex_exit(&db->db_mtx); 869168404Spjd continue; 870168404Spjd } 871168404Spjd if (db->db_state == DB_READ || db->db_state == DB_FILL) { 872168404Spjd /* will be handled in dbuf_read_done or dbuf_rele */ 873168404Spjd db->db_freed_in_flight = TRUE; 874168404Spjd mutex_exit(&db->db_mtx); 875168404Spjd continue; 876168404Spjd } 877168404Spjd if (refcount_count(&db->db_holds) == 0) { 878168404Spjd ASSERT(db->db_buf); 879168404Spjd dbuf_clear(db); 880168404Spjd continue; 881168404Spjd } 882168404Spjd /* The dbuf is referenced */ 883168404Spjd 884168404Spjd if (db->db_last_dirty != NULL) { 885168404Spjd dbuf_dirty_record_t *dr = db->db_last_dirty; 886168404Spjd 887168404Spjd if (dr->dr_txg == txg) { 888168404Spjd /* 889168404Spjd * This buffer is "in-use", re-adjust the file 890168404Spjd * size to reflect that this buffer may 891168404Spjd * contain new data when we sync. 892168404Spjd */ 893219089Spjd if (db->db_blkid != DMU_SPILL_BLKID && 894219089Spjd db->db_blkid > dn->dn_maxblkid) 895168404Spjd dn->dn_maxblkid = db->db_blkid; 896168404Spjd dbuf_unoverride(dr); 897168404Spjd } else { 898168404Spjd /* 899168404Spjd * This dbuf is not dirty in the open context. 900168404Spjd * Either uncache it (if its not referenced in 901168404Spjd * the open context) or reset its contents to 902168404Spjd * empty. 903168404Spjd */ 904168404Spjd dbuf_fix_old_data(db, txg); 905168404Spjd } 906168404Spjd } 907168404Spjd /* clear the contents if its cached */ 908168404Spjd if (db->db_state == DB_CACHED) { 909168404Spjd ASSERT(db->db.db_data != NULL); 910168404Spjd arc_release(db->db_buf, db); 911168404Spjd bzero(db->db.db_data, db->db.db_size); 912168404Spjd arc_buf_freeze(db->db_buf); 913168404Spjd } 914168404Spjd 915168404Spjd mutex_exit(&db->db_mtx); 916168404Spjd } 917168404Spjd mutex_exit(&dn->dn_dbufs_mtx); 918168404Spjd} 919168404Spjd 920168404Spjdstatic int 921185029Spjddbuf_block_freeable(dmu_buf_impl_t *db) 922168404Spjd{ 923168404Spjd dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; 924168404Spjd uint64_t birth_txg = 0; 925168404Spjd 926168404Spjd /* 927168404Spjd * We don't need any locking to protect db_blkptr: 928168404Spjd * If it's syncing, then db_last_dirty will be set 929168404Spjd * so we'll ignore db_blkptr. 930263397Sdelphij * 931263397Sdelphij * This logic ensures that only block births for 932263397Sdelphij * filled blocks are considered. 933168404Spjd */ 934168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 935263397Sdelphij if (db->db_last_dirty && (db->db_blkptr == NULL || 936263397Sdelphij !BP_IS_HOLE(db->db_blkptr))) { 937168404Spjd birth_txg = db->db_last_dirty->dr_txg; 938263397Sdelphij } else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { 939168404Spjd birth_txg = db->db_blkptr->blk_birth; 940263397Sdelphij } 941168404Spjd 942219089Spjd /* 943263397Sdelphij * If this block don't exist or is in a snapshot, it can't be freed. 944219089Spjd * Don't pass the bp to dsl_dataset_block_freeable() since we 945219089Spjd * are holding the db_mtx lock and might deadlock if we are 946219089Spjd * prefetching a dedup-ed block. 947219089Spjd */ 948263397Sdelphij if (birth_txg != 0) 949185029Spjd return (ds == NULL || 950219089Spjd dsl_dataset_block_freeable(ds, NULL, birth_txg)); 951168404Spjd else 952263397Sdelphij return (B_FALSE); 953168404Spjd} 954168404Spjd 955168404Spjdvoid 956168404Spjddbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) 957168404Spjd{ 958168404Spjd arc_buf_t *buf, *obuf; 959168404Spjd int osize = db->db.db_size; 960168404Spjd arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 961219089Spjd dnode_t *dn; 962168404Spjd 963219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 964168404Spjd 965219089Spjd DB_DNODE_ENTER(db); 966219089Spjd dn = DB_DNODE(db); 967219089Spjd 968168404Spjd /* XXX does *this* func really need the lock? */ 969219089Spjd ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 970168404Spjd 971168404Spjd /* 972263397Sdelphij * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held 973168404Spjd * is OK, because there can be no other references to the db 974168404Spjd * when we are changing its size, so no concurrent DB_FILL can 975168404Spjd * be happening. 976168404Spjd */ 977168404Spjd /* 978168404Spjd * XXX we should be doing a dbuf_read, checking the return 979168404Spjd * value and returning that up to our callers 980168404Spjd */ 981263397Sdelphij dmu_buf_will_dirty(&db->db, tx); 982168404Spjd 983168404Spjd /* create the data buffer for the new block */ 984219089Spjd buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type); 985168404Spjd 986168404Spjd /* copy old block data to the new block */ 987168404Spjd obuf = db->db_buf; 988168404Spjd bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); 989168404Spjd /* zero the remainder */ 990168404Spjd if (size > osize) 991168404Spjd bzero((uint8_t *)buf->b_data + osize, size - osize); 992168404Spjd 993168404Spjd mutex_enter(&db->db_mtx); 994168404Spjd dbuf_set_data(db, buf); 995248571Smm VERIFY(arc_buf_remove_ref(obuf, db)); 996168404Spjd db->db.db_size = size; 997168404Spjd 998168404Spjd if (db->db_level == 0) { 999168404Spjd ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 1000168404Spjd db->db_last_dirty->dt.dl.dr_data = buf; 1001168404Spjd } 1002168404Spjd mutex_exit(&db->db_mtx); 1003168404Spjd 1004219089Spjd dnode_willuse_space(dn, size-osize, tx); 1005219089Spjd DB_DNODE_EXIT(db); 1006168404Spjd} 1007168404Spjd 1008219089Spjdvoid 1009219089Spjddbuf_release_bp(dmu_buf_impl_t *db) 1010219089Spjd{ 1011263397Sdelphij objset_t *os = db->db_objset; 1012219089Spjd 1013219089Spjd ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); 1014219089Spjd ASSERT(arc_released(os->os_phys_buf) || 1015219089Spjd list_link_active(&os->os_dsl_dataset->ds_synced_link)); 1016219089Spjd ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); 1017219089Spjd 1018246666Smm (void) arc_release(db->db_buf, db); 1019219089Spjd} 1020219089Spjd 1021168404Spjddbuf_dirty_record_t * 1022168404Spjddbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1023168404Spjd{ 1024219089Spjd dnode_t *dn; 1025219089Spjd objset_t *os; 1026168404Spjd dbuf_dirty_record_t **drp, *dr; 1027168404Spjd int drop_struct_lock = FALSE; 1028185029Spjd boolean_t do_free_accounting = B_FALSE; 1029168404Spjd int txgoff = tx->tx_txg & TXG_MASK; 1030168404Spjd 1031168404Spjd ASSERT(tx->tx_txg != 0); 1032168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 1033168404Spjd DMU_TX_DIRTY_BUF(tx, db); 1034168404Spjd 1035219089Spjd DB_DNODE_ENTER(db); 1036219089Spjd dn = DB_DNODE(db); 1037168404Spjd /* 1038168404Spjd * Shouldn't dirty a regular buffer in syncing context. Private 1039168404Spjd * objects may be dirtied in syncing context, but only if they 1040168404Spjd * were already pre-dirtied in open context. 1041168404Spjd */ 1042168404Spjd ASSERT(!dmu_tx_is_syncing(tx) || 1043168404Spjd BP_IS_HOLE(dn->dn_objset->os_rootbp) || 1044209962Smm DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 1045209962Smm dn->dn_objset->os_dsl_dataset == NULL); 1046168404Spjd /* 1047168404Spjd * We make this assert for private objects as well, but after we 1048168404Spjd * check if we're already dirty. They are allowed to re-dirty 1049168404Spjd * in syncing context. 1050168404Spjd */ 1051168404Spjd ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1052168404Spjd dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1053168404Spjd (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1054168404Spjd 1055168404Spjd mutex_enter(&db->db_mtx); 1056168404Spjd /* 1057168404Spjd * XXX make this true for indirects too? The problem is that 1058168404Spjd * transactions created with dmu_tx_create_assigned() from 1059168404Spjd * syncing context don't bother holding ahead. 1060168404Spjd */ 1061168404Spjd ASSERT(db->db_level != 0 || 1062219089Spjd db->db_state == DB_CACHED || db->db_state == DB_FILL || 1063219089Spjd db->db_state == DB_NOFILL); 1064168404Spjd 1065168404Spjd mutex_enter(&dn->dn_mtx); 1066168404Spjd /* 1067168404Spjd * Don't set dirtyctx to SYNC if we're just modifying this as we 1068168404Spjd * initialize the objset. 1069168404Spjd */ 1070168404Spjd if (dn->dn_dirtyctx == DN_UNDIRTIED && 1071168404Spjd !BP_IS_HOLE(dn->dn_objset->os_rootbp)) { 1072168404Spjd dn->dn_dirtyctx = 1073168404Spjd (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); 1074168404Spjd ASSERT(dn->dn_dirtyctx_firstset == NULL); 1075168404Spjd dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); 1076168404Spjd } 1077168404Spjd mutex_exit(&dn->dn_mtx); 1078168404Spjd 1079219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) 1080219089Spjd dn->dn_have_spill = B_TRUE; 1081219089Spjd 1082168404Spjd /* 1083168404Spjd * If this buffer is already dirty, we're done. 1084168404Spjd */ 1085168404Spjd drp = &db->db_last_dirty; 1086168404Spjd ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || 1087168404Spjd db->db.db_object == DMU_META_DNODE_OBJECT); 1088185029Spjd while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) 1089185029Spjd drp = &dr->dr_next; 1090185029Spjd if (dr && dr->dr_txg == tx->tx_txg) { 1091219089Spjd DB_DNODE_EXIT(db); 1092219089Spjd 1093219089Spjd if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { 1094168404Spjd /* 1095168404Spjd * If this buffer has already been written out, 1096168404Spjd * we now need to reset its state. 1097168404Spjd */ 1098185029Spjd dbuf_unoverride(dr); 1099219089Spjd if (db->db.db_object != DMU_META_DNODE_OBJECT && 1100219089Spjd db->db_state != DB_NOFILL) 1101168404Spjd arc_buf_thaw(db->db_buf); 1102168404Spjd } 1103168404Spjd mutex_exit(&db->db_mtx); 1104185029Spjd return (dr); 1105168404Spjd } 1106168404Spjd 1107168404Spjd /* 1108168404Spjd * Only valid if not already dirty. 1109168404Spjd */ 1110209962Smm ASSERT(dn->dn_object == 0 || 1111209962Smm dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1112168404Spjd (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1113168404Spjd 1114168404Spjd ASSERT3U(dn->dn_nlevels, >, db->db_level); 1115168404Spjd ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || 1116168404Spjd dn->dn_phys->dn_nlevels > db->db_level || 1117168404Spjd dn->dn_next_nlevels[txgoff] > db->db_level || 1118168404Spjd dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || 1119168404Spjd dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); 1120168404Spjd 1121168404Spjd /* 1122168404Spjd * We should only be dirtying in syncing context if it's the 1123209962Smm * mos or we're initializing the os or it's a special object. 1124209962Smm * However, we are allowed to dirty in syncing context provided 1125209962Smm * we already dirtied it in open context. Hence we must make 1126209962Smm * this assertion only if we're not already dirty. 1127168404Spjd */ 1128219089Spjd os = dn->dn_objset; 1129209962Smm ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 1130209962Smm os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); 1131168404Spjd ASSERT(db->db.db_size != 0); 1132168404Spjd 1133168404Spjd dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1134168404Spjd 1135219089Spjd if (db->db_blkid != DMU_BONUS_BLKID) { 1136185029Spjd /* 1137185029Spjd * Update the accounting. 1138185029Spjd * Note: we delay "free accounting" until after we drop 1139185029Spjd * the db_mtx. This keeps us from grabbing other locks 1140219089Spjd * (and possibly deadlocking) in bp_get_dsize() while 1141185029Spjd * also holding the db_mtx. 1142185029Spjd */ 1143185029Spjd dnode_willuse_space(dn, db->db.db_size, tx); 1144185029Spjd do_free_accounting = dbuf_block_freeable(db); 1145185029Spjd } 1146185029Spjd 1147168404Spjd /* 1148168404Spjd * If this buffer is dirty in an old transaction group we need 1149168404Spjd * to make a copy of it so that the changes we make in this 1150168404Spjd * transaction group won't leak out when we sync the older txg. 1151168404Spjd */ 1152168404Spjd dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); 1153168404Spjd if (db->db_level == 0) { 1154168404Spjd void *data_old = db->db_buf; 1155168404Spjd 1156219089Spjd if (db->db_state != DB_NOFILL) { 1157219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 1158219089Spjd dbuf_fix_old_data(db, tx->tx_txg); 1159219089Spjd data_old = db->db.db_data; 1160219089Spjd } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { 1161219089Spjd /* 1162219089Spjd * Release the data buffer from the cache so 1163219089Spjd * that we can modify it without impacting 1164219089Spjd * possible other users of this cached data 1165219089Spjd * block. Note that indirect blocks and 1166219089Spjd * private objects are not released until the 1167219089Spjd * syncing state (since they are only modified 1168219089Spjd * then). 1169219089Spjd */ 1170219089Spjd arc_release(db->db_buf, db); 1171219089Spjd dbuf_fix_old_data(db, tx->tx_txg); 1172219089Spjd data_old = db->db_buf; 1173219089Spjd } 1174219089Spjd ASSERT(data_old != NULL); 1175168404Spjd } 1176168404Spjd dr->dt.dl.dr_data = data_old; 1177168404Spjd } else { 1178168404Spjd mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); 1179168404Spjd list_create(&dr->dt.di.dr_children, 1180168404Spjd sizeof (dbuf_dirty_record_t), 1181168404Spjd offsetof(dbuf_dirty_record_t, dr_dirty_node)); 1182168404Spjd } 1183260763Savg if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL) 1184260763Savg dr->dr_accounted = db->db.db_size; 1185168404Spjd dr->dr_dbuf = db; 1186168404Spjd dr->dr_txg = tx->tx_txg; 1187168404Spjd dr->dr_next = *drp; 1188168404Spjd *drp = dr; 1189168404Spjd 1190168404Spjd /* 1191168404Spjd * We could have been freed_in_flight between the dbuf_noread 1192168404Spjd * and dbuf_dirty. We win, as though the dbuf_noread() had 1193168404Spjd * happened after the free. 1194168404Spjd */ 1195219089Spjd if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 1196219089Spjd db->db_blkid != DMU_SPILL_BLKID) { 1197168404Spjd mutex_enter(&dn->dn_mtx); 1198265740Sdelphij if (dn->dn_free_ranges[txgoff] != NULL) { 1199265740Sdelphij range_tree_clear(dn->dn_free_ranges[txgoff], 1200265740Sdelphij db->db_blkid, 1); 1201265740Sdelphij } 1202168404Spjd mutex_exit(&dn->dn_mtx); 1203168404Spjd db->db_freed_in_flight = FALSE; 1204168404Spjd } 1205168404Spjd 1206168404Spjd /* 1207168404Spjd * This buffer is now part of this txg 1208168404Spjd */ 1209168404Spjd dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); 1210168404Spjd db->db_dirtycnt += 1; 1211168404Spjd ASSERT3U(db->db_dirtycnt, <=, 3); 1212168404Spjd 1213168404Spjd mutex_exit(&db->db_mtx); 1214168404Spjd 1215219089Spjd if (db->db_blkid == DMU_BONUS_BLKID || 1216219089Spjd db->db_blkid == DMU_SPILL_BLKID) { 1217168404Spjd mutex_enter(&dn->dn_mtx); 1218168404Spjd ASSERT(!list_link_active(&dr->dr_dirty_node)); 1219168404Spjd list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1220168404Spjd mutex_exit(&dn->dn_mtx); 1221168404Spjd dnode_setdirty(dn, tx); 1222219089Spjd DB_DNODE_EXIT(db); 1223168404Spjd return (dr); 1224185029Spjd } else if (do_free_accounting) { 1225185029Spjd blkptr_t *bp = db->db_blkptr; 1226185029Spjd int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? 1227219089Spjd bp_get_dsize(os->os_spa, bp) : db->db.db_size; 1228185029Spjd /* 1229185029Spjd * This is only a guess -- if the dbuf is dirty 1230185029Spjd * in a previous txg, we don't know how much 1231185029Spjd * space it will use on disk yet. We should 1232185029Spjd * really have the struct_rwlock to access 1233185029Spjd * db_blkptr, but since this is just a guess, 1234185029Spjd * it's OK if we get an odd answer. 1235185029Spjd */ 1236219089Spjd ddt_prefetch(os->os_spa, bp); 1237185029Spjd dnode_willuse_space(dn, -willfree, tx); 1238168404Spjd } 1239168404Spjd 1240168404Spjd if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 1241168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 1242168404Spjd drop_struct_lock = TRUE; 1243168404Spjd } 1244168404Spjd 1245185029Spjd if (db->db_level == 0) { 1246185029Spjd dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); 1247185029Spjd ASSERT(dn->dn_maxblkid >= db->db_blkid); 1248185029Spjd } 1249185029Spjd 1250168404Spjd if (db->db_level+1 < dn->dn_nlevels) { 1251168404Spjd dmu_buf_impl_t *parent = db->db_parent; 1252168404Spjd dbuf_dirty_record_t *di; 1253168404Spjd int parent_held = FALSE; 1254168404Spjd 1255168404Spjd if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { 1256168404Spjd int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1257168404Spjd 1258168404Spjd parent = dbuf_hold_level(dn, db->db_level+1, 1259168404Spjd db->db_blkid >> epbs, FTAG); 1260219089Spjd ASSERT(parent != NULL); 1261168404Spjd parent_held = TRUE; 1262168404Spjd } 1263168404Spjd if (drop_struct_lock) 1264168404Spjd rw_exit(&dn->dn_struct_rwlock); 1265168404Spjd ASSERT3U(db->db_level+1, ==, parent->db_level); 1266168404Spjd di = dbuf_dirty(parent, tx); 1267168404Spjd if (parent_held) 1268168404Spjd dbuf_rele(parent, FTAG); 1269168404Spjd 1270168404Spjd mutex_enter(&db->db_mtx); 1271260763Savg /* 1272260763Savg * Since we've dropped the mutex, it's possible that 1273260763Savg * dbuf_undirty() might have changed this out from under us. 1274260763Savg */ 1275168404Spjd if (db->db_last_dirty == dr || 1276168404Spjd dn->dn_object == DMU_META_DNODE_OBJECT) { 1277168404Spjd mutex_enter(&di->dt.di.dr_mtx); 1278168404Spjd ASSERT3U(di->dr_txg, ==, tx->tx_txg); 1279168404Spjd ASSERT(!list_link_active(&dr->dr_dirty_node)); 1280168404Spjd list_insert_tail(&di->dt.di.dr_children, dr); 1281168404Spjd mutex_exit(&di->dt.di.dr_mtx); 1282168404Spjd dr->dr_parent = di; 1283168404Spjd } 1284168404Spjd mutex_exit(&db->db_mtx); 1285168404Spjd } else { 1286168404Spjd ASSERT(db->db_level+1 == dn->dn_nlevels); 1287168404Spjd ASSERT(db->db_blkid < dn->dn_nblkptr); 1288219089Spjd ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); 1289168404Spjd mutex_enter(&dn->dn_mtx); 1290168404Spjd ASSERT(!list_link_active(&dr->dr_dirty_node)); 1291168404Spjd list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1292168404Spjd mutex_exit(&dn->dn_mtx); 1293168404Spjd if (drop_struct_lock) 1294168404Spjd rw_exit(&dn->dn_struct_rwlock); 1295168404Spjd } 1296168404Spjd 1297168404Spjd dnode_setdirty(dn, tx); 1298219089Spjd DB_DNODE_EXIT(db); 1299168404Spjd return (dr); 1300168404Spjd} 1301168404Spjd 1302248571Smm/* 1303251629Sdelphij * Undirty a buffer in the transaction group referenced by the given 1304251629Sdelphij * transaction. Return whether this evicted the dbuf. 1305248571Smm */ 1306248571Smmstatic boolean_t 1307168404Spjddbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1308168404Spjd{ 1309219089Spjd dnode_t *dn; 1310168404Spjd uint64_t txg = tx->tx_txg; 1311185029Spjd dbuf_dirty_record_t *dr, **drp; 1312168404Spjd 1313168404Spjd ASSERT(txg != 0); 1314219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1315248571Smm ASSERT0(db->db_level); 1316248571Smm ASSERT(MUTEX_HELD(&db->db_mtx)); 1317168404Spjd 1318168404Spjd /* 1319168404Spjd * If this buffer is not dirty, we're done. 1320168404Spjd */ 1321185029Spjd for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) 1322168404Spjd if (dr->dr_txg <= txg) 1323168404Spjd break; 1324248571Smm if (dr == NULL || dr->dr_txg < txg) 1325248571Smm return (B_FALSE); 1326168404Spjd ASSERT(dr->dr_txg == txg); 1327219089Spjd ASSERT(dr->dr_dbuf == db); 1328168404Spjd 1329219089Spjd DB_DNODE_ENTER(db); 1330219089Spjd dn = DB_DNODE(db); 1331219089Spjd 1332168404Spjd dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1333168404Spjd 1334168404Spjd ASSERT(db->db.db_size != 0); 1335168404Spjd 1336260763Savg /* 1337260763Savg * Any space we accounted for in dp_dirty_* will be cleaned up by 1338260763Savg * dsl_pool_sync(). This is relatively rare so the discrepancy 1339260763Savg * is not a big deal. 1340260763Savg */ 1341168404Spjd 1342185029Spjd *drp = dr->dr_next; 1343168404Spjd 1344219636Spjd /* 1345219636Spjd * Note that there are three places in dbuf_dirty() 1346219636Spjd * where this dirty record may be put on a list. 1347219636Spjd * Make sure to do a list_remove corresponding to 1348219636Spjd * every one of those list_insert calls. 1349219636Spjd */ 1350168404Spjd if (dr->dr_parent) { 1351168404Spjd mutex_enter(&dr->dr_parent->dt.di.dr_mtx); 1352168404Spjd list_remove(&dr->dr_parent->dt.di.dr_children, dr); 1353168404Spjd mutex_exit(&dr->dr_parent->dt.di.dr_mtx); 1354219636Spjd } else if (db->db_blkid == DMU_SPILL_BLKID || 1355219636Spjd db->db_level+1 == dn->dn_nlevels) { 1356185029Spjd ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); 1357168404Spjd mutex_enter(&dn->dn_mtx); 1358168404Spjd list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); 1359168404Spjd mutex_exit(&dn->dn_mtx); 1360168404Spjd } 1361219089Spjd DB_DNODE_EXIT(db); 1362168404Spjd 1363248571Smm if (db->db_state != DB_NOFILL) { 1364248571Smm dbuf_unoverride(dr); 1365168404Spjd 1366168404Spjd ASSERT(db->db_buf != NULL); 1367248571Smm ASSERT(dr->dt.dl.dr_data != NULL); 1368248571Smm if (dr->dt.dl.dr_data != db->db_buf) 1369248571Smm VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db)); 1370168404Spjd } 1371269218Sdelphij 1372269218Sdelphij if (db->db_level != 0) { 1373269218Sdelphij mutex_destroy(&dr->dt.di.dr_mtx); 1374269218Sdelphij list_destroy(&dr->dt.di.dr_children); 1375269218Sdelphij } 1376269218Sdelphij 1377168404Spjd kmem_free(dr, sizeof (dbuf_dirty_record_t)); 1378168404Spjd 1379168404Spjd ASSERT(db->db_dirtycnt > 0); 1380168404Spjd db->db_dirtycnt -= 1; 1381168404Spjd 1382168404Spjd if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { 1383168404Spjd arc_buf_t *buf = db->db_buf; 1384168404Spjd 1385219089Spjd ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); 1386168404Spjd dbuf_set_data(db, NULL); 1387248571Smm VERIFY(arc_buf_remove_ref(buf, db)); 1388168404Spjd dbuf_evict(db); 1389248571Smm return (B_TRUE); 1390168404Spjd } 1391168404Spjd 1392248571Smm return (B_FALSE); 1393168404Spjd} 1394168404Spjd 1395168404Spjdvoid 1396263397Sdelphijdmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) 1397168404Spjd{ 1398263397Sdelphij dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1399185029Spjd int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; 1400168404Spjd 1401168404Spjd ASSERT(tx->tx_txg != 0); 1402168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 1403168404Spjd 1404219089Spjd DB_DNODE_ENTER(db); 1405219089Spjd if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) 1406168404Spjd rf |= DB_RF_HAVESTRUCT; 1407219089Spjd DB_DNODE_EXIT(db); 1408168404Spjd (void) dbuf_read(db, NULL, rf); 1409168404Spjd (void) dbuf_dirty(db, tx); 1410168404Spjd} 1411168404Spjd 1412168404Spjdvoid 1413219089Spjddmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1414219089Spjd{ 1415219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1416219089Spjd 1417219089Spjd db->db_state = DB_NOFILL; 1418219089Spjd 1419219089Spjd dmu_buf_will_fill(db_fake, tx); 1420219089Spjd} 1421219089Spjd 1422219089Spjdvoid 1423168404Spjddmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1424168404Spjd{ 1425168404Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1426168404Spjd 1427219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1428168404Spjd ASSERT(tx->tx_txg != 0); 1429168404Spjd ASSERT(db->db_level == 0); 1430168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 1431168404Spjd 1432168404Spjd ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || 1433168404Spjd dmu_tx_private_ok(tx)); 1434168404Spjd 1435168404Spjd dbuf_noread(db); 1436168404Spjd (void) dbuf_dirty(db, tx); 1437168404Spjd} 1438168404Spjd 1439168404Spjd#pragma weak dmu_buf_fill_done = dbuf_fill_done 1440168404Spjd/* ARGSUSED */ 1441168404Spjdvoid 1442168404Spjddbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) 1443168404Spjd{ 1444168404Spjd mutex_enter(&db->db_mtx); 1445168404Spjd DBUF_VERIFY(db); 1446168404Spjd 1447168404Spjd if (db->db_state == DB_FILL) { 1448168404Spjd if (db->db_level == 0 && db->db_freed_in_flight) { 1449219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1450168404Spjd /* we were freed while filling */ 1451168404Spjd /* XXX dbuf_undirty? */ 1452168404Spjd bzero(db->db.db_data, db->db.db_size); 1453168404Spjd db->db_freed_in_flight = FALSE; 1454168404Spjd } 1455168404Spjd db->db_state = DB_CACHED; 1456168404Spjd cv_broadcast(&db->db_changed); 1457168404Spjd } 1458168404Spjd mutex_exit(&db->db_mtx); 1459168404Spjd} 1460168404Spjd 1461268649Sdelphijvoid 1462268649Sdelphijdmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, 1463268649Sdelphij bp_embedded_type_t etype, enum zio_compress comp, 1464268649Sdelphij int uncompressed_size, int compressed_size, int byteorder, 1465268649Sdelphij dmu_tx_t *tx) 1466268649Sdelphij{ 1467268649Sdelphij dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; 1468268649Sdelphij struct dirty_leaf *dl; 1469268649Sdelphij dmu_object_type_t type; 1470268649Sdelphij 1471268649Sdelphij DB_DNODE_ENTER(db); 1472268649Sdelphij type = DB_DNODE(db)->dn_type; 1473268649Sdelphij DB_DNODE_EXIT(db); 1474268649Sdelphij 1475268649Sdelphij ASSERT0(db->db_level); 1476268649Sdelphij ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1477268649Sdelphij 1478268649Sdelphij dmu_buf_will_not_fill(dbuf, tx); 1479268649Sdelphij 1480268649Sdelphij ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 1481268649Sdelphij dl = &db->db_last_dirty->dt.dl; 1482268649Sdelphij encode_embedded_bp_compressed(&dl->dr_overridden_by, 1483268649Sdelphij data, comp, uncompressed_size, compressed_size); 1484268649Sdelphij BPE_SET_ETYPE(&dl->dr_overridden_by, etype); 1485268649Sdelphij BP_SET_TYPE(&dl->dr_overridden_by, type); 1486268649Sdelphij BP_SET_LEVEL(&dl->dr_overridden_by, 0); 1487268649Sdelphij BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder); 1488268649Sdelphij 1489268649Sdelphij dl->dr_override_state = DR_OVERRIDDEN; 1490268649Sdelphij dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg; 1491268649Sdelphij} 1492268649Sdelphij 1493168404Spjd/* 1494209962Smm * Directly assign a provided arc buf to a given dbuf if it's not referenced 1495209962Smm * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. 1496209962Smm */ 1497209962Smmvoid 1498209962Smmdbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) 1499209962Smm{ 1500209962Smm ASSERT(!refcount_is_zero(&db->db_holds)); 1501219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1502209962Smm ASSERT(db->db_level == 0); 1503209962Smm ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); 1504209962Smm ASSERT(buf != NULL); 1505209962Smm ASSERT(arc_buf_size(buf) == db->db.db_size); 1506209962Smm ASSERT(tx->tx_txg != 0); 1507209962Smm 1508209962Smm arc_return_buf(buf, db); 1509209962Smm ASSERT(arc_released(buf)); 1510209962Smm 1511209962Smm mutex_enter(&db->db_mtx); 1512209962Smm 1513209962Smm while (db->db_state == DB_READ || db->db_state == DB_FILL) 1514209962Smm cv_wait(&db->db_changed, &db->db_mtx); 1515209962Smm 1516209962Smm ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); 1517209962Smm 1518209962Smm if (db->db_state == DB_CACHED && 1519209962Smm refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { 1520209962Smm mutex_exit(&db->db_mtx); 1521209962Smm (void) dbuf_dirty(db, tx); 1522209962Smm bcopy(buf->b_data, db->db.db_data, db->db.db_size); 1523248571Smm VERIFY(arc_buf_remove_ref(buf, db)); 1524219089Spjd xuio_stat_wbuf_copied(); 1525209962Smm return; 1526209962Smm } 1527209962Smm 1528219089Spjd xuio_stat_wbuf_nocopy(); 1529209962Smm if (db->db_state == DB_CACHED) { 1530209962Smm dbuf_dirty_record_t *dr = db->db_last_dirty; 1531209962Smm 1532209962Smm ASSERT(db->db_buf != NULL); 1533209962Smm if (dr != NULL && dr->dr_txg == tx->tx_txg) { 1534209962Smm ASSERT(dr->dt.dl.dr_data == db->db_buf); 1535209962Smm if (!arc_released(db->db_buf)) { 1536209962Smm ASSERT(dr->dt.dl.dr_override_state == 1537209962Smm DR_OVERRIDDEN); 1538209962Smm arc_release(db->db_buf, db); 1539209962Smm } 1540209962Smm dr->dt.dl.dr_data = buf; 1541248571Smm VERIFY(arc_buf_remove_ref(db->db_buf, db)); 1542209962Smm } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { 1543209962Smm arc_release(db->db_buf, db); 1544248571Smm VERIFY(arc_buf_remove_ref(db->db_buf, db)); 1545209962Smm } 1546209962Smm db->db_buf = NULL; 1547209962Smm } 1548209962Smm ASSERT(db->db_buf == NULL); 1549209962Smm dbuf_set_data(db, buf); 1550209962Smm db->db_state = DB_FILL; 1551209962Smm mutex_exit(&db->db_mtx); 1552209962Smm (void) dbuf_dirty(db, tx); 1553263397Sdelphij dmu_buf_fill_done(&db->db, tx); 1554209962Smm} 1555209962Smm 1556209962Smm/* 1557168404Spjd * "Clear" the contents of this dbuf. This will mark the dbuf 1558260763Savg * EVICTING and clear *most* of its references. Unfortunately, 1559168404Spjd * when we are not holding the dn_dbufs_mtx, we can't clear the 1560168404Spjd * entry in the dn_dbufs list. We have to wait until dbuf_destroy() 1561168404Spjd * in this case. For callers from the DMU we will usually see: 1562269417Sdelphij * dbuf_clear()->arc_clear_callback()->dbuf_do_evict()->dbuf_destroy() 1563168404Spjd * For the arc callback, we will usually see: 1564219089Spjd * dbuf_do_evict()->dbuf_clear();dbuf_destroy() 1565168404Spjd * Sometimes, though, we will get a mix of these two: 1566269417Sdelphij * DMU: dbuf_clear()->arc_clear_callback() 1567168404Spjd * ARC: dbuf_do_evict()->dbuf_destroy() 1568269417Sdelphij * 1569269417Sdelphij * This routine will dissociate the dbuf from the arc, by calling 1570269417Sdelphij * arc_clear_callback(), but will not evict the data from the ARC. 1571168404Spjd */ 1572168404Spjdvoid 1573168404Spjddbuf_clear(dmu_buf_impl_t *db) 1574168404Spjd{ 1575219089Spjd dnode_t *dn; 1576168404Spjd dmu_buf_impl_t *parent = db->db_parent; 1577219089Spjd dmu_buf_impl_t *dndb; 1578269417Sdelphij boolean_t dbuf_gone = B_FALSE; 1579168404Spjd 1580168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 1581168404Spjd ASSERT(refcount_is_zero(&db->db_holds)); 1582168404Spjd 1583168404Spjd dbuf_evict_user(db); 1584168404Spjd 1585168404Spjd if (db->db_state == DB_CACHED) { 1586168404Spjd ASSERT(db->db.db_data != NULL); 1587219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 1588168404Spjd zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); 1589208373Smm arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 1590185029Spjd } 1591168404Spjd db->db.db_data = NULL; 1592168404Spjd db->db_state = DB_UNCACHED; 1593168404Spjd } 1594168404Spjd 1595219089Spjd ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); 1596168404Spjd ASSERT(db->db_data_pending == NULL); 1597168404Spjd 1598168404Spjd db->db_state = DB_EVICTING; 1599168404Spjd db->db_blkptr = NULL; 1600168404Spjd 1601219089Spjd DB_DNODE_ENTER(db); 1602219089Spjd dn = DB_DNODE(db); 1603219089Spjd dndb = dn->dn_dbuf; 1604219089Spjd if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { 1605269845Sdelphij avl_remove(&dn->dn_dbufs, db); 1606271002Sdelphij atomic_dec_32(&dn->dn_dbufs_count); 1607219089Spjd membar_producer(); 1608219089Spjd DB_DNODE_EXIT(db); 1609219089Spjd /* 1610219089Spjd * Decrementing the dbuf count means that the hold corresponding 1611219089Spjd * to the removed dbuf is no longer discounted in dnode_move(), 1612219089Spjd * so the dnode cannot be moved until after we release the hold. 1613219089Spjd * The membar_producer() ensures visibility of the decremented 1614219089Spjd * value in dnode_move(), since DB_DNODE_EXIT doesn't actually 1615219089Spjd * release any lock. 1616219089Spjd */ 1617168404Spjd dnode_rele(dn, db); 1618219089Spjd db->db_dnode_handle = NULL; 1619219089Spjd } else { 1620219089Spjd DB_DNODE_EXIT(db); 1621168404Spjd } 1622168404Spjd 1623168404Spjd if (db->db_buf) 1624269417Sdelphij dbuf_gone = arc_clear_callback(db->db_buf); 1625168404Spjd 1626168404Spjd if (!dbuf_gone) 1627168404Spjd mutex_exit(&db->db_mtx); 1628168404Spjd 1629168404Spjd /* 1630219089Spjd * If this dbuf is referenced from an indirect dbuf, 1631168404Spjd * decrement the ref count on the indirect dbuf. 1632168404Spjd */ 1633168404Spjd if (parent && parent != dndb) 1634168404Spjd dbuf_rele(parent, db); 1635168404Spjd} 1636168404Spjd 1637168404Spjdstatic int 1638168404Spjddbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, 1639168404Spjd dmu_buf_impl_t **parentp, blkptr_t **bpp) 1640168404Spjd{ 1641168404Spjd int nlevels, epbs; 1642168404Spjd 1643168404Spjd *parentp = NULL; 1644168404Spjd *bpp = NULL; 1645168404Spjd 1646219089Spjd ASSERT(blkid != DMU_BONUS_BLKID); 1647168404Spjd 1648219089Spjd if (blkid == DMU_SPILL_BLKID) { 1649219089Spjd mutex_enter(&dn->dn_mtx); 1650219089Spjd if (dn->dn_have_spill && 1651219089Spjd (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) 1652219089Spjd *bpp = &dn->dn_phys->dn_spill; 1653219089Spjd else 1654219089Spjd *bpp = NULL; 1655219089Spjd dbuf_add_ref(dn->dn_dbuf, NULL); 1656219089Spjd *parentp = dn->dn_dbuf; 1657219089Spjd mutex_exit(&dn->dn_mtx); 1658219089Spjd return (0); 1659219089Spjd } 1660219089Spjd 1661168404Spjd if (dn->dn_phys->dn_nlevels == 0) 1662168404Spjd nlevels = 1; 1663168404Spjd else 1664168404Spjd nlevels = dn->dn_phys->dn_nlevels; 1665168404Spjd 1666168404Spjd epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1667168404Spjd 1668168404Spjd ASSERT3U(level * epbs, <, 64); 1669168404Spjd ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1670168404Spjd if (level >= nlevels || 1671168404Spjd (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { 1672168404Spjd /* the buffer has no parent yet */ 1673249195Smm return (SET_ERROR(ENOENT)); 1674168404Spjd } else if (level < nlevels-1) { 1675168404Spjd /* this block is referenced from an indirect block */ 1676168404Spjd int err = dbuf_hold_impl(dn, level+1, 1677168404Spjd blkid >> epbs, fail_sparse, NULL, parentp); 1678168404Spjd if (err) 1679168404Spjd return (err); 1680168404Spjd err = dbuf_read(*parentp, NULL, 1681168404Spjd (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); 1682168404Spjd if (err) { 1683168404Spjd dbuf_rele(*parentp, NULL); 1684168404Spjd *parentp = NULL; 1685168404Spjd return (err); 1686168404Spjd } 1687168404Spjd *bpp = ((blkptr_t *)(*parentp)->db.db_data) + 1688168404Spjd (blkid & ((1ULL << epbs) - 1)); 1689168404Spjd return (0); 1690168404Spjd } else { 1691168404Spjd /* the block is referenced from the dnode */ 1692168404Spjd ASSERT3U(level, ==, nlevels-1); 1693168404Spjd ASSERT(dn->dn_phys->dn_nblkptr == 0 || 1694168404Spjd blkid < dn->dn_phys->dn_nblkptr); 1695168404Spjd if (dn->dn_dbuf) { 1696168404Spjd dbuf_add_ref(dn->dn_dbuf, NULL); 1697168404Spjd *parentp = dn->dn_dbuf; 1698168404Spjd } 1699168404Spjd *bpp = &dn->dn_phys->dn_blkptr[blkid]; 1700168404Spjd return (0); 1701168404Spjd } 1702168404Spjd} 1703168404Spjd 1704168404Spjdstatic dmu_buf_impl_t * 1705168404Spjddbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, 1706168404Spjd dmu_buf_impl_t *parent, blkptr_t *blkptr) 1707168404Spjd{ 1708219089Spjd objset_t *os = dn->dn_objset; 1709168404Spjd dmu_buf_impl_t *db, *odb; 1710168404Spjd 1711168404Spjd ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1712168404Spjd ASSERT(dn->dn_type != DMU_OT_NONE); 1713168404Spjd 1714168404Spjd db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); 1715168404Spjd 1716168404Spjd db->db_objset = os; 1717168404Spjd db->db.db_object = dn->dn_object; 1718168404Spjd db->db_level = level; 1719168404Spjd db->db_blkid = blkid; 1720168404Spjd db->db_last_dirty = NULL; 1721168404Spjd db->db_dirtycnt = 0; 1722219089Spjd db->db_dnode_handle = dn->dn_handle; 1723168404Spjd db->db_parent = parent; 1724168404Spjd db->db_blkptr = blkptr; 1725168404Spjd 1726168404Spjd db->db_user_ptr = NULL; 1727168404Spjd db->db_user_data_ptr_ptr = NULL; 1728168404Spjd db->db_evict_func = NULL; 1729168404Spjd db->db_immediate_evict = 0; 1730168404Spjd db->db_freed_in_flight = 0; 1731168404Spjd 1732219089Spjd if (blkid == DMU_BONUS_BLKID) { 1733168404Spjd ASSERT3P(parent, ==, dn->dn_dbuf); 1734185029Spjd db->db.db_size = DN_MAX_BONUSLEN - 1735185029Spjd (dn->dn_nblkptr-1) * sizeof (blkptr_t); 1736185029Spjd ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 1737219089Spjd db->db.db_offset = DMU_BONUS_BLKID; 1738168404Spjd db->db_state = DB_UNCACHED; 1739168404Spjd /* the bonus dbuf is not placed in the hash table */ 1740208373Smm arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1741168404Spjd return (db); 1742219089Spjd } else if (blkid == DMU_SPILL_BLKID) { 1743219089Spjd db->db.db_size = (blkptr != NULL) ? 1744219089Spjd BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; 1745219089Spjd db->db.db_offset = 0; 1746168404Spjd } else { 1747168404Spjd int blocksize = 1748260763Savg db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz; 1749168404Spjd db->db.db_size = blocksize; 1750168404Spjd db->db.db_offset = db->db_blkid * blocksize; 1751168404Spjd } 1752168404Spjd 1753168404Spjd /* 1754168404Spjd * Hold the dn_dbufs_mtx while we get the new dbuf 1755168404Spjd * in the hash table *and* added to the dbufs list. 1756168404Spjd * This prevents a possible deadlock with someone 1757168404Spjd * trying to look up this dbuf before its added to the 1758168404Spjd * dn_dbufs list. 1759168404Spjd */ 1760168404Spjd mutex_enter(&dn->dn_dbufs_mtx); 1761168404Spjd db->db_state = DB_EVICTING; 1762168404Spjd if ((odb = dbuf_hash_insert(db)) != NULL) { 1763168404Spjd /* someone else inserted it first */ 1764168404Spjd kmem_cache_free(dbuf_cache, db); 1765168404Spjd mutex_exit(&dn->dn_dbufs_mtx); 1766168404Spjd return (odb); 1767168404Spjd } 1768269845Sdelphij avl_add(&dn->dn_dbufs, db); 1769254753Sdelphij if (db->db_level == 0 && db->db_blkid >= 1770254753Sdelphij dn->dn_unlisted_l0_blkid) 1771254753Sdelphij dn->dn_unlisted_l0_blkid = db->db_blkid + 1; 1772168404Spjd db->db_state = DB_UNCACHED; 1773168404Spjd mutex_exit(&dn->dn_dbufs_mtx); 1774208373Smm arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1775168404Spjd 1776168404Spjd if (parent && parent != dn->dn_dbuf) 1777168404Spjd dbuf_add_ref(parent, db); 1778168404Spjd 1779168404Spjd ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1780168404Spjd refcount_count(&dn->dn_holds) > 0); 1781168404Spjd (void) refcount_add(&dn->dn_holds, db); 1782271002Sdelphij atomic_inc_32(&dn->dn_dbufs_count); 1783168404Spjd 1784168404Spjd dprintf_dbuf(db, "db=%p\n", db); 1785168404Spjd 1786168404Spjd return (db); 1787168404Spjd} 1788168404Spjd 1789168404Spjdstatic int 1790168404Spjddbuf_do_evict(void *private) 1791168404Spjd{ 1792269417Sdelphij dmu_buf_impl_t *db = private; 1793168404Spjd 1794168404Spjd if (!MUTEX_HELD(&db->db_mtx)) 1795168404Spjd mutex_enter(&db->db_mtx); 1796168404Spjd 1797168404Spjd ASSERT(refcount_is_zero(&db->db_holds)); 1798168404Spjd 1799168404Spjd if (db->db_state != DB_EVICTING) { 1800168404Spjd ASSERT(db->db_state == DB_CACHED); 1801168404Spjd DBUF_VERIFY(db); 1802168404Spjd db->db_buf = NULL; 1803168404Spjd dbuf_evict(db); 1804168404Spjd } else { 1805168404Spjd mutex_exit(&db->db_mtx); 1806168404Spjd dbuf_destroy(db); 1807168404Spjd } 1808168404Spjd return (0); 1809168404Spjd} 1810168404Spjd 1811168404Spjdstatic void 1812168404Spjddbuf_destroy(dmu_buf_impl_t *db) 1813168404Spjd{ 1814168404Spjd ASSERT(refcount_is_zero(&db->db_holds)); 1815168404Spjd 1816219089Spjd if (db->db_blkid != DMU_BONUS_BLKID) { 1817168404Spjd /* 1818168404Spjd * If this dbuf is still on the dn_dbufs list, 1819168404Spjd * remove it from that list. 1820168404Spjd */ 1821219089Spjd if (db->db_dnode_handle != NULL) { 1822219089Spjd dnode_t *dn; 1823185029Spjd 1824219089Spjd DB_DNODE_ENTER(db); 1825219089Spjd dn = DB_DNODE(db); 1826168404Spjd mutex_enter(&dn->dn_dbufs_mtx); 1827269845Sdelphij avl_remove(&dn->dn_dbufs, db); 1828271002Sdelphij atomic_dec_32(&dn->dn_dbufs_count); 1829168404Spjd mutex_exit(&dn->dn_dbufs_mtx); 1830219089Spjd DB_DNODE_EXIT(db); 1831219089Spjd /* 1832219089Spjd * Decrementing the dbuf count means that the hold 1833219089Spjd * corresponding to the removed dbuf is no longer 1834219089Spjd * discounted in dnode_move(), so the dnode cannot be 1835219089Spjd * moved until after we release the hold. 1836219089Spjd */ 1837168404Spjd dnode_rele(dn, db); 1838219089Spjd db->db_dnode_handle = NULL; 1839168404Spjd } 1840168404Spjd dbuf_hash_remove(db); 1841168404Spjd } 1842168404Spjd db->db_parent = NULL; 1843168404Spjd db->db_buf = NULL; 1844168404Spjd 1845168404Spjd ASSERT(db->db.db_data == NULL); 1846168404Spjd ASSERT(db->db_hash_next == NULL); 1847168404Spjd ASSERT(db->db_blkptr == NULL); 1848168404Spjd ASSERT(db->db_data_pending == NULL); 1849168404Spjd 1850168404Spjd kmem_cache_free(dbuf_cache, db); 1851208373Smm arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1852168404Spjd} 1853168404Spjd 1854168404Spjdvoid 1855260763Savgdbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio) 1856168404Spjd{ 1857168404Spjd dmu_buf_impl_t *db = NULL; 1858168404Spjd blkptr_t *bp = NULL; 1859168404Spjd 1860219089Spjd ASSERT(blkid != DMU_BONUS_BLKID); 1861168404Spjd ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1862168404Spjd 1863168404Spjd if (dnode_block_freed(dn, blkid)) 1864168404Spjd return; 1865168404Spjd 1866168404Spjd /* dbuf_find() returns with db_mtx held */ 1867168404Spjd if (db = dbuf_find(dn, 0, blkid)) { 1868219089Spjd /* 1869219089Spjd * This dbuf is already in the cache. We assume that 1870219089Spjd * it is already CACHED, or else about to be either 1871219089Spjd * read or filled. 1872219089Spjd */ 1873168404Spjd mutex_exit(&db->db_mtx); 1874219089Spjd return; 1875168404Spjd } 1876168404Spjd 1877168404Spjd if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { 1878268649Sdelphij if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { 1879219089Spjd dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; 1880168404Spjd uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; 1881268657Sdelphij zbookmark_phys_t zb; 1882168404Spjd 1883219089Spjd SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, 1884219089Spjd dn->dn_object, 0, blkid); 1885219089Spjd 1886246666Smm (void) arc_read(NULL, dn->dn_objset->os_spa, 1887260763Savg bp, NULL, NULL, prio, 1888168404Spjd ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 1889168404Spjd &aflags, &zb); 1890168404Spjd } 1891168404Spjd if (db) 1892168404Spjd dbuf_rele(db, NULL); 1893168404Spjd } 1894168404Spjd} 1895168404Spjd 1896168404Spjd/* 1897168404Spjd * Returns with db_holds incremented, and db_mtx not held. 1898168404Spjd * Note: dn_struct_rwlock must be held. 1899168404Spjd */ 1900168404Spjdint 1901168404Spjddbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, 1902168404Spjd void *tag, dmu_buf_impl_t **dbp) 1903168404Spjd{ 1904168404Spjd dmu_buf_impl_t *db, *parent = NULL; 1905168404Spjd 1906219089Spjd ASSERT(blkid != DMU_BONUS_BLKID); 1907168404Spjd ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1908168404Spjd ASSERT3U(dn->dn_nlevels, >, level); 1909168404Spjd 1910168404Spjd *dbp = NULL; 1911168404Spjdtop: 1912168404Spjd /* dbuf_find() returns with db_mtx held */ 1913168404Spjd db = dbuf_find(dn, level, blkid); 1914168404Spjd 1915168404Spjd if (db == NULL) { 1916168404Spjd blkptr_t *bp = NULL; 1917168404Spjd int err; 1918168404Spjd 1919168404Spjd ASSERT3P(parent, ==, NULL); 1920168404Spjd err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); 1921168404Spjd if (fail_sparse) { 1922168404Spjd if (err == 0 && bp && BP_IS_HOLE(bp)) 1923249195Smm err = SET_ERROR(ENOENT); 1924168404Spjd if (err) { 1925168404Spjd if (parent) 1926168404Spjd dbuf_rele(parent, NULL); 1927168404Spjd return (err); 1928168404Spjd } 1929168404Spjd } 1930168404Spjd if (err && err != ENOENT) 1931168404Spjd return (err); 1932168404Spjd db = dbuf_create(dn, level, blkid, parent, bp); 1933168404Spjd } 1934168404Spjd 1935168404Spjd if (db->db_buf && refcount_is_zero(&db->db_holds)) { 1936168404Spjd arc_buf_add_ref(db->db_buf, db); 1937168404Spjd if (db->db_buf->b_data == NULL) { 1938168404Spjd dbuf_clear(db); 1939168404Spjd if (parent) { 1940168404Spjd dbuf_rele(parent, NULL); 1941168404Spjd parent = NULL; 1942168404Spjd } 1943168404Spjd goto top; 1944168404Spjd } 1945168404Spjd ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); 1946168404Spjd } 1947168404Spjd 1948168404Spjd ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); 1949168404Spjd 1950168404Spjd /* 1951168404Spjd * If this buffer is currently syncing out, and we are are 1952168404Spjd * still referencing it from db_data, we need to make a copy 1953168404Spjd * of it in case we decide we want to dirty it again in this txg. 1954168404Spjd */ 1955219089Spjd if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 1956168404Spjd dn->dn_object != DMU_META_DNODE_OBJECT && 1957168404Spjd db->db_state == DB_CACHED && db->db_data_pending) { 1958168404Spjd dbuf_dirty_record_t *dr = db->db_data_pending; 1959168404Spjd 1960168404Spjd if (dr->dt.dl.dr_data == db->db_buf) { 1961168404Spjd arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1962168404Spjd 1963168404Spjd dbuf_set_data(db, 1964219089Spjd arc_buf_alloc(dn->dn_objset->os_spa, 1965168404Spjd db->db.db_size, db, type)); 1966168404Spjd bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, 1967168404Spjd db->db.db_size); 1968168404Spjd } 1969168404Spjd } 1970168404Spjd 1971168404Spjd (void) refcount_add(&db->db_holds, tag); 1972168404Spjd dbuf_update_data(db); 1973168404Spjd DBUF_VERIFY(db); 1974168404Spjd mutex_exit(&db->db_mtx); 1975168404Spjd 1976168404Spjd /* NOTE: we can't rele the parent until after we drop the db_mtx */ 1977168404Spjd if (parent) 1978168404Spjd dbuf_rele(parent, NULL); 1979168404Spjd 1980219089Spjd ASSERT3P(DB_DNODE(db), ==, dn); 1981168404Spjd ASSERT3U(db->db_blkid, ==, blkid); 1982168404Spjd ASSERT3U(db->db_level, ==, level); 1983168404Spjd *dbp = db; 1984168404Spjd 1985168404Spjd return (0); 1986168404Spjd} 1987168404Spjd 1988168404Spjddmu_buf_impl_t * 1989168404Spjddbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) 1990168404Spjd{ 1991168404Spjd dmu_buf_impl_t *db; 1992168404Spjd int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db); 1993168404Spjd return (err ? NULL : db); 1994168404Spjd} 1995168404Spjd 1996168404Spjddmu_buf_impl_t * 1997168404Spjddbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) 1998168404Spjd{ 1999168404Spjd dmu_buf_impl_t *db; 2000168404Spjd int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); 2001168404Spjd return (err ? NULL : db); 2002168404Spjd} 2003168404Spjd 2004185029Spjdvoid 2005168404Spjddbuf_create_bonus(dnode_t *dn) 2006168404Spjd{ 2007168404Spjd ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 2008168404Spjd 2009168404Spjd ASSERT(dn->dn_bonus == NULL); 2010219089Spjd dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); 2011168404Spjd} 2012168404Spjd 2013219089Spjdint 2014219089Spjddbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) 2015219089Spjd{ 2016219089Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2017219089Spjd dnode_t *dn; 2018219089Spjd 2019219089Spjd if (db->db_blkid != DMU_SPILL_BLKID) 2020249195Smm return (SET_ERROR(ENOTSUP)); 2021219089Spjd if (blksz == 0) 2022219089Spjd blksz = SPA_MINBLOCKSIZE; 2023219089Spjd if (blksz > SPA_MAXBLOCKSIZE) 2024219089Spjd blksz = SPA_MAXBLOCKSIZE; 2025219089Spjd else 2026219089Spjd blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); 2027219089Spjd 2028219089Spjd DB_DNODE_ENTER(db); 2029219089Spjd dn = DB_DNODE(db); 2030219089Spjd rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 2031219089Spjd dbuf_new_size(db, blksz, tx); 2032219089Spjd rw_exit(&dn->dn_struct_rwlock); 2033219089Spjd DB_DNODE_EXIT(db); 2034219089Spjd 2035219089Spjd return (0); 2036219089Spjd} 2037219089Spjd 2038219089Spjdvoid 2039219089Spjddbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) 2040219089Spjd{ 2041219089Spjd dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); 2042219089Spjd} 2043219089Spjd 2044168404Spjd#pragma weak dmu_buf_add_ref = dbuf_add_ref 2045168404Spjdvoid 2046168404Spjddbuf_add_ref(dmu_buf_impl_t *db, void *tag) 2047168404Spjd{ 2048168404Spjd int64_t holds = refcount_add(&db->db_holds, tag); 2049168404Spjd ASSERT(holds > 1); 2050168404Spjd} 2051168404Spjd 2052219089Spjd/* 2053219089Spjd * If you call dbuf_rele() you had better not be referencing the dnode handle 2054219089Spjd * unless you have some other direct or indirect hold on the dnode. (An indirect 2055219089Spjd * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) 2056219089Spjd * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the 2057219089Spjd * dnode's parent dbuf evicting its dnode handles. 2058219089Spjd */ 2059168404Spjdvoid 2060168404Spjddbuf_rele(dmu_buf_impl_t *db, void *tag) 2061168404Spjd{ 2062219089Spjd mutex_enter(&db->db_mtx); 2063219089Spjd dbuf_rele_and_unlock(db, tag); 2064219089Spjd} 2065219089Spjd 2066263397Sdelphijvoid 2067263397Sdelphijdmu_buf_rele(dmu_buf_t *db, void *tag) 2068263397Sdelphij{ 2069263397Sdelphij dbuf_rele((dmu_buf_impl_t *)db, tag); 2070263397Sdelphij} 2071263397Sdelphij 2072219089Spjd/* 2073219089Spjd * dbuf_rele() for an already-locked dbuf. This is necessary to allow 2074219089Spjd * db_dirtycnt and db_holds to be updated atomically. 2075219089Spjd */ 2076219089Spjdvoid 2077219089Spjddbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) 2078219089Spjd{ 2079168404Spjd int64_t holds; 2080168404Spjd 2081219089Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 2082168404Spjd DBUF_VERIFY(db); 2083168404Spjd 2084219089Spjd /* 2085219089Spjd * Remove the reference to the dbuf before removing its hold on the 2086219089Spjd * dnode so we can guarantee in dnode_move() that a referenced bonus 2087219089Spjd * buffer has a corresponding dnode hold. 2088219089Spjd */ 2089168404Spjd holds = refcount_remove(&db->db_holds, tag); 2090168404Spjd ASSERT(holds >= 0); 2091168404Spjd 2092168404Spjd /* 2093168404Spjd * We can't freeze indirects if there is a possibility that they 2094168404Spjd * may be modified in the current syncing context. 2095168404Spjd */ 2096168404Spjd if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) 2097168404Spjd arc_buf_freeze(db->db_buf); 2098168404Spjd 2099168404Spjd if (holds == db->db_dirtycnt && 2100168404Spjd db->db_level == 0 && db->db_immediate_evict) 2101168404Spjd dbuf_evict_user(db); 2102168404Spjd 2103168404Spjd if (holds == 0) { 2104219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 2105168404Spjd mutex_exit(&db->db_mtx); 2106219089Spjd 2107219089Spjd /* 2108219089Spjd * If the dnode moves here, we cannot cross this barrier 2109219089Spjd * until the move completes. 2110219089Spjd */ 2111219089Spjd DB_DNODE_ENTER(db); 2112271002Sdelphij atomic_dec_32(&DB_DNODE(db)->dn_dbufs_count); 2113219089Spjd DB_DNODE_EXIT(db); 2114219089Spjd /* 2115219089Spjd * The bonus buffer's dnode hold is no longer discounted 2116219089Spjd * in dnode_move(). The dnode cannot move until after 2117219089Spjd * the dnode_rele(). 2118219089Spjd */ 2119219089Spjd dnode_rele(DB_DNODE(db), db); 2120168404Spjd } else if (db->db_buf == NULL) { 2121168404Spjd /* 2122168404Spjd * This is a special case: we never associated this 2123168404Spjd * dbuf with any data allocated from the ARC. 2124168404Spjd */ 2125219089Spjd ASSERT(db->db_state == DB_UNCACHED || 2126219089Spjd db->db_state == DB_NOFILL); 2127168404Spjd dbuf_evict(db); 2128168404Spjd } else if (arc_released(db->db_buf)) { 2129168404Spjd arc_buf_t *buf = db->db_buf; 2130168404Spjd /* 2131168404Spjd * This dbuf has anonymous data associated with it. 2132168404Spjd */ 2133168404Spjd dbuf_set_data(db, NULL); 2134248571Smm VERIFY(arc_buf_remove_ref(buf, db)); 2135168404Spjd dbuf_evict(db); 2136168404Spjd } else { 2137248571Smm VERIFY(!arc_buf_remove_ref(db->db_buf, db)); 2138242845Sdelphij 2139242845Sdelphij /* 2140242845Sdelphij * A dbuf will be eligible for eviction if either the 2141242845Sdelphij * 'primarycache' property is set or a duplicate 2142242845Sdelphij * copy of this buffer is already cached in the arc. 2143242845Sdelphij * 2144242845Sdelphij * In the case of the 'primarycache' a buffer 2145242845Sdelphij * is considered for eviction if it matches the 2146242845Sdelphij * criteria set in the property. 2147242845Sdelphij * 2148242845Sdelphij * To decide if our buffer is considered a 2149242845Sdelphij * duplicate, we must call into the arc to determine 2150242845Sdelphij * if multiple buffers are referencing the same 2151242845Sdelphij * block on-disk. If so, then we simply evict 2152242845Sdelphij * ourselves. 2153242845Sdelphij */ 2154269417Sdelphij if (!DBUF_IS_CACHEABLE(db)) { 2155269417Sdelphij if (db->db_blkptr != NULL && 2156269417Sdelphij !BP_IS_HOLE(db->db_blkptr) && 2157269417Sdelphij !BP_IS_EMBEDDED(db->db_blkptr)) { 2158269417Sdelphij spa_t *spa = 2159269417Sdelphij dmu_objset_spa(db->db_objset); 2160269417Sdelphij blkptr_t bp = *db->db_blkptr; 2161269417Sdelphij dbuf_clear(db); 2162269417Sdelphij arc_freed(spa, &bp); 2163269417Sdelphij } else { 2164269417Sdelphij dbuf_clear(db); 2165269417Sdelphij } 2166269417Sdelphij } else if (arc_buf_eviction_needed(db->db_buf)) { 2167185029Spjd dbuf_clear(db); 2168269417Sdelphij } else { 2169185029Spjd mutex_exit(&db->db_mtx); 2170269417Sdelphij } 2171168404Spjd } 2172168404Spjd } else { 2173168404Spjd mutex_exit(&db->db_mtx); 2174168404Spjd } 2175168404Spjd} 2176168404Spjd 2177168404Spjd#pragma weak dmu_buf_refcount = dbuf_refcount 2178168404Spjduint64_t 2179168404Spjddbuf_refcount(dmu_buf_impl_t *db) 2180168404Spjd{ 2181168404Spjd return (refcount_count(&db->db_holds)); 2182168404Spjd} 2183168404Spjd 2184168404Spjdvoid * 2185168404Spjddmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 2186168404Spjd dmu_buf_evict_func_t *evict_func) 2187168404Spjd{ 2188168404Spjd return (dmu_buf_update_user(db_fake, NULL, user_ptr, 2189168404Spjd user_data_ptr_ptr, evict_func)); 2190168404Spjd} 2191168404Spjd 2192168404Spjdvoid * 2193168404Spjddmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 2194168404Spjd dmu_buf_evict_func_t *evict_func) 2195168404Spjd{ 2196168404Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2197168404Spjd 2198168404Spjd db->db_immediate_evict = TRUE; 2199168404Spjd return (dmu_buf_update_user(db_fake, NULL, user_ptr, 2200168404Spjd user_data_ptr_ptr, evict_func)); 2201168404Spjd} 2202168404Spjd 2203168404Spjdvoid * 2204168404Spjddmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, 2205168404Spjd void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func) 2206168404Spjd{ 2207168404Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2208168404Spjd ASSERT(db->db_level == 0); 2209168404Spjd 2210168404Spjd ASSERT((user_ptr == NULL) == (evict_func == NULL)); 2211168404Spjd 2212168404Spjd mutex_enter(&db->db_mtx); 2213168404Spjd 2214168404Spjd if (db->db_user_ptr == old_user_ptr) { 2215168404Spjd db->db_user_ptr = user_ptr; 2216168404Spjd db->db_user_data_ptr_ptr = user_data_ptr_ptr; 2217168404Spjd db->db_evict_func = evict_func; 2218168404Spjd 2219168404Spjd dbuf_update_data(db); 2220168404Spjd } else { 2221168404Spjd old_user_ptr = db->db_user_ptr; 2222168404Spjd } 2223168404Spjd 2224168404Spjd mutex_exit(&db->db_mtx); 2225168404Spjd return (old_user_ptr); 2226168404Spjd} 2227168404Spjd 2228168404Spjdvoid * 2229168404Spjddmu_buf_get_user(dmu_buf_t *db_fake) 2230168404Spjd{ 2231168404Spjd dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2232168404Spjd ASSERT(!refcount_is_zero(&db->db_holds)); 2233168404Spjd 2234168404Spjd return (db->db_user_ptr); 2235168404Spjd} 2236168404Spjd 2237209962Smmboolean_t 2238209962Smmdmu_buf_freeable(dmu_buf_t *dbuf) 2239209962Smm{ 2240209962Smm boolean_t res = B_FALSE; 2241209962Smm dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; 2242209962Smm 2243209962Smm if (db->db_blkptr) 2244209962Smm res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset, 2245219089Spjd db->db_blkptr, db->db_blkptr->blk_birth); 2246209962Smm 2247209962Smm return (res); 2248209962Smm} 2249209962Smm 2250243524Smmblkptr_t * 2251243524Smmdmu_buf_get_blkptr(dmu_buf_t *db) 2252243524Smm{ 2253243524Smm dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; 2254243524Smm return (dbi->db_blkptr); 2255243524Smm} 2256243524Smm 2257168404Spjdstatic void 2258168404Spjddbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) 2259168404Spjd{ 2260168404Spjd /* ASSERT(dmu_tx_is_syncing(tx) */ 2261168404Spjd ASSERT(MUTEX_HELD(&db->db_mtx)); 2262168404Spjd 2263168404Spjd if (db->db_blkptr != NULL) 2264168404Spjd return; 2265168404Spjd 2266219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) { 2267219089Spjd db->db_blkptr = &dn->dn_phys->dn_spill; 2268219089Spjd BP_ZERO(db->db_blkptr); 2269219089Spjd return; 2270219089Spjd } 2271168404Spjd if (db->db_level == dn->dn_phys->dn_nlevels-1) { 2272168404Spjd /* 2273168404Spjd * This buffer was allocated at a time when there was 2274168404Spjd * no available blkptrs from the dnode, or it was 2275168404Spjd * inappropriate to hook it in (i.e., nlevels mis-match). 2276168404Spjd */ 2277168404Spjd ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); 2278168404Spjd ASSERT(db->db_parent == NULL); 2279168404Spjd db->db_parent = dn->dn_dbuf; 2280168404Spjd db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; 2281168404Spjd DBUF_VERIFY(db); 2282168404Spjd } else { 2283168404Spjd dmu_buf_impl_t *parent = db->db_parent; 2284168404Spjd int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2285168404Spjd 2286168404Spjd ASSERT(dn->dn_phys->dn_nlevels > 1); 2287168404Spjd if (parent == NULL) { 2288168404Spjd mutex_exit(&db->db_mtx); 2289168404Spjd rw_enter(&dn->dn_struct_rwlock, RW_READER); 2290168404Spjd (void) dbuf_hold_impl(dn, db->db_level+1, 2291168404Spjd db->db_blkid >> epbs, FALSE, db, &parent); 2292168404Spjd rw_exit(&dn->dn_struct_rwlock); 2293168404Spjd mutex_enter(&db->db_mtx); 2294168404Spjd db->db_parent = parent; 2295168404Spjd } 2296168404Spjd db->db_blkptr = (blkptr_t *)parent->db.db_data + 2297168404Spjd (db->db_blkid & ((1ULL << epbs) - 1)); 2298168404Spjd DBUF_VERIFY(db); 2299168404Spjd } 2300168404Spjd} 2301168404Spjd 2302168404Spjdstatic void 2303168404Spjddbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2304168404Spjd{ 2305168404Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 2306219089Spjd dnode_t *dn; 2307168404Spjd zio_t *zio; 2308168404Spjd 2309168404Spjd ASSERT(dmu_tx_is_syncing(tx)); 2310168404Spjd 2311168404Spjd dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2312168404Spjd 2313168404Spjd mutex_enter(&db->db_mtx); 2314168404Spjd 2315168404Spjd ASSERT(db->db_level > 0); 2316168404Spjd DBUF_VERIFY(db); 2317168404Spjd 2318251629Sdelphij /* Read the block if it hasn't been read yet. */ 2319168404Spjd if (db->db_buf == NULL) { 2320168404Spjd mutex_exit(&db->db_mtx); 2321168404Spjd (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); 2322168404Spjd mutex_enter(&db->db_mtx); 2323168404Spjd } 2324168404Spjd ASSERT3U(db->db_state, ==, DB_CACHED); 2325168404Spjd ASSERT(db->db_buf != NULL); 2326168404Spjd 2327219089Spjd DB_DNODE_ENTER(db); 2328219089Spjd dn = DB_DNODE(db); 2329251629Sdelphij /* Indirect block size must match what the dnode thinks it is. */ 2330219089Spjd ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2331168404Spjd dbuf_check_blkptr(dn, db); 2332219089Spjd DB_DNODE_EXIT(db); 2333168404Spjd 2334251629Sdelphij /* Provide the pending dirty record to child dbufs */ 2335168404Spjd db->db_data_pending = dr; 2336168404Spjd 2337168404Spjd mutex_exit(&db->db_mtx); 2338185029Spjd dbuf_write(dr, db->db_buf, tx); 2339168404Spjd 2340168404Spjd zio = dr->dr_zio; 2341168404Spjd mutex_enter(&dr->dt.di.dr_mtx); 2342168404Spjd dbuf_sync_list(&dr->dt.di.dr_children, tx); 2343168404Spjd ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 2344168404Spjd mutex_exit(&dr->dt.di.dr_mtx); 2345168404Spjd zio_nowait(zio); 2346168404Spjd} 2347168404Spjd 2348168404Spjdstatic void 2349168404Spjddbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2350168404Spjd{ 2351168404Spjd arc_buf_t **datap = &dr->dt.dl.dr_data; 2352168404Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 2353219089Spjd dnode_t *dn; 2354219089Spjd objset_t *os; 2355168404Spjd uint64_t txg = tx->tx_txg; 2356168404Spjd 2357168404Spjd ASSERT(dmu_tx_is_syncing(tx)); 2358168404Spjd 2359168404Spjd dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2360168404Spjd 2361168404Spjd mutex_enter(&db->db_mtx); 2362168404Spjd /* 2363168404Spjd * To be synced, we must be dirtied. But we 2364168404Spjd * might have been freed after the dirty. 2365168404Spjd */ 2366168404Spjd if (db->db_state == DB_UNCACHED) { 2367168404Spjd /* This buffer has been freed since it was dirtied */ 2368168404Spjd ASSERT(db->db.db_data == NULL); 2369168404Spjd } else if (db->db_state == DB_FILL) { 2370168404Spjd /* This buffer was freed and is now being re-filled */ 2371168404Spjd ASSERT(db->db.db_data != dr->dt.dl.dr_data); 2372168404Spjd } else { 2373219089Spjd ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); 2374168404Spjd } 2375168404Spjd DBUF_VERIFY(db); 2376168404Spjd 2377219089Spjd DB_DNODE_ENTER(db); 2378219089Spjd dn = DB_DNODE(db); 2379219089Spjd 2380219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) { 2381219089Spjd mutex_enter(&dn->dn_mtx); 2382219089Spjd dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; 2383219089Spjd mutex_exit(&dn->dn_mtx); 2384219089Spjd } 2385219089Spjd 2386168404Spjd /* 2387168404Spjd * If this is a bonus buffer, simply copy the bonus data into the 2388168404Spjd * dnode. It will be written out when the dnode is synced (and it 2389168404Spjd * will be synced, since it must have been dirty for dbuf_sync to 2390168404Spjd * be called). 2391168404Spjd */ 2392219089Spjd if (db->db_blkid == DMU_BONUS_BLKID) { 2393168404Spjd dbuf_dirty_record_t **drp; 2394185029Spjd 2395168404Spjd ASSERT(*datap != NULL); 2396240415Smm ASSERT0(db->db_level); 2397168404Spjd ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); 2398168404Spjd bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); 2399219089Spjd DB_DNODE_EXIT(db); 2400219089Spjd 2401185029Spjd if (*datap != db->db.db_data) { 2402168404Spjd zio_buf_free(*datap, DN_MAX_BONUSLEN); 2403208373Smm arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 2404185029Spjd } 2405168404Spjd db->db_data_pending = NULL; 2406168404Spjd drp = &db->db_last_dirty; 2407168404Spjd while (*drp != dr) 2408168404Spjd drp = &(*drp)->dr_next; 2409185029Spjd ASSERT(dr->dr_next == NULL); 2410219089Spjd ASSERT(dr->dr_dbuf == db); 2411185029Spjd *drp = dr->dr_next; 2412169325Spjd if (dr->dr_dbuf->db_level != 0) { 2413169325Spjd list_destroy(&dr->dt.di.dr_children); 2414169325Spjd mutex_destroy(&dr->dt.di.dr_mtx); 2415169325Spjd } 2416168404Spjd kmem_free(dr, sizeof (dbuf_dirty_record_t)); 2417168404Spjd ASSERT(db->db_dirtycnt > 0); 2418168404Spjd db->db_dirtycnt -= 1; 2419219089Spjd dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); 2420168404Spjd return; 2421168404Spjd } 2422168404Spjd 2423219089Spjd os = dn->dn_objset; 2424219089Spjd 2425168404Spjd /* 2426185029Spjd * This function may have dropped the db_mtx lock allowing a dmu_sync 2427185029Spjd * operation to sneak in. As a result, we need to ensure that we 2428185029Spjd * don't check the dr_override_state until we have returned from 2429185029Spjd * dbuf_check_blkptr. 2430185029Spjd */ 2431185029Spjd dbuf_check_blkptr(dn, db); 2432185029Spjd 2433185029Spjd /* 2434219089Spjd * If this buffer is in the middle of an immediate write, 2435168404Spjd * wait for the synchronous IO to complete. 2436168404Spjd */ 2437168404Spjd while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { 2438168404Spjd ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); 2439168404Spjd cv_wait(&db->db_changed, &db->db_mtx); 2440168404Spjd ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); 2441168404Spjd } 2442168404Spjd 2443219089Spjd if (db->db_state != DB_NOFILL && 2444219089Spjd dn->dn_object != DMU_META_DNODE_OBJECT && 2445208050Smm refcount_count(&db->db_holds) > 1 && 2446219089Spjd dr->dt.dl.dr_override_state != DR_OVERRIDDEN && 2447208050Smm *datap == db->db_buf) { 2448168404Spjd /* 2449208050Smm * If this buffer is currently "in use" (i.e., there 2450208050Smm * are active holds and db_data still references it), 2451208050Smm * then make a copy before we start the write so that 2452208050Smm * any modifications from the open txg will not leak 2453208050Smm * into this write. 2454168404Spjd * 2455208050Smm * NOTE: this copy does not need to be made for 2456208050Smm * objects only modified in the syncing context (e.g. 2457208050Smm * DNONE_DNODE blocks). 2458168404Spjd */ 2459208050Smm int blksz = arc_buf_size(*datap); 2460208050Smm arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 2461208050Smm *datap = arc_buf_alloc(os->os_spa, blksz, db, type); 2462208050Smm bcopy(db->db.db_data, (*datap)->b_data, blksz); 2463168404Spjd } 2464168404Spjd db->db_data_pending = dr; 2465168404Spjd 2466168404Spjd mutex_exit(&db->db_mtx); 2467168404Spjd 2468185029Spjd dbuf_write(dr, *datap, tx); 2469168404Spjd 2470168404Spjd ASSERT(!list_link_active(&dr->dr_dirty_node)); 2471219089Spjd if (dn->dn_object == DMU_META_DNODE_OBJECT) { 2472168404Spjd list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); 2473219089Spjd DB_DNODE_EXIT(db); 2474219089Spjd } else { 2475219089Spjd /* 2476219089Spjd * Although zio_nowait() does not "wait for an IO", it does 2477219089Spjd * initiate the IO. If this is an empty write it seems plausible 2478219089Spjd * that the IO could actually be completed before the nowait 2479219089Spjd * returns. We need to DB_DNODE_EXIT() first in case 2480219089Spjd * zio_nowait() invalidates the dbuf. 2481219089Spjd */ 2482219089Spjd DB_DNODE_EXIT(db); 2483168404Spjd zio_nowait(dr->dr_zio); 2484219089Spjd } 2485168404Spjd} 2486168404Spjd 2487168404Spjdvoid 2488168404Spjddbuf_sync_list(list_t *list, dmu_tx_t *tx) 2489168404Spjd{ 2490168404Spjd dbuf_dirty_record_t *dr; 2491168404Spjd 2492168404Spjd while (dr = list_head(list)) { 2493168404Spjd if (dr->dr_zio != NULL) { 2494168404Spjd /* 2495168404Spjd * If we find an already initialized zio then we 2496168404Spjd * are processing the meta-dnode, and we have finished. 2497168404Spjd * The dbufs for all dnodes are put back on the list 2498168404Spjd * during processing, so that we can zio_wait() 2499168404Spjd * these IOs after initiating all child IOs. 2500168404Spjd */ 2501168404Spjd ASSERT3U(dr->dr_dbuf->db.db_object, ==, 2502168404Spjd DMU_META_DNODE_OBJECT); 2503168404Spjd break; 2504168404Spjd } 2505168404Spjd list_remove(list, dr); 2506168404Spjd if (dr->dr_dbuf->db_level > 0) 2507168404Spjd dbuf_sync_indirect(dr, tx); 2508168404Spjd else 2509168404Spjd dbuf_sync_leaf(dr, tx); 2510168404Spjd } 2511168404Spjd} 2512168404Spjd 2513168404Spjd/* ARGSUSED */ 2514168404Spjdstatic void 2515168404Spjddbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) 2516168404Spjd{ 2517168404Spjd dmu_buf_impl_t *db = vdb; 2518219089Spjd dnode_t *dn; 2519185029Spjd blkptr_t *bp = zio->io_bp; 2520168404Spjd blkptr_t *bp_orig = &zio->io_bp_orig; 2521219089Spjd spa_t *spa = zio->io_spa; 2522219089Spjd int64_t delta; 2523168404Spjd uint64_t fill = 0; 2524219089Spjd int i; 2525168404Spjd 2526268649Sdelphij ASSERT3P(db->db_blkptr, ==, bp); 2527185029Spjd 2528219089Spjd DB_DNODE_ENTER(db); 2529219089Spjd dn = DB_DNODE(db); 2530219089Spjd delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); 2531219089Spjd dnode_diduse_space(dn, delta - zio->io_prev_space_delta); 2532219089Spjd zio->io_prev_space_delta = delta; 2533168404Spjd 2534263397Sdelphij if (bp->blk_birth != 0) { 2535263397Sdelphij ASSERT((db->db_blkid != DMU_SPILL_BLKID && 2536263397Sdelphij BP_GET_TYPE(bp) == dn->dn_type) || 2537263397Sdelphij (db->db_blkid == DMU_SPILL_BLKID && 2538268649Sdelphij BP_GET_TYPE(bp) == dn->dn_bonustype) || 2539268649Sdelphij BP_IS_EMBEDDED(bp)); 2540263397Sdelphij ASSERT(BP_GET_LEVEL(bp) == db->db_level); 2541168404Spjd } 2542168404Spjd 2543168404Spjd mutex_enter(&db->db_mtx); 2544168404Spjd 2545219089Spjd#ifdef ZFS_DEBUG 2546219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) { 2547219089Spjd ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 2548219089Spjd ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 2549219089Spjd db->db_blkptr == &dn->dn_phys->dn_spill); 2550219089Spjd } 2551219089Spjd#endif 2552219089Spjd 2553168404Spjd if (db->db_level == 0) { 2554168404Spjd mutex_enter(&dn->dn_mtx); 2555219089Spjd if (db->db_blkid > dn->dn_phys->dn_maxblkid && 2556219089Spjd db->db_blkid != DMU_SPILL_BLKID) 2557168404Spjd dn->dn_phys->dn_maxblkid = db->db_blkid; 2558168404Spjd mutex_exit(&dn->dn_mtx); 2559168404Spjd 2560168404Spjd if (dn->dn_type == DMU_OT_DNODE) { 2561168404Spjd dnode_phys_t *dnp = db->db.db_data; 2562168404Spjd for (i = db->db.db_size >> DNODE_SHIFT; i > 0; 2563168404Spjd i--, dnp++) { 2564168404Spjd if (dnp->dn_type != DMU_OT_NONE) 2565168404Spjd fill++; 2566168404Spjd } 2567168404Spjd } else { 2568263397Sdelphij if (BP_IS_HOLE(bp)) { 2569263397Sdelphij fill = 0; 2570263397Sdelphij } else { 2571263397Sdelphij fill = 1; 2572263397Sdelphij } 2573168404Spjd } 2574168404Spjd } else { 2575185029Spjd blkptr_t *ibp = db->db.db_data; 2576168404Spjd ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2577185029Spjd for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { 2578185029Spjd if (BP_IS_HOLE(ibp)) 2579168404Spjd continue; 2580268649Sdelphij fill += BP_GET_FILL(ibp); 2581168404Spjd } 2582168404Spjd } 2583219089Spjd DB_DNODE_EXIT(db); 2584168404Spjd 2585268649Sdelphij if (!BP_IS_EMBEDDED(bp)) 2586268649Sdelphij bp->blk_fill = fill; 2587168404Spjd 2588168404Spjd mutex_exit(&db->db_mtx); 2589168404Spjd} 2590168404Spjd 2591260763Savg/* 2592260763Savg * The SPA will call this callback several times for each zio - once 2593260763Savg * for every physical child i/o (zio->io_phys_children times). This 2594260763Savg * allows the DMU to monitor the progress of each logical i/o. For example, 2595260763Savg * there may be 2 copies of an indirect block, or many fragments of a RAID-Z 2596260763Savg * block. There may be a long delay before all copies/fragments are completed, 2597260763Savg * so this callback allows us to retire dirty space gradually, as the physical 2598260763Savg * i/os complete. 2599260763Savg */ 2600168404Spjd/* ARGSUSED */ 2601168404Spjdstatic void 2602260763Savgdbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg) 2603260763Savg{ 2604260763Savg dmu_buf_impl_t *db = arg; 2605260763Savg objset_t *os = db->db_objset; 2606260763Savg dsl_pool_t *dp = dmu_objset_pool(os); 2607260763Savg dbuf_dirty_record_t *dr; 2608260763Savg int delta = 0; 2609260763Savg 2610260763Savg dr = db->db_data_pending; 2611260763Savg ASSERT3U(dr->dr_txg, ==, zio->io_txg); 2612260763Savg 2613260763Savg /* 2614260763Savg * The callback will be called io_phys_children times. Retire one 2615260763Savg * portion of our dirty space each time we are called. Any rounding 2616260763Savg * error will be cleaned up by dsl_pool_sync()'s call to 2617260763Savg * dsl_pool_undirty_space(). 2618260763Savg */ 2619260763Savg delta = dr->dr_accounted / zio->io_phys_children; 2620260763Savg dsl_pool_undirty_space(dp, delta, zio->io_txg); 2621260763Savg} 2622260763Savg 2623260763Savg/* ARGSUSED */ 2624260763Savgstatic void 2625168404Spjddbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) 2626168404Spjd{ 2627168404Spjd dmu_buf_impl_t *db = vdb; 2628219089Spjd blkptr_t *bp_orig = &zio->io_bp_orig; 2629263397Sdelphij blkptr_t *bp = db->db_blkptr; 2630263397Sdelphij objset_t *os = db->db_objset; 2631263397Sdelphij dmu_tx_t *tx = os->os_synctx; 2632168404Spjd dbuf_dirty_record_t **drp, *dr; 2633168404Spjd 2634240415Smm ASSERT0(zio->io_error); 2635219089Spjd ASSERT(db->db_blkptr == bp); 2636168404Spjd 2637243524Smm /* 2638243524Smm * For nopwrites and rewrites we ensure that the bp matches our 2639243524Smm * original and bypass all the accounting. 2640243524Smm */ 2641243524Smm if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { 2642219089Spjd ASSERT(BP_EQUAL(bp, bp_orig)); 2643219089Spjd } else { 2644263397Sdelphij dsl_dataset_t *ds = os->os_dsl_dataset; 2645219089Spjd (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); 2646219089Spjd dsl_dataset_block_born(ds, bp, tx); 2647219089Spjd } 2648219089Spjd 2649168404Spjd mutex_enter(&db->db_mtx); 2650168404Spjd 2651219089Spjd DBUF_VERIFY(db); 2652219089Spjd 2653168404Spjd drp = &db->db_last_dirty; 2654185029Spjd while ((dr = *drp) != db->db_data_pending) 2655185029Spjd drp = &dr->dr_next; 2656185029Spjd ASSERT(!list_link_active(&dr->dr_dirty_node)); 2657219089Spjd ASSERT(dr->dr_dbuf == db); 2658185029Spjd ASSERT(dr->dr_next == NULL); 2659185029Spjd *drp = dr->dr_next; 2660168404Spjd 2661219089Spjd#ifdef ZFS_DEBUG 2662219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) { 2663219089Spjd dnode_t *dn; 2664219089Spjd 2665219089Spjd DB_DNODE_ENTER(db); 2666219089Spjd dn = DB_DNODE(db); 2667219089Spjd ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 2668219089Spjd ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 2669219089Spjd db->db_blkptr == &dn->dn_phys->dn_spill); 2670219089Spjd DB_DNODE_EXIT(db); 2671219089Spjd } 2672219089Spjd#endif 2673219089Spjd 2674168404Spjd if (db->db_level == 0) { 2675219089Spjd ASSERT(db->db_blkid != DMU_BONUS_BLKID); 2676168404Spjd ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); 2677219089Spjd if (db->db_state != DB_NOFILL) { 2678219089Spjd if (dr->dt.dl.dr_data != db->db_buf) 2679219089Spjd VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, 2680248571Smm db)); 2681219089Spjd else if (!arc_released(db->db_buf)) 2682219089Spjd arc_set_callback(db->db_buf, dbuf_do_evict, db); 2683219089Spjd } 2684168404Spjd } else { 2685219089Spjd dnode_t *dn; 2686168404Spjd 2687219089Spjd DB_DNODE_ENTER(db); 2688219089Spjd dn = DB_DNODE(db); 2689168404Spjd ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 2690263397Sdelphij ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); 2691168404Spjd if (!BP_IS_HOLE(db->db_blkptr)) { 2692168404Spjd int epbs = 2693168404Spjd dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2694263397Sdelphij ASSERT3U(db->db_blkid, <=, 2695263397Sdelphij dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); 2696168404Spjd ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 2697168404Spjd db->db.db_size); 2698268649Sdelphij if (!arc_released(db->db_buf)) 2699268649Sdelphij arc_set_callback(db->db_buf, dbuf_do_evict, db); 2700168404Spjd } 2701219089Spjd DB_DNODE_EXIT(db); 2702185029Spjd mutex_destroy(&dr->dt.di.dr_mtx); 2703169325Spjd list_destroy(&dr->dt.di.dr_children); 2704168404Spjd } 2705168404Spjd kmem_free(dr, sizeof (dbuf_dirty_record_t)); 2706168404Spjd 2707168404Spjd cv_broadcast(&db->db_changed); 2708168404Spjd ASSERT(db->db_dirtycnt > 0); 2709168404Spjd db->db_dirtycnt -= 1; 2710168404Spjd db->db_data_pending = NULL; 2711263397Sdelphij dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg); 2712219089Spjd} 2713219089Spjd 2714219089Spjdstatic void 2715219089Spjddbuf_write_nofill_ready(zio_t *zio) 2716219089Spjd{ 2717219089Spjd dbuf_write_ready(zio, NULL, zio->io_private); 2718219089Spjd} 2719219089Spjd 2720219089Spjdstatic void 2721219089Spjddbuf_write_nofill_done(zio_t *zio) 2722219089Spjd{ 2723219089Spjd dbuf_write_done(zio, NULL, zio->io_private); 2724219089Spjd} 2725219089Spjd 2726219089Spjdstatic void 2727219089Spjddbuf_write_override_ready(zio_t *zio) 2728219089Spjd{ 2729219089Spjd dbuf_dirty_record_t *dr = zio->io_private; 2730219089Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 2731219089Spjd 2732219089Spjd dbuf_write_ready(zio, NULL, db); 2733219089Spjd} 2734219089Spjd 2735219089Spjdstatic void 2736219089Spjddbuf_write_override_done(zio_t *zio) 2737219089Spjd{ 2738219089Spjd dbuf_dirty_record_t *dr = zio->io_private; 2739219089Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 2740219089Spjd blkptr_t *obp = &dr->dt.dl.dr_overridden_by; 2741219089Spjd 2742219089Spjd mutex_enter(&db->db_mtx); 2743219089Spjd if (!BP_EQUAL(zio->io_bp, obp)) { 2744219089Spjd if (!BP_IS_HOLE(obp)) 2745219089Spjd dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); 2746219089Spjd arc_release(dr->dt.dl.dr_data, db); 2747219089Spjd } 2748168404Spjd mutex_exit(&db->db_mtx); 2749168404Spjd 2750219089Spjd dbuf_write_done(zio, NULL, db); 2751219089Spjd} 2752168404Spjd 2753251629Sdelphij/* Issue I/O to commit a dirty buffer to disk. */ 2754219089Spjdstatic void 2755219089Spjddbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) 2756219089Spjd{ 2757219089Spjd dmu_buf_impl_t *db = dr->dr_dbuf; 2758219089Spjd dnode_t *dn; 2759219089Spjd objset_t *os; 2760219089Spjd dmu_buf_impl_t *parent = db->db_parent; 2761219089Spjd uint64_t txg = tx->tx_txg; 2762268657Sdelphij zbookmark_phys_t zb; 2763219089Spjd zio_prop_t zp; 2764219089Spjd zio_t *zio; 2765219089Spjd int wp_flag = 0; 2766219089Spjd 2767219089Spjd DB_DNODE_ENTER(db); 2768219089Spjd dn = DB_DNODE(db); 2769219089Spjd os = dn->dn_objset; 2770219089Spjd 2771219089Spjd if (db->db_state != DB_NOFILL) { 2772219089Spjd if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { 2773219089Spjd /* 2774219089Spjd * Private object buffers are released here rather 2775219089Spjd * than in dbuf_dirty() since they are only modified 2776219089Spjd * in the syncing context and we don't want the 2777219089Spjd * overhead of making multiple copies of the data. 2778219089Spjd */ 2779219089Spjd if (BP_IS_HOLE(db->db_blkptr)) { 2780219089Spjd arc_buf_thaw(data); 2781219089Spjd } else { 2782219089Spjd dbuf_release_bp(db); 2783219089Spjd } 2784219089Spjd } 2785219089Spjd } 2786219089Spjd 2787219089Spjd if (parent != dn->dn_dbuf) { 2788251629Sdelphij /* Our parent is an indirect block. */ 2789251629Sdelphij /* We have a dirty parent that has been scheduled for write. */ 2790219089Spjd ASSERT(parent && parent->db_data_pending); 2791251629Sdelphij /* Our parent's buffer is one level closer to the dnode. */ 2792219089Spjd ASSERT(db->db_level == parent->db_level-1); 2793251629Sdelphij /* 2794251629Sdelphij * We're about to modify our parent's db_data by modifying 2795251629Sdelphij * our block pointer, so the parent must be released. 2796251629Sdelphij */ 2797219089Spjd ASSERT(arc_released(parent->db_buf)); 2798219089Spjd zio = parent->db_data_pending->dr_zio; 2799219089Spjd } else { 2800251629Sdelphij /* Our parent is the dnode itself. */ 2801219089Spjd ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && 2802219089Spjd db->db_blkid != DMU_SPILL_BLKID) || 2803219089Spjd (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); 2804219089Spjd if (db->db_blkid != DMU_SPILL_BLKID) 2805219089Spjd ASSERT3P(db->db_blkptr, ==, 2806219089Spjd &dn->dn_phys->dn_blkptr[db->db_blkid]); 2807219089Spjd zio = dn->dn_zio; 2808219089Spjd } 2809219089Spjd 2810219089Spjd ASSERT(db->db_level == 0 || data == db->db_buf); 2811219089Spjd ASSERT3U(db->db_blkptr->blk_birth, <=, txg); 2812219089Spjd ASSERT(zio); 2813219089Spjd 2814219089Spjd SET_BOOKMARK(&zb, os->os_dsl_dataset ? 2815219089Spjd os->os_dsl_dataset->ds_object : DMU_META_OBJSET, 2816219089Spjd db->db.db_object, db->db_level, db->db_blkid); 2817219089Spjd 2818219089Spjd if (db->db_blkid == DMU_SPILL_BLKID) 2819219089Spjd wp_flag = WP_SPILL; 2820219089Spjd wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; 2821219089Spjd 2822219089Spjd dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); 2823219089Spjd DB_DNODE_EXIT(db); 2824219089Spjd 2825268649Sdelphij if (db->db_level == 0 && 2826268649Sdelphij dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 2827268649Sdelphij /* 2828268649Sdelphij * The BP for this block has been provided by open context 2829268649Sdelphij * (by dmu_sync() or dmu_buf_write_embedded()). 2830268649Sdelphij */ 2831268649Sdelphij void *contents = (data != NULL) ? data->b_data : NULL; 2832268649Sdelphij 2833219089Spjd dr->dr_zio = zio_write(zio, os->os_spa, txg, 2834268649Sdelphij db->db_blkptr, contents, db->db.db_size, &zp, 2835260763Savg dbuf_write_override_ready, NULL, dbuf_write_override_done, 2836260763Savg dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 2837219089Spjd mutex_enter(&db->db_mtx); 2838219089Spjd dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 2839219089Spjd zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, 2840243524Smm dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); 2841219089Spjd mutex_exit(&db->db_mtx); 2842219089Spjd } else if (db->db_state == DB_NOFILL) { 2843255750Sdelphij ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || 2844255750Sdelphij zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); 2845219089Spjd dr->dr_zio = zio_write(zio, os->os_spa, txg, 2846219089Spjd db->db_blkptr, NULL, db->db.db_size, &zp, 2847260763Savg dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db, 2848219089Spjd ZIO_PRIORITY_ASYNC_WRITE, 2849219089Spjd ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); 2850219089Spjd } else { 2851219089Spjd ASSERT(arc_released(data)); 2852219089Spjd dr->dr_zio = arc_write(zio, os->os_spa, txg, 2853251478Sdelphij db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), 2854251478Sdelphij DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready, 2855260763Savg dbuf_write_physdone, dbuf_write_done, db, 2856260763Savg ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 2857219089Spjd } 2858168404Spjd} 2859