dbuf.c revision 269845
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 * Copyright (c) 2012, 2014 by Delphix. All rights reserved. 25 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 26 * Copyright (c) 2013, Joyent, Inc. All rights reserved. 27 */ 28 29#include <sys/zfs_context.h> 30#include <sys/dmu.h> 31#include <sys/dmu_send.h> 32#include <sys/dmu_impl.h> 33#include <sys/dbuf.h> 34#include <sys/dmu_objset.h> 35#include <sys/dsl_dataset.h> 36#include <sys/dsl_dir.h> 37#include <sys/dmu_tx.h> 38#include <sys/spa.h> 39#include <sys/zio.h> 40#include <sys/dmu_zfetch.h> 41#include <sys/sa.h> 42#include <sys/sa_impl.h> 43#include <sys/zfeature.h> 44#include <sys/blkptr.h> 45#include <sys/range_tree.h> 46 47/* 48 * Number of times that zfs_free_range() took the slow path while doing 49 * a zfs receive. A nonzero value indicates a potential performance problem. 50 */ 51uint64_t zfs_free_range_recv_miss; 52 53static void dbuf_destroy(dmu_buf_impl_t *db); 54static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); 55static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); 56 57/* 58 * Global data structures and functions for the dbuf cache. 59 */ 60static kmem_cache_t *dbuf_cache; 61 62/* ARGSUSED */ 63static int 64dbuf_cons(void *vdb, void *unused, int kmflag) 65{ 66 dmu_buf_impl_t *db = vdb; 67 bzero(db, sizeof (dmu_buf_impl_t)); 68 69 mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); 70 cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); 71 refcount_create(&db->db_holds); 72 73#if defined(illumos) || !defined(_KERNEL) 74 db->db_creation = gethrtime(); 75#else 76 db->db_creation = cpu_ticks() ^ ((uint64_t)CPU_SEQID << 48); 77#endif 78 79 return (0); 80} 81 82/* ARGSUSED */ 83static void 84dbuf_dest(void *vdb, void *unused) 85{ 86 dmu_buf_impl_t *db = vdb; 87 mutex_destroy(&db->db_mtx); 88 cv_destroy(&db->db_changed); 89 refcount_destroy(&db->db_holds); 90} 91 92/* 93 * dbuf hash table routines 94 */ 95static dbuf_hash_table_t dbuf_hash_table; 96 97static uint64_t dbuf_hash_count; 98 99static uint64_t 100dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) 101{ 102 uintptr_t osv = (uintptr_t)os; 103 uint64_t crc = -1ULL; 104 105 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 106 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; 107 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; 108 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; 109 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; 110 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; 111 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; 112 113 crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); 114 115 return (crc); 116} 117 118#define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); 119 120#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ 121 ((dbuf)->db.db_object == (obj) && \ 122 (dbuf)->db_objset == (os) && \ 123 (dbuf)->db_level == (level) && \ 124 (dbuf)->db_blkid == (blkid)) 125 126dmu_buf_impl_t * 127dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) 128{ 129 dbuf_hash_table_t *h = &dbuf_hash_table; 130 objset_t *os = dn->dn_objset; 131 uint64_t obj = dn->dn_object; 132 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 133 uint64_t idx = hv & h->hash_table_mask; 134 dmu_buf_impl_t *db; 135 136 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 137 for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { 138 if (DBUF_EQUAL(db, os, obj, level, blkid)) { 139 mutex_enter(&db->db_mtx); 140 if (db->db_state != DB_EVICTING) { 141 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 142 return (db); 143 } 144 mutex_exit(&db->db_mtx); 145 } 146 } 147 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 148 return (NULL); 149} 150 151/* 152 * Insert an entry into the hash table. If there is already an element 153 * equal to elem in the hash table, then the already existing element 154 * will be returned and the new element will not be inserted. 155 * Otherwise returns NULL. 156 */ 157static dmu_buf_impl_t * 158dbuf_hash_insert(dmu_buf_impl_t *db) 159{ 160 dbuf_hash_table_t *h = &dbuf_hash_table; 161 objset_t *os = db->db_objset; 162 uint64_t obj = db->db.db_object; 163 int level = db->db_level; 164 uint64_t blkid = db->db_blkid; 165 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 166 uint64_t idx = hv & h->hash_table_mask; 167 dmu_buf_impl_t *dbf; 168 169 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 170 for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { 171 if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { 172 mutex_enter(&dbf->db_mtx); 173 if (dbf->db_state != DB_EVICTING) { 174 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 175 return (dbf); 176 } 177 mutex_exit(&dbf->db_mtx); 178 } 179 } 180 181 mutex_enter(&db->db_mtx); 182 db->db_hash_next = h->hash_table[idx]; 183 h->hash_table[idx] = db; 184 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 185 atomic_add_64(&dbuf_hash_count, 1); 186 187 return (NULL); 188} 189 190/* 191 * Remove an entry from the hash table. It must be in the EVICTING state. 192 */ 193static void 194dbuf_hash_remove(dmu_buf_impl_t *db) 195{ 196 dbuf_hash_table_t *h = &dbuf_hash_table; 197 uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, 198 db->db_level, db->db_blkid); 199 uint64_t idx = hv & h->hash_table_mask; 200 dmu_buf_impl_t *dbf, **dbp; 201 202 /* 203 * We musn't hold db_mtx to maintain lock ordering: 204 * DBUF_HASH_MUTEX > db_mtx. 205 */ 206 ASSERT(refcount_is_zero(&db->db_holds)); 207 ASSERT(db->db_state == DB_EVICTING); 208 ASSERT(!MUTEX_HELD(&db->db_mtx)); 209 210 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 211 dbp = &h->hash_table[idx]; 212 while ((dbf = *dbp) != db) { 213 dbp = &dbf->db_hash_next; 214 ASSERT(dbf != NULL); 215 } 216 *dbp = db->db_hash_next; 217 db->db_hash_next = NULL; 218 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 219 atomic_add_64(&dbuf_hash_count, -1); 220} 221 222static arc_evict_func_t dbuf_do_evict; 223 224static void 225dbuf_evict_user(dmu_buf_impl_t *db) 226{ 227 ASSERT(MUTEX_HELD(&db->db_mtx)); 228 229 if (db->db_level != 0 || db->db_evict_func == NULL) 230 return; 231 232 if (db->db_user_data_ptr_ptr) 233 *db->db_user_data_ptr_ptr = db->db.db_data; 234 db->db_evict_func(&db->db, db->db_user_ptr); 235 db->db_user_ptr = NULL; 236 db->db_user_data_ptr_ptr = NULL; 237 db->db_evict_func = NULL; 238} 239 240boolean_t 241dbuf_is_metadata(dmu_buf_impl_t *db) 242{ 243 if (db->db_level > 0) { 244 return (B_TRUE); 245 } else { 246 boolean_t is_metadata; 247 248 DB_DNODE_ENTER(db); 249 is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type); 250 DB_DNODE_EXIT(db); 251 252 return (is_metadata); 253 } 254} 255 256void 257dbuf_evict(dmu_buf_impl_t *db) 258{ 259 ASSERT(MUTEX_HELD(&db->db_mtx)); 260 ASSERT(db->db_buf == NULL); 261 ASSERT(db->db_data_pending == NULL); 262 263 dbuf_clear(db); 264 dbuf_destroy(db); 265} 266 267void 268dbuf_init(void) 269{ 270 uint64_t hsize = 1ULL << 16; 271 dbuf_hash_table_t *h = &dbuf_hash_table; 272 int i; 273 274 /* 275 * The hash table is big enough to fill all of physical memory 276 * with an average 4K block size. The table will take up 277 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). 278 */ 279 while (hsize * 4096 < (uint64_t)physmem * PAGESIZE) 280 hsize <<= 1; 281 282retry: 283 h->hash_table_mask = hsize - 1; 284 h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); 285 if (h->hash_table == NULL) { 286 /* XXX - we should really return an error instead of assert */ 287 ASSERT(hsize > (1ULL << 10)); 288 hsize >>= 1; 289 goto retry; 290 } 291 292 dbuf_cache = kmem_cache_create("dmu_buf_impl_t", 293 sizeof (dmu_buf_impl_t), 294 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); 295 296 for (i = 0; i < DBUF_MUTEXES; i++) 297 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); 298} 299 300void 301dbuf_fini(void) 302{ 303 dbuf_hash_table_t *h = &dbuf_hash_table; 304 int i; 305 306 for (i = 0; i < DBUF_MUTEXES; i++) 307 mutex_destroy(&h->hash_mutexes[i]); 308 kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); 309 kmem_cache_destroy(dbuf_cache); 310} 311 312/* 313 * Other stuff. 314 */ 315 316#ifdef ZFS_DEBUG 317static void 318dbuf_verify(dmu_buf_impl_t *db) 319{ 320 dnode_t *dn; 321 dbuf_dirty_record_t *dr; 322 323 ASSERT(MUTEX_HELD(&db->db_mtx)); 324 325 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) 326 return; 327 328 ASSERT(db->db_objset != NULL); 329 DB_DNODE_ENTER(db); 330 dn = DB_DNODE(db); 331 if (dn == NULL) { 332 ASSERT(db->db_parent == NULL); 333 ASSERT(db->db_blkptr == NULL); 334 } else { 335 ASSERT3U(db->db.db_object, ==, dn->dn_object); 336 ASSERT3P(db->db_objset, ==, dn->dn_objset); 337 ASSERT3U(db->db_level, <, dn->dn_nlevels); 338 ASSERT(db->db_blkid == DMU_BONUS_BLKID || 339 db->db_blkid == DMU_SPILL_BLKID || 340 !avl_is_empty(&dn->dn_dbufs)); 341 } 342 if (db->db_blkid == DMU_BONUS_BLKID) { 343 ASSERT(dn != NULL); 344 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 345 ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); 346 } else if (db->db_blkid == DMU_SPILL_BLKID) { 347 ASSERT(dn != NULL); 348 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 349 ASSERT0(db->db.db_offset); 350 } else { 351 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); 352 } 353 354 for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) 355 ASSERT(dr->dr_dbuf == db); 356 357 for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) 358 ASSERT(dr->dr_dbuf == db); 359 360 /* 361 * We can't assert that db_size matches dn_datablksz because it 362 * can be momentarily different when another thread is doing 363 * dnode_set_blksz(). 364 */ 365 if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { 366 dr = db->db_data_pending; 367 /* 368 * It should only be modified in syncing context, so 369 * make sure we only have one copy of the data. 370 */ 371 ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); 372 } 373 374 /* verify db->db_blkptr */ 375 if (db->db_blkptr) { 376 if (db->db_parent == dn->dn_dbuf) { 377 /* db is pointed to by the dnode */ 378 /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ 379 if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) 380 ASSERT(db->db_parent == NULL); 381 else 382 ASSERT(db->db_parent != NULL); 383 if (db->db_blkid != DMU_SPILL_BLKID) 384 ASSERT3P(db->db_blkptr, ==, 385 &dn->dn_phys->dn_blkptr[db->db_blkid]); 386 } else { 387 /* db is pointed to by an indirect block */ 388 int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; 389 ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); 390 ASSERT3U(db->db_parent->db.db_object, ==, 391 db->db.db_object); 392 /* 393 * dnode_grow_indblksz() can make this fail if we don't 394 * have the struct_rwlock. XXX indblksz no longer 395 * grows. safe to do this now? 396 */ 397 if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 398 ASSERT3P(db->db_blkptr, ==, 399 ((blkptr_t *)db->db_parent->db.db_data + 400 db->db_blkid % epb)); 401 } 402 } 403 } 404 if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && 405 (db->db_buf == NULL || db->db_buf->b_data) && 406 db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && 407 db->db_state != DB_FILL && !dn->dn_free_txg) { 408 /* 409 * If the blkptr isn't set but they have nonzero data, 410 * it had better be dirty, otherwise we'll lose that 411 * data when we evict this buffer. 412 */ 413 if (db->db_dirtycnt == 0) { 414 uint64_t *buf = db->db.db_data; 415 int i; 416 417 for (i = 0; i < db->db.db_size >> 3; i++) { 418 ASSERT(buf[i] == 0); 419 } 420 } 421 } 422 DB_DNODE_EXIT(db); 423} 424#endif 425 426static void 427dbuf_update_data(dmu_buf_impl_t *db) 428{ 429 ASSERT(MUTEX_HELD(&db->db_mtx)); 430 if (db->db_level == 0 && db->db_user_data_ptr_ptr) { 431 ASSERT(!refcount_is_zero(&db->db_holds)); 432 *db->db_user_data_ptr_ptr = db->db.db_data; 433 } 434} 435 436static void 437dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) 438{ 439 ASSERT(MUTEX_HELD(&db->db_mtx)); 440 db->db_buf = buf; 441 if (buf != NULL) { 442 ASSERT(buf->b_data != NULL); 443 db->db.db_data = buf->b_data; 444 if (!arc_released(buf)) 445 arc_set_callback(buf, dbuf_do_evict, db); 446 dbuf_update_data(db); 447 } else { 448 dbuf_evict_user(db); 449 db->db.db_data = NULL; 450 if (db->db_state != DB_NOFILL) 451 db->db_state = DB_UNCACHED; 452 } 453} 454 455/* 456 * Loan out an arc_buf for read. Return the loaned arc_buf. 457 */ 458arc_buf_t * 459dbuf_loan_arcbuf(dmu_buf_impl_t *db) 460{ 461 arc_buf_t *abuf; 462 463 mutex_enter(&db->db_mtx); 464 if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { 465 int blksz = db->db.db_size; 466 spa_t *spa = db->db_objset->os_spa; 467 468 mutex_exit(&db->db_mtx); 469 abuf = arc_loan_buf(spa, blksz); 470 bcopy(db->db.db_data, abuf->b_data, blksz); 471 } else { 472 abuf = db->db_buf; 473 arc_loan_inuse_buf(abuf, db); 474 dbuf_set_data(db, NULL); 475 mutex_exit(&db->db_mtx); 476 } 477 return (abuf); 478} 479 480uint64_t 481dbuf_whichblock(dnode_t *dn, uint64_t offset) 482{ 483 if (dn->dn_datablkshift) { 484 return (offset >> dn->dn_datablkshift); 485 } else { 486 ASSERT3U(offset, <, dn->dn_datablksz); 487 return (0); 488 } 489} 490 491static void 492dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) 493{ 494 dmu_buf_impl_t *db = vdb; 495 496 mutex_enter(&db->db_mtx); 497 ASSERT3U(db->db_state, ==, DB_READ); 498 /* 499 * All reads are synchronous, so we must have a hold on the dbuf 500 */ 501 ASSERT(refcount_count(&db->db_holds) > 0); 502 ASSERT(db->db_buf == NULL); 503 ASSERT(db->db.db_data == NULL); 504 if (db->db_level == 0 && db->db_freed_in_flight) { 505 /* we were freed in flight; disregard any error */ 506 arc_release(buf, db); 507 bzero(buf->b_data, db->db.db_size); 508 arc_buf_freeze(buf); 509 db->db_freed_in_flight = FALSE; 510 dbuf_set_data(db, buf); 511 db->db_state = DB_CACHED; 512 } else if (zio == NULL || zio->io_error == 0) { 513 dbuf_set_data(db, buf); 514 db->db_state = DB_CACHED; 515 } else { 516 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 517 ASSERT3P(db->db_buf, ==, NULL); 518 VERIFY(arc_buf_remove_ref(buf, db)); 519 db->db_state = DB_UNCACHED; 520 } 521 cv_broadcast(&db->db_changed); 522 dbuf_rele_and_unlock(db, NULL); 523} 524 525static void 526dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) 527{ 528 dnode_t *dn; 529 zbookmark_phys_t zb; 530 uint32_t aflags = ARC_NOWAIT; 531 532 DB_DNODE_ENTER(db); 533 dn = DB_DNODE(db); 534 ASSERT(!refcount_is_zero(&db->db_holds)); 535 /* We need the struct_rwlock to prevent db_blkptr from changing. */ 536 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 537 ASSERT(MUTEX_HELD(&db->db_mtx)); 538 ASSERT(db->db_state == DB_UNCACHED); 539 ASSERT(db->db_buf == NULL); 540 541 if (db->db_blkid == DMU_BONUS_BLKID) { 542 int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); 543 544 ASSERT3U(bonuslen, <=, db->db.db_size); 545 db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); 546 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 547 if (bonuslen < DN_MAX_BONUSLEN) 548 bzero(db->db.db_data, DN_MAX_BONUSLEN); 549 if (bonuslen) 550 bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); 551 DB_DNODE_EXIT(db); 552 dbuf_update_data(db); 553 db->db_state = DB_CACHED; 554 mutex_exit(&db->db_mtx); 555 return; 556 } 557 558 /* 559 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() 560 * processes the delete record and clears the bp while we are waiting 561 * for the dn_mtx (resulting in a "no" from block_freed). 562 */ 563 if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || 564 (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || 565 BP_IS_HOLE(db->db_blkptr)))) { 566 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 567 568 DB_DNODE_EXIT(db); 569 dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa, 570 db->db.db_size, db, type)); 571 bzero(db->db.db_data, db->db.db_size); 572 db->db_state = DB_CACHED; 573 *flags |= DB_RF_CACHED; 574 mutex_exit(&db->db_mtx); 575 return; 576 } 577 578 DB_DNODE_EXIT(db); 579 580 db->db_state = DB_READ; 581 mutex_exit(&db->db_mtx); 582 583 if (DBUF_IS_L2CACHEABLE(db)) 584 aflags |= ARC_L2CACHE; 585 if (DBUF_IS_L2COMPRESSIBLE(db)) 586 aflags |= ARC_L2COMPRESS; 587 588 SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? 589 db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, 590 db->db.db_object, db->db_level, db->db_blkid); 591 592 dbuf_add_ref(db, NULL); 593 594 (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr, 595 dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, 596 (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, 597 &aflags, &zb); 598 if (aflags & ARC_CACHED) 599 *flags |= DB_RF_CACHED; 600} 601 602int 603dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 604{ 605 int err = 0; 606 boolean_t havepzio = (zio != NULL); 607 boolean_t prefetch; 608 dnode_t *dn; 609 610 /* 611 * We don't have to hold the mutex to check db_state because it 612 * can't be freed while we have a hold on the buffer. 613 */ 614 ASSERT(!refcount_is_zero(&db->db_holds)); 615 616 if (db->db_state == DB_NOFILL) 617 return (SET_ERROR(EIO)); 618 619 DB_DNODE_ENTER(db); 620 dn = DB_DNODE(db); 621 if ((flags & DB_RF_HAVESTRUCT) == 0) 622 rw_enter(&dn->dn_struct_rwlock, RW_READER); 623 624 prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 625 (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && 626 DBUF_IS_CACHEABLE(db); 627 628 mutex_enter(&db->db_mtx); 629 if (db->db_state == DB_CACHED) { 630 mutex_exit(&db->db_mtx); 631 if (prefetch) 632 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 633 db->db.db_size, TRUE); 634 if ((flags & DB_RF_HAVESTRUCT) == 0) 635 rw_exit(&dn->dn_struct_rwlock); 636 DB_DNODE_EXIT(db); 637 } else if (db->db_state == DB_UNCACHED) { 638 spa_t *spa = dn->dn_objset->os_spa; 639 640 if (zio == NULL) 641 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 642 dbuf_read_impl(db, zio, &flags); 643 644 /* dbuf_read_impl has dropped db_mtx for us */ 645 646 if (prefetch) 647 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 648 db->db.db_size, flags & DB_RF_CACHED); 649 650 if ((flags & DB_RF_HAVESTRUCT) == 0) 651 rw_exit(&dn->dn_struct_rwlock); 652 DB_DNODE_EXIT(db); 653 654 if (!havepzio) 655 err = zio_wait(zio); 656 } else { 657 /* 658 * Another reader came in while the dbuf was in flight 659 * between UNCACHED and CACHED. Either a writer will finish 660 * writing the buffer (sending the dbuf to CACHED) or the 661 * first reader's request will reach the read_done callback 662 * and send the dbuf to CACHED. Otherwise, a failure 663 * occurred and the dbuf went to UNCACHED. 664 */ 665 mutex_exit(&db->db_mtx); 666 if (prefetch) 667 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 668 db->db.db_size, TRUE); 669 if ((flags & DB_RF_HAVESTRUCT) == 0) 670 rw_exit(&dn->dn_struct_rwlock); 671 DB_DNODE_EXIT(db); 672 673 /* Skip the wait per the caller's request. */ 674 mutex_enter(&db->db_mtx); 675 if ((flags & DB_RF_NEVERWAIT) == 0) { 676 while (db->db_state == DB_READ || 677 db->db_state == DB_FILL) { 678 ASSERT(db->db_state == DB_READ || 679 (flags & DB_RF_HAVESTRUCT) == 0); 680 cv_wait(&db->db_changed, &db->db_mtx); 681 } 682 if (db->db_state == DB_UNCACHED) 683 err = SET_ERROR(EIO); 684 } 685 mutex_exit(&db->db_mtx); 686 } 687 688 ASSERT(err || havepzio || db->db_state == DB_CACHED); 689 return (err); 690} 691 692static void 693dbuf_noread(dmu_buf_impl_t *db) 694{ 695 ASSERT(!refcount_is_zero(&db->db_holds)); 696 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 697 mutex_enter(&db->db_mtx); 698 while (db->db_state == DB_READ || db->db_state == DB_FILL) 699 cv_wait(&db->db_changed, &db->db_mtx); 700 if (db->db_state == DB_UNCACHED) { 701 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 702 spa_t *spa = db->db_objset->os_spa; 703 704 ASSERT(db->db_buf == NULL); 705 ASSERT(db->db.db_data == NULL); 706 dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); 707 db->db_state = DB_FILL; 708 } else if (db->db_state == DB_NOFILL) { 709 dbuf_set_data(db, NULL); 710 } else { 711 ASSERT3U(db->db_state, ==, DB_CACHED); 712 } 713 mutex_exit(&db->db_mtx); 714} 715 716/* 717 * This is our just-in-time copy function. It makes a copy of 718 * buffers, that have been modified in a previous transaction 719 * group, before we modify them in the current active group. 720 * 721 * This function is used in two places: when we are dirtying a 722 * buffer for the first time in a txg, and when we are freeing 723 * a range in a dnode that includes this buffer. 724 * 725 * Note that when we are called from dbuf_free_range() we do 726 * not put a hold on the buffer, we just traverse the active 727 * dbuf list for the dnode. 728 */ 729static void 730dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) 731{ 732 dbuf_dirty_record_t *dr = db->db_last_dirty; 733 734 ASSERT(MUTEX_HELD(&db->db_mtx)); 735 ASSERT(db->db.db_data != NULL); 736 ASSERT(db->db_level == 0); 737 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); 738 739 if (dr == NULL || 740 (dr->dt.dl.dr_data != 741 ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) 742 return; 743 744 /* 745 * If the last dirty record for this dbuf has not yet synced 746 * and its referencing the dbuf data, either: 747 * reset the reference to point to a new copy, 748 * or (if there a no active holders) 749 * just null out the current db_data pointer. 750 */ 751 ASSERT(dr->dr_txg >= txg - 2); 752 if (db->db_blkid == DMU_BONUS_BLKID) { 753 /* Note that the data bufs here are zio_bufs */ 754 dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); 755 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 756 bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); 757 } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 758 int size = db->db.db_size; 759 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 760 spa_t *spa = db->db_objset->os_spa; 761 762 dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); 763 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); 764 } else { 765 dbuf_set_data(db, NULL); 766 } 767} 768 769void 770dbuf_unoverride(dbuf_dirty_record_t *dr) 771{ 772 dmu_buf_impl_t *db = dr->dr_dbuf; 773 blkptr_t *bp = &dr->dt.dl.dr_overridden_by; 774 uint64_t txg = dr->dr_txg; 775 776 ASSERT(MUTEX_HELD(&db->db_mtx)); 777 ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); 778 ASSERT(db->db_level == 0); 779 780 if (db->db_blkid == DMU_BONUS_BLKID || 781 dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) 782 return; 783 784 ASSERT(db->db_data_pending != dr); 785 786 /* free this block */ 787 if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) 788 zio_free(db->db_objset->os_spa, txg, bp); 789 790 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 791 dr->dt.dl.dr_nopwrite = B_FALSE; 792 793 /* 794 * Release the already-written buffer, so we leave it in 795 * a consistent dirty state. Note that all callers are 796 * modifying the buffer, so they will immediately do 797 * another (redundant) arc_release(). Therefore, leave 798 * the buf thawed to save the effort of freezing & 799 * immediately re-thawing it. 800 */ 801 arc_release(dr->dt.dl.dr_data, db); 802} 803 804/* 805 * Evict (if its unreferenced) or clear (if its referenced) any level-0 806 * data blocks in the free range, so that any future readers will find 807 * empty blocks. 808 * 809 * This is a no-op if the dataset is in the middle of an incremental 810 * receive; see comment below for details. 811 */ 812void 813dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, 814 dmu_tx_t *tx) 815{ 816 dmu_buf_impl_t *db, *db_next, db_search; 817 uint64_t txg = tx->tx_txg; 818 avl_index_t where; 819 820 if (end_blkid > dn->dn_maxblkid && (end_blkid != DMU_SPILL_BLKID)) 821 end_blkid = dn->dn_maxblkid; 822 dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid); 823 824 db_search.db_level = 0; 825 db_search.db_blkid = start_blkid; 826 db_search.db_creation = 0; 827 828 mutex_enter(&dn->dn_dbufs_mtx); 829 if (start_blkid >= dn->dn_unlisted_l0_blkid) { 830 /* There can't be any dbufs in this range; no need to search. */ 831#ifdef DEBUG 832 db = avl_find(&dn->dn_dbufs, &db_search, &where); 833 ASSERT3P(db, ==, NULL); 834 db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); 835 ASSERT(db == NULL || db->db_level > 0); 836#endif 837 mutex_exit(&dn->dn_dbufs_mtx); 838 return; 839 } else if (dmu_objset_is_receiving(dn->dn_objset)) { 840 /* 841 * If we are receiving, we expect there to be no dbufs in 842 * the range to be freed, because receive modifies each 843 * block at most once, and in offset order. If this is 844 * not the case, it can lead to performance problems, 845 * so note that we unexpectedly took the slow path. 846 */ 847 atomic_inc_64(&zfs_free_range_recv_miss); 848 } 849 850 db = avl_find(&dn->dn_dbufs, &db_search, &where); 851 ASSERT3P(db, ==, NULL); 852 db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); 853 854 for (; db != NULL; db = db_next) { 855 db_next = AVL_NEXT(&dn->dn_dbufs, db); 856 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 857 858 if (db->db_level != 0 || db->db_blkid > end_blkid) { 859 break; 860 } 861 ASSERT3U(db->db_blkid, >=, start_blkid); 862 863 /* found a level 0 buffer in the range */ 864 mutex_enter(&db->db_mtx); 865 if (dbuf_undirty(db, tx)) { 866 /* mutex has been dropped and dbuf destroyed */ 867 continue; 868 } 869 870 if (db->db_state == DB_UNCACHED || 871 db->db_state == DB_NOFILL || 872 db->db_state == DB_EVICTING) { 873 ASSERT(db->db.db_data == NULL); 874 mutex_exit(&db->db_mtx); 875 continue; 876 } 877 if (db->db_state == DB_READ || db->db_state == DB_FILL) { 878 /* will be handled in dbuf_read_done or dbuf_rele */ 879 db->db_freed_in_flight = TRUE; 880 mutex_exit(&db->db_mtx); 881 continue; 882 } 883 if (refcount_count(&db->db_holds) == 0) { 884 ASSERT(db->db_buf); 885 dbuf_clear(db); 886 continue; 887 } 888 /* The dbuf is referenced */ 889 890 if (db->db_last_dirty != NULL) { 891 dbuf_dirty_record_t *dr = db->db_last_dirty; 892 893 if (dr->dr_txg == txg) { 894 /* 895 * This buffer is "in-use", re-adjust the file 896 * size to reflect that this buffer may 897 * contain new data when we sync. 898 */ 899 if (db->db_blkid != DMU_SPILL_BLKID && 900 db->db_blkid > dn->dn_maxblkid) 901 dn->dn_maxblkid = db->db_blkid; 902 dbuf_unoverride(dr); 903 } else { 904 /* 905 * This dbuf is not dirty in the open context. 906 * Either uncache it (if its not referenced in 907 * the open context) or reset its contents to 908 * empty. 909 */ 910 dbuf_fix_old_data(db, txg); 911 } 912 } 913 /* clear the contents if its cached */ 914 if (db->db_state == DB_CACHED) { 915 ASSERT(db->db.db_data != NULL); 916 arc_release(db->db_buf, db); 917 bzero(db->db.db_data, db->db.db_size); 918 arc_buf_freeze(db->db_buf); 919 } 920 921 mutex_exit(&db->db_mtx); 922 } 923 mutex_exit(&dn->dn_dbufs_mtx); 924} 925 926static int 927dbuf_block_freeable(dmu_buf_impl_t *db) 928{ 929 dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; 930 uint64_t birth_txg = 0; 931 932 /* 933 * We don't need any locking to protect db_blkptr: 934 * If it's syncing, then db_last_dirty will be set 935 * so we'll ignore db_blkptr. 936 * 937 * This logic ensures that only block births for 938 * filled blocks are considered. 939 */ 940 ASSERT(MUTEX_HELD(&db->db_mtx)); 941 if (db->db_last_dirty && (db->db_blkptr == NULL || 942 !BP_IS_HOLE(db->db_blkptr))) { 943 birth_txg = db->db_last_dirty->dr_txg; 944 } else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { 945 birth_txg = db->db_blkptr->blk_birth; 946 } 947 948 /* 949 * If this block don't exist or is in a snapshot, it can't be freed. 950 * Don't pass the bp to dsl_dataset_block_freeable() since we 951 * are holding the db_mtx lock and might deadlock if we are 952 * prefetching a dedup-ed block. 953 */ 954 if (birth_txg != 0) 955 return (ds == NULL || 956 dsl_dataset_block_freeable(ds, NULL, birth_txg)); 957 else 958 return (B_FALSE); 959} 960 961void 962dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) 963{ 964 arc_buf_t *buf, *obuf; 965 int osize = db->db.db_size; 966 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 967 dnode_t *dn; 968 969 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 970 971 DB_DNODE_ENTER(db); 972 dn = DB_DNODE(db); 973 974 /* XXX does *this* func really need the lock? */ 975 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 976 977 /* 978 * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held 979 * is OK, because there can be no other references to the db 980 * when we are changing its size, so no concurrent DB_FILL can 981 * be happening. 982 */ 983 /* 984 * XXX we should be doing a dbuf_read, checking the return 985 * value and returning that up to our callers 986 */ 987 dmu_buf_will_dirty(&db->db, tx); 988 989 /* create the data buffer for the new block */ 990 buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type); 991 992 /* copy old block data to the new block */ 993 obuf = db->db_buf; 994 bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); 995 /* zero the remainder */ 996 if (size > osize) 997 bzero((uint8_t *)buf->b_data + osize, size - osize); 998 999 mutex_enter(&db->db_mtx); 1000 dbuf_set_data(db, buf); 1001 VERIFY(arc_buf_remove_ref(obuf, db)); 1002 db->db.db_size = size; 1003 1004 if (db->db_level == 0) { 1005 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 1006 db->db_last_dirty->dt.dl.dr_data = buf; 1007 } 1008 mutex_exit(&db->db_mtx); 1009 1010 dnode_willuse_space(dn, size-osize, tx); 1011 DB_DNODE_EXIT(db); 1012} 1013 1014void 1015dbuf_release_bp(dmu_buf_impl_t *db) 1016{ 1017 objset_t *os = db->db_objset; 1018 1019 ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); 1020 ASSERT(arc_released(os->os_phys_buf) || 1021 list_link_active(&os->os_dsl_dataset->ds_synced_link)); 1022 ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); 1023 1024 (void) arc_release(db->db_buf, db); 1025} 1026 1027dbuf_dirty_record_t * 1028dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1029{ 1030 dnode_t *dn; 1031 objset_t *os; 1032 dbuf_dirty_record_t **drp, *dr; 1033 int drop_struct_lock = FALSE; 1034 boolean_t do_free_accounting = B_FALSE; 1035 int txgoff = tx->tx_txg & TXG_MASK; 1036 1037 ASSERT(tx->tx_txg != 0); 1038 ASSERT(!refcount_is_zero(&db->db_holds)); 1039 DMU_TX_DIRTY_BUF(tx, db); 1040 1041 DB_DNODE_ENTER(db); 1042 dn = DB_DNODE(db); 1043 /* 1044 * Shouldn't dirty a regular buffer in syncing context. Private 1045 * objects may be dirtied in syncing context, but only if they 1046 * were already pre-dirtied in open context. 1047 */ 1048 ASSERT(!dmu_tx_is_syncing(tx) || 1049 BP_IS_HOLE(dn->dn_objset->os_rootbp) || 1050 DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 1051 dn->dn_objset->os_dsl_dataset == NULL); 1052 /* 1053 * We make this assert for private objects as well, but after we 1054 * check if we're already dirty. They are allowed to re-dirty 1055 * in syncing context. 1056 */ 1057 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1058 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1059 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1060 1061 mutex_enter(&db->db_mtx); 1062 /* 1063 * XXX make this true for indirects too? The problem is that 1064 * transactions created with dmu_tx_create_assigned() from 1065 * syncing context don't bother holding ahead. 1066 */ 1067 ASSERT(db->db_level != 0 || 1068 db->db_state == DB_CACHED || db->db_state == DB_FILL || 1069 db->db_state == DB_NOFILL); 1070 1071 mutex_enter(&dn->dn_mtx); 1072 /* 1073 * Don't set dirtyctx to SYNC if we're just modifying this as we 1074 * initialize the objset. 1075 */ 1076 if (dn->dn_dirtyctx == DN_UNDIRTIED && 1077 !BP_IS_HOLE(dn->dn_objset->os_rootbp)) { 1078 dn->dn_dirtyctx = 1079 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); 1080 ASSERT(dn->dn_dirtyctx_firstset == NULL); 1081 dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); 1082 } 1083 mutex_exit(&dn->dn_mtx); 1084 1085 if (db->db_blkid == DMU_SPILL_BLKID) 1086 dn->dn_have_spill = B_TRUE; 1087 1088 /* 1089 * If this buffer is already dirty, we're done. 1090 */ 1091 drp = &db->db_last_dirty; 1092 ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || 1093 db->db.db_object == DMU_META_DNODE_OBJECT); 1094 while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) 1095 drp = &dr->dr_next; 1096 if (dr && dr->dr_txg == tx->tx_txg) { 1097 DB_DNODE_EXIT(db); 1098 1099 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { 1100 /* 1101 * If this buffer has already been written out, 1102 * we now need to reset its state. 1103 */ 1104 dbuf_unoverride(dr); 1105 if (db->db.db_object != DMU_META_DNODE_OBJECT && 1106 db->db_state != DB_NOFILL) 1107 arc_buf_thaw(db->db_buf); 1108 } 1109 mutex_exit(&db->db_mtx); 1110 return (dr); 1111 } 1112 1113 /* 1114 * Only valid if not already dirty. 1115 */ 1116 ASSERT(dn->dn_object == 0 || 1117 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1118 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1119 1120 ASSERT3U(dn->dn_nlevels, >, db->db_level); 1121 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || 1122 dn->dn_phys->dn_nlevels > db->db_level || 1123 dn->dn_next_nlevels[txgoff] > db->db_level || 1124 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || 1125 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); 1126 1127 /* 1128 * We should only be dirtying in syncing context if it's the 1129 * mos or we're initializing the os or it's a special object. 1130 * However, we are allowed to dirty in syncing context provided 1131 * we already dirtied it in open context. Hence we must make 1132 * this assertion only if we're not already dirty. 1133 */ 1134 os = dn->dn_objset; 1135 ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 1136 os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); 1137 ASSERT(db->db.db_size != 0); 1138 1139 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1140 1141 if (db->db_blkid != DMU_BONUS_BLKID) { 1142 /* 1143 * Update the accounting. 1144 * Note: we delay "free accounting" until after we drop 1145 * the db_mtx. This keeps us from grabbing other locks 1146 * (and possibly deadlocking) in bp_get_dsize() while 1147 * also holding the db_mtx. 1148 */ 1149 dnode_willuse_space(dn, db->db.db_size, tx); 1150 do_free_accounting = dbuf_block_freeable(db); 1151 } 1152 1153 /* 1154 * If this buffer is dirty in an old transaction group we need 1155 * to make a copy of it so that the changes we make in this 1156 * transaction group won't leak out when we sync the older txg. 1157 */ 1158 dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); 1159 if (db->db_level == 0) { 1160 void *data_old = db->db_buf; 1161 1162 if (db->db_state != DB_NOFILL) { 1163 if (db->db_blkid == DMU_BONUS_BLKID) { 1164 dbuf_fix_old_data(db, tx->tx_txg); 1165 data_old = db->db.db_data; 1166 } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { 1167 /* 1168 * Release the data buffer from the cache so 1169 * that we can modify it without impacting 1170 * possible other users of this cached data 1171 * block. Note that indirect blocks and 1172 * private objects are not released until the 1173 * syncing state (since they are only modified 1174 * then). 1175 */ 1176 arc_release(db->db_buf, db); 1177 dbuf_fix_old_data(db, tx->tx_txg); 1178 data_old = db->db_buf; 1179 } 1180 ASSERT(data_old != NULL); 1181 } 1182 dr->dt.dl.dr_data = data_old; 1183 } else { 1184 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); 1185 list_create(&dr->dt.di.dr_children, 1186 sizeof (dbuf_dirty_record_t), 1187 offsetof(dbuf_dirty_record_t, dr_dirty_node)); 1188 } 1189 if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL) 1190 dr->dr_accounted = db->db.db_size; 1191 dr->dr_dbuf = db; 1192 dr->dr_txg = tx->tx_txg; 1193 dr->dr_next = *drp; 1194 *drp = dr; 1195 1196 /* 1197 * We could have been freed_in_flight between the dbuf_noread 1198 * and dbuf_dirty. We win, as though the dbuf_noread() had 1199 * happened after the free. 1200 */ 1201 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 1202 db->db_blkid != DMU_SPILL_BLKID) { 1203 mutex_enter(&dn->dn_mtx); 1204 if (dn->dn_free_ranges[txgoff] != NULL) { 1205 range_tree_clear(dn->dn_free_ranges[txgoff], 1206 db->db_blkid, 1); 1207 } 1208 mutex_exit(&dn->dn_mtx); 1209 db->db_freed_in_flight = FALSE; 1210 } 1211 1212 /* 1213 * This buffer is now part of this txg 1214 */ 1215 dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); 1216 db->db_dirtycnt += 1; 1217 ASSERT3U(db->db_dirtycnt, <=, 3); 1218 1219 mutex_exit(&db->db_mtx); 1220 1221 if (db->db_blkid == DMU_BONUS_BLKID || 1222 db->db_blkid == DMU_SPILL_BLKID) { 1223 mutex_enter(&dn->dn_mtx); 1224 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1225 list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1226 mutex_exit(&dn->dn_mtx); 1227 dnode_setdirty(dn, tx); 1228 DB_DNODE_EXIT(db); 1229 return (dr); 1230 } else if (do_free_accounting) { 1231 blkptr_t *bp = db->db_blkptr; 1232 int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? 1233 bp_get_dsize(os->os_spa, bp) : db->db.db_size; 1234 /* 1235 * This is only a guess -- if the dbuf is dirty 1236 * in a previous txg, we don't know how much 1237 * space it will use on disk yet. We should 1238 * really have the struct_rwlock to access 1239 * db_blkptr, but since this is just a guess, 1240 * it's OK if we get an odd answer. 1241 */ 1242 ddt_prefetch(os->os_spa, bp); 1243 dnode_willuse_space(dn, -willfree, tx); 1244 } 1245 1246 if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 1247 rw_enter(&dn->dn_struct_rwlock, RW_READER); 1248 drop_struct_lock = TRUE; 1249 } 1250 1251 if (db->db_level == 0) { 1252 dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); 1253 ASSERT(dn->dn_maxblkid >= db->db_blkid); 1254 } 1255 1256 if (db->db_level+1 < dn->dn_nlevels) { 1257 dmu_buf_impl_t *parent = db->db_parent; 1258 dbuf_dirty_record_t *di; 1259 int parent_held = FALSE; 1260 1261 if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { 1262 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1263 1264 parent = dbuf_hold_level(dn, db->db_level+1, 1265 db->db_blkid >> epbs, FTAG); 1266 ASSERT(parent != NULL); 1267 parent_held = TRUE; 1268 } 1269 if (drop_struct_lock) 1270 rw_exit(&dn->dn_struct_rwlock); 1271 ASSERT3U(db->db_level+1, ==, parent->db_level); 1272 di = dbuf_dirty(parent, tx); 1273 if (parent_held) 1274 dbuf_rele(parent, FTAG); 1275 1276 mutex_enter(&db->db_mtx); 1277 /* 1278 * Since we've dropped the mutex, it's possible that 1279 * dbuf_undirty() might have changed this out from under us. 1280 */ 1281 if (db->db_last_dirty == dr || 1282 dn->dn_object == DMU_META_DNODE_OBJECT) { 1283 mutex_enter(&di->dt.di.dr_mtx); 1284 ASSERT3U(di->dr_txg, ==, tx->tx_txg); 1285 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1286 list_insert_tail(&di->dt.di.dr_children, dr); 1287 mutex_exit(&di->dt.di.dr_mtx); 1288 dr->dr_parent = di; 1289 } 1290 mutex_exit(&db->db_mtx); 1291 } else { 1292 ASSERT(db->db_level+1 == dn->dn_nlevels); 1293 ASSERT(db->db_blkid < dn->dn_nblkptr); 1294 ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); 1295 mutex_enter(&dn->dn_mtx); 1296 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1297 list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1298 mutex_exit(&dn->dn_mtx); 1299 if (drop_struct_lock) 1300 rw_exit(&dn->dn_struct_rwlock); 1301 } 1302 1303 dnode_setdirty(dn, tx); 1304 DB_DNODE_EXIT(db); 1305 return (dr); 1306} 1307 1308/* 1309 * Undirty a buffer in the transaction group referenced by the given 1310 * transaction. Return whether this evicted the dbuf. 1311 */ 1312static boolean_t 1313dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1314{ 1315 dnode_t *dn; 1316 uint64_t txg = tx->tx_txg; 1317 dbuf_dirty_record_t *dr, **drp; 1318 1319 ASSERT(txg != 0); 1320 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1321 ASSERT0(db->db_level); 1322 ASSERT(MUTEX_HELD(&db->db_mtx)); 1323 1324 /* 1325 * If this buffer is not dirty, we're done. 1326 */ 1327 for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) 1328 if (dr->dr_txg <= txg) 1329 break; 1330 if (dr == NULL || dr->dr_txg < txg) 1331 return (B_FALSE); 1332 ASSERT(dr->dr_txg == txg); 1333 ASSERT(dr->dr_dbuf == db); 1334 1335 DB_DNODE_ENTER(db); 1336 dn = DB_DNODE(db); 1337 1338 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1339 1340 ASSERT(db->db.db_size != 0); 1341 1342 /* 1343 * Any space we accounted for in dp_dirty_* will be cleaned up by 1344 * dsl_pool_sync(). This is relatively rare so the discrepancy 1345 * is not a big deal. 1346 */ 1347 1348 *drp = dr->dr_next; 1349 1350 /* 1351 * Note that there are three places in dbuf_dirty() 1352 * where this dirty record may be put on a list. 1353 * Make sure to do a list_remove corresponding to 1354 * every one of those list_insert calls. 1355 */ 1356 if (dr->dr_parent) { 1357 mutex_enter(&dr->dr_parent->dt.di.dr_mtx); 1358 list_remove(&dr->dr_parent->dt.di.dr_children, dr); 1359 mutex_exit(&dr->dr_parent->dt.di.dr_mtx); 1360 } else if (db->db_blkid == DMU_SPILL_BLKID || 1361 db->db_level+1 == dn->dn_nlevels) { 1362 ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); 1363 mutex_enter(&dn->dn_mtx); 1364 list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); 1365 mutex_exit(&dn->dn_mtx); 1366 } 1367 DB_DNODE_EXIT(db); 1368 1369 if (db->db_state != DB_NOFILL) { 1370 dbuf_unoverride(dr); 1371 1372 ASSERT(db->db_buf != NULL); 1373 ASSERT(dr->dt.dl.dr_data != NULL); 1374 if (dr->dt.dl.dr_data != db->db_buf) 1375 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db)); 1376 } 1377 1378 if (db->db_level != 0) { 1379 mutex_destroy(&dr->dt.di.dr_mtx); 1380 list_destroy(&dr->dt.di.dr_children); 1381 } 1382 1383 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 1384 1385 ASSERT(db->db_dirtycnt > 0); 1386 db->db_dirtycnt -= 1; 1387 1388 if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { 1389 arc_buf_t *buf = db->db_buf; 1390 1391 ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); 1392 dbuf_set_data(db, NULL); 1393 VERIFY(arc_buf_remove_ref(buf, db)); 1394 dbuf_evict(db); 1395 return (B_TRUE); 1396 } 1397 1398 return (B_FALSE); 1399} 1400 1401void 1402dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) 1403{ 1404 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1405 int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; 1406 1407 ASSERT(tx->tx_txg != 0); 1408 ASSERT(!refcount_is_zero(&db->db_holds)); 1409 1410 DB_DNODE_ENTER(db); 1411 if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) 1412 rf |= DB_RF_HAVESTRUCT; 1413 DB_DNODE_EXIT(db); 1414 (void) dbuf_read(db, NULL, rf); 1415 (void) dbuf_dirty(db, tx); 1416} 1417 1418void 1419dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1420{ 1421 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1422 1423 db->db_state = DB_NOFILL; 1424 1425 dmu_buf_will_fill(db_fake, tx); 1426} 1427 1428void 1429dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1430{ 1431 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1432 1433 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1434 ASSERT(tx->tx_txg != 0); 1435 ASSERT(db->db_level == 0); 1436 ASSERT(!refcount_is_zero(&db->db_holds)); 1437 1438 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || 1439 dmu_tx_private_ok(tx)); 1440 1441 dbuf_noread(db); 1442 (void) dbuf_dirty(db, tx); 1443} 1444 1445#pragma weak dmu_buf_fill_done = dbuf_fill_done 1446/* ARGSUSED */ 1447void 1448dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) 1449{ 1450 mutex_enter(&db->db_mtx); 1451 DBUF_VERIFY(db); 1452 1453 if (db->db_state == DB_FILL) { 1454 if (db->db_level == 0 && db->db_freed_in_flight) { 1455 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1456 /* we were freed while filling */ 1457 /* XXX dbuf_undirty? */ 1458 bzero(db->db.db_data, db->db.db_size); 1459 db->db_freed_in_flight = FALSE; 1460 } 1461 db->db_state = DB_CACHED; 1462 cv_broadcast(&db->db_changed); 1463 } 1464 mutex_exit(&db->db_mtx); 1465} 1466 1467void 1468dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, 1469 bp_embedded_type_t etype, enum zio_compress comp, 1470 int uncompressed_size, int compressed_size, int byteorder, 1471 dmu_tx_t *tx) 1472{ 1473 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; 1474 struct dirty_leaf *dl; 1475 dmu_object_type_t type; 1476 1477 DB_DNODE_ENTER(db); 1478 type = DB_DNODE(db)->dn_type; 1479 DB_DNODE_EXIT(db); 1480 1481 ASSERT0(db->db_level); 1482 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1483 1484 dmu_buf_will_not_fill(dbuf, tx); 1485 1486 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 1487 dl = &db->db_last_dirty->dt.dl; 1488 encode_embedded_bp_compressed(&dl->dr_overridden_by, 1489 data, comp, uncompressed_size, compressed_size); 1490 BPE_SET_ETYPE(&dl->dr_overridden_by, etype); 1491 BP_SET_TYPE(&dl->dr_overridden_by, type); 1492 BP_SET_LEVEL(&dl->dr_overridden_by, 0); 1493 BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder); 1494 1495 dl->dr_override_state = DR_OVERRIDDEN; 1496 dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg; 1497} 1498 1499/* 1500 * Directly assign a provided arc buf to a given dbuf if it's not referenced 1501 * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. 1502 */ 1503void 1504dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) 1505{ 1506 ASSERT(!refcount_is_zero(&db->db_holds)); 1507 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1508 ASSERT(db->db_level == 0); 1509 ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); 1510 ASSERT(buf != NULL); 1511 ASSERT(arc_buf_size(buf) == db->db.db_size); 1512 ASSERT(tx->tx_txg != 0); 1513 1514 arc_return_buf(buf, db); 1515 ASSERT(arc_released(buf)); 1516 1517 mutex_enter(&db->db_mtx); 1518 1519 while (db->db_state == DB_READ || db->db_state == DB_FILL) 1520 cv_wait(&db->db_changed, &db->db_mtx); 1521 1522 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); 1523 1524 if (db->db_state == DB_CACHED && 1525 refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { 1526 mutex_exit(&db->db_mtx); 1527 (void) dbuf_dirty(db, tx); 1528 bcopy(buf->b_data, db->db.db_data, db->db.db_size); 1529 VERIFY(arc_buf_remove_ref(buf, db)); 1530 xuio_stat_wbuf_copied(); 1531 return; 1532 } 1533 1534 xuio_stat_wbuf_nocopy(); 1535 if (db->db_state == DB_CACHED) { 1536 dbuf_dirty_record_t *dr = db->db_last_dirty; 1537 1538 ASSERT(db->db_buf != NULL); 1539 if (dr != NULL && dr->dr_txg == tx->tx_txg) { 1540 ASSERT(dr->dt.dl.dr_data == db->db_buf); 1541 if (!arc_released(db->db_buf)) { 1542 ASSERT(dr->dt.dl.dr_override_state == 1543 DR_OVERRIDDEN); 1544 arc_release(db->db_buf, db); 1545 } 1546 dr->dt.dl.dr_data = buf; 1547 VERIFY(arc_buf_remove_ref(db->db_buf, db)); 1548 } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { 1549 arc_release(db->db_buf, db); 1550 VERIFY(arc_buf_remove_ref(db->db_buf, db)); 1551 } 1552 db->db_buf = NULL; 1553 } 1554 ASSERT(db->db_buf == NULL); 1555 dbuf_set_data(db, buf); 1556 db->db_state = DB_FILL; 1557 mutex_exit(&db->db_mtx); 1558 (void) dbuf_dirty(db, tx); 1559 dmu_buf_fill_done(&db->db, tx); 1560} 1561 1562/* 1563 * "Clear" the contents of this dbuf. This will mark the dbuf 1564 * EVICTING and clear *most* of its references. Unfortunately, 1565 * when we are not holding the dn_dbufs_mtx, we can't clear the 1566 * entry in the dn_dbufs list. We have to wait until dbuf_destroy() 1567 * in this case. For callers from the DMU we will usually see: 1568 * dbuf_clear()->arc_clear_callback()->dbuf_do_evict()->dbuf_destroy() 1569 * For the arc callback, we will usually see: 1570 * dbuf_do_evict()->dbuf_clear();dbuf_destroy() 1571 * Sometimes, though, we will get a mix of these two: 1572 * DMU: dbuf_clear()->arc_clear_callback() 1573 * ARC: dbuf_do_evict()->dbuf_destroy() 1574 * 1575 * This routine will dissociate the dbuf from the arc, by calling 1576 * arc_clear_callback(), but will not evict the data from the ARC. 1577 */ 1578void 1579dbuf_clear(dmu_buf_impl_t *db) 1580{ 1581 dnode_t *dn; 1582 dmu_buf_impl_t *parent = db->db_parent; 1583 dmu_buf_impl_t *dndb; 1584 boolean_t dbuf_gone = B_FALSE; 1585 1586 ASSERT(MUTEX_HELD(&db->db_mtx)); 1587 ASSERT(refcount_is_zero(&db->db_holds)); 1588 1589 dbuf_evict_user(db); 1590 1591 if (db->db_state == DB_CACHED) { 1592 ASSERT(db->db.db_data != NULL); 1593 if (db->db_blkid == DMU_BONUS_BLKID) { 1594 zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); 1595 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 1596 } 1597 db->db.db_data = NULL; 1598 db->db_state = DB_UNCACHED; 1599 } 1600 1601 ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); 1602 ASSERT(db->db_data_pending == NULL); 1603 1604 db->db_state = DB_EVICTING; 1605 db->db_blkptr = NULL; 1606 1607 DB_DNODE_ENTER(db); 1608 dn = DB_DNODE(db); 1609 dndb = dn->dn_dbuf; 1610 if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { 1611 avl_remove(&dn->dn_dbufs, db); 1612 (void) atomic_dec_32_nv(&dn->dn_dbufs_count); 1613 membar_producer(); 1614 DB_DNODE_EXIT(db); 1615 /* 1616 * Decrementing the dbuf count means that the hold corresponding 1617 * to the removed dbuf is no longer discounted in dnode_move(), 1618 * so the dnode cannot be moved until after we release the hold. 1619 * The membar_producer() ensures visibility of the decremented 1620 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually 1621 * release any lock. 1622 */ 1623 dnode_rele(dn, db); 1624 db->db_dnode_handle = NULL; 1625 } else { 1626 DB_DNODE_EXIT(db); 1627 } 1628 1629 if (db->db_buf) 1630 dbuf_gone = arc_clear_callback(db->db_buf); 1631 1632 if (!dbuf_gone) 1633 mutex_exit(&db->db_mtx); 1634 1635 /* 1636 * If this dbuf is referenced from an indirect dbuf, 1637 * decrement the ref count on the indirect dbuf. 1638 */ 1639 if (parent && parent != dndb) 1640 dbuf_rele(parent, db); 1641} 1642 1643static int 1644dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, 1645 dmu_buf_impl_t **parentp, blkptr_t **bpp) 1646{ 1647 int nlevels, epbs; 1648 1649 *parentp = NULL; 1650 *bpp = NULL; 1651 1652 ASSERT(blkid != DMU_BONUS_BLKID); 1653 1654 if (blkid == DMU_SPILL_BLKID) { 1655 mutex_enter(&dn->dn_mtx); 1656 if (dn->dn_have_spill && 1657 (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) 1658 *bpp = &dn->dn_phys->dn_spill; 1659 else 1660 *bpp = NULL; 1661 dbuf_add_ref(dn->dn_dbuf, NULL); 1662 *parentp = dn->dn_dbuf; 1663 mutex_exit(&dn->dn_mtx); 1664 return (0); 1665 } 1666 1667 if (dn->dn_phys->dn_nlevels == 0) 1668 nlevels = 1; 1669 else 1670 nlevels = dn->dn_phys->dn_nlevels; 1671 1672 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1673 1674 ASSERT3U(level * epbs, <, 64); 1675 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1676 if (level >= nlevels || 1677 (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { 1678 /* the buffer has no parent yet */ 1679 return (SET_ERROR(ENOENT)); 1680 } else if (level < nlevels-1) { 1681 /* this block is referenced from an indirect block */ 1682 int err = dbuf_hold_impl(dn, level+1, 1683 blkid >> epbs, fail_sparse, NULL, parentp); 1684 if (err) 1685 return (err); 1686 err = dbuf_read(*parentp, NULL, 1687 (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); 1688 if (err) { 1689 dbuf_rele(*parentp, NULL); 1690 *parentp = NULL; 1691 return (err); 1692 } 1693 *bpp = ((blkptr_t *)(*parentp)->db.db_data) + 1694 (blkid & ((1ULL << epbs) - 1)); 1695 return (0); 1696 } else { 1697 /* the block is referenced from the dnode */ 1698 ASSERT3U(level, ==, nlevels-1); 1699 ASSERT(dn->dn_phys->dn_nblkptr == 0 || 1700 blkid < dn->dn_phys->dn_nblkptr); 1701 if (dn->dn_dbuf) { 1702 dbuf_add_ref(dn->dn_dbuf, NULL); 1703 *parentp = dn->dn_dbuf; 1704 } 1705 *bpp = &dn->dn_phys->dn_blkptr[blkid]; 1706 return (0); 1707 } 1708} 1709 1710static dmu_buf_impl_t * 1711dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, 1712 dmu_buf_impl_t *parent, blkptr_t *blkptr) 1713{ 1714 objset_t *os = dn->dn_objset; 1715 dmu_buf_impl_t *db, *odb; 1716 1717 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1718 ASSERT(dn->dn_type != DMU_OT_NONE); 1719 1720 db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); 1721 1722 db->db_objset = os; 1723 db->db.db_object = dn->dn_object; 1724 db->db_level = level; 1725 db->db_blkid = blkid; 1726 db->db_last_dirty = NULL; 1727 db->db_dirtycnt = 0; 1728 db->db_dnode_handle = dn->dn_handle; 1729 db->db_parent = parent; 1730 db->db_blkptr = blkptr; 1731 1732 db->db_user_ptr = NULL; 1733 db->db_user_data_ptr_ptr = NULL; 1734 db->db_evict_func = NULL; 1735 db->db_immediate_evict = 0; 1736 db->db_freed_in_flight = 0; 1737 1738 if (blkid == DMU_BONUS_BLKID) { 1739 ASSERT3P(parent, ==, dn->dn_dbuf); 1740 db->db.db_size = DN_MAX_BONUSLEN - 1741 (dn->dn_nblkptr-1) * sizeof (blkptr_t); 1742 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 1743 db->db.db_offset = DMU_BONUS_BLKID; 1744 db->db_state = DB_UNCACHED; 1745 /* the bonus dbuf is not placed in the hash table */ 1746 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1747 return (db); 1748 } else if (blkid == DMU_SPILL_BLKID) { 1749 db->db.db_size = (blkptr != NULL) ? 1750 BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; 1751 db->db.db_offset = 0; 1752 } else { 1753 int blocksize = 1754 db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz; 1755 db->db.db_size = blocksize; 1756 db->db.db_offset = db->db_blkid * blocksize; 1757 } 1758 1759 /* 1760 * Hold the dn_dbufs_mtx while we get the new dbuf 1761 * in the hash table *and* added to the dbufs list. 1762 * This prevents a possible deadlock with someone 1763 * trying to look up this dbuf before its added to the 1764 * dn_dbufs list. 1765 */ 1766 mutex_enter(&dn->dn_dbufs_mtx); 1767 db->db_state = DB_EVICTING; 1768 if ((odb = dbuf_hash_insert(db)) != NULL) { 1769 /* someone else inserted it first */ 1770 kmem_cache_free(dbuf_cache, db); 1771 mutex_exit(&dn->dn_dbufs_mtx); 1772 return (odb); 1773 } 1774 avl_add(&dn->dn_dbufs, db); 1775 if (db->db_level == 0 && db->db_blkid >= 1776 dn->dn_unlisted_l0_blkid) 1777 dn->dn_unlisted_l0_blkid = db->db_blkid + 1; 1778 db->db_state = DB_UNCACHED; 1779 mutex_exit(&dn->dn_dbufs_mtx); 1780 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1781 1782 if (parent && parent != dn->dn_dbuf) 1783 dbuf_add_ref(parent, db); 1784 1785 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1786 refcount_count(&dn->dn_holds) > 0); 1787 (void) refcount_add(&dn->dn_holds, db); 1788 (void) atomic_inc_32_nv(&dn->dn_dbufs_count); 1789 1790 dprintf_dbuf(db, "db=%p\n", db); 1791 1792 return (db); 1793} 1794 1795static int 1796dbuf_do_evict(void *private) 1797{ 1798 dmu_buf_impl_t *db = private; 1799 1800 if (!MUTEX_HELD(&db->db_mtx)) 1801 mutex_enter(&db->db_mtx); 1802 1803 ASSERT(refcount_is_zero(&db->db_holds)); 1804 1805 if (db->db_state != DB_EVICTING) { 1806 ASSERT(db->db_state == DB_CACHED); 1807 DBUF_VERIFY(db); 1808 db->db_buf = NULL; 1809 dbuf_evict(db); 1810 } else { 1811 mutex_exit(&db->db_mtx); 1812 dbuf_destroy(db); 1813 } 1814 return (0); 1815} 1816 1817static void 1818dbuf_destroy(dmu_buf_impl_t *db) 1819{ 1820 ASSERT(refcount_is_zero(&db->db_holds)); 1821 1822 if (db->db_blkid != DMU_BONUS_BLKID) { 1823 /* 1824 * If this dbuf is still on the dn_dbufs list, 1825 * remove it from that list. 1826 */ 1827 if (db->db_dnode_handle != NULL) { 1828 dnode_t *dn; 1829 1830 DB_DNODE_ENTER(db); 1831 dn = DB_DNODE(db); 1832 mutex_enter(&dn->dn_dbufs_mtx); 1833 avl_remove(&dn->dn_dbufs, db); 1834 (void) atomic_dec_32_nv(&dn->dn_dbufs_count); 1835 mutex_exit(&dn->dn_dbufs_mtx); 1836 DB_DNODE_EXIT(db); 1837 /* 1838 * Decrementing the dbuf count means that the hold 1839 * corresponding to the removed dbuf is no longer 1840 * discounted in dnode_move(), so the dnode cannot be 1841 * moved until after we release the hold. 1842 */ 1843 dnode_rele(dn, db); 1844 db->db_dnode_handle = NULL; 1845 } 1846 dbuf_hash_remove(db); 1847 } 1848 db->db_parent = NULL; 1849 db->db_buf = NULL; 1850 1851 ASSERT(db->db.db_data == NULL); 1852 ASSERT(db->db_hash_next == NULL); 1853 ASSERT(db->db_blkptr == NULL); 1854 ASSERT(db->db_data_pending == NULL); 1855 1856 kmem_cache_free(dbuf_cache, db); 1857 arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1858} 1859 1860void 1861dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio) 1862{ 1863 dmu_buf_impl_t *db = NULL; 1864 blkptr_t *bp = NULL; 1865 1866 ASSERT(blkid != DMU_BONUS_BLKID); 1867 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1868 1869 if (dnode_block_freed(dn, blkid)) 1870 return; 1871 1872 /* dbuf_find() returns with db_mtx held */ 1873 if (db = dbuf_find(dn, 0, blkid)) { 1874 /* 1875 * This dbuf is already in the cache. We assume that 1876 * it is already CACHED, or else about to be either 1877 * read or filled. 1878 */ 1879 mutex_exit(&db->db_mtx); 1880 return; 1881 } 1882 1883 if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { 1884 if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { 1885 dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; 1886 uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; 1887 zbookmark_phys_t zb; 1888 1889 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, 1890 dn->dn_object, 0, blkid); 1891 1892 (void) arc_read(NULL, dn->dn_objset->os_spa, 1893 bp, NULL, NULL, prio, 1894 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 1895 &aflags, &zb); 1896 } 1897 if (db) 1898 dbuf_rele(db, NULL); 1899 } 1900} 1901 1902/* 1903 * Returns with db_holds incremented, and db_mtx not held. 1904 * Note: dn_struct_rwlock must be held. 1905 */ 1906int 1907dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, 1908 void *tag, dmu_buf_impl_t **dbp) 1909{ 1910 dmu_buf_impl_t *db, *parent = NULL; 1911 1912 ASSERT(blkid != DMU_BONUS_BLKID); 1913 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1914 ASSERT3U(dn->dn_nlevels, >, level); 1915 1916 *dbp = NULL; 1917top: 1918 /* dbuf_find() returns with db_mtx held */ 1919 db = dbuf_find(dn, level, blkid); 1920 1921 if (db == NULL) { 1922 blkptr_t *bp = NULL; 1923 int err; 1924 1925 ASSERT3P(parent, ==, NULL); 1926 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); 1927 if (fail_sparse) { 1928 if (err == 0 && bp && BP_IS_HOLE(bp)) 1929 err = SET_ERROR(ENOENT); 1930 if (err) { 1931 if (parent) 1932 dbuf_rele(parent, NULL); 1933 return (err); 1934 } 1935 } 1936 if (err && err != ENOENT) 1937 return (err); 1938 db = dbuf_create(dn, level, blkid, parent, bp); 1939 } 1940 1941 if (db->db_buf && refcount_is_zero(&db->db_holds)) { 1942 arc_buf_add_ref(db->db_buf, db); 1943 if (db->db_buf->b_data == NULL) { 1944 dbuf_clear(db); 1945 if (parent) { 1946 dbuf_rele(parent, NULL); 1947 parent = NULL; 1948 } 1949 goto top; 1950 } 1951 ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); 1952 } 1953 1954 ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); 1955 1956 /* 1957 * If this buffer is currently syncing out, and we are are 1958 * still referencing it from db_data, we need to make a copy 1959 * of it in case we decide we want to dirty it again in this txg. 1960 */ 1961 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 1962 dn->dn_object != DMU_META_DNODE_OBJECT && 1963 db->db_state == DB_CACHED && db->db_data_pending) { 1964 dbuf_dirty_record_t *dr = db->db_data_pending; 1965 1966 if (dr->dt.dl.dr_data == db->db_buf) { 1967 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1968 1969 dbuf_set_data(db, 1970 arc_buf_alloc(dn->dn_objset->os_spa, 1971 db->db.db_size, db, type)); 1972 bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, 1973 db->db.db_size); 1974 } 1975 } 1976 1977 (void) refcount_add(&db->db_holds, tag); 1978 dbuf_update_data(db); 1979 DBUF_VERIFY(db); 1980 mutex_exit(&db->db_mtx); 1981 1982 /* NOTE: we can't rele the parent until after we drop the db_mtx */ 1983 if (parent) 1984 dbuf_rele(parent, NULL); 1985 1986 ASSERT3P(DB_DNODE(db), ==, dn); 1987 ASSERT3U(db->db_blkid, ==, blkid); 1988 ASSERT3U(db->db_level, ==, level); 1989 *dbp = db; 1990 1991 return (0); 1992} 1993 1994dmu_buf_impl_t * 1995dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) 1996{ 1997 dmu_buf_impl_t *db; 1998 int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db); 1999 return (err ? NULL : db); 2000} 2001 2002dmu_buf_impl_t * 2003dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) 2004{ 2005 dmu_buf_impl_t *db; 2006 int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); 2007 return (err ? NULL : db); 2008} 2009 2010void 2011dbuf_create_bonus(dnode_t *dn) 2012{ 2013 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 2014 2015 ASSERT(dn->dn_bonus == NULL); 2016 dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); 2017} 2018 2019int 2020dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) 2021{ 2022 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2023 dnode_t *dn; 2024 2025 if (db->db_blkid != DMU_SPILL_BLKID) 2026 return (SET_ERROR(ENOTSUP)); 2027 if (blksz == 0) 2028 blksz = SPA_MINBLOCKSIZE; 2029 if (blksz > SPA_MAXBLOCKSIZE) 2030 blksz = SPA_MAXBLOCKSIZE; 2031 else 2032 blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); 2033 2034 DB_DNODE_ENTER(db); 2035 dn = DB_DNODE(db); 2036 rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 2037 dbuf_new_size(db, blksz, tx); 2038 rw_exit(&dn->dn_struct_rwlock); 2039 DB_DNODE_EXIT(db); 2040 2041 return (0); 2042} 2043 2044void 2045dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) 2046{ 2047 dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); 2048} 2049 2050#pragma weak dmu_buf_add_ref = dbuf_add_ref 2051void 2052dbuf_add_ref(dmu_buf_impl_t *db, void *tag) 2053{ 2054 int64_t holds = refcount_add(&db->db_holds, tag); 2055 ASSERT(holds > 1); 2056} 2057 2058/* 2059 * If you call dbuf_rele() you had better not be referencing the dnode handle 2060 * unless you have some other direct or indirect hold on the dnode. (An indirect 2061 * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) 2062 * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the 2063 * dnode's parent dbuf evicting its dnode handles. 2064 */ 2065void 2066dbuf_rele(dmu_buf_impl_t *db, void *tag) 2067{ 2068 mutex_enter(&db->db_mtx); 2069 dbuf_rele_and_unlock(db, tag); 2070} 2071 2072void 2073dmu_buf_rele(dmu_buf_t *db, void *tag) 2074{ 2075 dbuf_rele((dmu_buf_impl_t *)db, tag); 2076} 2077 2078/* 2079 * dbuf_rele() for an already-locked dbuf. This is necessary to allow 2080 * db_dirtycnt and db_holds to be updated atomically. 2081 */ 2082void 2083dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) 2084{ 2085 int64_t holds; 2086 2087 ASSERT(MUTEX_HELD(&db->db_mtx)); 2088 DBUF_VERIFY(db); 2089 2090 /* 2091 * Remove the reference to the dbuf before removing its hold on the 2092 * dnode so we can guarantee in dnode_move() that a referenced bonus 2093 * buffer has a corresponding dnode hold. 2094 */ 2095 holds = refcount_remove(&db->db_holds, tag); 2096 ASSERT(holds >= 0); 2097 2098 /* 2099 * We can't freeze indirects if there is a possibility that they 2100 * may be modified in the current syncing context. 2101 */ 2102 if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) 2103 arc_buf_freeze(db->db_buf); 2104 2105 if (holds == db->db_dirtycnt && 2106 db->db_level == 0 && db->db_immediate_evict) 2107 dbuf_evict_user(db); 2108 2109 if (holds == 0) { 2110 if (db->db_blkid == DMU_BONUS_BLKID) { 2111 mutex_exit(&db->db_mtx); 2112 2113 /* 2114 * If the dnode moves here, we cannot cross this barrier 2115 * until the move completes. 2116 */ 2117 DB_DNODE_ENTER(db); 2118 (void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count); 2119 DB_DNODE_EXIT(db); 2120 /* 2121 * The bonus buffer's dnode hold is no longer discounted 2122 * in dnode_move(). The dnode cannot move until after 2123 * the dnode_rele(). 2124 */ 2125 dnode_rele(DB_DNODE(db), db); 2126 } else if (db->db_buf == NULL) { 2127 /* 2128 * This is a special case: we never associated this 2129 * dbuf with any data allocated from the ARC. 2130 */ 2131 ASSERT(db->db_state == DB_UNCACHED || 2132 db->db_state == DB_NOFILL); 2133 dbuf_evict(db); 2134 } else if (arc_released(db->db_buf)) { 2135 arc_buf_t *buf = db->db_buf; 2136 /* 2137 * This dbuf has anonymous data associated with it. 2138 */ 2139 dbuf_set_data(db, NULL); 2140 VERIFY(arc_buf_remove_ref(buf, db)); 2141 dbuf_evict(db); 2142 } else { 2143 VERIFY(!arc_buf_remove_ref(db->db_buf, db)); 2144 2145 /* 2146 * A dbuf will be eligible for eviction if either the 2147 * 'primarycache' property is set or a duplicate 2148 * copy of this buffer is already cached in the arc. 2149 * 2150 * In the case of the 'primarycache' a buffer 2151 * is considered for eviction if it matches the 2152 * criteria set in the property. 2153 * 2154 * To decide if our buffer is considered a 2155 * duplicate, we must call into the arc to determine 2156 * if multiple buffers are referencing the same 2157 * block on-disk. If so, then we simply evict 2158 * ourselves. 2159 */ 2160 if (!DBUF_IS_CACHEABLE(db)) { 2161 if (db->db_blkptr != NULL && 2162 !BP_IS_HOLE(db->db_blkptr) && 2163 !BP_IS_EMBEDDED(db->db_blkptr)) { 2164 spa_t *spa = 2165 dmu_objset_spa(db->db_objset); 2166 blkptr_t bp = *db->db_blkptr; 2167 dbuf_clear(db); 2168 arc_freed(spa, &bp); 2169 } else { 2170 dbuf_clear(db); 2171 } 2172 } else if (arc_buf_eviction_needed(db->db_buf)) { 2173 dbuf_clear(db); 2174 } else { 2175 mutex_exit(&db->db_mtx); 2176 } 2177 } 2178 } else { 2179 mutex_exit(&db->db_mtx); 2180 } 2181} 2182 2183#pragma weak dmu_buf_refcount = dbuf_refcount 2184uint64_t 2185dbuf_refcount(dmu_buf_impl_t *db) 2186{ 2187 return (refcount_count(&db->db_holds)); 2188} 2189 2190void * 2191dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 2192 dmu_buf_evict_func_t *evict_func) 2193{ 2194 return (dmu_buf_update_user(db_fake, NULL, user_ptr, 2195 user_data_ptr_ptr, evict_func)); 2196} 2197 2198void * 2199dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 2200 dmu_buf_evict_func_t *evict_func) 2201{ 2202 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2203 2204 db->db_immediate_evict = TRUE; 2205 return (dmu_buf_update_user(db_fake, NULL, user_ptr, 2206 user_data_ptr_ptr, evict_func)); 2207} 2208 2209void * 2210dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, 2211 void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func) 2212{ 2213 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2214 ASSERT(db->db_level == 0); 2215 2216 ASSERT((user_ptr == NULL) == (evict_func == NULL)); 2217 2218 mutex_enter(&db->db_mtx); 2219 2220 if (db->db_user_ptr == old_user_ptr) { 2221 db->db_user_ptr = user_ptr; 2222 db->db_user_data_ptr_ptr = user_data_ptr_ptr; 2223 db->db_evict_func = evict_func; 2224 2225 dbuf_update_data(db); 2226 } else { 2227 old_user_ptr = db->db_user_ptr; 2228 } 2229 2230 mutex_exit(&db->db_mtx); 2231 return (old_user_ptr); 2232} 2233 2234void * 2235dmu_buf_get_user(dmu_buf_t *db_fake) 2236{ 2237 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2238 ASSERT(!refcount_is_zero(&db->db_holds)); 2239 2240 return (db->db_user_ptr); 2241} 2242 2243boolean_t 2244dmu_buf_freeable(dmu_buf_t *dbuf) 2245{ 2246 boolean_t res = B_FALSE; 2247 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; 2248 2249 if (db->db_blkptr) 2250 res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset, 2251 db->db_blkptr, db->db_blkptr->blk_birth); 2252 2253 return (res); 2254} 2255 2256blkptr_t * 2257dmu_buf_get_blkptr(dmu_buf_t *db) 2258{ 2259 dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; 2260 return (dbi->db_blkptr); 2261} 2262 2263static void 2264dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) 2265{ 2266 /* ASSERT(dmu_tx_is_syncing(tx) */ 2267 ASSERT(MUTEX_HELD(&db->db_mtx)); 2268 2269 if (db->db_blkptr != NULL) 2270 return; 2271 2272 if (db->db_blkid == DMU_SPILL_BLKID) { 2273 db->db_blkptr = &dn->dn_phys->dn_spill; 2274 BP_ZERO(db->db_blkptr); 2275 return; 2276 } 2277 if (db->db_level == dn->dn_phys->dn_nlevels-1) { 2278 /* 2279 * This buffer was allocated at a time when there was 2280 * no available blkptrs from the dnode, or it was 2281 * inappropriate to hook it in (i.e., nlevels mis-match). 2282 */ 2283 ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); 2284 ASSERT(db->db_parent == NULL); 2285 db->db_parent = dn->dn_dbuf; 2286 db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; 2287 DBUF_VERIFY(db); 2288 } else { 2289 dmu_buf_impl_t *parent = db->db_parent; 2290 int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2291 2292 ASSERT(dn->dn_phys->dn_nlevels > 1); 2293 if (parent == NULL) { 2294 mutex_exit(&db->db_mtx); 2295 rw_enter(&dn->dn_struct_rwlock, RW_READER); 2296 (void) dbuf_hold_impl(dn, db->db_level+1, 2297 db->db_blkid >> epbs, FALSE, db, &parent); 2298 rw_exit(&dn->dn_struct_rwlock); 2299 mutex_enter(&db->db_mtx); 2300 db->db_parent = parent; 2301 } 2302 db->db_blkptr = (blkptr_t *)parent->db.db_data + 2303 (db->db_blkid & ((1ULL << epbs) - 1)); 2304 DBUF_VERIFY(db); 2305 } 2306} 2307 2308static void 2309dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2310{ 2311 dmu_buf_impl_t *db = dr->dr_dbuf; 2312 dnode_t *dn; 2313 zio_t *zio; 2314 2315 ASSERT(dmu_tx_is_syncing(tx)); 2316 2317 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2318 2319 mutex_enter(&db->db_mtx); 2320 2321 ASSERT(db->db_level > 0); 2322 DBUF_VERIFY(db); 2323 2324 /* Read the block if it hasn't been read yet. */ 2325 if (db->db_buf == NULL) { 2326 mutex_exit(&db->db_mtx); 2327 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); 2328 mutex_enter(&db->db_mtx); 2329 } 2330 ASSERT3U(db->db_state, ==, DB_CACHED); 2331 ASSERT(db->db_buf != NULL); 2332 2333 DB_DNODE_ENTER(db); 2334 dn = DB_DNODE(db); 2335 /* Indirect block size must match what the dnode thinks it is. */ 2336 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2337 dbuf_check_blkptr(dn, db); 2338 DB_DNODE_EXIT(db); 2339 2340 /* Provide the pending dirty record to child dbufs */ 2341 db->db_data_pending = dr; 2342 2343 mutex_exit(&db->db_mtx); 2344 dbuf_write(dr, db->db_buf, tx); 2345 2346 zio = dr->dr_zio; 2347 mutex_enter(&dr->dt.di.dr_mtx); 2348 dbuf_sync_list(&dr->dt.di.dr_children, tx); 2349 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 2350 mutex_exit(&dr->dt.di.dr_mtx); 2351 zio_nowait(zio); 2352} 2353 2354static void 2355dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2356{ 2357 arc_buf_t **datap = &dr->dt.dl.dr_data; 2358 dmu_buf_impl_t *db = dr->dr_dbuf; 2359 dnode_t *dn; 2360 objset_t *os; 2361 uint64_t txg = tx->tx_txg; 2362 2363 ASSERT(dmu_tx_is_syncing(tx)); 2364 2365 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2366 2367 mutex_enter(&db->db_mtx); 2368 /* 2369 * To be synced, we must be dirtied. But we 2370 * might have been freed after the dirty. 2371 */ 2372 if (db->db_state == DB_UNCACHED) { 2373 /* This buffer has been freed since it was dirtied */ 2374 ASSERT(db->db.db_data == NULL); 2375 } else if (db->db_state == DB_FILL) { 2376 /* This buffer was freed and is now being re-filled */ 2377 ASSERT(db->db.db_data != dr->dt.dl.dr_data); 2378 } else { 2379 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); 2380 } 2381 DBUF_VERIFY(db); 2382 2383 DB_DNODE_ENTER(db); 2384 dn = DB_DNODE(db); 2385 2386 if (db->db_blkid == DMU_SPILL_BLKID) { 2387 mutex_enter(&dn->dn_mtx); 2388 dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; 2389 mutex_exit(&dn->dn_mtx); 2390 } 2391 2392 /* 2393 * If this is a bonus buffer, simply copy the bonus data into the 2394 * dnode. It will be written out when the dnode is synced (and it 2395 * will be synced, since it must have been dirty for dbuf_sync to 2396 * be called). 2397 */ 2398 if (db->db_blkid == DMU_BONUS_BLKID) { 2399 dbuf_dirty_record_t **drp; 2400 2401 ASSERT(*datap != NULL); 2402 ASSERT0(db->db_level); 2403 ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); 2404 bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); 2405 DB_DNODE_EXIT(db); 2406 2407 if (*datap != db->db.db_data) { 2408 zio_buf_free(*datap, DN_MAX_BONUSLEN); 2409 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 2410 } 2411 db->db_data_pending = NULL; 2412 drp = &db->db_last_dirty; 2413 while (*drp != dr) 2414 drp = &(*drp)->dr_next; 2415 ASSERT(dr->dr_next == NULL); 2416 ASSERT(dr->dr_dbuf == db); 2417 *drp = dr->dr_next; 2418 if (dr->dr_dbuf->db_level != 0) { 2419 list_destroy(&dr->dt.di.dr_children); 2420 mutex_destroy(&dr->dt.di.dr_mtx); 2421 } 2422 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 2423 ASSERT(db->db_dirtycnt > 0); 2424 db->db_dirtycnt -= 1; 2425 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); 2426 return; 2427 } 2428 2429 os = dn->dn_objset; 2430 2431 /* 2432 * This function may have dropped the db_mtx lock allowing a dmu_sync 2433 * operation to sneak in. As a result, we need to ensure that we 2434 * don't check the dr_override_state until we have returned from 2435 * dbuf_check_blkptr. 2436 */ 2437 dbuf_check_blkptr(dn, db); 2438 2439 /* 2440 * If this buffer is in the middle of an immediate write, 2441 * wait for the synchronous IO to complete. 2442 */ 2443 while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { 2444 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); 2445 cv_wait(&db->db_changed, &db->db_mtx); 2446 ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); 2447 } 2448 2449 if (db->db_state != DB_NOFILL && 2450 dn->dn_object != DMU_META_DNODE_OBJECT && 2451 refcount_count(&db->db_holds) > 1 && 2452 dr->dt.dl.dr_override_state != DR_OVERRIDDEN && 2453 *datap == db->db_buf) { 2454 /* 2455 * If this buffer is currently "in use" (i.e., there 2456 * are active holds and db_data still references it), 2457 * then make a copy before we start the write so that 2458 * any modifications from the open txg will not leak 2459 * into this write. 2460 * 2461 * NOTE: this copy does not need to be made for 2462 * objects only modified in the syncing context (e.g. 2463 * DNONE_DNODE blocks). 2464 */ 2465 int blksz = arc_buf_size(*datap); 2466 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 2467 *datap = arc_buf_alloc(os->os_spa, blksz, db, type); 2468 bcopy(db->db.db_data, (*datap)->b_data, blksz); 2469 } 2470 db->db_data_pending = dr; 2471 2472 mutex_exit(&db->db_mtx); 2473 2474 dbuf_write(dr, *datap, tx); 2475 2476 ASSERT(!list_link_active(&dr->dr_dirty_node)); 2477 if (dn->dn_object == DMU_META_DNODE_OBJECT) { 2478 list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); 2479 DB_DNODE_EXIT(db); 2480 } else { 2481 /* 2482 * Although zio_nowait() does not "wait for an IO", it does 2483 * initiate the IO. If this is an empty write it seems plausible 2484 * that the IO could actually be completed before the nowait 2485 * returns. We need to DB_DNODE_EXIT() first in case 2486 * zio_nowait() invalidates the dbuf. 2487 */ 2488 DB_DNODE_EXIT(db); 2489 zio_nowait(dr->dr_zio); 2490 } 2491} 2492 2493void 2494dbuf_sync_list(list_t *list, dmu_tx_t *tx) 2495{ 2496 dbuf_dirty_record_t *dr; 2497 2498 while (dr = list_head(list)) { 2499 if (dr->dr_zio != NULL) { 2500 /* 2501 * If we find an already initialized zio then we 2502 * are processing the meta-dnode, and we have finished. 2503 * The dbufs for all dnodes are put back on the list 2504 * during processing, so that we can zio_wait() 2505 * these IOs after initiating all child IOs. 2506 */ 2507 ASSERT3U(dr->dr_dbuf->db.db_object, ==, 2508 DMU_META_DNODE_OBJECT); 2509 break; 2510 } 2511 list_remove(list, dr); 2512 if (dr->dr_dbuf->db_level > 0) 2513 dbuf_sync_indirect(dr, tx); 2514 else 2515 dbuf_sync_leaf(dr, tx); 2516 } 2517} 2518 2519/* ARGSUSED */ 2520static void 2521dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) 2522{ 2523 dmu_buf_impl_t *db = vdb; 2524 dnode_t *dn; 2525 blkptr_t *bp = zio->io_bp; 2526 blkptr_t *bp_orig = &zio->io_bp_orig; 2527 spa_t *spa = zio->io_spa; 2528 int64_t delta; 2529 uint64_t fill = 0; 2530 int i; 2531 2532 ASSERT3P(db->db_blkptr, ==, bp); 2533 2534 DB_DNODE_ENTER(db); 2535 dn = DB_DNODE(db); 2536 delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); 2537 dnode_diduse_space(dn, delta - zio->io_prev_space_delta); 2538 zio->io_prev_space_delta = delta; 2539 2540 if (bp->blk_birth != 0) { 2541 ASSERT((db->db_blkid != DMU_SPILL_BLKID && 2542 BP_GET_TYPE(bp) == dn->dn_type) || 2543 (db->db_blkid == DMU_SPILL_BLKID && 2544 BP_GET_TYPE(bp) == dn->dn_bonustype) || 2545 BP_IS_EMBEDDED(bp)); 2546 ASSERT(BP_GET_LEVEL(bp) == db->db_level); 2547 } 2548 2549 mutex_enter(&db->db_mtx); 2550 2551#ifdef ZFS_DEBUG 2552 if (db->db_blkid == DMU_SPILL_BLKID) { 2553 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 2554 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 2555 db->db_blkptr == &dn->dn_phys->dn_spill); 2556 } 2557#endif 2558 2559 if (db->db_level == 0) { 2560 mutex_enter(&dn->dn_mtx); 2561 if (db->db_blkid > dn->dn_phys->dn_maxblkid && 2562 db->db_blkid != DMU_SPILL_BLKID) 2563 dn->dn_phys->dn_maxblkid = db->db_blkid; 2564 mutex_exit(&dn->dn_mtx); 2565 2566 if (dn->dn_type == DMU_OT_DNODE) { 2567 dnode_phys_t *dnp = db->db.db_data; 2568 for (i = db->db.db_size >> DNODE_SHIFT; i > 0; 2569 i--, dnp++) { 2570 if (dnp->dn_type != DMU_OT_NONE) 2571 fill++; 2572 } 2573 } else { 2574 if (BP_IS_HOLE(bp)) { 2575 fill = 0; 2576 } else { 2577 fill = 1; 2578 } 2579 } 2580 } else { 2581 blkptr_t *ibp = db->db.db_data; 2582 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2583 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { 2584 if (BP_IS_HOLE(ibp)) 2585 continue; 2586 fill += BP_GET_FILL(ibp); 2587 } 2588 } 2589 DB_DNODE_EXIT(db); 2590 2591 if (!BP_IS_EMBEDDED(bp)) 2592 bp->blk_fill = fill; 2593 2594 mutex_exit(&db->db_mtx); 2595} 2596 2597/* 2598 * The SPA will call this callback several times for each zio - once 2599 * for every physical child i/o (zio->io_phys_children times). This 2600 * allows the DMU to monitor the progress of each logical i/o. For example, 2601 * there may be 2 copies of an indirect block, or many fragments of a RAID-Z 2602 * block. There may be a long delay before all copies/fragments are completed, 2603 * so this callback allows us to retire dirty space gradually, as the physical 2604 * i/os complete. 2605 */ 2606/* ARGSUSED */ 2607static void 2608dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg) 2609{ 2610 dmu_buf_impl_t *db = arg; 2611 objset_t *os = db->db_objset; 2612 dsl_pool_t *dp = dmu_objset_pool(os); 2613 dbuf_dirty_record_t *dr; 2614 int delta = 0; 2615 2616 dr = db->db_data_pending; 2617 ASSERT3U(dr->dr_txg, ==, zio->io_txg); 2618 2619 /* 2620 * The callback will be called io_phys_children times. Retire one 2621 * portion of our dirty space each time we are called. Any rounding 2622 * error will be cleaned up by dsl_pool_sync()'s call to 2623 * dsl_pool_undirty_space(). 2624 */ 2625 delta = dr->dr_accounted / zio->io_phys_children; 2626 dsl_pool_undirty_space(dp, delta, zio->io_txg); 2627} 2628 2629/* ARGSUSED */ 2630static void 2631dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) 2632{ 2633 dmu_buf_impl_t *db = vdb; 2634 blkptr_t *bp_orig = &zio->io_bp_orig; 2635 blkptr_t *bp = db->db_blkptr; 2636 objset_t *os = db->db_objset; 2637 dmu_tx_t *tx = os->os_synctx; 2638 dbuf_dirty_record_t **drp, *dr; 2639 2640 ASSERT0(zio->io_error); 2641 ASSERT(db->db_blkptr == bp); 2642 2643 /* 2644 * For nopwrites and rewrites we ensure that the bp matches our 2645 * original and bypass all the accounting. 2646 */ 2647 if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { 2648 ASSERT(BP_EQUAL(bp, bp_orig)); 2649 } else { 2650 dsl_dataset_t *ds = os->os_dsl_dataset; 2651 (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); 2652 dsl_dataset_block_born(ds, bp, tx); 2653 } 2654 2655 mutex_enter(&db->db_mtx); 2656 2657 DBUF_VERIFY(db); 2658 2659 drp = &db->db_last_dirty; 2660 while ((dr = *drp) != db->db_data_pending) 2661 drp = &dr->dr_next; 2662 ASSERT(!list_link_active(&dr->dr_dirty_node)); 2663 ASSERT(dr->dr_dbuf == db); 2664 ASSERT(dr->dr_next == NULL); 2665 *drp = dr->dr_next; 2666 2667#ifdef ZFS_DEBUG 2668 if (db->db_blkid == DMU_SPILL_BLKID) { 2669 dnode_t *dn; 2670 2671 DB_DNODE_ENTER(db); 2672 dn = DB_DNODE(db); 2673 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 2674 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 2675 db->db_blkptr == &dn->dn_phys->dn_spill); 2676 DB_DNODE_EXIT(db); 2677 } 2678#endif 2679 2680 if (db->db_level == 0) { 2681 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 2682 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); 2683 if (db->db_state != DB_NOFILL) { 2684 if (dr->dt.dl.dr_data != db->db_buf) 2685 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, 2686 db)); 2687 else if (!arc_released(db->db_buf)) 2688 arc_set_callback(db->db_buf, dbuf_do_evict, db); 2689 } 2690 } else { 2691 dnode_t *dn; 2692 2693 DB_DNODE_ENTER(db); 2694 dn = DB_DNODE(db); 2695 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 2696 ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); 2697 if (!BP_IS_HOLE(db->db_blkptr)) { 2698 int epbs = 2699 dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2700 ASSERT3U(db->db_blkid, <=, 2701 dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); 2702 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 2703 db->db.db_size); 2704 if (!arc_released(db->db_buf)) 2705 arc_set_callback(db->db_buf, dbuf_do_evict, db); 2706 } 2707 DB_DNODE_EXIT(db); 2708 mutex_destroy(&dr->dt.di.dr_mtx); 2709 list_destroy(&dr->dt.di.dr_children); 2710 } 2711 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 2712 2713 cv_broadcast(&db->db_changed); 2714 ASSERT(db->db_dirtycnt > 0); 2715 db->db_dirtycnt -= 1; 2716 db->db_data_pending = NULL; 2717 dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg); 2718} 2719 2720static void 2721dbuf_write_nofill_ready(zio_t *zio) 2722{ 2723 dbuf_write_ready(zio, NULL, zio->io_private); 2724} 2725 2726static void 2727dbuf_write_nofill_done(zio_t *zio) 2728{ 2729 dbuf_write_done(zio, NULL, zio->io_private); 2730} 2731 2732static void 2733dbuf_write_override_ready(zio_t *zio) 2734{ 2735 dbuf_dirty_record_t *dr = zio->io_private; 2736 dmu_buf_impl_t *db = dr->dr_dbuf; 2737 2738 dbuf_write_ready(zio, NULL, db); 2739} 2740 2741static void 2742dbuf_write_override_done(zio_t *zio) 2743{ 2744 dbuf_dirty_record_t *dr = zio->io_private; 2745 dmu_buf_impl_t *db = dr->dr_dbuf; 2746 blkptr_t *obp = &dr->dt.dl.dr_overridden_by; 2747 2748 mutex_enter(&db->db_mtx); 2749 if (!BP_EQUAL(zio->io_bp, obp)) { 2750 if (!BP_IS_HOLE(obp)) 2751 dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); 2752 arc_release(dr->dt.dl.dr_data, db); 2753 } 2754 mutex_exit(&db->db_mtx); 2755 2756 dbuf_write_done(zio, NULL, db); 2757} 2758 2759/* Issue I/O to commit a dirty buffer to disk. */ 2760static void 2761dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) 2762{ 2763 dmu_buf_impl_t *db = dr->dr_dbuf; 2764 dnode_t *dn; 2765 objset_t *os; 2766 dmu_buf_impl_t *parent = db->db_parent; 2767 uint64_t txg = tx->tx_txg; 2768 zbookmark_phys_t zb; 2769 zio_prop_t zp; 2770 zio_t *zio; 2771 int wp_flag = 0; 2772 2773 DB_DNODE_ENTER(db); 2774 dn = DB_DNODE(db); 2775 os = dn->dn_objset; 2776 2777 if (db->db_state != DB_NOFILL) { 2778 if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { 2779 /* 2780 * Private object buffers are released here rather 2781 * than in dbuf_dirty() since they are only modified 2782 * in the syncing context and we don't want the 2783 * overhead of making multiple copies of the data. 2784 */ 2785 if (BP_IS_HOLE(db->db_blkptr)) { 2786 arc_buf_thaw(data); 2787 } else { 2788 dbuf_release_bp(db); 2789 } 2790 } 2791 } 2792 2793 if (parent != dn->dn_dbuf) { 2794 /* Our parent is an indirect block. */ 2795 /* We have a dirty parent that has been scheduled for write. */ 2796 ASSERT(parent && parent->db_data_pending); 2797 /* Our parent's buffer is one level closer to the dnode. */ 2798 ASSERT(db->db_level == parent->db_level-1); 2799 /* 2800 * We're about to modify our parent's db_data by modifying 2801 * our block pointer, so the parent must be released. 2802 */ 2803 ASSERT(arc_released(parent->db_buf)); 2804 zio = parent->db_data_pending->dr_zio; 2805 } else { 2806 /* Our parent is the dnode itself. */ 2807 ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && 2808 db->db_blkid != DMU_SPILL_BLKID) || 2809 (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); 2810 if (db->db_blkid != DMU_SPILL_BLKID) 2811 ASSERT3P(db->db_blkptr, ==, 2812 &dn->dn_phys->dn_blkptr[db->db_blkid]); 2813 zio = dn->dn_zio; 2814 } 2815 2816 ASSERT(db->db_level == 0 || data == db->db_buf); 2817 ASSERT3U(db->db_blkptr->blk_birth, <=, txg); 2818 ASSERT(zio); 2819 2820 SET_BOOKMARK(&zb, os->os_dsl_dataset ? 2821 os->os_dsl_dataset->ds_object : DMU_META_OBJSET, 2822 db->db.db_object, db->db_level, db->db_blkid); 2823 2824 if (db->db_blkid == DMU_SPILL_BLKID) 2825 wp_flag = WP_SPILL; 2826 wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; 2827 2828 dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); 2829 DB_DNODE_EXIT(db); 2830 2831 if (db->db_level == 0 && 2832 dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 2833 /* 2834 * The BP for this block has been provided by open context 2835 * (by dmu_sync() or dmu_buf_write_embedded()). 2836 */ 2837 void *contents = (data != NULL) ? data->b_data : NULL; 2838 2839 dr->dr_zio = zio_write(zio, os->os_spa, txg, 2840 db->db_blkptr, contents, db->db.db_size, &zp, 2841 dbuf_write_override_ready, NULL, dbuf_write_override_done, 2842 dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 2843 mutex_enter(&db->db_mtx); 2844 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 2845 zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, 2846 dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); 2847 mutex_exit(&db->db_mtx); 2848 } else if (db->db_state == DB_NOFILL) { 2849 ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || 2850 zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); 2851 dr->dr_zio = zio_write(zio, os->os_spa, txg, 2852 db->db_blkptr, NULL, db->db.db_size, &zp, 2853 dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db, 2854 ZIO_PRIORITY_ASYNC_WRITE, 2855 ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); 2856 } else { 2857 ASSERT(arc_released(data)); 2858 dr->dr_zio = arc_write(zio, os->os_spa, txg, 2859 db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), 2860 DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready, 2861 dbuf_write_physdone, dbuf_write_done, db, 2862 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 2863 } 2864} 2865