dbuf.c revision 269845
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
24 * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
25 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
26 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
27 */
28
29#include <sys/zfs_context.h>
30#include <sys/dmu.h>
31#include <sys/dmu_send.h>
32#include <sys/dmu_impl.h>
33#include <sys/dbuf.h>
34#include <sys/dmu_objset.h>
35#include <sys/dsl_dataset.h>
36#include <sys/dsl_dir.h>
37#include <sys/dmu_tx.h>
38#include <sys/spa.h>
39#include <sys/zio.h>
40#include <sys/dmu_zfetch.h>
41#include <sys/sa.h>
42#include <sys/sa_impl.h>
43#include <sys/zfeature.h>
44#include <sys/blkptr.h>
45#include <sys/range_tree.h>
46
47/*
48 * Number of times that zfs_free_range() took the slow path while doing
49 * a zfs receive.  A nonzero value indicates a potential performance problem.
50 */
51uint64_t zfs_free_range_recv_miss;
52
53static void dbuf_destroy(dmu_buf_impl_t *db);
54static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
55static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
56
57/*
58 * Global data structures and functions for the dbuf cache.
59 */
60static kmem_cache_t *dbuf_cache;
61
62/* ARGSUSED */
63static int
64dbuf_cons(void *vdb, void *unused, int kmflag)
65{
66	dmu_buf_impl_t *db = vdb;
67	bzero(db, sizeof (dmu_buf_impl_t));
68
69	mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
70	cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
71	refcount_create(&db->db_holds);
72
73#if defined(illumos) || !defined(_KERNEL)
74	db->db_creation = gethrtime();
75#else
76	db->db_creation = cpu_ticks() ^ ((uint64_t)CPU_SEQID << 48);
77#endif
78
79	return (0);
80}
81
82/* ARGSUSED */
83static void
84dbuf_dest(void *vdb, void *unused)
85{
86	dmu_buf_impl_t *db = vdb;
87	mutex_destroy(&db->db_mtx);
88	cv_destroy(&db->db_changed);
89	refcount_destroy(&db->db_holds);
90}
91
92/*
93 * dbuf hash table routines
94 */
95static dbuf_hash_table_t dbuf_hash_table;
96
97static uint64_t dbuf_hash_count;
98
99static uint64_t
100dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
101{
102	uintptr_t osv = (uintptr_t)os;
103	uint64_t crc = -1ULL;
104
105	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
106	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
107	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
108	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
109	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
110	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
111	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
112
113	crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
114
115	return (crc);
116}
117
118#define	DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
119
120#define	DBUF_EQUAL(dbuf, os, obj, level, blkid)		\
121	((dbuf)->db.db_object == (obj) &&		\
122	(dbuf)->db_objset == (os) &&			\
123	(dbuf)->db_level == (level) &&			\
124	(dbuf)->db_blkid == (blkid))
125
126dmu_buf_impl_t *
127dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
128{
129	dbuf_hash_table_t *h = &dbuf_hash_table;
130	objset_t *os = dn->dn_objset;
131	uint64_t obj = dn->dn_object;
132	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
133	uint64_t idx = hv & h->hash_table_mask;
134	dmu_buf_impl_t *db;
135
136	mutex_enter(DBUF_HASH_MUTEX(h, idx));
137	for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
138		if (DBUF_EQUAL(db, os, obj, level, blkid)) {
139			mutex_enter(&db->db_mtx);
140			if (db->db_state != DB_EVICTING) {
141				mutex_exit(DBUF_HASH_MUTEX(h, idx));
142				return (db);
143			}
144			mutex_exit(&db->db_mtx);
145		}
146	}
147	mutex_exit(DBUF_HASH_MUTEX(h, idx));
148	return (NULL);
149}
150
151/*
152 * Insert an entry into the hash table.  If there is already an element
153 * equal to elem in the hash table, then the already existing element
154 * will be returned and the new element will not be inserted.
155 * Otherwise returns NULL.
156 */
157static dmu_buf_impl_t *
158dbuf_hash_insert(dmu_buf_impl_t *db)
159{
160	dbuf_hash_table_t *h = &dbuf_hash_table;
161	objset_t *os = db->db_objset;
162	uint64_t obj = db->db.db_object;
163	int level = db->db_level;
164	uint64_t blkid = db->db_blkid;
165	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
166	uint64_t idx = hv & h->hash_table_mask;
167	dmu_buf_impl_t *dbf;
168
169	mutex_enter(DBUF_HASH_MUTEX(h, idx));
170	for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
171		if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
172			mutex_enter(&dbf->db_mtx);
173			if (dbf->db_state != DB_EVICTING) {
174				mutex_exit(DBUF_HASH_MUTEX(h, idx));
175				return (dbf);
176			}
177			mutex_exit(&dbf->db_mtx);
178		}
179	}
180
181	mutex_enter(&db->db_mtx);
182	db->db_hash_next = h->hash_table[idx];
183	h->hash_table[idx] = db;
184	mutex_exit(DBUF_HASH_MUTEX(h, idx));
185	atomic_add_64(&dbuf_hash_count, 1);
186
187	return (NULL);
188}
189
190/*
191 * Remove an entry from the hash table.  It must be in the EVICTING state.
192 */
193static void
194dbuf_hash_remove(dmu_buf_impl_t *db)
195{
196	dbuf_hash_table_t *h = &dbuf_hash_table;
197	uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
198	    db->db_level, db->db_blkid);
199	uint64_t idx = hv & h->hash_table_mask;
200	dmu_buf_impl_t *dbf, **dbp;
201
202	/*
203	 * We musn't hold db_mtx to maintain lock ordering:
204	 * DBUF_HASH_MUTEX > db_mtx.
205	 */
206	ASSERT(refcount_is_zero(&db->db_holds));
207	ASSERT(db->db_state == DB_EVICTING);
208	ASSERT(!MUTEX_HELD(&db->db_mtx));
209
210	mutex_enter(DBUF_HASH_MUTEX(h, idx));
211	dbp = &h->hash_table[idx];
212	while ((dbf = *dbp) != db) {
213		dbp = &dbf->db_hash_next;
214		ASSERT(dbf != NULL);
215	}
216	*dbp = db->db_hash_next;
217	db->db_hash_next = NULL;
218	mutex_exit(DBUF_HASH_MUTEX(h, idx));
219	atomic_add_64(&dbuf_hash_count, -1);
220}
221
222static arc_evict_func_t dbuf_do_evict;
223
224static void
225dbuf_evict_user(dmu_buf_impl_t *db)
226{
227	ASSERT(MUTEX_HELD(&db->db_mtx));
228
229	if (db->db_level != 0 || db->db_evict_func == NULL)
230		return;
231
232	if (db->db_user_data_ptr_ptr)
233		*db->db_user_data_ptr_ptr = db->db.db_data;
234	db->db_evict_func(&db->db, db->db_user_ptr);
235	db->db_user_ptr = NULL;
236	db->db_user_data_ptr_ptr = NULL;
237	db->db_evict_func = NULL;
238}
239
240boolean_t
241dbuf_is_metadata(dmu_buf_impl_t *db)
242{
243	if (db->db_level > 0) {
244		return (B_TRUE);
245	} else {
246		boolean_t is_metadata;
247
248		DB_DNODE_ENTER(db);
249		is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
250		DB_DNODE_EXIT(db);
251
252		return (is_metadata);
253	}
254}
255
256void
257dbuf_evict(dmu_buf_impl_t *db)
258{
259	ASSERT(MUTEX_HELD(&db->db_mtx));
260	ASSERT(db->db_buf == NULL);
261	ASSERT(db->db_data_pending == NULL);
262
263	dbuf_clear(db);
264	dbuf_destroy(db);
265}
266
267void
268dbuf_init(void)
269{
270	uint64_t hsize = 1ULL << 16;
271	dbuf_hash_table_t *h = &dbuf_hash_table;
272	int i;
273
274	/*
275	 * The hash table is big enough to fill all of physical memory
276	 * with an average 4K block size.  The table will take up
277	 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
278	 */
279	while (hsize * 4096 < (uint64_t)physmem * PAGESIZE)
280		hsize <<= 1;
281
282retry:
283	h->hash_table_mask = hsize - 1;
284	h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
285	if (h->hash_table == NULL) {
286		/* XXX - we should really return an error instead of assert */
287		ASSERT(hsize > (1ULL << 10));
288		hsize >>= 1;
289		goto retry;
290	}
291
292	dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
293	    sizeof (dmu_buf_impl_t),
294	    0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
295
296	for (i = 0; i < DBUF_MUTEXES; i++)
297		mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
298}
299
300void
301dbuf_fini(void)
302{
303	dbuf_hash_table_t *h = &dbuf_hash_table;
304	int i;
305
306	for (i = 0; i < DBUF_MUTEXES; i++)
307		mutex_destroy(&h->hash_mutexes[i]);
308	kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
309	kmem_cache_destroy(dbuf_cache);
310}
311
312/*
313 * Other stuff.
314 */
315
316#ifdef ZFS_DEBUG
317static void
318dbuf_verify(dmu_buf_impl_t *db)
319{
320	dnode_t *dn;
321	dbuf_dirty_record_t *dr;
322
323	ASSERT(MUTEX_HELD(&db->db_mtx));
324
325	if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
326		return;
327
328	ASSERT(db->db_objset != NULL);
329	DB_DNODE_ENTER(db);
330	dn = DB_DNODE(db);
331	if (dn == NULL) {
332		ASSERT(db->db_parent == NULL);
333		ASSERT(db->db_blkptr == NULL);
334	} else {
335		ASSERT3U(db->db.db_object, ==, dn->dn_object);
336		ASSERT3P(db->db_objset, ==, dn->dn_objset);
337		ASSERT3U(db->db_level, <, dn->dn_nlevels);
338		ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
339		    db->db_blkid == DMU_SPILL_BLKID ||
340		    !avl_is_empty(&dn->dn_dbufs));
341	}
342	if (db->db_blkid == DMU_BONUS_BLKID) {
343		ASSERT(dn != NULL);
344		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
345		ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
346	} else if (db->db_blkid == DMU_SPILL_BLKID) {
347		ASSERT(dn != NULL);
348		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
349		ASSERT0(db->db.db_offset);
350	} else {
351		ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
352	}
353
354	for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
355		ASSERT(dr->dr_dbuf == db);
356
357	for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
358		ASSERT(dr->dr_dbuf == db);
359
360	/*
361	 * We can't assert that db_size matches dn_datablksz because it
362	 * can be momentarily different when another thread is doing
363	 * dnode_set_blksz().
364	 */
365	if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
366		dr = db->db_data_pending;
367		/*
368		 * It should only be modified in syncing context, so
369		 * make sure we only have one copy of the data.
370		 */
371		ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
372	}
373
374	/* verify db->db_blkptr */
375	if (db->db_blkptr) {
376		if (db->db_parent == dn->dn_dbuf) {
377			/* db is pointed to by the dnode */
378			/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
379			if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
380				ASSERT(db->db_parent == NULL);
381			else
382				ASSERT(db->db_parent != NULL);
383			if (db->db_blkid != DMU_SPILL_BLKID)
384				ASSERT3P(db->db_blkptr, ==,
385				    &dn->dn_phys->dn_blkptr[db->db_blkid]);
386		} else {
387			/* db is pointed to by an indirect block */
388			int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
389			ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
390			ASSERT3U(db->db_parent->db.db_object, ==,
391			    db->db.db_object);
392			/*
393			 * dnode_grow_indblksz() can make this fail if we don't
394			 * have the struct_rwlock.  XXX indblksz no longer
395			 * grows.  safe to do this now?
396			 */
397			if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
398				ASSERT3P(db->db_blkptr, ==,
399				    ((blkptr_t *)db->db_parent->db.db_data +
400				    db->db_blkid % epb));
401			}
402		}
403	}
404	if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
405	    (db->db_buf == NULL || db->db_buf->b_data) &&
406	    db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
407	    db->db_state != DB_FILL && !dn->dn_free_txg) {
408		/*
409		 * If the blkptr isn't set but they have nonzero data,
410		 * it had better be dirty, otherwise we'll lose that
411		 * data when we evict this buffer.
412		 */
413		if (db->db_dirtycnt == 0) {
414			uint64_t *buf = db->db.db_data;
415			int i;
416
417			for (i = 0; i < db->db.db_size >> 3; i++) {
418				ASSERT(buf[i] == 0);
419			}
420		}
421	}
422	DB_DNODE_EXIT(db);
423}
424#endif
425
426static void
427dbuf_update_data(dmu_buf_impl_t *db)
428{
429	ASSERT(MUTEX_HELD(&db->db_mtx));
430	if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
431		ASSERT(!refcount_is_zero(&db->db_holds));
432		*db->db_user_data_ptr_ptr = db->db.db_data;
433	}
434}
435
436static void
437dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
438{
439	ASSERT(MUTEX_HELD(&db->db_mtx));
440	db->db_buf = buf;
441	if (buf != NULL) {
442		ASSERT(buf->b_data != NULL);
443		db->db.db_data = buf->b_data;
444		if (!arc_released(buf))
445			arc_set_callback(buf, dbuf_do_evict, db);
446		dbuf_update_data(db);
447	} else {
448		dbuf_evict_user(db);
449		db->db.db_data = NULL;
450		if (db->db_state != DB_NOFILL)
451			db->db_state = DB_UNCACHED;
452	}
453}
454
455/*
456 * Loan out an arc_buf for read.  Return the loaned arc_buf.
457 */
458arc_buf_t *
459dbuf_loan_arcbuf(dmu_buf_impl_t *db)
460{
461	arc_buf_t *abuf;
462
463	mutex_enter(&db->db_mtx);
464	if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
465		int blksz = db->db.db_size;
466		spa_t *spa = db->db_objset->os_spa;
467
468		mutex_exit(&db->db_mtx);
469		abuf = arc_loan_buf(spa, blksz);
470		bcopy(db->db.db_data, abuf->b_data, blksz);
471	} else {
472		abuf = db->db_buf;
473		arc_loan_inuse_buf(abuf, db);
474		dbuf_set_data(db, NULL);
475		mutex_exit(&db->db_mtx);
476	}
477	return (abuf);
478}
479
480uint64_t
481dbuf_whichblock(dnode_t *dn, uint64_t offset)
482{
483	if (dn->dn_datablkshift) {
484		return (offset >> dn->dn_datablkshift);
485	} else {
486		ASSERT3U(offset, <, dn->dn_datablksz);
487		return (0);
488	}
489}
490
491static void
492dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
493{
494	dmu_buf_impl_t *db = vdb;
495
496	mutex_enter(&db->db_mtx);
497	ASSERT3U(db->db_state, ==, DB_READ);
498	/*
499	 * All reads are synchronous, so we must have a hold on the dbuf
500	 */
501	ASSERT(refcount_count(&db->db_holds) > 0);
502	ASSERT(db->db_buf == NULL);
503	ASSERT(db->db.db_data == NULL);
504	if (db->db_level == 0 && db->db_freed_in_flight) {
505		/* we were freed in flight; disregard any error */
506		arc_release(buf, db);
507		bzero(buf->b_data, db->db.db_size);
508		arc_buf_freeze(buf);
509		db->db_freed_in_flight = FALSE;
510		dbuf_set_data(db, buf);
511		db->db_state = DB_CACHED;
512	} else if (zio == NULL || zio->io_error == 0) {
513		dbuf_set_data(db, buf);
514		db->db_state = DB_CACHED;
515	} else {
516		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
517		ASSERT3P(db->db_buf, ==, NULL);
518		VERIFY(arc_buf_remove_ref(buf, db));
519		db->db_state = DB_UNCACHED;
520	}
521	cv_broadcast(&db->db_changed);
522	dbuf_rele_and_unlock(db, NULL);
523}
524
525static void
526dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
527{
528	dnode_t *dn;
529	zbookmark_phys_t zb;
530	uint32_t aflags = ARC_NOWAIT;
531
532	DB_DNODE_ENTER(db);
533	dn = DB_DNODE(db);
534	ASSERT(!refcount_is_zero(&db->db_holds));
535	/* We need the struct_rwlock to prevent db_blkptr from changing. */
536	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
537	ASSERT(MUTEX_HELD(&db->db_mtx));
538	ASSERT(db->db_state == DB_UNCACHED);
539	ASSERT(db->db_buf == NULL);
540
541	if (db->db_blkid == DMU_BONUS_BLKID) {
542		int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
543
544		ASSERT3U(bonuslen, <=, db->db.db_size);
545		db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
546		arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
547		if (bonuslen < DN_MAX_BONUSLEN)
548			bzero(db->db.db_data, DN_MAX_BONUSLEN);
549		if (bonuslen)
550			bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
551		DB_DNODE_EXIT(db);
552		dbuf_update_data(db);
553		db->db_state = DB_CACHED;
554		mutex_exit(&db->db_mtx);
555		return;
556	}
557
558	/*
559	 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
560	 * processes the delete record and clears the bp while we are waiting
561	 * for the dn_mtx (resulting in a "no" from block_freed).
562	 */
563	if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
564	    (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
565	    BP_IS_HOLE(db->db_blkptr)))) {
566		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
567
568		DB_DNODE_EXIT(db);
569		dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa,
570		    db->db.db_size, db, type));
571		bzero(db->db.db_data, db->db.db_size);
572		db->db_state = DB_CACHED;
573		*flags |= DB_RF_CACHED;
574		mutex_exit(&db->db_mtx);
575		return;
576	}
577
578	DB_DNODE_EXIT(db);
579
580	db->db_state = DB_READ;
581	mutex_exit(&db->db_mtx);
582
583	if (DBUF_IS_L2CACHEABLE(db))
584		aflags |= ARC_L2CACHE;
585	if (DBUF_IS_L2COMPRESSIBLE(db))
586		aflags |= ARC_L2COMPRESS;
587
588	SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
589	    db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
590	    db->db.db_object, db->db_level, db->db_blkid);
591
592	dbuf_add_ref(db, NULL);
593
594	(void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
595	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
596	    (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
597	    &aflags, &zb);
598	if (aflags & ARC_CACHED)
599		*flags |= DB_RF_CACHED;
600}
601
602int
603dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
604{
605	int err = 0;
606	boolean_t havepzio = (zio != NULL);
607	boolean_t prefetch;
608	dnode_t *dn;
609
610	/*
611	 * We don't have to hold the mutex to check db_state because it
612	 * can't be freed while we have a hold on the buffer.
613	 */
614	ASSERT(!refcount_is_zero(&db->db_holds));
615
616	if (db->db_state == DB_NOFILL)
617		return (SET_ERROR(EIO));
618
619	DB_DNODE_ENTER(db);
620	dn = DB_DNODE(db);
621	if ((flags & DB_RF_HAVESTRUCT) == 0)
622		rw_enter(&dn->dn_struct_rwlock, RW_READER);
623
624	prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
625	    (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
626	    DBUF_IS_CACHEABLE(db);
627
628	mutex_enter(&db->db_mtx);
629	if (db->db_state == DB_CACHED) {
630		mutex_exit(&db->db_mtx);
631		if (prefetch)
632			dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
633			    db->db.db_size, TRUE);
634		if ((flags & DB_RF_HAVESTRUCT) == 0)
635			rw_exit(&dn->dn_struct_rwlock);
636		DB_DNODE_EXIT(db);
637	} else if (db->db_state == DB_UNCACHED) {
638		spa_t *spa = dn->dn_objset->os_spa;
639
640		if (zio == NULL)
641			zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
642		dbuf_read_impl(db, zio, &flags);
643
644		/* dbuf_read_impl has dropped db_mtx for us */
645
646		if (prefetch)
647			dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
648			    db->db.db_size, flags & DB_RF_CACHED);
649
650		if ((flags & DB_RF_HAVESTRUCT) == 0)
651			rw_exit(&dn->dn_struct_rwlock);
652		DB_DNODE_EXIT(db);
653
654		if (!havepzio)
655			err = zio_wait(zio);
656	} else {
657		/*
658		 * Another reader came in while the dbuf was in flight
659		 * between UNCACHED and CACHED.  Either a writer will finish
660		 * writing the buffer (sending the dbuf to CACHED) or the
661		 * first reader's request will reach the read_done callback
662		 * and send the dbuf to CACHED.  Otherwise, a failure
663		 * occurred and the dbuf went to UNCACHED.
664		 */
665		mutex_exit(&db->db_mtx);
666		if (prefetch)
667			dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
668			    db->db.db_size, TRUE);
669		if ((flags & DB_RF_HAVESTRUCT) == 0)
670			rw_exit(&dn->dn_struct_rwlock);
671		DB_DNODE_EXIT(db);
672
673		/* Skip the wait per the caller's request. */
674		mutex_enter(&db->db_mtx);
675		if ((flags & DB_RF_NEVERWAIT) == 0) {
676			while (db->db_state == DB_READ ||
677			    db->db_state == DB_FILL) {
678				ASSERT(db->db_state == DB_READ ||
679				    (flags & DB_RF_HAVESTRUCT) == 0);
680				cv_wait(&db->db_changed, &db->db_mtx);
681			}
682			if (db->db_state == DB_UNCACHED)
683				err = SET_ERROR(EIO);
684		}
685		mutex_exit(&db->db_mtx);
686	}
687
688	ASSERT(err || havepzio || db->db_state == DB_CACHED);
689	return (err);
690}
691
692static void
693dbuf_noread(dmu_buf_impl_t *db)
694{
695	ASSERT(!refcount_is_zero(&db->db_holds));
696	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
697	mutex_enter(&db->db_mtx);
698	while (db->db_state == DB_READ || db->db_state == DB_FILL)
699		cv_wait(&db->db_changed, &db->db_mtx);
700	if (db->db_state == DB_UNCACHED) {
701		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
702		spa_t *spa = db->db_objset->os_spa;
703
704		ASSERT(db->db_buf == NULL);
705		ASSERT(db->db.db_data == NULL);
706		dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
707		db->db_state = DB_FILL;
708	} else if (db->db_state == DB_NOFILL) {
709		dbuf_set_data(db, NULL);
710	} else {
711		ASSERT3U(db->db_state, ==, DB_CACHED);
712	}
713	mutex_exit(&db->db_mtx);
714}
715
716/*
717 * This is our just-in-time copy function.  It makes a copy of
718 * buffers, that have been modified in a previous transaction
719 * group, before we modify them in the current active group.
720 *
721 * This function is used in two places: when we are dirtying a
722 * buffer for the first time in a txg, and when we are freeing
723 * a range in a dnode that includes this buffer.
724 *
725 * Note that when we are called from dbuf_free_range() we do
726 * not put a hold on the buffer, we just traverse the active
727 * dbuf list for the dnode.
728 */
729static void
730dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
731{
732	dbuf_dirty_record_t *dr = db->db_last_dirty;
733
734	ASSERT(MUTEX_HELD(&db->db_mtx));
735	ASSERT(db->db.db_data != NULL);
736	ASSERT(db->db_level == 0);
737	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
738
739	if (dr == NULL ||
740	    (dr->dt.dl.dr_data !=
741	    ((db->db_blkid  == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
742		return;
743
744	/*
745	 * If the last dirty record for this dbuf has not yet synced
746	 * and its referencing the dbuf data, either:
747	 *	reset the reference to point to a new copy,
748	 * or (if there a no active holders)
749	 *	just null out the current db_data pointer.
750	 */
751	ASSERT(dr->dr_txg >= txg - 2);
752	if (db->db_blkid == DMU_BONUS_BLKID) {
753		/* Note that the data bufs here are zio_bufs */
754		dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
755		arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
756		bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
757	} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
758		int size = db->db.db_size;
759		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
760		spa_t *spa = db->db_objset->os_spa;
761
762		dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
763		bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
764	} else {
765		dbuf_set_data(db, NULL);
766	}
767}
768
769void
770dbuf_unoverride(dbuf_dirty_record_t *dr)
771{
772	dmu_buf_impl_t *db = dr->dr_dbuf;
773	blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
774	uint64_t txg = dr->dr_txg;
775
776	ASSERT(MUTEX_HELD(&db->db_mtx));
777	ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
778	ASSERT(db->db_level == 0);
779
780	if (db->db_blkid == DMU_BONUS_BLKID ||
781	    dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
782		return;
783
784	ASSERT(db->db_data_pending != dr);
785
786	/* free this block */
787	if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
788		zio_free(db->db_objset->os_spa, txg, bp);
789
790	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
791	dr->dt.dl.dr_nopwrite = B_FALSE;
792
793	/*
794	 * Release the already-written buffer, so we leave it in
795	 * a consistent dirty state.  Note that all callers are
796	 * modifying the buffer, so they will immediately do
797	 * another (redundant) arc_release().  Therefore, leave
798	 * the buf thawed to save the effort of freezing &
799	 * immediately re-thawing it.
800	 */
801	arc_release(dr->dt.dl.dr_data, db);
802}
803
804/*
805 * Evict (if its unreferenced) or clear (if its referenced) any level-0
806 * data blocks in the free range, so that any future readers will find
807 * empty blocks.
808 *
809 * This is a no-op if the dataset is in the middle of an incremental
810 * receive; see comment below for details.
811 */
812void
813dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
814    dmu_tx_t *tx)
815{
816	dmu_buf_impl_t *db, *db_next, db_search;
817	uint64_t txg = tx->tx_txg;
818	avl_index_t where;
819
820	if (end_blkid > dn->dn_maxblkid && (end_blkid != DMU_SPILL_BLKID))
821		end_blkid = dn->dn_maxblkid;
822	dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid);
823
824	db_search.db_level = 0;
825	db_search.db_blkid = start_blkid;
826	db_search.db_creation = 0;
827
828	mutex_enter(&dn->dn_dbufs_mtx);
829	if (start_blkid >= dn->dn_unlisted_l0_blkid) {
830		/* There can't be any dbufs in this range; no need to search. */
831#ifdef DEBUG
832		db = avl_find(&dn->dn_dbufs, &db_search, &where);
833		ASSERT3P(db, ==, NULL);
834		db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
835		ASSERT(db == NULL || db->db_level > 0);
836#endif
837		mutex_exit(&dn->dn_dbufs_mtx);
838		return;
839	} else if (dmu_objset_is_receiving(dn->dn_objset)) {
840		/*
841		 * If we are receiving, we expect there to be no dbufs in
842		 * the range to be freed, because receive modifies each
843		 * block at most once, and in offset order.  If this is
844		 * not the case, it can lead to performance problems,
845		 * so note that we unexpectedly took the slow path.
846		 */
847		atomic_inc_64(&zfs_free_range_recv_miss);
848	}
849
850	db = avl_find(&dn->dn_dbufs, &db_search, &where);
851	ASSERT3P(db, ==, NULL);
852	db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
853
854	for (; db != NULL; db = db_next) {
855		db_next = AVL_NEXT(&dn->dn_dbufs, db);
856		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
857
858		if (db->db_level != 0 || db->db_blkid > end_blkid) {
859			break;
860		}
861		ASSERT3U(db->db_blkid, >=, start_blkid);
862
863		/* found a level 0 buffer in the range */
864		mutex_enter(&db->db_mtx);
865		if (dbuf_undirty(db, tx)) {
866			/* mutex has been dropped and dbuf destroyed */
867			continue;
868		}
869
870		if (db->db_state == DB_UNCACHED ||
871		    db->db_state == DB_NOFILL ||
872		    db->db_state == DB_EVICTING) {
873			ASSERT(db->db.db_data == NULL);
874			mutex_exit(&db->db_mtx);
875			continue;
876		}
877		if (db->db_state == DB_READ || db->db_state == DB_FILL) {
878			/* will be handled in dbuf_read_done or dbuf_rele */
879			db->db_freed_in_flight = TRUE;
880			mutex_exit(&db->db_mtx);
881			continue;
882		}
883		if (refcount_count(&db->db_holds) == 0) {
884			ASSERT(db->db_buf);
885			dbuf_clear(db);
886			continue;
887		}
888		/* The dbuf is referenced */
889
890		if (db->db_last_dirty != NULL) {
891			dbuf_dirty_record_t *dr = db->db_last_dirty;
892
893			if (dr->dr_txg == txg) {
894				/*
895				 * This buffer is "in-use", re-adjust the file
896				 * size to reflect that this buffer may
897				 * contain new data when we sync.
898				 */
899				if (db->db_blkid != DMU_SPILL_BLKID &&
900				    db->db_blkid > dn->dn_maxblkid)
901					dn->dn_maxblkid = db->db_blkid;
902				dbuf_unoverride(dr);
903			} else {
904				/*
905				 * This dbuf is not dirty in the open context.
906				 * Either uncache it (if its not referenced in
907				 * the open context) or reset its contents to
908				 * empty.
909				 */
910				dbuf_fix_old_data(db, txg);
911			}
912		}
913		/* clear the contents if its cached */
914		if (db->db_state == DB_CACHED) {
915			ASSERT(db->db.db_data != NULL);
916			arc_release(db->db_buf, db);
917			bzero(db->db.db_data, db->db.db_size);
918			arc_buf_freeze(db->db_buf);
919		}
920
921		mutex_exit(&db->db_mtx);
922	}
923	mutex_exit(&dn->dn_dbufs_mtx);
924}
925
926static int
927dbuf_block_freeable(dmu_buf_impl_t *db)
928{
929	dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
930	uint64_t birth_txg = 0;
931
932	/*
933	 * We don't need any locking to protect db_blkptr:
934	 * If it's syncing, then db_last_dirty will be set
935	 * so we'll ignore db_blkptr.
936	 *
937	 * This logic ensures that only block births for
938	 * filled blocks are considered.
939	 */
940	ASSERT(MUTEX_HELD(&db->db_mtx));
941	if (db->db_last_dirty && (db->db_blkptr == NULL ||
942	    !BP_IS_HOLE(db->db_blkptr))) {
943		birth_txg = db->db_last_dirty->dr_txg;
944	} else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
945		birth_txg = db->db_blkptr->blk_birth;
946	}
947
948	/*
949	 * If this block don't exist or is in a snapshot, it can't be freed.
950	 * Don't pass the bp to dsl_dataset_block_freeable() since we
951	 * are holding the db_mtx lock and might deadlock if we are
952	 * prefetching a dedup-ed block.
953	 */
954	if (birth_txg != 0)
955		return (ds == NULL ||
956		    dsl_dataset_block_freeable(ds, NULL, birth_txg));
957	else
958		return (B_FALSE);
959}
960
961void
962dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
963{
964	arc_buf_t *buf, *obuf;
965	int osize = db->db.db_size;
966	arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
967	dnode_t *dn;
968
969	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
970
971	DB_DNODE_ENTER(db);
972	dn = DB_DNODE(db);
973
974	/* XXX does *this* func really need the lock? */
975	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
976
977	/*
978	 * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held
979	 * is OK, because there can be no other references to the db
980	 * when we are changing its size, so no concurrent DB_FILL can
981	 * be happening.
982	 */
983	/*
984	 * XXX we should be doing a dbuf_read, checking the return
985	 * value and returning that up to our callers
986	 */
987	dmu_buf_will_dirty(&db->db, tx);
988
989	/* create the data buffer for the new block */
990	buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
991
992	/* copy old block data to the new block */
993	obuf = db->db_buf;
994	bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
995	/* zero the remainder */
996	if (size > osize)
997		bzero((uint8_t *)buf->b_data + osize, size - osize);
998
999	mutex_enter(&db->db_mtx);
1000	dbuf_set_data(db, buf);
1001	VERIFY(arc_buf_remove_ref(obuf, db));
1002	db->db.db_size = size;
1003
1004	if (db->db_level == 0) {
1005		ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
1006		db->db_last_dirty->dt.dl.dr_data = buf;
1007	}
1008	mutex_exit(&db->db_mtx);
1009
1010	dnode_willuse_space(dn, size-osize, tx);
1011	DB_DNODE_EXIT(db);
1012}
1013
1014void
1015dbuf_release_bp(dmu_buf_impl_t *db)
1016{
1017	objset_t *os = db->db_objset;
1018
1019	ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
1020	ASSERT(arc_released(os->os_phys_buf) ||
1021	    list_link_active(&os->os_dsl_dataset->ds_synced_link));
1022	ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
1023
1024	(void) arc_release(db->db_buf, db);
1025}
1026
1027dbuf_dirty_record_t *
1028dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1029{
1030	dnode_t *dn;
1031	objset_t *os;
1032	dbuf_dirty_record_t **drp, *dr;
1033	int drop_struct_lock = FALSE;
1034	boolean_t do_free_accounting = B_FALSE;
1035	int txgoff = tx->tx_txg & TXG_MASK;
1036
1037	ASSERT(tx->tx_txg != 0);
1038	ASSERT(!refcount_is_zero(&db->db_holds));
1039	DMU_TX_DIRTY_BUF(tx, db);
1040
1041	DB_DNODE_ENTER(db);
1042	dn = DB_DNODE(db);
1043	/*
1044	 * Shouldn't dirty a regular buffer in syncing context.  Private
1045	 * objects may be dirtied in syncing context, but only if they
1046	 * were already pre-dirtied in open context.
1047	 */
1048	ASSERT(!dmu_tx_is_syncing(tx) ||
1049	    BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
1050	    DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1051	    dn->dn_objset->os_dsl_dataset == NULL);
1052	/*
1053	 * We make this assert for private objects as well, but after we
1054	 * check if we're already dirty.  They are allowed to re-dirty
1055	 * in syncing context.
1056	 */
1057	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1058	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1059	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1060
1061	mutex_enter(&db->db_mtx);
1062	/*
1063	 * XXX make this true for indirects too?  The problem is that
1064	 * transactions created with dmu_tx_create_assigned() from
1065	 * syncing context don't bother holding ahead.
1066	 */
1067	ASSERT(db->db_level != 0 ||
1068	    db->db_state == DB_CACHED || db->db_state == DB_FILL ||
1069	    db->db_state == DB_NOFILL);
1070
1071	mutex_enter(&dn->dn_mtx);
1072	/*
1073	 * Don't set dirtyctx to SYNC if we're just modifying this as we
1074	 * initialize the objset.
1075	 */
1076	if (dn->dn_dirtyctx == DN_UNDIRTIED &&
1077	    !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
1078		dn->dn_dirtyctx =
1079		    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
1080		ASSERT(dn->dn_dirtyctx_firstset == NULL);
1081		dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
1082	}
1083	mutex_exit(&dn->dn_mtx);
1084
1085	if (db->db_blkid == DMU_SPILL_BLKID)
1086		dn->dn_have_spill = B_TRUE;
1087
1088	/*
1089	 * If this buffer is already dirty, we're done.
1090	 */
1091	drp = &db->db_last_dirty;
1092	ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
1093	    db->db.db_object == DMU_META_DNODE_OBJECT);
1094	while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
1095		drp = &dr->dr_next;
1096	if (dr && dr->dr_txg == tx->tx_txg) {
1097		DB_DNODE_EXIT(db);
1098
1099		if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
1100			/*
1101			 * If this buffer has already been written out,
1102			 * we now need to reset its state.
1103			 */
1104			dbuf_unoverride(dr);
1105			if (db->db.db_object != DMU_META_DNODE_OBJECT &&
1106			    db->db_state != DB_NOFILL)
1107				arc_buf_thaw(db->db_buf);
1108		}
1109		mutex_exit(&db->db_mtx);
1110		return (dr);
1111	}
1112
1113	/*
1114	 * Only valid if not already dirty.
1115	 */
1116	ASSERT(dn->dn_object == 0 ||
1117	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1118	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1119
1120	ASSERT3U(dn->dn_nlevels, >, db->db_level);
1121	ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
1122	    dn->dn_phys->dn_nlevels > db->db_level ||
1123	    dn->dn_next_nlevels[txgoff] > db->db_level ||
1124	    dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
1125	    dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
1126
1127	/*
1128	 * We should only be dirtying in syncing context if it's the
1129	 * mos or we're initializing the os or it's a special object.
1130	 * However, we are allowed to dirty in syncing context provided
1131	 * we already dirtied it in open context.  Hence we must make
1132	 * this assertion only if we're not already dirty.
1133	 */
1134	os = dn->dn_objset;
1135	ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1136	    os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
1137	ASSERT(db->db.db_size != 0);
1138
1139	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1140
1141	if (db->db_blkid != DMU_BONUS_BLKID) {
1142		/*
1143		 * Update the accounting.
1144		 * Note: we delay "free accounting" until after we drop
1145		 * the db_mtx.  This keeps us from grabbing other locks
1146		 * (and possibly deadlocking) in bp_get_dsize() while
1147		 * also holding the db_mtx.
1148		 */
1149		dnode_willuse_space(dn, db->db.db_size, tx);
1150		do_free_accounting = dbuf_block_freeable(db);
1151	}
1152
1153	/*
1154	 * If this buffer is dirty in an old transaction group we need
1155	 * to make a copy of it so that the changes we make in this
1156	 * transaction group won't leak out when we sync the older txg.
1157	 */
1158	dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
1159	if (db->db_level == 0) {
1160		void *data_old = db->db_buf;
1161
1162		if (db->db_state != DB_NOFILL) {
1163			if (db->db_blkid == DMU_BONUS_BLKID) {
1164				dbuf_fix_old_data(db, tx->tx_txg);
1165				data_old = db->db.db_data;
1166			} else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
1167				/*
1168				 * Release the data buffer from the cache so
1169				 * that we can modify it without impacting
1170				 * possible other users of this cached data
1171				 * block.  Note that indirect blocks and
1172				 * private objects are not released until the
1173				 * syncing state (since they are only modified
1174				 * then).
1175				 */
1176				arc_release(db->db_buf, db);
1177				dbuf_fix_old_data(db, tx->tx_txg);
1178				data_old = db->db_buf;
1179			}
1180			ASSERT(data_old != NULL);
1181		}
1182		dr->dt.dl.dr_data = data_old;
1183	} else {
1184		mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
1185		list_create(&dr->dt.di.dr_children,
1186		    sizeof (dbuf_dirty_record_t),
1187		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
1188	}
1189	if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
1190		dr->dr_accounted = db->db.db_size;
1191	dr->dr_dbuf = db;
1192	dr->dr_txg = tx->tx_txg;
1193	dr->dr_next = *drp;
1194	*drp = dr;
1195
1196	/*
1197	 * We could have been freed_in_flight between the dbuf_noread
1198	 * and dbuf_dirty.  We win, as though the dbuf_noread() had
1199	 * happened after the free.
1200	 */
1201	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1202	    db->db_blkid != DMU_SPILL_BLKID) {
1203		mutex_enter(&dn->dn_mtx);
1204		if (dn->dn_free_ranges[txgoff] != NULL) {
1205			range_tree_clear(dn->dn_free_ranges[txgoff],
1206			    db->db_blkid, 1);
1207		}
1208		mutex_exit(&dn->dn_mtx);
1209		db->db_freed_in_flight = FALSE;
1210	}
1211
1212	/*
1213	 * This buffer is now part of this txg
1214	 */
1215	dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
1216	db->db_dirtycnt += 1;
1217	ASSERT3U(db->db_dirtycnt, <=, 3);
1218
1219	mutex_exit(&db->db_mtx);
1220
1221	if (db->db_blkid == DMU_BONUS_BLKID ||
1222	    db->db_blkid == DMU_SPILL_BLKID) {
1223		mutex_enter(&dn->dn_mtx);
1224		ASSERT(!list_link_active(&dr->dr_dirty_node));
1225		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1226		mutex_exit(&dn->dn_mtx);
1227		dnode_setdirty(dn, tx);
1228		DB_DNODE_EXIT(db);
1229		return (dr);
1230	} else if (do_free_accounting) {
1231		blkptr_t *bp = db->db_blkptr;
1232		int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
1233		    bp_get_dsize(os->os_spa, bp) : db->db.db_size;
1234		/*
1235		 * This is only a guess -- if the dbuf is dirty
1236		 * in a previous txg, we don't know how much
1237		 * space it will use on disk yet.  We should
1238		 * really have the struct_rwlock to access
1239		 * db_blkptr, but since this is just a guess,
1240		 * it's OK if we get an odd answer.
1241		 */
1242		ddt_prefetch(os->os_spa, bp);
1243		dnode_willuse_space(dn, -willfree, tx);
1244	}
1245
1246	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
1247		rw_enter(&dn->dn_struct_rwlock, RW_READER);
1248		drop_struct_lock = TRUE;
1249	}
1250
1251	if (db->db_level == 0) {
1252		dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
1253		ASSERT(dn->dn_maxblkid >= db->db_blkid);
1254	}
1255
1256	if (db->db_level+1 < dn->dn_nlevels) {
1257		dmu_buf_impl_t *parent = db->db_parent;
1258		dbuf_dirty_record_t *di;
1259		int parent_held = FALSE;
1260
1261		if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
1262			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1263
1264			parent = dbuf_hold_level(dn, db->db_level+1,
1265			    db->db_blkid >> epbs, FTAG);
1266			ASSERT(parent != NULL);
1267			parent_held = TRUE;
1268		}
1269		if (drop_struct_lock)
1270			rw_exit(&dn->dn_struct_rwlock);
1271		ASSERT3U(db->db_level+1, ==, parent->db_level);
1272		di = dbuf_dirty(parent, tx);
1273		if (parent_held)
1274			dbuf_rele(parent, FTAG);
1275
1276		mutex_enter(&db->db_mtx);
1277		/*
1278		 * Since we've dropped the mutex, it's possible that
1279		 * dbuf_undirty() might have changed this out from under us.
1280		 */
1281		if (db->db_last_dirty == dr ||
1282		    dn->dn_object == DMU_META_DNODE_OBJECT) {
1283			mutex_enter(&di->dt.di.dr_mtx);
1284			ASSERT3U(di->dr_txg, ==, tx->tx_txg);
1285			ASSERT(!list_link_active(&dr->dr_dirty_node));
1286			list_insert_tail(&di->dt.di.dr_children, dr);
1287			mutex_exit(&di->dt.di.dr_mtx);
1288			dr->dr_parent = di;
1289		}
1290		mutex_exit(&db->db_mtx);
1291	} else {
1292		ASSERT(db->db_level+1 == dn->dn_nlevels);
1293		ASSERT(db->db_blkid < dn->dn_nblkptr);
1294		ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
1295		mutex_enter(&dn->dn_mtx);
1296		ASSERT(!list_link_active(&dr->dr_dirty_node));
1297		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1298		mutex_exit(&dn->dn_mtx);
1299		if (drop_struct_lock)
1300			rw_exit(&dn->dn_struct_rwlock);
1301	}
1302
1303	dnode_setdirty(dn, tx);
1304	DB_DNODE_EXIT(db);
1305	return (dr);
1306}
1307
1308/*
1309 * Undirty a buffer in the transaction group referenced by the given
1310 * transaction.  Return whether this evicted the dbuf.
1311 */
1312static boolean_t
1313dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1314{
1315	dnode_t *dn;
1316	uint64_t txg = tx->tx_txg;
1317	dbuf_dirty_record_t *dr, **drp;
1318
1319	ASSERT(txg != 0);
1320	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1321	ASSERT0(db->db_level);
1322	ASSERT(MUTEX_HELD(&db->db_mtx));
1323
1324	/*
1325	 * If this buffer is not dirty, we're done.
1326	 */
1327	for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
1328		if (dr->dr_txg <= txg)
1329			break;
1330	if (dr == NULL || dr->dr_txg < txg)
1331		return (B_FALSE);
1332	ASSERT(dr->dr_txg == txg);
1333	ASSERT(dr->dr_dbuf == db);
1334
1335	DB_DNODE_ENTER(db);
1336	dn = DB_DNODE(db);
1337
1338	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1339
1340	ASSERT(db->db.db_size != 0);
1341
1342	/*
1343	 * Any space we accounted for in dp_dirty_* will be cleaned up by
1344	 * dsl_pool_sync().  This is relatively rare so the discrepancy
1345	 * is not a big deal.
1346	 */
1347
1348	*drp = dr->dr_next;
1349
1350	/*
1351	 * Note that there are three places in dbuf_dirty()
1352	 * where this dirty record may be put on a list.
1353	 * Make sure to do a list_remove corresponding to
1354	 * every one of those list_insert calls.
1355	 */
1356	if (dr->dr_parent) {
1357		mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
1358		list_remove(&dr->dr_parent->dt.di.dr_children, dr);
1359		mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
1360	} else if (db->db_blkid == DMU_SPILL_BLKID ||
1361	    db->db_level+1 == dn->dn_nlevels) {
1362		ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
1363		mutex_enter(&dn->dn_mtx);
1364		list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
1365		mutex_exit(&dn->dn_mtx);
1366	}
1367	DB_DNODE_EXIT(db);
1368
1369	if (db->db_state != DB_NOFILL) {
1370		dbuf_unoverride(dr);
1371
1372		ASSERT(db->db_buf != NULL);
1373		ASSERT(dr->dt.dl.dr_data != NULL);
1374		if (dr->dt.dl.dr_data != db->db_buf)
1375			VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
1376	}
1377
1378	if (db->db_level != 0) {
1379		mutex_destroy(&dr->dt.di.dr_mtx);
1380		list_destroy(&dr->dt.di.dr_children);
1381	}
1382
1383	kmem_free(dr, sizeof (dbuf_dirty_record_t));
1384
1385	ASSERT(db->db_dirtycnt > 0);
1386	db->db_dirtycnt -= 1;
1387
1388	if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
1389		arc_buf_t *buf = db->db_buf;
1390
1391		ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
1392		dbuf_set_data(db, NULL);
1393		VERIFY(arc_buf_remove_ref(buf, db));
1394		dbuf_evict(db);
1395		return (B_TRUE);
1396	}
1397
1398	return (B_FALSE);
1399}
1400
1401void
1402dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
1403{
1404	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1405	int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
1406
1407	ASSERT(tx->tx_txg != 0);
1408	ASSERT(!refcount_is_zero(&db->db_holds));
1409
1410	DB_DNODE_ENTER(db);
1411	if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
1412		rf |= DB_RF_HAVESTRUCT;
1413	DB_DNODE_EXIT(db);
1414	(void) dbuf_read(db, NULL, rf);
1415	(void) dbuf_dirty(db, tx);
1416}
1417
1418void
1419dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1420{
1421	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1422
1423	db->db_state = DB_NOFILL;
1424
1425	dmu_buf_will_fill(db_fake, tx);
1426}
1427
1428void
1429dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1430{
1431	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1432
1433	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1434	ASSERT(tx->tx_txg != 0);
1435	ASSERT(db->db_level == 0);
1436	ASSERT(!refcount_is_zero(&db->db_holds));
1437
1438	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
1439	    dmu_tx_private_ok(tx));
1440
1441	dbuf_noread(db);
1442	(void) dbuf_dirty(db, tx);
1443}
1444
1445#pragma weak dmu_buf_fill_done = dbuf_fill_done
1446/* ARGSUSED */
1447void
1448dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
1449{
1450	mutex_enter(&db->db_mtx);
1451	DBUF_VERIFY(db);
1452
1453	if (db->db_state == DB_FILL) {
1454		if (db->db_level == 0 && db->db_freed_in_flight) {
1455			ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1456			/* we were freed while filling */
1457			/* XXX dbuf_undirty? */
1458			bzero(db->db.db_data, db->db.db_size);
1459			db->db_freed_in_flight = FALSE;
1460		}
1461		db->db_state = DB_CACHED;
1462		cv_broadcast(&db->db_changed);
1463	}
1464	mutex_exit(&db->db_mtx);
1465}
1466
1467void
1468dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
1469    bp_embedded_type_t etype, enum zio_compress comp,
1470    int uncompressed_size, int compressed_size, int byteorder,
1471    dmu_tx_t *tx)
1472{
1473	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
1474	struct dirty_leaf *dl;
1475	dmu_object_type_t type;
1476
1477	DB_DNODE_ENTER(db);
1478	type = DB_DNODE(db)->dn_type;
1479	DB_DNODE_EXIT(db);
1480
1481	ASSERT0(db->db_level);
1482	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1483
1484	dmu_buf_will_not_fill(dbuf, tx);
1485
1486	ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
1487	dl = &db->db_last_dirty->dt.dl;
1488	encode_embedded_bp_compressed(&dl->dr_overridden_by,
1489	    data, comp, uncompressed_size, compressed_size);
1490	BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
1491	BP_SET_TYPE(&dl->dr_overridden_by, type);
1492	BP_SET_LEVEL(&dl->dr_overridden_by, 0);
1493	BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
1494
1495	dl->dr_override_state = DR_OVERRIDDEN;
1496	dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg;
1497}
1498
1499/*
1500 * Directly assign a provided arc buf to a given dbuf if it's not referenced
1501 * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
1502 */
1503void
1504dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
1505{
1506	ASSERT(!refcount_is_zero(&db->db_holds));
1507	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1508	ASSERT(db->db_level == 0);
1509	ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
1510	ASSERT(buf != NULL);
1511	ASSERT(arc_buf_size(buf) == db->db.db_size);
1512	ASSERT(tx->tx_txg != 0);
1513
1514	arc_return_buf(buf, db);
1515	ASSERT(arc_released(buf));
1516
1517	mutex_enter(&db->db_mtx);
1518
1519	while (db->db_state == DB_READ || db->db_state == DB_FILL)
1520		cv_wait(&db->db_changed, &db->db_mtx);
1521
1522	ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
1523
1524	if (db->db_state == DB_CACHED &&
1525	    refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
1526		mutex_exit(&db->db_mtx);
1527		(void) dbuf_dirty(db, tx);
1528		bcopy(buf->b_data, db->db.db_data, db->db.db_size);
1529		VERIFY(arc_buf_remove_ref(buf, db));
1530		xuio_stat_wbuf_copied();
1531		return;
1532	}
1533
1534	xuio_stat_wbuf_nocopy();
1535	if (db->db_state == DB_CACHED) {
1536		dbuf_dirty_record_t *dr = db->db_last_dirty;
1537
1538		ASSERT(db->db_buf != NULL);
1539		if (dr != NULL && dr->dr_txg == tx->tx_txg) {
1540			ASSERT(dr->dt.dl.dr_data == db->db_buf);
1541			if (!arc_released(db->db_buf)) {
1542				ASSERT(dr->dt.dl.dr_override_state ==
1543				    DR_OVERRIDDEN);
1544				arc_release(db->db_buf, db);
1545			}
1546			dr->dt.dl.dr_data = buf;
1547			VERIFY(arc_buf_remove_ref(db->db_buf, db));
1548		} else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
1549			arc_release(db->db_buf, db);
1550			VERIFY(arc_buf_remove_ref(db->db_buf, db));
1551		}
1552		db->db_buf = NULL;
1553	}
1554	ASSERT(db->db_buf == NULL);
1555	dbuf_set_data(db, buf);
1556	db->db_state = DB_FILL;
1557	mutex_exit(&db->db_mtx);
1558	(void) dbuf_dirty(db, tx);
1559	dmu_buf_fill_done(&db->db, tx);
1560}
1561
1562/*
1563 * "Clear" the contents of this dbuf.  This will mark the dbuf
1564 * EVICTING and clear *most* of its references.  Unfortunately,
1565 * when we are not holding the dn_dbufs_mtx, we can't clear the
1566 * entry in the dn_dbufs list.  We have to wait until dbuf_destroy()
1567 * in this case.  For callers from the DMU we will usually see:
1568 *	dbuf_clear()->arc_clear_callback()->dbuf_do_evict()->dbuf_destroy()
1569 * For the arc callback, we will usually see:
1570 *	dbuf_do_evict()->dbuf_clear();dbuf_destroy()
1571 * Sometimes, though, we will get a mix of these two:
1572 *	DMU: dbuf_clear()->arc_clear_callback()
1573 *	ARC: dbuf_do_evict()->dbuf_destroy()
1574 *
1575 * This routine will dissociate the dbuf from the arc, by calling
1576 * arc_clear_callback(), but will not evict the data from the ARC.
1577 */
1578void
1579dbuf_clear(dmu_buf_impl_t *db)
1580{
1581	dnode_t *dn;
1582	dmu_buf_impl_t *parent = db->db_parent;
1583	dmu_buf_impl_t *dndb;
1584	boolean_t dbuf_gone = B_FALSE;
1585
1586	ASSERT(MUTEX_HELD(&db->db_mtx));
1587	ASSERT(refcount_is_zero(&db->db_holds));
1588
1589	dbuf_evict_user(db);
1590
1591	if (db->db_state == DB_CACHED) {
1592		ASSERT(db->db.db_data != NULL);
1593		if (db->db_blkid == DMU_BONUS_BLKID) {
1594			zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
1595			arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
1596		}
1597		db->db.db_data = NULL;
1598		db->db_state = DB_UNCACHED;
1599	}
1600
1601	ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
1602	ASSERT(db->db_data_pending == NULL);
1603
1604	db->db_state = DB_EVICTING;
1605	db->db_blkptr = NULL;
1606
1607	DB_DNODE_ENTER(db);
1608	dn = DB_DNODE(db);
1609	dndb = dn->dn_dbuf;
1610	if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
1611		avl_remove(&dn->dn_dbufs, db);
1612		(void) atomic_dec_32_nv(&dn->dn_dbufs_count);
1613		membar_producer();
1614		DB_DNODE_EXIT(db);
1615		/*
1616		 * Decrementing the dbuf count means that the hold corresponding
1617		 * to the removed dbuf is no longer discounted in dnode_move(),
1618		 * so the dnode cannot be moved until after we release the hold.
1619		 * The membar_producer() ensures visibility of the decremented
1620		 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
1621		 * release any lock.
1622		 */
1623		dnode_rele(dn, db);
1624		db->db_dnode_handle = NULL;
1625	} else {
1626		DB_DNODE_EXIT(db);
1627	}
1628
1629	if (db->db_buf)
1630		dbuf_gone = arc_clear_callback(db->db_buf);
1631
1632	if (!dbuf_gone)
1633		mutex_exit(&db->db_mtx);
1634
1635	/*
1636	 * If this dbuf is referenced from an indirect dbuf,
1637	 * decrement the ref count on the indirect dbuf.
1638	 */
1639	if (parent && parent != dndb)
1640		dbuf_rele(parent, db);
1641}
1642
1643static int
1644dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
1645    dmu_buf_impl_t **parentp, blkptr_t **bpp)
1646{
1647	int nlevels, epbs;
1648
1649	*parentp = NULL;
1650	*bpp = NULL;
1651
1652	ASSERT(blkid != DMU_BONUS_BLKID);
1653
1654	if (blkid == DMU_SPILL_BLKID) {
1655		mutex_enter(&dn->dn_mtx);
1656		if (dn->dn_have_spill &&
1657		    (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
1658			*bpp = &dn->dn_phys->dn_spill;
1659		else
1660			*bpp = NULL;
1661		dbuf_add_ref(dn->dn_dbuf, NULL);
1662		*parentp = dn->dn_dbuf;
1663		mutex_exit(&dn->dn_mtx);
1664		return (0);
1665	}
1666
1667	if (dn->dn_phys->dn_nlevels == 0)
1668		nlevels = 1;
1669	else
1670		nlevels = dn->dn_phys->dn_nlevels;
1671
1672	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1673
1674	ASSERT3U(level * epbs, <, 64);
1675	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1676	if (level >= nlevels ||
1677	    (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
1678		/* the buffer has no parent yet */
1679		return (SET_ERROR(ENOENT));
1680	} else if (level < nlevels-1) {
1681		/* this block is referenced from an indirect block */
1682		int err = dbuf_hold_impl(dn, level+1,
1683		    blkid >> epbs, fail_sparse, NULL, parentp);
1684		if (err)
1685			return (err);
1686		err = dbuf_read(*parentp, NULL,
1687		    (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
1688		if (err) {
1689			dbuf_rele(*parentp, NULL);
1690			*parentp = NULL;
1691			return (err);
1692		}
1693		*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
1694		    (blkid & ((1ULL << epbs) - 1));
1695		return (0);
1696	} else {
1697		/* the block is referenced from the dnode */
1698		ASSERT3U(level, ==, nlevels-1);
1699		ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
1700		    blkid < dn->dn_phys->dn_nblkptr);
1701		if (dn->dn_dbuf) {
1702			dbuf_add_ref(dn->dn_dbuf, NULL);
1703			*parentp = dn->dn_dbuf;
1704		}
1705		*bpp = &dn->dn_phys->dn_blkptr[blkid];
1706		return (0);
1707	}
1708}
1709
1710static dmu_buf_impl_t *
1711dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
1712    dmu_buf_impl_t *parent, blkptr_t *blkptr)
1713{
1714	objset_t *os = dn->dn_objset;
1715	dmu_buf_impl_t *db, *odb;
1716
1717	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1718	ASSERT(dn->dn_type != DMU_OT_NONE);
1719
1720	db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
1721
1722	db->db_objset = os;
1723	db->db.db_object = dn->dn_object;
1724	db->db_level = level;
1725	db->db_blkid = blkid;
1726	db->db_last_dirty = NULL;
1727	db->db_dirtycnt = 0;
1728	db->db_dnode_handle = dn->dn_handle;
1729	db->db_parent = parent;
1730	db->db_blkptr = blkptr;
1731
1732	db->db_user_ptr = NULL;
1733	db->db_user_data_ptr_ptr = NULL;
1734	db->db_evict_func = NULL;
1735	db->db_immediate_evict = 0;
1736	db->db_freed_in_flight = 0;
1737
1738	if (blkid == DMU_BONUS_BLKID) {
1739		ASSERT3P(parent, ==, dn->dn_dbuf);
1740		db->db.db_size = DN_MAX_BONUSLEN -
1741		    (dn->dn_nblkptr-1) * sizeof (blkptr_t);
1742		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
1743		db->db.db_offset = DMU_BONUS_BLKID;
1744		db->db_state = DB_UNCACHED;
1745		/* the bonus dbuf is not placed in the hash table */
1746		arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1747		return (db);
1748	} else if (blkid == DMU_SPILL_BLKID) {
1749		db->db.db_size = (blkptr != NULL) ?
1750		    BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
1751		db->db.db_offset = 0;
1752	} else {
1753		int blocksize =
1754		    db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
1755		db->db.db_size = blocksize;
1756		db->db.db_offset = db->db_blkid * blocksize;
1757	}
1758
1759	/*
1760	 * Hold the dn_dbufs_mtx while we get the new dbuf
1761	 * in the hash table *and* added to the dbufs list.
1762	 * This prevents a possible deadlock with someone
1763	 * trying to look up this dbuf before its added to the
1764	 * dn_dbufs list.
1765	 */
1766	mutex_enter(&dn->dn_dbufs_mtx);
1767	db->db_state = DB_EVICTING;
1768	if ((odb = dbuf_hash_insert(db)) != NULL) {
1769		/* someone else inserted it first */
1770		kmem_cache_free(dbuf_cache, db);
1771		mutex_exit(&dn->dn_dbufs_mtx);
1772		return (odb);
1773	}
1774	avl_add(&dn->dn_dbufs, db);
1775	if (db->db_level == 0 && db->db_blkid >=
1776	    dn->dn_unlisted_l0_blkid)
1777		dn->dn_unlisted_l0_blkid = db->db_blkid + 1;
1778	db->db_state = DB_UNCACHED;
1779	mutex_exit(&dn->dn_dbufs_mtx);
1780	arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1781
1782	if (parent && parent != dn->dn_dbuf)
1783		dbuf_add_ref(parent, db);
1784
1785	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1786	    refcount_count(&dn->dn_holds) > 0);
1787	(void) refcount_add(&dn->dn_holds, db);
1788	(void) atomic_inc_32_nv(&dn->dn_dbufs_count);
1789
1790	dprintf_dbuf(db, "db=%p\n", db);
1791
1792	return (db);
1793}
1794
1795static int
1796dbuf_do_evict(void *private)
1797{
1798	dmu_buf_impl_t *db = private;
1799
1800	if (!MUTEX_HELD(&db->db_mtx))
1801		mutex_enter(&db->db_mtx);
1802
1803	ASSERT(refcount_is_zero(&db->db_holds));
1804
1805	if (db->db_state != DB_EVICTING) {
1806		ASSERT(db->db_state == DB_CACHED);
1807		DBUF_VERIFY(db);
1808		db->db_buf = NULL;
1809		dbuf_evict(db);
1810	} else {
1811		mutex_exit(&db->db_mtx);
1812		dbuf_destroy(db);
1813	}
1814	return (0);
1815}
1816
1817static void
1818dbuf_destroy(dmu_buf_impl_t *db)
1819{
1820	ASSERT(refcount_is_zero(&db->db_holds));
1821
1822	if (db->db_blkid != DMU_BONUS_BLKID) {
1823		/*
1824		 * If this dbuf is still on the dn_dbufs list,
1825		 * remove it from that list.
1826		 */
1827		if (db->db_dnode_handle != NULL) {
1828			dnode_t *dn;
1829
1830			DB_DNODE_ENTER(db);
1831			dn = DB_DNODE(db);
1832			mutex_enter(&dn->dn_dbufs_mtx);
1833			avl_remove(&dn->dn_dbufs, db);
1834			(void) atomic_dec_32_nv(&dn->dn_dbufs_count);
1835			mutex_exit(&dn->dn_dbufs_mtx);
1836			DB_DNODE_EXIT(db);
1837			/*
1838			 * Decrementing the dbuf count means that the hold
1839			 * corresponding to the removed dbuf is no longer
1840			 * discounted in dnode_move(), so the dnode cannot be
1841			 * moved until after we release the hold.
1842			 */
1843			dnode_rele(dn, db);
1844			db->db_dnode_handle = NULL;
1845		}
1846		dbuf_hash_remove(db);
1847	}
1848	db->db_parent = NULL;
1849	db->db_buf = NULL;
1850
1851	ASSERT(db->db.db_data == NULL);
1852	ASSERT(db->db_hash_next == NULL);
1853	ASSERT(db->db_blkptr == NULL);
1854	ASSERT(db->db_data_pending == NULL);
1855
1856	kmem_cache_free(dbuf_cache, db);
1857	arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1858}
1859
1860void
1861dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
1862{
1863	dmu_buf_impl_t *db = NULL;
1864	blkptr_t *bp = NULL;
1865
1866	ASSERT(blkid != DMU_BONUS_BLKID);
1867	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1868
1869	if (dnode_block_freed(dn, blkid))
1870		return;
1871
1872	/* dbuf_find() returns with db_mtx held */
1873	if (db = dbuf_find(dn, 0, blkid)) {
1874		/*
1875		 * This dbuf is already in the cache.  We assume that
1876		 * it is already CACHED, or else about to be either
1877		 * read or filled.
1878		 */
1879		mutex_exit(&db->db_mtx);
1880		return;
1881	}
1882
1883	if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
1884		if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
1885			dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
1886			uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
1887			zbookmark_phys_t zb;
1888
1889			SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
1890			    dn->dn_object, 0, blkid);
1891
1892			(void) arc_read(NULL, dn->dn_objset->os_spa,
1893			    bp, NULL, NULL, prio,
1894			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
1895			    &aflags, &zb);
1896		}
1897		if (db)
1898			dbuf_rele(db, NULL);
1899	}
1900}
1901
1902/*
1903 * Returns with db_holds incremented, and db_mtx not held.
1904 * Note: dn_struct_rwlock must be held.
1905 */
1906int
1907dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
1908    void *tag, dmu_buf_impl_t **dbp)
1909{
1910	dmu_buf_impl_t *db, *parent = NULL;
1911
1912	ASSERT(blkid != DMU_BONUS_BLKID);
1913	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1914	ASSERT3U(dn->dn_nlevels, >, level);
1915
1916	*dbp = NULL;
1917top:
1918	/* dbuf_find() returns with db_mtx held */
1919	db = dbuf_find(dn, level, blkid);
1920
1921	if (db == NULL) {
1922		blkptr_t *bp = NULL;
1923		int err;
1924
1925		ASSERT3P(parent, ==, NULL);
1926		err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
1927		if (fail_sparse) {
1928			if (err == 0 && bp && BP_IS_HOLE(bp))
1929				err = SET_ERROR(ENOENT);
1930			if (err) {
1931				if (parent)
1932					dbuf_rele(parent, NULL);
1933				return (err);
1934			}
1935		}
1936		if (err && err != ENOENT)
1937			return (err);
1938		db = dbuf_create(dn, level, blkid, parent, bp);
1939	}
1940
1941	if (db->db_buf && refcount_is_zero(&db->db_holds)) {
1942		arc_buf_add_ref(db->db_buf, db);
1943		if (db->db_buf->b_data == NULL) {
1944			dbuf_clear(db);
1945			if (parent) {
1946				dbuf_rele(parent, NULL);
1947				parent = NULL;
1948			}
1949			goto top;
1950		}
1951		ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
1952	}
1953
1954	ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
1955
1956	/*
1957	 * If this buffer is currently syncing out, and we are are
1958	 * still referencing it from db_data, we need to make a copy
1959	 * of it in case we decide we want to dirty it again in this txg.
1960	 */
1961	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1962	    dn->dn_object != DMU_META_DNODE_OBJECT &&
1963	    db->db_state == DB_CACHED && db->db_data_pending) {
1964		dbuf_dirty_record_t *dr = db->db_data_pending;
1965
1966		if (dr->dt.dl.dr_data == db->db_buf) {
1967			arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1968
1969			dbuf_set_data(db,
1970			    arc_buf_alloc(dn->dn_objset->os_spa,
1971			    db->db.db_size, db, type));
1972			bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
1973			    db->db.db_size);
1974		}
1975	}
1976
1977	(void) refcount_add(&db->db_holds, tag);
1978	dbuf_update_data(db);
1979	DBUF_VERIFY(db);
1980	mutex_exit(&db->db_mtx);
1981
1982	/* NOTE: we can't rele the parent until after we drop the db_mtx */
1983	if (parent)
1984		dbuf_rele(parent, NULL);
1985
1986	ASSERT3P(DB_DNODE(db), ==, dn);
1987	ASSERT3U(db->db_blkid, ==, blkid);
1988	ASSERT3U(db->db_level, ==, level);
1989	*dbp = db;
1990
1991	return (0);
1992}
1993
1994dmu_buf_impl_t *
1995dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
1996{
1997	dmu_buf_impl_t *db;
1998	int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
1999	return (err ? NULL : db);
2000}
2001
2002dmu_buf_impl_t *
2003dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
2004{
2005	dmu_buf_impl_t *db;
2006	int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
2007	return (err ? NULL : db);
2008}
2009
2010void
2011dbuf_create_bonus(dnode_t *dn)
2012{
2013	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
2014
2015	ASSERT(dn->dn_bonus == NULL);
2016	dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
2017}
2018
2019int
2020dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
2021{
2022	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2023	dnode_t *dn;
2024
2025	if (db->db_blkid != DMU_SPILL_BLKID)
2026		return (SET_ERROR(ENOTSUP));
2027	if (blksz == 0)
2028		blksz = SPA_MINBLOCKSIZE;
2029	if (blksz > SPA_MAXBLOCKSIZE)
2030		blksz = SPA_MAXBLOCKSIZE;
2031	else
2032		blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
2033
2034	DB_DNODE_ENTER(db);
2035	dn = DB_DNODE(db);
2036	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
2037	dbuf_new_size(db, blksz, tx);
2038	rw_exit(&dn->dn_struct_rwlock);
2039	DB_DNODE_EXIT(db);
2040
2041	return (0);
2042}
2043
2044void
2045dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
2046{
2047	dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
2048}
2049
2050#pragma weak dmu_buf_add_ref = dbuf_add_ref
2051void
2052dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
2053{
2054	int64_t holds = refcount_add(&db->db_holds, tag);
2055	ASSERT(holds > 1);
2056}
2057
2058/*
2059 * If you call dbuf_rele() you had better not be referencing the dnode handle
2060 * unless you have some other direct or indirect hold on the dnode. (An indirect
2061 * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
2062 * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
2063 * dnode's parent dbuf evicting its dnode handles.
2064 */
2065void
2066dbuf_rele(dmu_buf_impl_t *db, void *tag)
2067{
2068	mutex_enter(&db->db_mtx);
2069	dbuf_rele_and_unlock(db, tag);
2070}
2071
2072void
2073dmu_buf_rele(dmu_buf_t *db, void *tag)
2074{
2075	dbuf_rele((dmu_buf_impl_t *)db, tag);
2076}
2077
2078/*
2079 * dbuf_rele() for an already-locked dbuf.  This is necessary to allow
2080 * db_dirtycnt and db_holds to be updated atomically.
2081 */
2082void
2083dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
2084{
2085	int64_t holds;
2086
2087	ASSERT(MUTEX_HELD(&db->db_mtx));
2088	DBUF_VERIFY(db);
2089
2090	/*
2091	 * Remove the reference to the dbuf before removing its hold on the
2092	 * dnode so we can guarantee in dnode_move() that a referenced bonus
2093	 * buffer has a corresponding dnode hold.
2094	 */
2095	holds = refcount_remove(&db->db_holds, tag);
2096	ASSERT(holds >= 0);
2097
2098	/*
2099	 * We can't freeze indirects if there is a possibility that they
2100	 * may be modified in the current syncing context.
2101	 */
2102	if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
2103		arc_buf_freeze(db->db_buf);
2104
2105	if (holds == db->db_dirtycnt &&
2106	    db->db_level == 0 && db->db_immediate_evict)
2107		dbuf_evict_user(db);
2108
2109	if (holds == 0) {
2110		if (db->db_blkid == DMU_BONUS_BLKID) {
2111			mutex_exit(&db->db_mtx);
2112
2113			/*
2114			 * If the dnode moves here, we cannot cross this barrier
2115			 * until the move completes.
2116			 */
2117			DB_DNODE_ENTER(db);
2118			(void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count);
2119			DB_DNODE_EXIT(db);
2120			/*
2121			 * The bonus buffer's dnode hold is no longer discounted
2122			 * in dnode_move(). The dnode cannot move until after
2123			 * the dnode_rele().
2124			 */
2125			dnode_rele(DB_DNODE(db), db);
2126		} else if (db->db_buf == NULL) {
2127			/*
2128			 * This is a special case: we never associated this
2129			 * dbuf with any data allocated from the ARC.
2130			 */
2131			ASSERT(db->db_state == DB_UNCACHED ||
2132			    db->db_state == DB_NOFILL);
2133			dbuf_evict(db);
2134		} else if (arc_released(db->db_buf)) {
2135			arc_buf_t *buf = db->db_buf;
2136			/*
2137			 * This dbuf has anonymous data associated with it.
2138			 */
2139			dbuf_set_data(db, NULL);
2140			VERIFY(arc_buf_remove_ref(buf, db));
2141			dbuf_evict(db);
2142		} else {
2143			VERIFY(!arc_buf_remove_ref(db->db_buf, db));
2144
2145			/*
2146			 * A dbuf will be eligible for eviction if either the
2147			 * 'primarycache' property is set or a duplicate
2148			 * copy of this buffer is already cached in the arc.
2149			 *
2150			 * In the case of the 'primarycache' a buffer
2151			 * is considered for eviction if it matches the
2152			 * criteria set in the property.
2153			 *
2154			 * To decide if our buffer is considered a
2155			 * duplicate, we must call into the arc to determine
2156			 * if multiple buffers are referencing the same
2157			 * block on-disk. If so, then we simply evict
2158			 * ourselves.
2159			 */
2160			if (!DBUF_IS_CACHEABLE(db)) {
2161				if (db->db_blkptr != NULL &&
2162				    !BP_IS_HOLE(db->db_blkptr) &&
2163				    !BP_IS_EMBEDDED(db->db_blkptr)) {
2164					spa_t *spa =
2165					    dmu_objset_spa(db->db_objset);
2166					blkptr_t bp = *db->db_blkptr;
2167					dbuf_clear(db);
2168					arc_freed(spa, &bp);
2169				} else {
2170					dbuf_clear(db);
2171				}
2172			} else if (arc_buf_eviction_needed(db->db_buf)) {
2173				dbuf_clear(db);
2174			} else {
2175				mutex_exit(&db->db_mtx);
2176			}
2177		}
2178	} else {
2179		mutex_exit(&db->db_mtx);
2180	}
2181}
2182
2183#pragma weak dmu_buf_refcount = dbuf_refcount
2184uint64_t
2185dbuf_refcount(dmu_buf_impl_t *db)
2186{
2187	return (refcount_count(&db->db_holds));
2188}
2189
2190void *
2191dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
2192    dmu_buf_evict_func_t *evict_func)
2193{
2194	return (dmu_buf_update_user(db_fake, NULL, user_ptr,
2195	    user_data_ptr_ptr, evict_func));
2196}
2197
2198void *
2199dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
2200    dmu_buf_evict_func_t *evict_func)
2201{
2202	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2203
2204	db->db_immediate_evict = TRUE;
2205	return (dmu_buf_update_user(db_fake, NULL, user_ptr,
2206	    user_data_ptr_ptr, evict_func));
2207}
2208
2209void *
2210dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
2211    void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
2212{
2213	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2214	ASSERT(db->db_level == 0);
2215
2216	ASSERT((user_ptr == NULL) == (evict_func == NULL));
2217
2218	mutex_enter(&db->db_mtx);
2219
2220	if (db->db_user_ptr == old_user_ptr) {
2221		db->db_user_ptr = user_ptr;
2222		db->db_user_data_ptr_ptr = user_data_ptr_ptr;
2223		db->db_evict_func = evict_func;
2224
2225		dbuf_update_data(db);
2226	} else {
2227		old_user_ptr = db->db_user_ptr;
2228	}
2229
2230	mutex_exit(&db->db_mtx);
2231	return (old_user_ptr);
2232}
2233
2234void *
2235dmu_buf_get_user(dmu_buf_t *db_fake)
2236{
2237	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2238	ASSERT(!refcount_is_zero(&db->db_holds));
2239
2240	return (db->db_user_ptr);
2241}
2242
2243boolean_t
2244dmu_buf_freeable(dmu_buf_t *dbuf)
2245{
2246	boolean_t res = B_FALSE;
2247	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
2248
2249	if (db->db_blkptr)
2250		res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
2251		    db->db_blkptr, db->db_blkptr->blk_birth);
2252
2253	return (res);
2254}
2255
2256blkptr_t *
2257dmu_buf_get_blkptr(dmu_buf_t *db)
2258{
2259	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
2260	return (dbi->db_blkptr);
2261}
2262
2263static void
2264dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
2265{
2266	/* ASSERT(dmu_tx_is_syncing(tx) */
2267	ASSERT(MUTEX_HELD(&db->db_mtx));
2268
2269	if (db->db_blkptr != NULL)
2270		return;
2271
2272	if (db->db_blkid == DMU_SPILL_BLKID) {
2273		db->db_blkptr = &dn->dn_phys->dn_spill;
2274		BP_ZERO(db->db_blkptr);
2275		return;
2276	}
2277	if (db->db_level == dn->dn_phys->dn_nlevels-1) {
2278		/*
2279		 * This buffer was allocated at a time when there was
2280		 * no available blkptrs from the dnode, or it was
2281		 * inappropriate to hook it in (i.e., nlevels mis-match).
2282		 */
2283		ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
2284		ASSERT(db->db_parent == NULL);
2285		db->db_parent = dn->dn_dbuf;
2286		db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
2287		DBUF_VERIFY(db);
2288	} else {
2289		dmu_buf_impl_t *parent = db->db_parent;
2290		int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2291
2292		ASSERT(dn->dn_phys->dn_nlevels > 1);
2293		if (parent == NULL) {
2294			mutex_exit(&db->db_mtx);
2295			rw_enter(&dn->dn_struct_rwlock, RW_READER);
2296			(void) dbuf_hold_impl(dn, db->db_level+1,
2297			    db->db_blkid >> epbs, FALSE, db, &parent);
2298			rw_exit(&dn->dn_struct_rwlock);
2299			mutex_enter(&db->db_mtx);
2300			db->db_parent = parent;
2301		}
2302		db->db_blkptr = (blkptr_t *)parent->db.db_data +
2303		    (db->db_blkid & ((1ULL << epbs) - 1));
2304		DBUF_VERIFY(db);
2305	}
2306}
2307
2308static void
2309dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2310{
2311	dmu_buf_impl_t *db = dr->dr_dbuf;
2312	dnode_t *dn;
2313	zio_t *zio;
2314
2315	ASSERT(dmu_tx_is_syncing(tx));
2316
2317	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2318
2319	mutex_enter(&db->db_mtx);
2320
2321	ASSERT(db->db_level > 0);
2322	DBUF_VERIFY(db);
2323
2324	/* Read the block if it hasn't been read yet. */
2325	if (db->db_buf == NULL) {
2326		mutex_exit(&db->db_mtx);
2327		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
2328		mutex_enter(&db->db_mtx);
2329	}
2330	ASSERT3U(db->db_state, ==, DB_CACHED);
2331	ASSERT(db->db_buf != NULL);
2332
2333	DB_DNODE_ENTER(db);
2334	dn = DB_DNODE(db);
2335	/* Indirect block size must match what the dnode thinks it is. */
2336	ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2337	dbuf_check_blkptr(dn, db);
2338	DB_DNODE_EXIT(db);
2339
2340	/* Provide the pending dirty record to child dbufs */
2341	db->db_data_pending = dr;
2342
2343	mutex_exit(&db->db_mtx);
2344	dbuf_write(dr, db->db_buf, tx);
2345
2346	zio = dr->dr_zio;
2347	mutex_enter(&dr->dt.di.dr_mtx);
2348	dbuf_sync_list(&dr->dt.di.dr_children, tx);
2349	ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2350	mutex_exit(&dr->dt.di.dr_mtx);
2351	zio_nowait(zio);
2352}
2353
2354static void
2355dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2356{
2357	arc_buf_t **datap = &dr->dt.dl.dr_data;
2358	dmu_buf_impl_t *db = dr->dr_dbuf;
2359	dnode_t *dn;
2360	objset_t *os;
2361	uint64_t txg = tx->tx_txg;
2362
2363	ASSERT(dmu_tx_is_syncing(tx));
2364
2365	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2366
2367	mutex_enter(&db->db_mtx);
2368	/*
2369	 * To be synced, we must be dirtied.  But we
2370	 * might have been freed after the dirty.
2371	 */
2372	if (db->db_state == DB_UNCACHED) {
2373		/* This buffer has been freed since it was dirtied */
2374		ASSERT(db->db.db_data == NULL);
2375	} else if (db->db_state == DB_FILL) {
2376		/* This buffer was freed and is now being re-filled */
2377		ASSERT(db->db.db_data != dr->dt.dl.dr_data);
2378	} else {
2379		ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
2380	}
2381	DBUF_VERIFY(db);
2382
2383	DB_DNODE_ENTER(db);
2384	dn = DB_DNODE(db);
2385
2386	if (db->db_blkid == DMU_SPILL_BLKID) {
2387		mutex_enter(&dn->dn_mtx);
2388		dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
2389		mutex_exit(&dn->dn_mtx);
2390	}
2391
2392	/*
2393	 * If this is a bonus buffer, simply copy the bonus data into the
2394	 * dnode.  It will be written out when the dnode is synced (and it
2395	 * will be synced, since it must have been dirty for dbuf_sync to
2396	 * be called).
2397	 */
2398	if (db->db_blkid == DMU_BONUS_BLKID) {
2399		dbuf_dirty_record_t **drp;
2400
2401		ASSERT(*datap != NULL);
2402		ASSERT0(db->db_level);
2403		ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
2404		bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
2405		DB_DNODE_EXIT(db);
2406
2407		if (*datap != db->db.db_data) {
2408			zio_buf_free(*datap, DN_MAX_BONUSLEN);
2409			arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
2410		}
2411		db->db_data_pending = NULL;
2412		drp = &db->db_last_dirty;
2413		while (*drp != dr)
2414			drp = &(*drp)->dr_next;
2415		ASSERT(dr->dr_next == NULL);
2416		ASSERT(dr->dr_dbuf == db);
2417		*drp = dr->dr_next;
2418		if (dr->dr_dbuf->db_level != 0) {
2419			list_destroy(&dr->dt.di.dr_children);
2420			mutex_destroy(&dr->dt.di.dr_mtx);
2421		}
2422		kmem_free(dr, sizeof (dbuf_dirty_record_t));
2423		ASSERT(db->db_dirtycnt > 0);
2424		db->db_dirtycnt -= 1;
2425		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
2426		return;
2427	}
2428
2429	os = dn->dn_objset;
2430
2431	/*
2432	 * This function may have dropped the db_mtx lock allowing a dmu_sync
2433	 * operation to sneak in. As a result, we need to ensure that we
2434	 * don't check the dr_override_state until we have returned from
2435	 * dbuf_check_blkptr.
2436	 */
2437	dbuf_check_blkptr(dn, db);
2438
2439	/*
2440	 * If this buffer is in the middle of an immediate write,
2441	 * wait for the synchronous IO to complete.
2442	 */
2443	while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
2444		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
2445		cv_wait(&db->db_changed, &db->db_mtx);
2446		ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
2447	}
2448
2449	if (db->db_state != DB_NOFILL &&
2450	    dn->dn_object != DMU_META_DNODE_OBJECT &&
2451	    refcount_count(&db->db_holds) > 1 &&
2452	    dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
2453	    *datap == db->db_buf) {
2454		/*
2455		 * If this buffer is currently "in use" (i.e., there
2456		 * are active holds and db_data still references it),
2457		 * then make a copy before we start the write so that
2458		 * any modifications from the open txg will not leak
2459		 * into this write.
2460		 *
2461		 * NOTE: this copy does not need to be made for
2462		 * objects only modified in the syncing context (e.g.
2463		 * DNONE_DNODE blocks).
2464		 */
2465		int blksz = arc_buf_size(*datap);
2466		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
2467		*datap = arc_buf_alloc(os->os_spa, blksz, db, type);
2468		bcopy(db->db.db_data, (*datap)->b_data, blksz);
2469	}
2470	db->db_data_pending = dr;
2471
2472	mutex_exit(&db->db_mtx);
2473
2474	dbuf_write(dr, *datap, tx);
2475
2476	ASSERT(!list_link_active(&dr->dr_dirty_node));
2477	if (dn->dn_object == DMU_META_DNODE_OBJECT) {
2478		list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
2479		DB_DNODE_EXIT(db);
2480	} else {
2481		/*
2482		 * Although zio_nowait() does not "wait for an IO", it does
2483		 * initiate the IO. If this is an empty write it seems plausible
2484		 * that the IO could actually be completed before the nowait
2485		 * returns. We need to DB_DNODE_EXIT() first in case
2486		 * zio_nowait() invalidates the dbuf.
2487		 */
2488		DB_DNODE_EXIT(db);
2489		zio_nowait(dr->dr_zio);
2490	}
2491}
2492
2493void
2494dbuf_sync_list(list_t *list, dmu_tx_t *tx)
2495{
2496	dbuf_dirty_record_t *dr;
2497
2498	while (dr = list_head(list)) {
2499		if (dr->dr_zio != NULL) {
2500			/*
2501			 * If we find an already initialized zio then we
2502			 * are processing the meta-dnode, and we have finished.
2503			 * The dbufs for all dnodes are put back on the list
2504			 * during processing, so that we can zio_wait()
2505			 * these IOs after initiating all child IOs.
2506			 */
2507			ASSERT3U(dr->dr_dbuf->db.db_object, ==,
2508			    DMU_META_DNODE_OBJECT);
2509			break;
2510		}
2511		list_remove(list, dr);
2512		if (dr->dr_dbuf->db_level > 0)
2513			dbuf_sync_indirect(dr, tx);
2514		else
2515			dbuf_sync_leaf(dr, tx);
2516	}
2517}
2518
2519/* ARGSUSED */
2520static void
2521dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
2522{
2523	dmu_buf_impl_t *db = vdb;
2524	dnode_t *dn;
2525	blkptr_t *bp = zio->io_bp;
2526	blkptr_t *bp_orig = &zio->io_bp_orig;
2527	spa_t *spa = zio->io_spa;
2528	int64_t delta;
2529	uint64_t fill = 0;
2530	int i;
2531
2532	ASSERT3P(db->db_blkptr, ==, bp);
2533
2534	DB_DNODE_ENTER(db);
2535	dn = DB_DNODE(db);
2536	delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
2537	dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
2538	zio->io_prev_space_delta = delta;
2539
2540	if (bp->blk_birth != 0) {
2541		ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
2542		    BP_GET_TYPE(bp) == dn->dn_type) ||
2543		    (db->db_blkid == DMU_SPILL_BLKID &&
2544		    BP_GET_TYPE(bp) == dn->dn_bonustype) ||
2545		    BP_IS_EMBEDDED(bp));
2546		ASSERT(BP_GET_LEVEL(bp) == db->db_level);
2547	}
2548
2549	mutex_enter(&db->db_mtx);
2550
2551#ifdef ZFS_DEBUG
2552	if (db->db_blkid == DMU_SPILL_BLKID) {
2553		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
2554		ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
2555		    db->db_blkptr == &dn->dn_phys->dn_spill);
2556	}
2557#endif
2558
2559	if (db->db_level == 0) {
2560		mutex_enter(&dn->dn_mtx);
2561		if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
2562		    db->db_blkid != DMU_SPILL_BLKID)
2563			dn->dn_phys->dn_maxblkid = db->db_blkid;
2564		mutex_exit(&dn->dn_mtx);
2565
2566		if (dn->dn_type == DMU_OT_DNODE) {
2567			dnode_phys_t *dnp = db->db.db_data;
2568			for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
2569			    i--, dnp++) {
2570				if (dnp->dn_type != DMU_OT_NONE)
2571					fill++;
2572			}
2573		} else {
2574			if (BP_IS_HOLE(bp)) {
2575				fill = 0;
2576			} else {
2577				fill = 1;
2578			}
2579		}
2580	} else {
2581		blkptr_t *ibp = db->db.db_data;
2582		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2583		for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
2584			if (BP_IS_HOLE(ibp))
2585				continue;
2586			fill += BP_GET_FILL(ibp);
2587		}
2588	}
2589	DB_DNODE_EXIT(db);
2590
2591	if (!BP_IS_EMBEDDED(bp))
2592		bp->blk_fill = fill;
2593
2594	mutex_exit(&db->db_mtx);
2595}
2596
2597/*
2598 * The SPA will call this callback several times for each zio - once
2599 * for every physical child i/o (zio->io_phys_children times).  This
2600 * allows the DMU to monitor the progress of each logical i/o.  For example,
2601 * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
2602 * block.  There may be a long delay before all copies/fragments are completed,
2603 * so this callback allows us to retire dirty space gradually, as the physical
2604 * i/os complete.
2605 */
2606/* ARGSUSED */
2607static void
2608dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
2609{
2610	dmu_buf_impl_t *db = arg;
2611	objset_t *os = db->db_objset;
2612	dsl_pool_t *dp = dmu_objset_pool(os);
2613	dbuf_dirty_record_t *dr;
2614	int delta = 0;
2615
2616	dr = db->db_data_pending;
2617	ASSERT3U(dr->dr_txg, ==, zio->io_txg);
2618
2619	/*
2620	 * The callback will be called io_phys_children times.  Retire one
2621	 * portion of our dirty space each time we are called.  Any rounding
2622	 * error will be cleaned up by dsl_pool_sync()'s call to
2623	 * dsl_pool_undirty_space().
2624	 */
2625	delta = dr->dr_accounted / zio->io_phys_children;
2626	dsl_pool_undirty_space(dp, delta, zio->io_txg);
2627}
2628
2629/* ARGSUSED */
2630static void
2631dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
2632{
2633	dmu_buf_impl_t *db = vdb;
2634	blkptr_t *bp_orig = &zio->io_bp_orig;
2635	blkptr_t *bp = db->db_blkptr;
2636	objset_t *os = db->db_objset;
2637	dmu_tx_t *tx = os->os_synctx;
2638	dbuf_dirty_record_t **drp, *dr;
2639
2640	ASSERT0(zio->io_error);
2641	ASSERT(db->db_blkptr == bp);
2642
2643	/*
2644	 * For nopwrites and rewrites we ensure that the bp matches our
2645	 * original and bypass all the accounting.
2646	 */
2647	if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
2648		ASSERT(BP_EQUAL(bp, bp_orig));
2649	} else {
2650		dsl_dataset_t *ds = os->os_dsl_dataset;
2651		(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
2652		dsl_dataset_block_born(ds, bp, tx);
2653	}
2654
2655	mutex_enter(&db->db_mtx);
2656
2657	DBUF_VERIFY(db);
2658
2659	drp = &db->db_last_dirty;
2660	while ((dr = *drp) != db->db_data_pending)
2661		drp = &dr->dr_next;
2662	ASSERT(!list_link_active(&dr->dr_dirty_node));
2663	ASSERT(dr->dr_dbuf == db);
2664	ASSERT(dr->dr_next == NULL);
2665	*drp = dr->dr_next;
2666
2667#ifdef ZFS_DEBUG
2668	if (db->db_blkid == DMU_SPILL_BLKID) {
2669		dnode_t *dn;
2670
2671		DB_DNODE_ENTER(db);
2672		dn = DB_DNODE(db);
2673		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
2674		ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
2675		    db->db_blkptr == &dn->dn_phys->dn_spill);
2676		DB_DNODE_EXIT(db);
2677	}
2678#endif
2679
2680	if (db->db_level == 0) {
2681		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2682		ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
2683		if (db->db_state != DB_NOFILL) {
2684			if (dr->dt.dl.dr_data != db->db_buf)
2685				VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
2686				    db));
2687			else if (!arc_released(db->db_buf))
2688				arc_set_callback(db->db_buf, dbuf_do_evict, db);
2689		}
2690	} else {
2691		dnode_t *dn;
2692
2693		DB_DNODE_ENTER(db);
2694		dn = DB_DNODE(db);
2695		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2696		ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
2697		if (!BP_IS_HOLE(db->db_blkptr)) {
2698			int epbs =
2699			    dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2700			ASSERT3U(db->db_blkid, <=,
2701			    dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
2702			ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
2703			    db->db.db_size);
2704			if (!arc_released(db->db_buf))
2705				arc_set_callback(db->db_buf, dbuf_do_evict, db);
2706		}
2707		DB_DNODE_EXIT(db);
2708		mutex_destroy(&dr->dt.di.dr_mtx);
2709		list_destroy(&dr->dt.di.dr_children);
2710	}
2711	kmem_free(dr, sizeof (dbuf_dirty_record_t));
2712
2713	cv_broadcast(&db->db_changed);
2714	ASSERT(db->db_dirtycnt > 0);
2715	db->db_dirtycnt -= 1;
2716	db->db_data_pending = NULL;
2717	dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg);
2718}
2719
2720static void
2721dbuf_write_nofill_ready(zio_t *zio)
2722{
2723	dbuf_write_ready(zio, NULL, zio->io_private);
2724}
2725
2726static void
2727dbuf_write_nofill_done(zio_t *zio)
2728{
2729	dbuf_write_done(zio, NULL, zio->io_private);
2730}
2731
2732static void
2733dbuf_write_override_ready(zio_t *zio)
2734{
2735	dbuf_dirty_record_t *dr = zio->io_private;
2736	dmu_buf_impl_t *db = dr->dr_dbuf;
2737
2738	dbuf_write_ready(zio, NULL, db);
2739}
2740
2741static void
2742dbuf_write_override_done(zio_t *zio)
2743{
2744	dbuf_dirty_record_t *dr = zio->io_private;
2745	dmu_buf_impl_t *db = dr->dr_dbuf;
2746	blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
2747
2748	mutex_enter(&db->db_mtx);
2749	if (!BP_EQUAL(zio->io_bp, obp)) {
2750		if (!BP_IS_HOLE(obp))
2751			dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
2752		arc_release(dr->dt.dl.dr_data, db);
2753	}
2754	mutex_exit(&db->db_mtx);
2755
2756	dbuf_write_done(zio, NULL, db);
2757}
2758
2759/* Issue I/O to commit a dirty buffer to disk. */
2760static void
2761dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
2762{
2763	dmu_buf_impl_t *db = dr->dr_dbuf;
2764	dnode_t *dn;
2765	objset_t *os;
2766	dmu_buf_impl_t *parent = db->db_parent;
2767	uint64_t txg = tx->tx_txg;
2768	zbookmark_phys_t zb;
2769	zio_prop_t zp;
2770	zio_t *zio;
2771	int wp_flag = 0;
2772
2773	DB_DNODE_ENTER(db);
2774	dn = DB_DNODE(db);
2775	os = dn->dn_objset;
2776
2777	if (db->db_state != DB_NOFILL) {
2778		if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
2779			/*
2780			 * Private object buffers are released here rather
2781			 * than in dbuf_dirty() since they are only modified
2782			 * in the syncing context and we don't want the
2783			 * overhead of making multiple copies of the data.
2784			 */
2785			if (BP_IS_HOLE(db->db_blkptr)) {
2786				arc_buf_thaw(data);
2787			} else {
2788				dbuf_release_bp(db);
2789			}
2790		}
2791	}
2792
2793	if (parent != dn->dn_dbuf) {
2794		/* Our parent is an indirect block. */
2795		/* We have a dirty parent that has been scheduled for write. */
2796		ASSERT(parent && parent->db_data_pending);
2797		/* Our parent's buffer is one level closer to the dnode. */
2798		ASSERT(db->db_level == parent->db_level-1);
2799		/*
2800		 * We're about to modify our parent's db_data by modifying
2801		 * our block pointer, so the parent must be released.
2802		 */
2803		ASSERT(arc_released(parent->db_buf));
2804		zio = parent->db_data_pending->dr_zio;
2805	} else {
2806		/* Our parent is the dnode itself. */
2807		ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
2808		    db->db_blkid != DMU_SPILL_BLKID) ||
2809		    (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
2810		if (db->db_blkid != DMU_SPILL_BLKID)
2811			ASSERT3P(db->db_blkptr, ==,
2812			    &dn->dn_phys->dn_blkptr[db->db_blkid]);
2813		zio = dn->dn_zio;
2814	}
2815
2816	ASSERT(db->db_level == 0 || data == db->db_buf);
2817	ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
2818	ASSERT(zio);
2819
2820	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
2821	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
2822	    db->db.db_object, db->db_level, db->db_blkid);
2823
2824	if (db->db_blkid == DMU_SPILL_BLKID)
2825		wp_flag = WP_SPILL;
2826	wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
2827
2828	dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
2829	DB_DNODE_EXIT(db);
2830
2831	if (db->db_level == 0 &&
2832	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
2833		/*
2834		 * The BP for this block has been provided by open context
2835		 * (by dmu_sync() or dmu_buf_write_embedded()).
2836		 */
2837		void *contents = (data != NULL) ? data->b_data : NULL;
2838
2839		dr->dr_zio = zio_write(zio, os->os_spa, txg,
2840		    db->db_blkptr, contents, db->db.db_size, &zp,
2841		    dbuf_write_override_ready, NULL, dbuf_write_override_done,
2842		    dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2843		mutex_enter(&db->db_mtx);
2844		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
2845		zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
2846		    dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
2847		mutex_exit(&db->db_mtx);
2848	} else if (db->db_state == DB_NOFILL) {
2849		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
2850		    zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
2851		dr->dr_zio = zio_write(zio, os->os_spa, txg,
2852		    db->db_blkptr, NULL, db->db.db_size, &zp,
2853		    dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db,
2854		    ZIO_PRIORITY_ASYNC_WRITE,
2855		    ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
2856	} else {
2857		ASSERT(arc_released(data));
2858		dr->dr_zio = arc_write(zio, os->os_spa, txg,
2859		    db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db),
2860		    DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
2861		    dbuf_write_physdone, dbuf_write_done, db,
2862		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2863	}
2864}
2865