dbuf.c revision 269218
1117845Ssam/*
2117845Ssam * CDDL HEADER START
3117845Ssam *
4117845Ssam * The contents of this file are subject to the terms of the
5117845Ssam * Common Development and Distribution License (the "License").
6117845Ssam * You may not use this file except in compliance with the License.
7117845Ssam *
8117845Ssam * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9117845Ssam * or http://www.opensolaris.org/os/licensing.
10117845Ssam * See the License for the specific language governing permissions
11117845Ssam * and limitations under the License.
12117845Ssam *
13117845Ssam * When distributing Covered Code, include this CDDL HEADER in each
14117845Ssam * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15117845Ssam * If applicable, add the following below this CDDL HEADER, with the
16117845Ssam * fields enclosed by brackets "[]" replaced with your own identifying
17117845Ssam * information: Portions Copyright [yyyy] [name of copyright owner]
18117845Ssam *
19117845Ssam * CDDL HEADER END
20117845Ssam */
21117845Ssam/*
22117845Ssam * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23117845Ssam * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
24117845Ssam * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
25117845Ssam * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
26117845Ssam * Copyright (c) 2013, Joyent, Inc. All rights reserved.
27117845Ssam */
28117845Ssam
29117845Ssam#include <sys/zfs_context.h>
30117845Ssam#include <sys/dmu.h>
31117845Ssam#include <sys/dmu_send.h>
32117845Ssam#include <sys/dmu_impl.h>
33117845Ssam#include <sys/dbuf.h>
34117845Ssam#include <sys/dmu_objset.h>
35117845Ssam#include <sys/dsl_dataset.h>
36117845Ssam#include <sys/dsl_dir.h>
37117845Ssam#include <sys/dmu_tx.h>
38117845Ssam#include <sys/spa.h>
39117845Ssam#include <sys/zio.h>
40117845Ssam#include <sys/dmu_zfetch.h>
41117845Ssam#include <sys/sa.h>
42117845Ssam#include <sys/sa_impl.h>
43117845Ssam#include <sys/zfeature.h>
44117845Ssam#include <sys/blkptr.h>
45117845Ssam#include <sys/range_tree.h>
46117845Ssam
47117845Ssam/*
48117845Ssam * Number of times that zfs_free_range() took the slow path while doing
49117845Ssam * a zfs receive.  A nonzero value indicates a potential performance problem.
50117845Ssam */
51117845Ssamuint64_t zfs_free_range_recv_miss;
52117845Ssam
53117845Ssamstatic void dbuf_destroy(dmu_buf_impl_t *db);
54117845Ssamstatic boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
55117845Ssamstatic void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
56117845Ssam
57117845Ssam/*
58117845Ssam * Global data structures and functions for the dbuf cache.
59117845Ssam */
60117845Ssamstatic kmem_cache_t *dbuf_cache;
61117845Ssam
62117845Ssam/* ARGSUSED */
63117845Ssamstatic int
64117845Ssamdbuf_cons(void *vdb, void *unused, int kmflag)
65117845Ssam{
66117845Ssam	dmu_buf_impl_t *db = vdb;
67117845Ssam	bzero(db, sizeof (dmu_buf_impl_t));
68117845Ssam
69117845Ssam	mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
70117845Ssam	cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
71117845Ssam	refcount_create(&db->db_holds);
72117845Ssam	return (0);
73117845Ssam}
74117845Ssam
75117845Ssam/* ARGSUSED */
76117845Ssamstatic void
77117845Ssamdbuf_dest(void *vdb, void *unused)
78117845Ssam{
79117845Ssam	dmu_buf_impl_t *db = vdb;
80117845Ssam	mutex_destroy(&db->db_mtx);
81117845Ssam	cv_destroy(&db->db_changed);
82117845Ssam	refcount_destroy(&db->db_holds);
83117845Ssam}
84117845Ssam
85117845Ssam/*
86117845Ssam * dbuf hash table routines
87117845Ssam */
88117845Ssamstatic dbuf_hash_table_t dbuf_hash_table;
89117845Ssam
90117845Ssamstatic uint64_t dbuf_hash_count;
91117845Ssam
92117845Ssamstatic uint64_t
93117845Ssamdbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
94117845Ssam{
95117845Ssam	uintptr_t osv = (uintptr_t)os;
96117845Ssam	uint64_t crc = -1ULL;
97117845Ssam
98117845Ssam	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
99117845Ssam	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
100117845Ssam	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
101117845Ssam	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
102117845Ssam	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
103117845Ssam	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
104117845Ssam	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
105117845Ssam
106117845Ssam	crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
107117845Ssam
108117845Ssam	return (crc);
109117845Ssam}
110117845Ssam
111117845Ssam#define	DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
112117845Ssam
113117845Ssam#define	DBUF_EQUAL(dbuf, os, obj, level, blkid)		\
114117845Ssam	((dbuf)->db.db_object == (obj) &&		\
115117845Ssam	(dbuf)->db_objset == (os) &&			\
116117845Ssam	(dbuf)->db_level == (level) &&			\
117117845Ssam	(dbuf)->db_blkid == (blkid))
118117845Ssam
119117845Ssamdmu_buf_impl_t *
120117845Ssamdbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
121117845Ssam{
122117845Ssam	dbuf_hash_table_t *h = &dbuf_hash_table;
123117845Ssam	objset_t *os = dn->dn_objset;
124117845Ssam	uint64_t obj = dn->dn_object;
125117845Ssam	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
126117845Ssam	uint64_t idx = hv & h->hash_table_mask;
127117845Ssam	dmu_buf_impl_t *db;
128117845Ssam
129117845Ssam	mutex_enter(DBUF_HASH_MUTEX(h, idx));
130117845Ssam	for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
131117845Ssam		if (DBUF_EQUAL(db, os, obj, level, blkid)) {
132117845Ssam			mutex_enter(&db->db_mtx);
133117845Ssam			if (db->db_state != DB_EVICTING) {
134117845Ssam				mutex_exit(DBUF_HASH_MUTEX(h, idx));
135117845Ssam				return (db);
136117845Ssam			}
137117845Ssam			mutex_exit(&db->db_mtx);
138117845Ssam		}
139117845Ssam	}
140117845Ssam	mutex_exit(DBUF_HASH_MUTEX(h, idx));
141117845Ssam	return (NULL);
142117845Ssam}
143117845Ssam
144117845Ssam/*
145117845Ssam * Insert an entry into the hash table.  If there is already an element
146117845Ssam * equal to elem in the hash table, then the already existing element
147117845Ssam * will be returned and the new element will not be inserted.
148117845Ssam * Otherwise returns NULL.
149117845Ssam */
150117845Ssamstatic dmu_buf_impl_t *
151117845Ssamdbuf_hash_insert(dmu_buf_impl_t *db)
152117845Ssam{
153117845Ssam	dbuf_hash_table_t *h = &dbuf_hash_table;
154117845Ssam	objset_t *os = db->db_objset;
155117845Ssam	uint64_t obj = db->db.db_object;
156117845Ssam	int level = db->db_level;
157117845Ssam	uint64_t blkid = db->db_blkid;
158117845Ssam	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
159117845Ssam	uint64_t idx = hv & h->hash_table_mask;
160117845Ssam	dmu_buf_impl_t *dbf;
161117845Ssam
162117845Ssam	mutex_enter(DBUF_HASH_MUTEX(h, idx));
163117845Ssam	for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
164117845Ssam		if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
165117845Ssam			mutex_enter(&dbf->db_mtx);
166117845Ssam			if (dbf->db_state != DB_EVICTING) {
167117845Ssam				mutex_exit(DBUF_HASH_MUTEX(h, idx));
168117845Ssam				return (dbf);
169117845Ssam			}
170117845Ssam			mutex_exit(&dbf->db_mtx);
171117845Ssam		}
172117845Ssam	}
173117845Ssam
174117845Ssam	mutex_enter(&db->db_mtx);
175117845Ssam	db->db_hash_next = h->hash_table[idx];
176117845Ssam	h->hash_table[idx] = db;
177117845Ssam	mutex_exit(DBUF_HASH_MUTEX(h, idx));
178117845Ssam	atomic_add_64(&dbuf_hash_count, 1);
179117845Ssam
180117845Ssam	return (NULL);
181117845Ssam}
182117845Ssam
183117845Ssam/*
184117845Ssam * Remove an entry from the hash table.  This operation will
185117845Ssam * fail if there are any existing holds on the db.
186117845Ssam */
187117845Ssamstatic void
188117845Ssamdbuf_hash_remove(dmu_buf_impl_t *db)
189117845Ssam{
190117845Ssam	dbuf_hash_table_t *h = &dbuf_hash_table;
191117845Ssam	uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
192117845Ssam	    db->db_level, db->db_blkid);
193117845Ssam	uint64_t idx = hv & h->hash_table_mask;
194117845Ssam	dmu_buf_impl_t *dbf, **dbp;
195117845Ssam
196117845Ssam	/*
197117845Ssam	 * We musn't hold db_mtx to maintin lock ordering:
198117845Ssam	 * DBUF_HASH_MUTEX > db_mtx.
199117845Ssam	 */
200117845Ssam	ASSERT(refcount_is_zero(&db->db_holds));
201117845Ssam	ASSERT(db->db_state == DB_EVICTING);
202117845Ssam	ASSERT(!MUTEX_HELD(&db->db_mtx));
203117845Ssam
204117845Ssam	mutex_enter(DBUF_HASH_MUTEX(h, idx));
205117845Ssam	dbp = &h->hash_table[idx];
206117845Ssam	while ((dbf = *dbp) != db) {
207117845Ssam		dbp = &dbf->db_hash_next;
208117845Ssam		ASSERT(dbf != NULL);
209117845Ssam	}
210117845Ssam	*dbp = db->db_hash_next;
211117845Ssam	db->db_hash_next = NULL;
212117845Ssam	mutex_exit(DBUF_HASH_MUTEX(h, idx));
213117845Ssam	atomic_add_64(&dbuf_hash_count, -1);
214117845Ssam}
215117845Ssam
216117845Ssamstatic arc_evict_func_t dbuf_do_evict;
217117845Ssam
218117845Ssamstatic void
219117845Ssamdbuf_evict_user(dmu_buf_impl_t *db)
220117845Ssam{
221117845Ssam	ASSERT(MUTEX_HELD(&db->db_mtx));
222117845Ssam
223117845Ssam	if (db->db_level != 0 || db->db_evict_func == NULL)
224117845Ssam		return;
225117845Ssam
226117845Ssam	if (db->db_user_data_ptr_ptr)
227117845Ssam		*db->db_user_data_ptr_ptr = db->db.db_data;
228117845Ssam	db->db_evict_func(&db->db, db->db_user_ptr);
229117845Ssam	db->db_user_ptr = NULL;
230117845Ssam	db->db_user_data_ptr_ptr = NULL;
231117845Ssam	db->db_evict_func = NULL;
232117845Ssam}
233117845Ssam
234117845Ssamboolean_t
235117845Ssamdbuf_is_metadata(dmu_buf_impl_t *db)
236117845Ssam{
237117845Ssam	if (db->db_level > 0) {
238117845Ssam		return (B_TRUE);
239117845Ssam	} else {
240117845Ssam		boolean_t is_metadata;
241117845Ssam
242117845Ssam		DB_DNODE_ENTER(db);
243117845Ssam		is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
244117845Ssam		DB_DNODE_EXIT(db);
245117845Ssam
246117845Ssam		return (is_metadata);
247117845Ssam	}
248117845Ssam}
249117845Ssam
250117845Ssamvoid
251117845Ssamdbuf_evict(dmu_buf_impl_t *db)
252117845Ssam{
253117845Ssam	ASSERT(MUTEX_HELD(&db->db_mtx));
254117845Ssam	ASSERT(db->db_buf == NULL);
255117845Ssam	ASSERT(db->db_data_pending == NULL);
256117845Ssam
257117845Ssam	dbuf_clear(db);
258117845Ssam	dbuf_destroy(db);
259117845Ssam}
260117845Ssam
261117845Ssamvoid
262117845Ssamdbuf_init(void)
263117845Ssam{
264117845Ssam	uint64_t hsize = 1ULL << 16;
265117845Ssam	dbuf_hash_table_t *h = &dbuf_hash_table;
266117845Ssam	int i;
267117845Ssam
268117845Ssam	/*
269117845Ssam	 * The hash table is big enough to fill all of physical memory
270117845Ssam	 * with an average 4K block size.  The table will take up
271117845Ssam	 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
272117845Ssam	 */
273117845Ssam	while (hsize * 4096 < (uint64_t)physmem * PAGESIZE)
274117845Ssam		hsize <<= 1;
275117845Ssam
276117845Ssamretry:
277117845Ssam	h->hash_table_mask = hsize - 1;
278117845Ssam	h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
279117845Ssam	if (h->hash_table == NULL) {
280117845Ssam		/* XXX - we should really return an error instead of assert */
281117845Ssam		ASSERT(hsize > (1ULL << 10));
282117845Ssam		hsize >>= 1;
283117845Ssam		goto retry;
284117845Ssam	}
285117845Ssam
286117845Ssam	dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
287117845Ssam	    sizeof (dmu_buf_impl_t),
288117845Ssam	    0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
289117845Ssam
290117845Ssam	for (i = 0; i < DBUF_MUTEXES; i++)
291117845Ssam		mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
292117845Ssam}
293117845Ssam
294117845Ssamvoid
295117845Ssamdbuf_fini(void)
296117845Ssam{
297117845Ssam	dbuf_hash_table_t *h = &dbuf_hash_table;
298117845Ssam	int i;
299117845Ssam
300117845Ssam	for (i = 0; i < DBUF_MUTEXES; i++)
301117845Ssam		mutex_destroy(&h->hash_mutexes[i]);
302117845Ssam	kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
303117845Ssam	kmem_cache_destroy(dbuf_cache);
304117845Ssam}
305117845Ssam
306117845Ssam/*
307117845Ssam * Other stuff.
308117845Ssam */
309117845Ssam
310117845Ssam#ifdef ZFS_DEBUG
311117845Ssamstatic void
312117845Ssamdbuf_verify(dmu_buf_impl_t *db)
313117845Ssam{
314117845Ssam	dnode_t *dn;
315117845Ssam	dbuf_dirty_record_t *dr;
316117845Ssam
317117845Ssam	ASSERT(MUTEX_HELD(&db->db_mtx));
318117845Ssam
319117845Ssam	if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
320117845Ssam		return;
321117845Ssam
322117845Ssam	ASSERT(db->db_objset != NULL);
323117845Ssam	DB_DNODE_ENTER(db);
324117845Ssam	dn = DB_DNODE(db);
325117845Ssam	if (dn == NULL) {
326117845Ssam		ASSERT(db->db_parent == NULL);
327117845Ssam		ASSERT(db->db_blkptr == NULL);
328117845Ssam	} else {
329117845Ssam		ASSERT3U(db->db.db_object, ==, dn->dn_object);
330117845Ssam		ASSERT3P(db->db_objset, ==, dn->dn_objset);
331117845Ssam		ASSERT3U(db->db_level, <, dn->dn_nlevels);
332117845Ssam		ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
333117845Ssam		    db->db_blkid == DMU_SPILL_BLKID ||
334117845Ssam		    !list_is_empty(&dn->dn_dbufs));
335117845Ssam	}
336117845Ssam	if (db->db_blkid == DMU_BONUS_BLKID) {
337117845Ssam		ASSERT(dn != NULL);
338117845Ssam		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
339117845Ssam		ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
340117845Ssam	} else if (db->db_blkid == DMU_SPILL_BLKID) {
341117845Ssam		ASSERT(dn != NULL);
342117845Ssam		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
343117845Ssam		ASSERT0(db->db.db_offset);
344117845Ssam	} else {
345117845Ssam		ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
346117845Ssam	}
347117845Ssam
348117845Ssam	for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
349117845Ssam		ASSERT(dr->dr_dbuf == db);
350117845Ssam
351117845Ssam	for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
352117845Ssam		ASSERT(dr->dr_dbuf == db);
353117845Ssam
354117845Ssam	/*
355117845Ssam	 * We can't assert that db_size matches dn_datablksz because it
356117845Ssam	 * can be momentarily different when another thread is doing
357117845Ssam	 * dnode_set_blksz().
358117845Ssam	 */
359117845Ssam	if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
360117845Ssam		dr = db->db_data_pending;
361117845Ssam		/*
362117845Ssam		 * It should only be modified in syncing context, so
363117845Ssam		 * make sure we only have one copy of the data.
364117845Ssam		 */
365117845Ssam		ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
366117845Ssam	}
367117845Ssam
368117845Ssam	/* verify db->db_blkptr */
369117845Ssam	if (db->db_blkptr) {
370117845Ssam		if (db->db_parent == dn->dn_dbuf) {
371117845Ssam			/* db is pointed to by the dnode */
372117845Ssam			/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
373117845Ssam			if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
374117845Ssam				ASSERT(db->db_parent == NULL);
375117845Ssam			else
376117845Ssam				ASSERT(db->db_parent != NULL);
377117845Ssam			if (db->db_blkid != DMU_SPILL_BLKID)
378117845Ssam				ASSERT3P(db->db_blkptr, ==,
379117845Ssam				    &dn->dn_phys->dn_blkptr[db->db_blkid]);
380117845Ssam		} else {
381117845Ssam			/* db is pointed to by an indirect block */
382117845Ssam			int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
383117845Ssam			ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
384117845Ssam			ASSERT3U(db->db_parent->db.db_object, ==,
385117845Ssam			    db->db.db_object);
386117845Ssam			/*
387117845Ssam			 * dnode_grow_indblksz() can make this fail if we don't
388117845Ssam			 * have the struct_rwlock.  XXX indblksz no longer
389117845Ssam			 * grows.  safe to do this now?
390117845Ssam			 */
391117845Ssam			if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
392117845Ssam				ASSERT3P(db->db_blkptr, ==,
393117845Ssam				    ((blkptr_t *)db->db_parent->db.db_data +
394117845Ssam				    db->db_blkid % epb));
395117845Ssam			}
396117845Ssam		}
397117845Ssam	}
398117845Ssam	if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
399117845Ssam	    (db->db_buf == NULL || db->db_buf->b_data) &&
400117845Ssam	    db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
401117845Ssam	    db->db_state != DB_FILL && !dn->dn_free_txg) {
402117845Ssam		/*
403117845Ssam		 * If the blkptr isn't set but they have nonzero data,
404117845Ssam		 * it had better be dirty, otherwise we'll lose that
405117845Ssam		 * data when we evict this buffer.
406117845Ssam		 */
407117845Ssam		if (db->db_dirtycnt == 0) {
408117845Ssam			uint64_t *buf = db->db.db_data;
409117845Ssam			int i;
410117845Ssam
411117845Ssam			for (i = 0; i < db->db.db_size >> 3; i++) {
412117845Ssam				ASSERT(buf[i] == 0);
413			}
414		}
415	}
416	DB_DNODE_EXIT(db);
417}
418#endif
419
420static void
421dbuf_update_data(dmu_buf_impl_t *db)
422{
423	ASSERT(MUTEX_HELD(&db->db_mtx));
424	if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
425		ASSERT(!refcount_is_zero(&db->db_holds));
426		*db->db_user_data_ptr_ptr = db->db.db_data;
427	}
428}
429
430static void
431dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
432{
433	ASSERT(MUTEX_HELD(&db->db_mtx));
434	ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
435	db->db_buf = buf;
436	if (buf != NULL) {
437		ASSERT(buf->b_data != NULL);
438		db->db.db_data = buf->b_data;
439		if (!arc_released(buf))
440			arc_set_callback(buf, dbuf_do_evict, db);
441		dbuf_update_data(db);
442	} else {
443		dbuf_evict_user(db);
444		db->db.db_data = NULL;
445		if (db->db_state != DB_NOFILL)
446			db->db_state = DB_UNCACHED;
447	}
448}
449
450/*
451 * Loan out an arc_buf for read.  Return the loaned arc_buf.
452 */
453arc_buf_t *
454dbuf_loan_arcbuf(dmu_buf_impl_t *db)
455{
456	arc_buf_t *abuf;
457
458	mutex_enter(&db->db_mtx);
459	if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
460		int blksz = db->db.db_size;
461		spa_t *spa = db->db_objset->os_spa;
462
463		mutex_exit(&db->db_mtx);
464		abuf = arc_loan_buf(spa, blksz);
465		bcopy(db->db.db_data, abuf->b_data, blksz);
466	} else {
467		abuf = db->db_buf;
468		arc_loan_inuse_buf(abuf, db);
469		dbuf_set_data(db, NULL);
470		mutex_exit(&db->db_mtx);
471	}
472	return (abuf);
473}
474
475uint64_t
476dbuf_whichblock(dnode_t *dn, uint64_t offset)
477{
478	if (dn->dn_datablkshift) {
479		return (offset >> dn->dn_datablkshift);
480	} else {
481		ASSERT3U(offset, <, dn->dn_datablksz);
482		return (0);
483	}
484}
485
486static void
487dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
488{
489	dmu_buf_impl_t *db = vdb;
490
491	mutex_enter(&db->db_mtx);
492	ASSERT3U(db->db_state, ==, DB_READ);
493	/*
494	 * All reads are synchronous, so we must have a hold on the dbuf
495	 */
496	ASSERT(refcount_count(&db->db_holds) > 0);
497	ASSERT(db->db_buf == NULL);
498	ASSERT(db->db.db_data == NULL);
499	if (db->db_level == 0 && db->db_freed_in_flight) {
500		/* we were freed in flight; disregard any error */
501		arc_release(buf, db);
502		bzero(buf->b_data, db->db.db_size);
503		arc_buf_freeze(buf);
504		db->db_freed_in_flight = FALSE;
505		dbuf_set_data(db, buf);
506		db->db_state = DB_CACHED;
507	} else if (zio == NULL || zio->io_error == 0) {
508		dbuf_set_data(db, buf);
509		db->db_state = DB_CACHED;
510	} else {
511		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
512		ASSERT3P(db->db_buf, ==, NULL);
513		VERIFY(arc_buf_remove_ref(buf, db));
514		db->db_state = DB_UNCACHED;
515	}
516	cv_broadcast(&db->db_changed);
517	dbuf_rele_and_unlock(db, NULL);
518}
519
520static void
521dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
522{
523	dnode_t *dn;
524	zbookmark_phys_t zb;
525	uint32_t aflags = ARC_NOWAIT;
526
527	DB_DNODE_ENTER(db);
528	dn = DB_DNODE(db);
529	ASSERT(!refcount_is_zero(&db->db_holds));
530	/* We need the struct_rwlock to prevent db_blkptr from changing. */
531	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
532	ASSERT(MUTEX_HELD(&db->db_mtx));
533	ASSERT(db->db_state == DB_UNCACHED);
534	ASSERT(db->db_buf == NULL);
535
536	if (db->db_blkid == DMU_BONUS_BLKID) {
537		int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
538
539		ASSERT3U(bonuslen, <=, db->db.db_size);
540		db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
541		arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
542		if (bonuslen < DN_MAX_BONUSLEN)
543			bzero(db->db.db_data, DN_MAX_BONUSLEN);
544		if (bonuslen)
545			bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
546		DB_DNODE_EXIT(db);
547		dbuf_update_data(db);
548		db->db_state = DB_CACHED;
549		mutex_exit(&db->db_mtx);
550		return;
551	}
552
553	/*
554	 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
555	 * processes the delete record and clears the bp while we are waiting
556	 * for the dn_mtx (resulting in a "no" from block_freed).
557	 */
558	if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
559	    (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
560	    BP_IS_HOLE(db->db_blkptr)))) {
561		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
562
563		DB_DNODE_EXIT(db);
564		dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa,
565		    db->db.db_size, db, type));
566		bzero(db->db.db_data, db->db.db_size);
567		db->db_state = DB_CACHED;
568		*flags |= DB_RF_CACHED;
569		mutex_exit(&db->db_mtx);
570		return;
571	}
572
573	DB_DNODE_EXIT(db);
574
575	db->db_state = DB_READ;
576	mutex_exit(&db->db_mtx);
577
578	if (DBUF_IS_L2CACHEABLE(db))
579		aflags |= ARC_L2CACHE;
580	if (DBUF_IS_L2COMPRESSIBLE(db))
581		aflags |= ARC_L2COMPRESS;
582
583	SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
584	    db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
585	    db->db.db_object, db->db_level, db->db_blkid);
586
587	dbuf_add_ref(db, NULL);
588
589	(void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
590	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
591	    (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
592	    &aflags, &zb);
593	if (aflags & ARC_CACHED)
594		*flags |= DB_RF_CACHED;
595}
596
597int
598dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
599{
600	int err = 0;
601	boolean_t havepzio = (zio != NULL);
602	boolean_t prefetch;
603	dnode_t *dn;
604
605	/*
606	 * We don't have to hold the mutex to check db_state because it
607	 * can't be freed while we have a hold on the buffer.
608	 */
609	ASSERT(!refcount_is_zero(&db->db_holds));
610
611	if (db->db_state == DB_NOFILL)
612		return (SET_ERROR(EIO));
613
614	DB_DNODE_ENTER(db);
615	dn = DB_DNODE(db);
616	if ((flags & DB_RF_HAVESTRUCT) == 0)
617		rw_enter(&dn->dn_struct_rwlock, RW_READER);
618
619	prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
620	    (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
621	    DBUF_IS_CACHEABLE(db);
622
623	mutex_enter(&db->db_mtx);
624	if (db->db_state == DB_CACHED) {
625		mutex_exit(&db->db_mtx);
626		if (prefetch)
627			dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
628			    db->db.db_size, TRUE);
629		if ((flags & DB_RF_HAVESTRUCT) == 0)
630			rw_exit(&dn->dn_struct_rwlock);
631		DB_DNODE_EXIT(db);
632	} else if (db->db_state == DB_UNCACHED) {
633		spa_t *spa = dn->dn_objset->os_spa;
634
635		if (zio == NULL)
636			zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
637		dbuf_read_impl(db, zio, &flags);
638
639		/* dbuf_read_impl has dropped db_mtx for us */
640
641		if (prefetch)
642			dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
643			    db->db.db_size, flags & DB_RF_CACHED);
644
645		if ((flags & DB_RF_HAVESTRUCT) == 0)
646			rw_exit(&dn->dn_struct_rwlock);
647		DB_DNODE_EXIT(db);
648
649		if (!havepzio)
650			err = zio_wait(zio);
651	} else {
652		/*
653		 * Another reader came in while the dbuf was in flight
654		 * between UNCACHED and CACHED.  Either a writer will finish
655		 * writing the buffer (sending the dbuf to CACHED) or the
656		 * first reader's request will reach the read_done callback
657		 * and send the dbuf to CACHED.  Otherwise, a failure
658		 * occurred and the dbuf went to UNCACHED.
659		 */
660		mutex_exit(&db->db_mtx);
661		if (prefetch)
662			dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
663			    db->db.db_size, TRUE);
664		if ((flags & DB_RF_HAVESTRUCT) == 0)
665			rw_exit(&dn->dn_struct_rwlock);
666		DB_DNODE_EXIT(db);
667
668		/* Skip the wait per the caller's request. */
669		mutex_enter(&db->db_mtx);
670		if ((flags & DB_RF_NEVERWAIT) == 0) {
671			while (db->db_state == DB_READ ||
672			    db->db_state == DB_FILL) {
673				ASSERT(db->db_state == DB_READ ||
674				    (flags & DB_RF_HAVESTRUCT) == 0);
675				cv_wait(&db->db_changed, &db->db_mtx);
676			}
677			if (db->db_state == DB_UNCACHED)
678				err = SET_ERROR(EIO);
679		}
680		mutex_exit(&db->db_mtx);
681	}
682
683	ASSERT(err || havepzio || db->db_state == DB_CACHED);
684	return (err);
685}
686
687static void
688dbuf_noread(dmu_buf_impl_t *db)
689{
690	ASSERT(!refcount_is_zero(&db->db_holds));
691	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
692	mutex_enter(&db->db_mtx);
693	while (db->db_state == DB_READ || db->db_state == DB_FILL)
694		cv_wait(&db->db_changed, &db->db_mtx);
695	if (db->db_state == DB_UNCACHED) {
696		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
697		spa_t *spa = db->db_objset->os_spa;
698
699		ASSERT(db->db_buf == NULL);
700		ASSERT(db->db.db_data == NULL);
701		dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
702		db->db_state = DB_FILL;
703	} else if (db->db_state == DB_NOFILL) {
704		dbuf_set_data(db, NULL);
705	} else {
706		ASSERT3U(db->db_state, ==, DB_CACHED);
707	}
708	mutex_exit(&db->db_mtx);
709}
710
711/*
712 * This is our just-in-time copy function.  It makes a copy of
713 * buffers, that have been modified in a previous transaction
714 * group, before we modify them in the current active group.
715 *
716 * This function is used in two places: when we are dirtying a
717 * buffer for the first time in a txg, and when we are freeing
718 * a range in a dnode that includes this buffer.
719 *
720 * Note that when we are called from dbuf_free_range() we do
721 * not put a hold on the buffer, we just traverse the active
722 * dbuf list for the dnode.
723 */
724static void
725dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
726{
727	dbuf_dirty_record_t *dr = db->db_last_dirty;
728
729	ASSERT(MUTEX_HELD(&db->db_mtx));
730	ASSERT(db->db.db_data != NULL);
731	ASSERT(db->db_level == 0);
732	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
733
734	if (dr == NULL ||
735	    (dr->dt.dl.dr_data !=
736	    ((db->db_blkid  == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
737		return;
738
739	/*
740	 * If the last dirty record for this dbuf has not yet synced
741	 * and its referencing the dbuf data, either:
742	 *	reset the reference to point to a new copy,
743	 * or (if there a no active holders)
744	 *	just null out the current db_data pointer.
745	 */
746	ASSERT(dr->dr_txg >= txg - 2);
747	if (db->db_blkid == DMU_BONUS_BLKID) {
748		/* Note that the data bufs here are zio_bufs */
749		dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
750		arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
751		bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
752	} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
753		int size = db->db.db_size;
754		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
755		spa_t *spa = db->db_objset->os_spa;
756
757		dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
758		bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
759	} else {
760		dbuf_set_data(db, NULL);
761	}
762}
763
764void
765dbuf_unoverride(dbuf_dirty_record_t *dr)
766{
767	dmu_buf_impl_t *db = dr->dr_dbuf;
768	blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
769	uint64_t txg = dr->dr_txg;
770
771	ASSERT(MUTEX_HELD(&db->db_mtx));
772	ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
773	ASSERT(db->db_level == 0);
774
775	if (db->db_blkid == DMU_BONUS_BLKID ||
776	    dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
777		return;
778
779	ASSERT(db->db_data_pending != dr);
780
781	/* free this block */
782	if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
783		zio_free(db->db_objset->os_spa, txg, bp);
784
785	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
786	dr->dt.dl.dr_nopwrite = B_FALSE;
787
788	/*
789	 * Release the already-written buffer, so we leave it in
790	 * a consistent dirty state.  Note that all callers are
791	 * modifying the buffer, so they will immediately do
792	 * another (redundant) arc_release().  Therefore, leave
793	 * the buf thawed to save the effort of freezing &
794	 * immediately re-thawing it.
795	 */
796	arc_release(dr->dt.dl.dr_data, db);
797}
798
799/*
800 * Evict (if its unreferenced) or clear (if its referenced) any level-0
801 * data blocks in the free range, so that any future readers will find
802 * empty blocks.
803 *
804 * This is a no-op if the dataset is in the middle of an incremental
805 * receive; see comment below for details.
806 */
807void
808dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
809{
810	dmu_buf_impl_t *db, *db_next;
811	uint64_t txg = tx->tx_txg;
812
813	if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID))
814		end = dn->dn_maxblkid;
815	dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
816
817	mutex_enter(&dn->dn_dbufs_mtx);
818	if (start >= dn->dn_unlisted_l0_blkid * dn->dn_datablksz) {
819		/* There can't be any dbufs in this range; no need to search. */
820		mutex_exit(&dn->dn_dbufs_mtx);
821		return;
822	} else if (dmu_objset_is_receiving(dn->dn_objset)) {
823		/*
824		 * If we are receiving, we expect there to be no dbufs in
825		 * the range to be freed, because receive modifies each
826		 * block at most once, and in offset order.  If this is
827		 * not the case, it can lead to performance problems,
828		 * so note that we unexpectedly took the slow path.
829		 */
830		atomic_inc_64(&zfs_free_range_recv_miss);
831	}
832
833	for (db = list_head(&dn->dn_dbufs); db != NULL; db = db_next) {
834		db_next = list_next(&dn->dn_dbufs, db);
835		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
836
837		if (db->db_level != 0)
838			continue;
839		if (db->db_blkid < start || db->db_blkid > end)
840			continue;
841
842		/* found a level 0 buffer in the range */
843		mutex_enter(&db->db_mtx);
844		if (dbuf_undirty(db, tx)) {
845			/* mutex has been dropped and dbuf destroyed */
846			continue;
847		}
848
849		if (db->db_state == DB_UNCACHED ||
850		    db->db_state == DB_NOFILL ||
851		    db->db_state == DB_EVICTING) {
852			ASSERT(db->db.db_data == NULL);
853			mutex_exit(&db->db_mtx);
854			continue;
855		}
856		if (db->db_state == DB_READ || db->db_state == DB_FILL) {
857			/* will be handled in dbuf_read_done or dbuf_rele */
858			db->db_freed_in_flight = TRUE;
859			mutex_exit(&db->db_mtx);
860			continue;
861		}
862		if (refcount_count(&db->db_holds) == 0) {
863			ASSERT(db->db_buf);
864			dbuf_clear(db);
865			continue;
866		}
867		/* The dbuf is referenced */
868
869		if (db->db_last_dirty != NULL) {
870			dbuf_dirty_record_t *dr = db->db_last_dirty;
871
872			if (dr->dr_txg == txg) {
873				/*
874				 * This buffer is "in-use", re-adjust the file
875				 * size to reflect that this buffer may
876				 * contain new data when we sync.
877				 */
878				if (db->db_blkid != DMU_SPILL_BLKID &&
879				    db->db_blkid > dn->dn_maxblkid)
880					dn->dn_maxblkid = db->db_blkid;
881				dbuf_unoverride(dr);
882			} else {
883				/*
884				 * This dbuf is not dirty in the open context.
885				 * Either uncache it (if its not referenced in
886				 * the open context) or reset its contents to
887				 * empty.
888				 */
889				dbuf_fix_old_data(db, txg);
890			}
891		}
892		/* clear the contents if its cached */
893		if (db->db_state == DB_CACHED) {
894			ASSERT(db->db.db_data != NULL);
895			arc_release(db->db_buf, db);
896			bzero(db->db.db_data, db->db.db_size);
897			arc_buf_freeze(db->db_buf);
898		}
899
900		mutex_exit(&db->db_mtx);
901	}
902	mutex_exit(&dn->dn_dbufs_mtx);
903}
904
905static int
906dbuf_block_freeable(dmu_buf_impl_t *db)
907{
908	dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
909	uint64_t birth_txg = 0;
910
911	/*
912	 * We don't need any locking to protect db_blkptr:
913	 * If it's syncing, then db_last_dirty will be set
914	 * so we'll ignore db_blkptr.
915	 *
916	 * This logic ensures that only block births for
917	 * filled blocks are considered.
918	 */
919	ASSERT(MUTEX_HELD(&db->db_mtx));
920	if (db->db_last_dirty && (db->db_blkptr == NULL ||
921	    !BP_IS_HOLE(db->db_blkptr))) {
922		birth_txg = db->db_last_dirty->dr_txg;
923	} else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
924		birth_txg = db->db_blkptr->blk_birth;
925	}
926
927	/*
928	 * If this block don't exist or is in a snapshot, it can't be freed.
929	 * Don't pass the bp to dsl_dataset_block_freeable() since we
930	 * are holding the db_mtx lock and might deadlock if we are
931	 * prefetching a dedup-ed block.
932	 */
933	if (birth_txg != 0)
934		return (ds == NULL ||
935		    dsl_dataset_block_freeable(ds, NULL, birth_txg));
936	else
937		return (B_FALSE);
938}
939
940void
941dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
942{
943	arc_buf_t *buf, *obuf;
944	int osize = db->db.db_size;
945	arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
946	dnode_t *dn;
947
948	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
949
950	DB_DNODE_ENTER(db);
951	dn = DB_DNODE(db);
952
953	/* XXX does *this* func really need the lock? */
954	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
955
956	/*
957	 * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held
958	 * is OK, because there can be no other references to the db
959	 * when we are changing its size, so no concurrent DB_FILL can
960	 * be happening.
961	 */
962	/*
963	 * XXX we should be doing a dbuf_read, checking the return
964	 * value and returning that up to our callers
965	 */
966	dmu_buf_will_dirty(&db->db, tx);
967
968	/* create the data buffer for the new block */
969	buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
970
971	/* copy old block data to the new block */
972	obuf = db->db_buf;
973	bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
974	/* zero the remainder */
975	if (size > osize)
976		bzero((uint8_t *)buf->b_data + osize, size - osize);
977
978	mutex_enter(&db->db_mtx);
979	dbuf_set_data(db, buf);
980	VERIFY(arc_buf_remove_ref(obuf, db));
981	db->db.db_size = size;
982
983	if (db->db_level == 0) {
984		ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
985		db->db_last_dirty->dt.dl.dr_data = buf;
986	}
987	mutex_exit(&db->db_mtx);
988
989	dnode_willuse_space(dn, size-osize, tx);
990	DB_DNODE_EXIT(db);
991}
992
993void
994dbuf_release_bp(dmu_buf_impl_t *db)
995{
996	objset_t *os = db->db_objset;
997
998	ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
999	ASSERT(arc_released(os->os_phys_buf) ||
1000	    list_link_active(&os->os_dsl_dataset->ds_synced_link));
1001	ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
1002
1003	(void) arc_release(db->db_buf, db);
1004}
1005
1006dbuf_dirty_record_t *
1007dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1008{
1009	dnode_t *dn;
1010	objset_t *os;
1011	dbuf_dirty_record_t **drp, *dr;
1012	int drop_struct_lock = FALSE;
1013	boolean_t do_free_accounting = B_FALSE;
1014	int txgoff = tx->tx_txg & TXG_MASK;
1015
1016	ASSERT(tx->tx_txg != 0);
1017	ASSERT(!refcount_is_zero(&db->db_holds));
1018	DMU_TX_DIRTY_BUF(tx, db);
1019
1020	DB_DNODE_ENTER(db);
1021	dn = DB_DNODE(db);
1022	/*
1023	 * Shouldn't dirty a regular buffer in syncing context.  Private
1024	 * objects may be dirtied in syncing context, but only if they
1025	 * were already pre-dirtied in open context.
1026	 */
1027	ASSERT(!dmu_tx_is_syncing(tx) ||
1028	    BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
1029	    DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1030	    dn->dn_objset->os_dsl_dataset == NULL);
1031	/*
1032	 * We make this assert for private objects as well, but after we
1033	 * check if we're already dirty.  They are allowed to re-dirty
1034	 * in syncing context.
1035	 */
1036	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1037	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1038	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1039
1040	mutex_enter(&db->db_mtx);
1041	/*
1042	 * XXX make this true for indirects too?  The problem is that
1043	 * transactions created with dmu_tx_create_assigned() from
1044	 * syncing context don't bother holding ahead.
1045	 */
1046	ASSERT(db->db_level != 0 ||
1047	    db->db_state == DB_CACHED || db->db_state == DB_FILL ||
1048	    db->db_state == DB_NOFILL);
1049
1050	mutex_enter(&dn->dn_mtx);
1051	/*
1052	 * Don't set dirtyctx to SYNC if we're just modifying this as we
1053	 * initialize the objset.
1054	 */
1055	if (dn->dn_dirtyctx == DN_UNDIRTIED &&
1056	    !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
1057		dn->dn_dirtyctx =
1058		    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
1059		ASSERT(dn->dn_dirtyctx_firstset == NULL);
1060		dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
1061	}
1062	mutex_exit(&dn->dn_mtx);
1063
1064	if (db->db_blkid == DMU_SPILL_BLKID)
1065		dn->dn_have_spill = B_TRUE;
1066
1067	/*
1068	 * If this buffer is already dirty, we're done.
1069	 */
1070	drp = &db->db_last_dirty;
1071	ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
1072	    db->db.db_object == DMU_META_DNODE_OBJECT);
1073	while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
1074		drp = &dr->dr_next;
1075	if (dr && dr->dr_txg == tx->tx_txg) {
1076		DB_DNODE_EXIT(db);
1077
1078		if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
1079			/*
1080			 * If this buffer has already been written out,
1081			 * we now need to reset its state.
1082			 */
1083			dbuf_unoverride(dr);
1084			if (db->db.db_object != DMU_META_DNODE_OBJECT &&
1085			    db->db_state != DB_NOFILL)
1086				arc_buf_thaw(db->db_buf);
1087		}
1088		mutex_exit(&db->db_mtx);
1089		return (dr);
1090	}
1091
1092	/*
1093	 * Only valid if not already dirty.
1094	 */
1095	ASSERT(dn->dn_object == 0 ||
1096	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1097	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1098
1099	ASSERT3U(dn->dn_nlevels, >, db->db_level);
1100	ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
1101	    dn->dn_phys->dn_nlevels > db->db_level ||
1102	    dn->dn_next_nlevels[txgoff] > db->db_level ||
1103	    dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
1104	    dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
1105
1106	/*
1107	 * We should only be dirtying in syncing context if it's the
1108	 * mos or we're initializing the os or it's a special object.
1109	 * However, we are allowed to dirty in syncing context provided
1110	 * we already dirtied it in open context.  Hence we must make
1111	 * this assertion only if we're not already dirty.
1112	 */
1113	os = dn->dn_objset;
1114	ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1115	    os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
1116	ASSERT(db->db.db_size != 0);
1117
1118	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1119
1120	if (db->db_blkid != DMU_BONUS_BLKID) {
1121		/*
1122		 * Update the accounting.
1123		 * Note: we delay "free accounting" until after we drop
1124		 * the db_mtx.  This keeps us from grabbing other locks
1125		 * (and possibly deadlocking) in bp_get_dsize() while
1126		 * also holding the db_mtx.
1127		 */
1128		dnode_willuse_space(dn, db->db.db_size, tx);
1129		do_free_accounting = dbuf_block_freeable(db);
1130	}
1131
1132	/*
1133	 * If this buffer is dirty in an old transaction group we need
1134	 * to make a copy of it so that the changes we make in this
1135	 * transaction group won't leak out when we sync the older txg.
1136	 */
1137	dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
1138	if (db->db_level == 0) {
1139		void *data_old = db->db_buf;
1140
1141		if (db->db_state != DB_NOFILL) {
1142			if (db->db_blkid == DMU_BONUS_BLKID) {
1143				dbuf_fix_old_data(db, tx->tx_txg);
1144				data_old = db->db.db_data;
1145			} else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
1146				/*
1147				 * Release the data buffer from the cache so
1148				 * that we can modify it without impacting
1149				 * possible other users of this cached data
1150				 * block.  Note that indirect blocks and
1151				 * private objects are not released until the
1152				 * syncing state (since they are only modified
1153				 * then).
1154				 */
1155				arc_release(db->db_buf, db);
1156				dbuf_fix_old_data(db, tx->tx_txg);
1157				data_old = db->db_buf;
1158			}
1159			ASSERT(data_old != NULL);
1160		}
1161		dr->dt.dl.dr_data = data_old;
1162	} else {
1163		mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
1164		list_create(&dr->dt.di.dr_children,
1165		    sizeof (dbuf_dirty_record_t),
1166		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
1167	}
1168	if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
1169		dr->dr_accounted = db->db.db_size;
1170	dr->dr_dbuf = db;
1171	dr->dr_txg = tx->tx_txg;
1172	dr->dr_next = *drp;
1173	*drp = dr;
1174
1175	/*
1176	 * We could have been freed_in_flight between the dbuf_noread
1177	 * and dbuf_dirty.  We win, as though the dbuf_noread() had
1178	 * happened after the free.
1179	 */
1180	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1181	    db->db_blkid != DMU_SPILL_BLKID) {
1182		mutex_enter(&dn->dn_mtx);
1183		if (dn->dn_free_ranges[txgoff] != NULL) {
1184			range_tree_clear(dn->dn_free_ranges[txgoff],
1185			    db->db_blkid, 1);
1186		}
1187		mutex_exit(&dn->dn_mtx);
1188		db->db_freed_in_flight = FALSE;
1189	}
1190
1191	/*
1192	 * This buffer is now part of this txg
1193	 */
1194	dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
1195	db->db_dirtycnt += 1;
1196	ASSERT3U(db->db_dirtycnt, <=, 3);
1197
1198	mutex_exit(&db->db_mtx);
1199
1200	if (db->db_blkid == DMU_BONUS_BLKID ||
1201	    db->db_blkid == DMU_SPILL_BLKID) {
1202		mutex_enter(&dn->dn_mtx);
1203		ASSERT(!list_link_active(&dr->dr_dirty_node));
1204		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1205		mutex_exit(&dn->dn_mtx);
1206		dnode_setdirty(dn, tx);
1207		DB_DNODE_EXIT(db);
1208		return (dr);
1209	} else if (do_free_accounting) {
1210		blkptr_t *bp = db->db_blkptr;
1211		int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
1212		    bp_get_dsize(os->os_spa, bp) : db->db.db_size;
1213		/*
1214		 * This is only a guess -- if the dbuf is dirty
1215		 * in a previous txg, we don't know how much
1216		 * space it will use on disk yet.  We should
1217		 * really have the struct_rwlock to access
1218		 * db_blkptr, but since this is just a guess,
1219		 * it's OK if we get an odd answer.
1220		 */
1221		ddt_prefetch(os->os_spa, bp);
1222		dnode_willuse_space(dn, -willfree, tx);
1223	}
1224
1225	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
1226		rw_enter(&dn->dn_struct_rwlock, RW_READER);
1227		drop_struct_lock = TRUE;
1228	}
1229
1230	if (db->db_level == 0) {
1231		dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
1232		ASSERT(dn->dn_maxblkid >= db->db_blkid);
1233	}
1234
1235	if (db->db_level+1 < dn->dn_nlevels) {
1236		dmu_buf_impl_t *parent = db->db_parent;
1237		dbuf_dirty_record_t *di;
1238		int parent_held = FALSE;
1239
1240		if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
1241			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1242
1243			parent = dbuf_hold_level(dn, db->db_level+1,
1244			    db->db_blkid >> epbs, FTAG);
1245			ASSERT(parent != NULL);
1246			parent_held = TRUE;
1247		}
1248		if (drop_struct_lock)
1249			rw_exit(&dn->dn_struct_rwlock);
1250		ASSERT3U(db->db_level+1, ==, parent->db_level);
1251		di = dbuf_dirty(parent, tx);
1252		if (parent_held)
1253			dbuf_rele(parent, FTAG);
1254
1255		mutex_enter(&db->db_mtx);
1256		/*
1257		 * Since we've dropped the mutex, it's possible that
1258		 * dbuf_undirty() might have changed this out from under us.
1259		 */
1260		if (db->db_last_dirty == dr ||
1261		    dn->dn_object == DMU_META_DNODE_OBJECT) {
1262			mutex_enter(&di->dt.di.dr_mtx);
1263			ASSERT3U(di->dr_txg, ==, tx->tx_txg);
1264			ASSERT(!list_link_active(&dr->dr_dirty_node));
1265			list_insert_tail(&di->dt.di.dr_children, dr);
1266			mutex_exit(&di->dt.di.dr_mtx);
1267			dr->dr_parent = di;
1268		}
1269		mutex_exit(&db->db_mtx);
1270	} else {
1271		ASSERT(db->db_level+1 == dn->dn_nlevels);
1272		ASSERT(db->db_blkid < dn->dn_nblkptr);
1273		ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
1274		mutex_enter(&dn->dn_mtx);
1275		ASSERT(!list_link_active(&dr->dr_dirty_node));
1276		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1277		mutex_exit(&dn->dn_mtx);
1278		if (drop_struct_lock)
1279			rw_exit(&dn->dn_struct_rwlock);
1280	}
1281
1282	dnode_setdirty(dn, tx);
1283	DB_DNODE_EXIT(db);
1284	return (dr);
1285}
1286
1287/*
1288 * Undirty a buffer in the transaction group referenced by the given
1289 * transaction.  Return whether this evicted the dbuf.
1290 */
1291static boolean_t
1292dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1293{
1294	dnode_t *dn;
1295	uint64_t txg = tx->tx_txg;
1296	dbuf_dirty_record_t *dr, **drp;
1297
1298	ASSERT(txg != 0);
1299	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1300	ASSERT0(db->db_level);
1301	ASSERT(MUTEX_HELD(&db->db_mtx));
1302
1303	/*
1304	 * If this buffer is not dirty, we're done.
1305	 */
1306	for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
1307		if (dr->dr_txg <= txg)
1308			break;
1309	if (dr == NULL || dr->dr_txg < txg)
1310		return (B_FALSE);
1311	ASSERT(dr->dr_txg == txg);
1312	ASSERT(dr->dr_dbuf == db);
1313
1314	DB_DNODE_ENTER(db);
1315	dn = DB_DNODE(db);
1316
1317	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1318
1319	ASSERT(db->db.db_size != 0);
1320
1321	/*
1322	 * Any space we accounted for in dp_dirty_* will be cleaned up by
1323	 * dsl_pool_sync().  This is relatively rare so the discrepancy
1324	 * is not a big deal.
1325	 */
1326
1327	*drp = dr->dr_next;
1328
1329	/*
1330	 * Note that there are three places in dbuf_dirty()
1331	 * where this dirty record may be put on a list.
1332	 * Make sure to do a list_remove corresponding to
1333	 * every one of those list_insert calls.
1334	 */
1335	if (dr->dr_parent) {
1336		mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
1337		list_remove(&dr->dr_parent->dt.di.dr_children, dr);
1338		mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
1339	} else if (db->db_blkid == DMU_SPILL_BLKID ||
1340	    db->db_level+1 == dn->dn_nlevels) {
1341		ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
1342		mutex_enter(&dn->dn_mtx);
1343		list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
1344		mutex_exit(&dn->dn_mtx);
1345	}
1346	DB_DNODE_EXIT(db);
1347
1348	if (db->db_state != DB_NOFILL) {
1349		dbuf_unoverride(dr);
1350
1351		ASSERT(db->db_buf != NULL);
1352		ASSERT(dr->dt.dl.dr_data != NULL);
1353		if (dr->dt.dl.dr_data != db->db_buf)
1354			VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
1355	}
1356
1357	if (db->db_level != 0) {
1358		mutex_destroy(&dr->dt.di.dr_mtx);
1359		list_destroy(&dr->dt.di.dr_children);
1360	}
1361
1362	kmem_free(dr, sizeof (dbuf_dirty_record_t));
1363
1364	ASSERT(db->db_dirtycnt > 0);
1365	db->db_dirtycnt -= 1;
1366
1367	if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
1368		arc_buf_t *buf = db->db_buf;
1369
1370		ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
1371		dbuf_set_data(db, NULL);
1372		VERIFY(arc_buf_remove_ref(buf, db));
1373		dbuf_evict(db);
1374		return (B_TRUE);
1375	}
1376
1377	return (B_FALSE);
1378}
1379
1380void
1381dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
1382{
1383	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1384	int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
1385
1386	ASSERT(tx->tx_txg != 0);
1387	ASSERT(!refcount_is_zero(&db->db_holds));
1388
1389	DB_DNODE_ENTER(db);
1390	if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
1391		rf |= DB_RF_HAVESTRUCT;
1392	DB_DNODE_EXIT(db);
1393	(void) dbuf_read(db, NULL, rf);
1394	(void) dbuf_dirty(db, tx);
1395}
1396
1397void
1398dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1399{
1400	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1401
1402	db->db_state = DB_NOFILL;
1403
1404	dmu_buf_will_fill(db_fake, tx);
1405}
1406
1407void
1408dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1409{
1410	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1411
1412	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1413	ASSERT(tx->tx_txg != 0);
1414	ASSERT(db->db_level == 0);
1415	ASSERT(!refcount_is_zero(&db->db_holds));
1416
1417	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
1418	    dmu_tx_private_ok(tx));
1419
1420	dbuf_noread(db);
1421	(void) dbuf_dirty(db, tx);
1422}
1423
1424#pragma weak dmu_buf_fill_done = dbuf_fill_done
1425/* ARGSUSED */
1426void
1427dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
1428{
1429	mutex_enter(&db->db_mtx);
1430	DBUF_VERIFY(db);
1431
1432	if (db->db_state == DB_FILL) {
1433		if (db->db_level == 0 && db->db_freed_in_flight) {
1434			ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1435			/* we were freed while filling */
1436			/* XXX dbuf_undirty? */
1437			bzero(db->db.db_data, db->db.db_size);
1438			db->db_freed_in_flight = FALSE;
1439		}
1440		db->db_state = DB_CACHED;
1441		cv_broadcast(&db->db_changed);
1442	}
1443	mutex_exit(&db->db_mtx);
1444}
1445
1446void
1447dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
1448    bp_embedded_type_t etype, enum zio_compress comp,
1449    int uncompressed_size, int compressed_size, int byteorder,
1450    dmu_tx_t *tx)
1451{
1452	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
1453	struct dirty_leaf *dl;
1454	dmu_object_type_t type;
1455
1456	DB_DNODE_ENTER(db);
1457	type = DB_DNODE(db)->dn_type;
1458	DB_DNODE_EXIT(db);
1459
1460	ASSERT0(db->db_level);
1461	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1462
1463	dmu_buf_will_not_fill(dbuf, tx);
1464
1465	ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
1466	dl = &db->db_last_dirty->dt.dl;
1467	encode_embedded_bp_compressed(&dl->dr_overridden_by,
1468	    data, comp, uncompressed_size, compressed_size);
1469	BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
1470	BP_SET_TYPE(&dl->dr_overridden_by, type);
1471	BP_SET_LEVEL(&dl->dr_overridden_by, 0);
1472	BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
1473
1474	dl->dr_override_state = DR_OVERRIDDEN;
1475	dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg;
1476}
1477
1478/*
1479 * Directly assign a provided arc buf to a given dbuf if it's not referenced
1480 * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
1481 */
1482void
1483dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
1484{
1485	ASSERT(!refcount_is_zero(&db->db_holds));
1486	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1487	ASSERT(db->db_level == 0);
1488	ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
1489	ASSERT(buf != NULL);
1490	ASSERT(arc_buf_size(buf) == db->db.db_size);
1491	ASSERT(tx->tx_txg != 0);
1492
1493	arc_return_buf(buf, db);
1494	ASSERT(arc_released(buf));
1495
1496	mutex_enter(&db->db_mtx);
1497
1498	while (db->db_state == DB_READ || db->db_state == DB_FILL)
1499		cv_wait(&db->db_changed, &db->db_mtx);
1500
1501	ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
1502
1503	if (db->db_state == DB_CACHED &&
1504	    refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
1505		mutex_exit(&db->db_mtx);
1506		(void) dbuf_dirty(db, tx);
1507		bcopy(buf->b_data, db->db.db_data, db->db.db_size);
1508		VERIFY(arc_buf_remove_ref(buf, db));
1509		xuio_stat_wbuf_copied();
1510		return;
1511	}
1512
1513	xuio_stat_wbuf_nocopy();
1514	if (db->db_state == DB_CACHED) {
1515		dbuf_dirty_record_t *dr = db->db_last_dirty;
1516
1517		ASSERT(db->db_buf != NULL);
1518		if (dr != NULL && dr->dr_txg == tx->tx_txg) {
1519			ASSERT(dr->dt.dl.dr_data == db->db_buf);
1520			if (!arc_released(db->db_buf)) {
1521				ASSERT(dr->dt.dl.dr_override_state ==
1522				    DR_OVERRIDDEN);
1523				arc_release(db->db_buf, db);
1524			}
1525			dr->dt.dl.dr_data = buf;
1526			VERIFY(arc_buf_remove_ref(db->db_buf, db));
1527		} else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
1528			arc_release(db->db_buf, db);
1529			VERIFY(arc_buf_remove_ref(db->db_buf, db));
1530		}
1531		db->db_buf = NULL;
1532	}
1533	ASSERT(db->db_buf == NULL);
1534	dbuf_set_data(db, buf);
1535	db->db_state = DB_FILL;
1536	mutex_exit(&db->db_mtx);
1537	(void) dbuf_dirty(db, tx);
1538	dmu_buf_fill_done(&db->db, tx);
1539}
1540
1541/*
1542 * "Clear" the contents of this dbuf.  This will mark the dbuf
1543 * EVICTING and clear *most* of its references.  Unfortunately,
1544 * when we are not holding the dn_dbufs_mtx, we can't clear the
1545 * entry in the dn_dbufs list.  We have to wait until dbuf_destroy()
1546 * in this case.  For callers from the DMU we will usually see:
1547 *	dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
1548 * For the arc callback, we will usually see:
1549 *	dbuf_do_evict()->dbuf_clear();dbuf_destroy()
1550 * Sometimes, though, we will get a mix of these two:
1551 *	DMU: dbuf_clear()->arc_buf_evict()
1552 *	ARC: dbuf_do_evict()->dbuf_destroy()
1553 */
1554void
1555dbuf_clear(dmu_buf_impl_t *db)
1556{
1557	dnode_t *dn;
1558	dmu_buf_impl_t *parent = db->db_parent;
1559	dmu_buf_impl_t *dndb;
1560	int dbuf_gone = FALSE;
1561
1562	ASSERT(MUTEX_HELD(&db->db_mtx));
1563	ASSERT(refcount_is_zero(&db->db_holds));
1564
1565	dbuf_evict_user(db);
1566
1567	if (db->db_state == DB_CACHED) {
1568		ASSERT(db->db.db_data != NULL);
1569		if (db->db_blkid == DMU_BONUS_BLKID) {
1570			zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
1571			arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
1572		}
1573		db->db.db_data = NULL;
1574		db->db_state = DB_UNCACHED;
1575	}
1576
1577	ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
1578	ASSERT(db->db_data_pending == NULL);
1579
1580	db->db_state = DB_EVICTING;
1581	db->db_blkptr = NULL;
1582
1583	DB_DNODE_ENTER(db);
1584	dn = DB_DNODE(db);
1585	dndb = dn->dn_dbuf;
1586	if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
1587		list_remove(&dn->dn_dbufs, db);
1588		(void) atomic_dec_32_nv(&dn->dn_dbufs_count);
1589		membar_producer();
1590		DB_DNODE_EXIT(db);
1591		/*
1592		 * Decrementing the dbuf count means that the hold corresponding
1593		 * to the removed dbuf is no longer discounted in dnode_move(),
1594		 * so the dnode cannot be moved until after we release the hold.
1595		 * The membar_producer() ensures visibility of the decremented
1596		 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
1597		 * release any lock.
1598		 */
1599		dnode_rele(dn, db);
1600		db->db_dnode_handle = NULL;
1601	} else {
1602		DB_DNODE_EXIT(db);
1603	}
1604
1605	if (db->db_buf)
1606		dbuf_gone = arc_buf_evict(db->db_buf);
1607
1608	if (!dbuf_gone)
1609		mutex_exit(&db->db_mtx);
1610
1611	/*
1612	 * If this dbuf is referenced from an indirect dbuf,
1613	 * decrement the ref count on the indirect dbuf.
1614	 */
1615	if (parent && parent != dndb)
1616		dbuf_rele(parent, db);
1617}
1618
1619static int
1620dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
1621    dmu_buf_impl_t **parentp, blkptr_t **bpp)
1622{
1623	int nlevels, epbs;
1624
1625	*parentp = NULL;
1626	*bpp = NULL;
1627
1628	ASSERT(blkid != DMU_BONUS_BLKID);
1629
1630	if (blkid == DMU_SPILL_BLKID) {
1631		mutex_enter(&dn->dn_mtx);
1632		if (dn->dn_have_spill &&
1633		    (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
1634			*bpp = &dn->dn_phys->dn_spill;
1635		else
1636			*bpp = NULL;
1637		dbuf_add_ref(dn->dn_dbuf, NULL);
1638		*parentp = dn->dn_dbuf;
1639		mutex_exit(&dn->dn_mtx);
1640		return (0);
1641	}
1642
1643	if (dn->dn_phys->dn_nlevels == 0)
1644		nlevels = 1;
1645	else
1646		nlevels = dn->dn_phys->dn_nlevels;
1647
1648	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1649
1650	ASSERT3U(level * epbs, <, 64);
1651	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1652	if (level >= nlevels ||
1653	    (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
1654		/* the buffer has no parent yet */
1655		return (SET_ERROR(ENOENT));
1656	} else if (level < nlevels-1) {
1657		/* this block is referenced from an indirect block */
1658		int err = dbuf_hold_impl(dn, level+1,
1659		    blkid >> epbs, fail_sparse, NULL, parentp);
1660		if (err)
1661			return (err);
1662		err = dbuf_read(*parentp, NULL,
1663		    (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
1664		if (err) {
1665			dbuf_rele(*parentp, NULL);
1666			*parentp = NULL;
1667			return (err);
1668		}
1669		*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
1670		    (blkid & ((1ULL << epbs) - 1));
1671		return (0);
1672	} else {
1673		/* the block is referenced from the dnode */
1674		ASSERT3U(level, ==, nlevels-1);
1675		ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
1676		    blkid < dn->dn_phys->dn_nblkptr);
1677		if (dn->dn_dbuf) {
1678			dbuf_add_ref(dn->dn_dbuf, NULL);
1679			*parentp = dn->dn_dbuf;
1680		}
1681		*bpp = &dn->dn_phys->dn_blkptr[blkid];
1682		return (0);
1683	}
1684}
1685
1686static dmu_buf_impl_t *
1687dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
1688    dmu_buf_impl_t *parent, blkptr_t *blkptr)
1689{
1690	objset_t *os = dn->dn_objset;
1691	dmu_buf_impl_t *db, *odb;
1692
1693	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1694	ASSERT(dn->dn_type != DMU_OT_NONE);
1695
1696	db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
1697
1698	db->db_objset = os;
1699	db->db.db_object = dn->dn_object;
1700	db->db_level = level;
1701	db->db_blkid = blkid;
1702	db->db_last_dirty = NULL;
1703	db->db_dirtycnt = 0;
1704	db->db_dnode_handle = dn->dn_handle;
1705	db->db_parent = parent;
1706	db->db_blkptr = blkptr;
1707
1708	db->db_user_ptr = NULL;
1709	db->db_user_data_ptr_ptr = NULL;
1710	db->db_evict_func = NULL;
1711	db->db_immediate_evict = 0;
1712	db->db_freed_in_flight = 0;
1713
1714	if (blkid == DMU_BONUS_BLKID) {
1715		ASSERT3P(parent, ==, dn->dn_dbuf);
1716		db->db.db_size = DN_MAX_BONUSLEN -
1717		    (dn->dn_nblkptr-1) * sizeof (blkptr_t);
1718		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
1719		db->db.db_offset = DMU_BONUS_BLKID;
1720		db->db_state = DB_UNCACHED;
1721		/* the bonus dbuf is not placed in the hash table */
1722		arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1723		return (db);
1724	} else if (blkid == DMU_SPILL_BLKID) {
1725		db->db.db_size = (blkptr != NULL) ?
1726		    BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
1727		db->db.db_offset = 0;
1728	} else {
1729		int blocksize =
1730		    db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
1731		db->db.db_size = blocksize;
1732		db->db.db_offset = db->db_blkid * blocksize;
1733	}
1734
1735	/*
1736	 * Hold the dn_dbufs_mtx while we get the new dbuf
1737	 * in the hash table *and* added to the dbufs list.
1738	 * This prevents a possible deadlock with someone
1739	 * trying to look up this dbuf before its added to the
1740	 * dn_dbufs list.
1741	 */
1742	mutex_enter(&dn->dn_dbufs_mtx);
1743	db->db_state = DB_EVICTING;
1744	if ((odb = dbuf_hash_insert(db)) != NULL) {
1745		/* someone else inserted it first */
1746		kmem_cache_free(dbuf_cache, db);
1747		mutex_exit(&dn->dn_dbufs_mtx);
1748		return (odb);
1749	}
1750	list_insert_head(&dn->dn_dbufs, db);
1751	if (db->db_level == 0 && db->db_blkid >=
1752	    dn->dn_unlisted_l0_blkid)
1753		dn->dn_unlisted_l0_blkid = db->db_blkid + 1;
1754	db->db_state = DB_UNCACHED;
1755	mutex_exit(&dn->dn_dbufs_mtx);
1756	arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1757
1758	if (parent && parent != dn->dn_dbuf)
1759		dbuf_add_ref(parent, db);
1760
1761	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1762	    refcount_count(&dn->dn_holds) > 0);
1763	(void) refcount_add(&dn->dn_holds, db);
1764	(void) atomic_inc_32_nv(&dn->dn_dbufs_count);
1765
1766	dprintf_dbuf(db, "db=%p\n", db);
1767
1768	return (db);
1769}
1770
1771static int
1772dbuf_do_evict(void *private)
1773{
1774	arc_buf_t *buf = private;
1775	dmu_buf_impl_t *db = buf->b_private;
1776
1777	if (!MUTEX_HELD(&db->db_mtx))
1778		mutex_enter(&db->db_mtx);
1779
1780	ASSERT(refcount_is_zero(&db->db_holds));
1781
1782	if (db->db_state != DB_EVICTING) {
1783		ASSERT(db->db_state == DB_CACHED);
1784		DBUF_VERIFY(db);
1785		db->db_buf = NULL;
1786		dbuf_evict(db);
1787	} else {
1788		mutex_exit(&db->db_mtx);
1789		dbuf_destroy(db);
1790	}
1791	return (0);
1792}
1793
1794static void
1795dbuf_destroy(dmu_buf_impl_t *db)
1796{
1797	ASSERT(refcount_is_zero(&db->db_holds));
1798
1799	if (db->db_blkid != DMU_BONUS_BLKID) {
1800		/*
1801		 * If this dbuf is still on the dn_dbufs list,
1802		 * remove it from that list.
1803		 */
1804		if (db->db_dnode_handle != NULL) {
1805			dnode_t *dn;
1806
1807			DB_DNODE_ENTER(db);
1808			dn = DB_DNODE(db);
1809			mutex_enter(&dn->dn_dbufs_mtx);
1810			list_remove(&dn->dn_dbufs, db);
1811			(void) atomic_dec_32_nv(&dn->dn_dbufs_count);
1812			mutex_exit(&dn->dn_dbufs_mtx);
1813			DB_DNODE_EXIT(db);
1814			/*
1815			 * Decrementing the dbuf count means that the hold
1816			 * corresponding to the removed dbuf is no longer
1817			 * discounted in dnode_move(), so the dnode cannot be
1818			 * moved until after we release the hold.
1819			 */
1820			dnode_rele(dn, db);
1821			db->db_dnode_handle = NULL;
1822		}
1823		dbuf_hash_remove(db);
1824	}
1825	db->db_parent = NULL;
1826	db->db_buf = NULL;
1827
1828	ASSERT(!list_link_active(&db->db_link));
1829	ASSERT(db->db.db_data == NULL);
1830	ASSERT(db->db_hash_next == NULL);
1831	ASSERT(db->db_blkptr == NULL);
1832	ASSERT(db->db_data_pending == NULL);
1833
1834	kmem_cache_free(dbuf_cache, db);
1835	arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1836}
1837
1838void
1839dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
1840{
1841	dmu_buf_impl_t *db = NULL;
1842	blkptr_t *bp = NULL;
1843
1844	ASSERT(blkid != DMU_BONUS_BLKID);
1845	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1846
1847	if (dnode_block_freed(dn, blkid))
1848		return;
1849
1850	/* dbuf_find() returns with db_mtx held */
1851	if (db = dbuf_find(dn, 0, blkid)) {
1852		/*
1853		 * This dbuf is already in the cache.  We assume that
1854		 * it is already CACHED, or else about to be either
1855		 * read or filled.
1856		 */
1857		mutex_exit(&db->db_mtx);
1858		return;
1859	}
1860
1861	if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
1862		if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
1863			dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
1864			uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
1865			zbookmark_phys_t zb;
1866
1867			SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
1868			    dn->dn_object, 0, blkid);
1869
1870			(void) arc_read(NULL, dn->dn_objset->os_spa,
1871			    bp, NULL, NULL, prio,
1872			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
1873			    &aflags, &zb);
1874		}
1875		if (db)
1876			dbuf_rele(db, NULL);
1877	}
1878}
1879
1880/*
1881 * Returns with db_holds incremented, and db_mtx not held.
1882 * Note: dn_struct_rwlock must be held.
1883 */
1884int
1885dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
1886    void *tag, dmu_buf_impl_t **dbp)
1887{
1888	dmu_buf_impl_t *db, *parent = NULL;
1889
1890	ASSERT(blkid != DMU_BONUS_BLKID);
1891	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1892	ASSERT3U(dn->dn_nlevels, >, level);
1893
1894	*dbp = NULL;
1895top:
1896	/* dbuf_find() returns with db_mtx held */
1897	db = dbuf_find(dn, level, blkid);
1898
1899	if (db == NULL) {
1900		blkptr_t *bp = NULL;
1901		int err;
1902
1903		ASSERT3P(parent, ==, NULL);
1904		err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
1905		if (fail_sparse) {
1906			if (err == 0 && bp && BP_IS_HOLE(bp))
1907				err = SET_ERROR(ENOENT);
1908			if (err) {
1909				if (parent)
1910					dbuf_rele(parent, NULL);
1911				return (err);
1912			}
1913		}
1914		if (err && err != ENOENT)
1915			return (err);
1916		db = dbuf_create(dn, level, blkid, parent, bp);
1917	}
1918
1919	if (db->db_buf && refcount_is_zero(&db->db_holds)) {
1920		arc_buf_add_ref(db->db_buf, db);
1921		if (db->db_buf->b_data == NULL) {
1922			dbuf_clear(db);
1923			if (parent) {
1924				dbuf_rele(parent, NULL);
1925				parent = NULL;
1926			}
1927			goto top;
1928		}
1929		ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
1930	}
1931
1932	ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
1933
1934	/*
1935	 * If this buffer is currently syncing out, and we are are
1936	 * still referencing it from db_data, we need to make a copy
1937	 * of it in case we decide we want to dirty it again in this txg.
1938	 */
1939	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1940	    dn->dn_object != DMU_META_DNODE_OBJECT &&
1941	    db->db_state == DB_CACHED && db->db_data_pending) {
1942		dbuf_dirty_record_t *dr = db->db_data_pending;
1943
1944		if (dr->dt.dl.dr_data == db->db_buf) {
1945			arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1946
1947			dbuf_set_data(db,
1948			    arc_buf_alloc(dn->dn_objset->os_spa,
1949			    db->db.db_size, db, type));
1950			bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
1951			    db->db.db_size);
1952		}
1953	}
1954
1955	(void) refcount_add(&db->db_holds, tag);
1956	dbuf_update_data(db);
1957	DBUF_VERIFY(db);
1958	mutex_exit(&db->db_mtx);
1959
1960	/* NOTE: we can't rele the parent until after we drop the db_mtx */
1961	if (parent)
1962		dbuf_rele(parent, NULL);
1963
1964	ASSERT3P(DB_DNODE(db), ==, dn);
1965	ASSERT3U(db->db_blkid, ==, blkid);
1966	ASSERT3U(db->db_level, ==, level);
1967	*dbp = db;
1968
1969	return (0);
1970}
1971
1972dmu_buf_impl_t *
1973dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
1974{
1975	dmu_buf_impl_t *db;
1976	int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
1977	return (err ? NULL : db);
1978}
1979
1980dmu_buf_impl_t *
1981dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
1982{
1983	dmu_buf_impl_t *db;
1984	int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
1985	return (err ? NULL : db);
1986}
1987
1988void
1989dbuf_create_bonus(dnode_t *dn)
1990{
1991	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
1992
1993	ASSERT(dn->dn_bonus == NULL);
1994	dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
1995}
1996
1997int
1998dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
1999{
2000	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2001	dnode_t *dn;
2002
2003	if (db->db_blkid != DMU_SPILL_BLKID)
2004		return (SET_ERROR(ENOTSUP));
2005	if (blksz == 0)
2006		blksz = SPA_MINBLOCKSIZE;
2007	if (blksz > SPA_MAXBLOCKSIZE)
2008		blksz = SPA_MAXBLOCKSIZE;
2009	else
2010		blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
2011
2012	DB_DNODE_ENTER(db);
2013	dn = DB_DNODE(db);
2014	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
2015	dbuf_new_size(db, blksz, tx);
2016	rw_exit(&dn->dn_struct_rwlock);
2017	DB_DNODE_EXIT(db);
2018
2019	return (0);
2020}
2021
2022void
2023dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
2024{
2025	dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
2026}
2027
2028#pragma weak dmu_buf_add_ref = dbuf_add_ref
2029void
2030dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
2031{
2032	int64_t holds = refcount_add(&db->db_holds, tag);
2033	ASSERT(holds > 1);
2034}
2035
2036/*
2037 * If you call dbuf_rele() you had better not be referencing the dnode handle
2038 * unless you have some other direct or indirect hold on the dnode. (An indirect
2039 * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
2040 * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
2041 * dnode's parent dbuf evicting its dnode handles.
2042 */
2043void
2044dbuf_rele(dmu_buf_impl_t *db, void *tag)
2045{
2046	mutex_enter(&db->db_mtx);
2047	dbuf_rele_and_unlock(db, tag);
2048}
2049
2050void
2051dmu_buf_rele(dmu_buf_t *db, void *tag)
2052{
2053	dbuf_rele((dmu_buf_impl_t *)db, tag);
2054}
2055
2056/*
2057 * dbuf_rele() for an already-locked dbuf.  This is necessary to allow
2058 * db_dirtycnt and db_holds to be updated atomically.
2059 */
2060void
2061dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
2062{
2063	int64_t holds;
2064
2065	ASSERT(MUTEX_HELD(&db->db_mtx));
2066	DBUF_VERIFY(db);
2067
2068	/*
2069	 * Remove the reference to the dbuf before removing its hold on the
2070	 * dnode so we can guarantee in dnode_move() that a referenced bonus
2071	 * buffer has a corresponding dnode hold.
2072	 */
2073	holds = refcount_remove(&db->db_holds, tag);
2074	ASSERT(holds >= 0);
2075
2076	/*
2077	 * We can't freeze indirects if there is a possibility that they
2078	 * may be modified in the current syncing context.
2079	 */
2080	if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
2081		arc_buf_freeze(db->db_buf);
2082
2083	if (holds == db->db_dirtycnt &&
2084	    db->db_level == 0 && db->db_immediate_evict)
2085		dbuf_evict_user(db);
2086
2087	if (holds == 0) {
2088		if (db->db_blkid == DMU_BONUS_BLKID) {
2089			mutex_exit(&db->db_mtx);
2090
2091			/*
2092			 * If the dnode moves here, we cannot cross this barrier
2093			 * until the move completes.
2094			 */
2095			DB_DNODE_ENTER(db);
2096			(void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count);
2097			DB_DNODE_EXIT(db);
2098			/*
2099			 * The bonus buffer's dnode hold is no longer discounted
2100			 * in dnode_move(). The dnode cannot move until after
2101			 * the dnode_rele().
2102			 */
2103			dnode_rele(DB_DNODE(db), db);
2104		} else if (db->db_buf == NULL) {
2105			/*
2106			 * This is a special case: we never associated this
2107			 * dbuf with any data allocated from the ARC.
2108			 */
2109			ASSERT(db->db_state == DB_UNCACHED ||
2110			    db->db_state == DB_NOFILL);
2111			dbuf_evict(db);
2112		} else if (arc_released(db->db_buf)) {
2113			arc_buf_t *buf = db->db_buf;
2114			/*
2115			 * This dbuf has anonymous data associated with it.
2116			 */
2117			dbuf_set_data(db, NULL);
2118			VERIFY(arc_buf_remove_ref(buf, db));
2119			dbuf_evict(db);
2120		} else {
2121			VERIFY(!arc_buf_remove_ref(db->db_buf, db));
2122
2123			/*
2124			 * A dbuf will be eligible for eviction if either the
2125			 * 'primarycache' property is set or a duplicate
2126			 * copy of this buffer is already cached in the arc.
2127			 *
2128			 * In the case of the 'primarycache' a buffer
2129			 * is considered for eviction if it matches the
2130			 * criteria set in the property.
2131			 *
2132			 * To decide if our buffer is considered a
2133			 * duplicate, we must call into the arc to determine
2134			 * if multiple buffers are referencing the same
2135			 * block on-disk. If so, then we simply evict
2136			 * ourselves.
2137			 */
2138			if (!DBUF_IS_CACHEABLE(db) ||
2139			    arc_buf_eviction_needed(db->db_buf))
2140				dbuf_clear(db);
2141			else
2142				mutex_exit(&db->db_mtx);
2143		}
2144	} else {
2145		mutex_exit(&db->db_mtx);
2146	}
2147}
2148
2149#pragma weak dmu_buf_refcount = dbuf_refcount
2150uint64_t
2151dbuf_refcount(dmu_buf_impl_t *db)
2152{
2153	return (refcount_count(&db->db_holds));
2154}
2155
2156void *
2157dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
2158    dmu_buf_evict_func_t *evict_func)
2159{
2160	return (dmu_buf_update_user(db_fake, NULL, user_ptr,
2161	    user_data_ptr_ptr, evict_func));
2162}
2163
2164void *
2165dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
2166    dmu_buf_evict_func_t *evict_func)
2167{
2168	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2169
2170	db->db_immediate_evict = TRUE;
2171	return (dmu_buf_update_user(db_fake, NULL, user_ptr,
2172	    user_data_ptr_ptr, evict_func));
2173}
2174
2175void *
2176dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
2177    void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
2178{
2179	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2180	ASSERT(db->db_level == 0);
2181
2182	ASSERT((user_ptr == NULL) == (evict_func == NULL));
2183
2184	mutex_enter(&db->db_mtx);
2185
2186	if (db->db_user_ptr == old_user_ptr) {
2187		db->db_user_ptr = user_ptr;
2188		db->db_user_data_ptr_ptr = user_data_ptr_ptr;
2189		db->db_evict_func = evict_func;
2190
2191		dbuf_update_data(db);
2192	} else {
2193		old_user_ptr = db->db_user_ptr;
2194	}
2195
2196	mutex_exit(&db->db_mtx);
2197	return (old_user_ptr);
2198}
2199
2200void *
2201dmu_buf_get_user(dmu_buf_t *db_fake)
2202{
2203	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2204	ASSERT(!refcount_is_zero(&db->db_holds));
2205
2206	return (db->db_user_ptr);
2207}
2208
2209boolean_t
2210dmu_buf_freeable(dmu_buf_t *dbuf)
2211{
2212	boolean_t res = B_FALSE;
2213	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
2214
2215	if (db->db_blkptr)
2216		res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
2217		    db->db_blkptr, db->db_blkptr->blk_birth);
2218
2219	return (res);
2220}
2221
2222blkptr_t *
2223dmu_buf_get_blkptr(dmu_buf_t *db)
2224{
2225	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
2226	return (dbi->db_blkptr);
2227}
2228
2229static void
2230dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
2231{
2232	/* ASSERT(dmu_tx_is_syncing(tx) */
2233	ASSERT(MUTEX_HELD(&db->db_mtx));
2234
2235	if (db->db_blkptr != NULL)
2236		return;
2237
2238	if (db->db_blkid == DMU_SPILL_BLKID) {
2239		db->db_blkptr = &dn->dn_phys->dn_spill;
2240		BP_ZERO(db->db_blkptr);
2241		return;
2242	}
2243	if (db->db_level == dn->dn_phys->dn_nlevels-1) {
2244		/*
2245		 * This buffer was allocated at a time when there was
2246		 * no available blkptrs from the dnode, or it was
2247		 * inappropriate to hook it in (i.e., nlevels mis-match).
2248		 */
2249		ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
2250		ASSERT(db->db_parent == NULL);
2251		db->db_parent = dn->dn_dbuf;
2252		db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
2253		DBUF_VERIFY(db);
2254	} else {
2255		dmu_buf_impl_t *parent = db->db_parent;
2256		int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2257
2258		ASSERT(dn->dn_phys->dn_nlevels > 1);
2259		if (parent == NULL) {
2260			mutex_exit(&db->db_mtx);
2261			rw_enter(&dn->dn_struct_rwlock, RW_READER);
2262			(void) dbuf_hold_impl(dn, db->db_level+1,
2263			    db->db_blkid >> epbs, FALSE, db, &parent);
2264			rw_exit(&dn->dn_struct_rwlock);
2265			mutex_enter(&db->db_mtx);
2266			db->db_parent = parent;
2267		}
2268		db->db_blkptr = (blkptr_t *)parent->db.db_data +
2269		    (db->db_blkid & ((1ULL << epbs) - 1));
2270		DBUF_VERIFY(db);
2271	}
2272}
2273
2274static void
2275dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2276{
2277	dmu_buf_impl_t *db = dr->dr_dbuf;
2278	dnode_t *dn;
2279	zio_t *zio;
2280
2281	ASSERT(dmu_tx_is_syncing(tx));
2282
2283	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2284
2285	mutex_enter(&db->db_mtx);
2286
2287	ASSERT(db->db_level > 0);
2288	DBUF_VERIFY(db);
2289
2290	/* Read the block if it hasn't been read yet. */
2291	if (db->db_buf == NULL) {
2292		mutex_exit(&db->db_mtx);
2293		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
2294		mutex_enter(&db->db_mtx);
2295	}
2296	ASSERT3U(db->db_state, ==, DB_CACHED);
2297	ASSERT(db->db_buf != NULL);
2298
2299	DB_DNODE_ENTER(db);
2300	dn = DB_DNODE(db);
2301	/* Indirect block size must match what the dnode thinks it is. */
2302	ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2303	dbuf_check_blkptr(dn, db);
2304	DB_DNODE_EXIT(db);
2305
2306	/* Provide the pending dirty record to child dbufs */
2307	db->db_data_pending = dr;
2308
2309	mutex_exit(&db->db_mtx);
2310	dbuf_write(dr, db->db_buf, tx);
2311
2312	zio = dr->dr_zio;
2313	mutex_enter(&dr->dt.di.dr_mtx);
2314	dbuf_sync_list(&dr->dt.di.dr_children, tx);
2315	ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2316	mutex_exit(&dr->dt.di.dr_mtx);
2317	zio_nowait(zio);
2318}
2319
2320static void
2321dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2322{
2323	arc_buf_t **datap = &dr->dt.dl.dr_data;
2324	dmu_buf_impl_t *db = dr->dr_dbuf;
2325	dnode_t *dn;
2326	objset_t *os;
2327	uint64_t txg = tx->tx_txg;
2328
2329	ASSERT(dmu_tx_is_syncing(tx));
2330
2331	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2332
2333	mutex_enter(&db->db_mtx);
2334	/*
2335	 * To be synced, we must be dirtied.  But we
2336	 * might have been freed after the dirty.
2337	 */
2338	if (db->db_state == DB_UNCACHED) {
2339		/* This buffer has been freed since it was dirtied */
2340		ASSERT(db->db.db_data == NULL);
2341	} else if (db->db_state == DB_FILL) {
2342		/* This buffer was freed and is now being re-filled */
2343		ASSERT(db->db.db_data != dr->dt.dl.dr_data);
2344	} else {
2345		ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
2346	}
2347	DBUF_VERIFY(db);
2348
2349	DB_DNODE_ENTER(db);
2350	dn = DB_DNODE(db);
2351
2352	if (db->db_blkid == DMU_SPILL_BLKID) {
2353		mutex_enter(&dn->dn_mtx);
2354		dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
2355		mutex_exit(&dn->dn_mtx);
2356	}
2357
2358	/*
2359	 * If this is a bonus buffer, simply copy the bonus data into the
2360	 * dnode.  It will be written out when the dnode is synced (and it
2361	 * will be synced, since it must have been dirty for dbuf_sync to
2362	 * be called).
2363	 */
2364	if (db->db_blkid == DMU_BONUS_BLKID) {
2365		dbuf_dirty_record_t **drp;
2366
2367		ASSERT(*datap != NULL);
2368		ASSERT0(db->db_level);
2369		ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
2370		bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
2371		DB_DNODE_EXIT(db);
2372
2373		if (*datap != db->db.db_data) {
2374			zio_buf_free(*datap, DN_MAX_BONUSLEN);
2375			arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
2376		}
2377		db->db_data_pending = NULL;
2378		drp = &db->db_last_dirty;
2379		while (*drp != dr)
2380			drp = &(*drp)->dr_next;
2381		ASSERT(dr->dr_next == NULL);
2382		ASSERT(dr->dr_dbuf == db);
2383		*drp = dr->dr_next;
2384		if (dr->dr_dbuf->db_level != 0) {
2385			list_destroy(&dr->dt.di.dr_children);
2386			mutex_destroy(&dr->dt.di.dr_mtx);
2387		}
2388		kmem_free(dr, sizeof (dbuf_dirty_record_t));
2389		ASSERT(db->db_dirtycnt > 0);
2390		db->db_dirtycnt -= 1;
2391		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
2392		return;
2393	}
2394
2395	os = dn->dn_objset;
2396
2397	/*
2398	 * This function may have dropped the db_mtx lock allowing a dmu_sync
2399	 * operation to sneak in. As a result, we need to ensure that we
2400	 * don't check the dr_override_state until we have returned from
2401	 * dbuf_check_blkptr.
2402	 */
2403	dbuf_check_blkptr(dn, db);
2404
2405	/*
2406	 * If this buffer is in the middle of an immediate write,
2407	 * wait for the synchronous IO to complete.
2408	 */
2409	while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
2410		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
2411		cv_wait(&db->db_changed, &db->db_mtx);
2412		ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
2413	}
2414
2415	if (db->db_state != DB_NOFILL &&
2416	    dn->dn_object != DMU_META_DNODE_OBJECT &&
2417	    refcount_count(&db->db_holds) > 1 &&
2418	    dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
2419	    *datap == db->db_buf) {
2420		/*
2421		 * If this buffer is currently "in use" (i.e., there
2422		 * are active holds and db_data still references it),
2423		 * then make a copy before we start the write so that
2424		 * any modifications from the open txg will not leak
2425		 * into this write.
2426		 *
2427		 * NOTE: this copy does not need to be made for
2428		 * objects only modified in the syncing context (e.g.
2429		 * DNONE_DNODE blocks).
2430		 */
2431		int blksz = arc_buf_size(*datap);
2432		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
2433		*datap = arc_buf_alloc(os->os_spa, blksz, db, type);
2434		bcopy(db->db.db_data, (*datap)->b_data, blksz);
2435	}
2436	db->db_data_pending = dr;
2437
2438	mutex_exit(&db->db_mtx);
2439
2440	dbuf_write(dr, *datap, tx);
2441
2442	ASSERT(!list_link_active(&dr->dr_dirty_node));
2443	if (dn->dn_object == DMU_META_DNODE_OBJECT) {
2444		list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
2445		DB_DNODE_EXIT(db);
2446	} else {
2447		/*
2448		 * Although zio_nowait() does not "wait for an IO", it does
2449		 * initiate the IO. If this is an empty write it seems plausible
2450		 * that the IO could actually be completed before the nowait
2451		 * returns. We need to DB_DNODE_EXIT() first in case
2452		 * zio_nowait() invalidates the dbuf.
2453		 */
2454		DB_DNODE_EXIT(db);
2455		zio_nowait(dr->dr_zio);
2456	}
2457}
2458
2459void
2460dbuf_sync_list(list_t *list, dmu_tx_t *tx)
2461{
2462	dbuf_dirty_record_t *dr;
2463
2464	while (dr = list_head(list)) {
2465		if (dr->dr_zio != NULL) {
2466			/*
2467			 * If we find an already initialized zio then we
2468			 * are processing the meta-dnode, and we have finished.
2469			 * The dbufs for all dnodes are put back on the list
2470			 * during processing, so that we can zio_wait()
2471			 * these IOs after initiating all child IOs.
2472			 */
2473			ASSERT3U(dr->dr_dbuf->db.db_object, ==,
2474			    DMU_META_DNODE_OBJECT);
2475			break;
2476		}
2477		list_remove(list, dr);
2478		if (dr->dr_dbuf->db_level > 0)
2479			dbuf_sync_indirect(dr, tx);
2480		else
2481			dbuf_sync_leaf(dr, tx);
2482	}
2483}
2484
2485/* ARGSUSED */
2486static void
2487dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
2488{
2489	dmu_buf_impl_t *db = vdb;
2490	dnode_t *dn;
2491	blkptr_t *bp = zio->io_bp;
2492	blkptr_t *bp_orig = &zio->io_bp_orig;
2493	spa_t *spa = zio->io_spa;
2494	int64_t delta;
2495	uint64_t fill = 0;
2496	int i;
2497
2498	ASSERT3P(db->db_blkptr, ==, bp);
2499
2500	DB_DNODE_ENTER(db);
2501	dn = DB_DNODE(db);
2502	delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
2503	dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
2504	zio->io_prev_space_delta = delta;
2505
2506	if (bp->blk_birth != 0) {
2507		ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
2508		    BP_GET_TYPE(bp) == dn->dn_type) ||
2509		    (db->db_blkid == DMU_SPILL_BLKID &&
2510		    BP_GET_TYPE(bp) == dn->dn_bonustype) ||
2511		    BP_IS_EMBEDDED(bp));
2512		ASSERT(BP_GET_LEVEL(bp) == db->db_level);
2513	}
2514
2515	mutex_enter(&db->db_mtx);
2516
2517#ifdef ZFS_DEBUG
2518	if (db->db_blkid == DMU_SPILL_BLKID) {
2519		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
2520		ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
2521		    db->db_blkptr == &dn->dn_phys->dn_spill);
2522	}
2523#endif
2524
2525	if (db->db_level == 0) {
2526		mutex_enter(&dn->dn_mtx);
2527		if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
2528		    db->db_blkid != DMU_SPILL_BLKID)
2529			dn->dn_phys->dn_maxblkid = db->db_blkid;
2530		mutex_exit(&dn->dn_mtx);
2531
2532		if (dn->dn_type == DMU_OT_DNODE) {
2533			dnode_phys_t *dnp = db->db.db_data;
2534			for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
2535			    i--, dnp++) {
2536				if (dnp->dn_type != DMU_OT_NONE)
2537					fill++;
2538			}
2539		} else {
2540			if (BP_IS_HOLE(bp)) {
2541				fill = 0;
2542			} else {
2543				fill = 1;
2544			}
2545		}
2546	} else {
2547		blkptr_t *ibp = db->db.db_data;
2548		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2549		for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
2550			if (BP_IS_HOLE(ibp))
2551				continue;
2552			fill += BP_GET_FILL(ibp);
2553		}
2554	}
2555	DB_DNODE_EXIT(db);
2556
2557	if (!BP_IS_EMBEDDED(bp))
2558		bp->blk_fill = fill;
2559
2560	mutex_exit(&db->db_mtx);
2561}
2562
2563/*
2564 * The SPA will call this callback several times for each zio - once
2565 * for every physical child i/o (zio->io_phys_children times).  This
2566 * allows the DMU to monitor the progress of each logical i/o.  For example,
2567 * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
2568 * block.  There may be a long delay before all copies/fragments are completed,
2569 * so this callback allows us to retire dirty space gradually, as the physical
2570 * i/os complete.
2571 */
2572/* ARGSUSED */
2573static void
2574dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
2575{
2576	dmu_buf_impl_t *db = arg;
2577	objset_t *os = db->db_objset;
2578	dsl_pool_t *dp = dmu_objset_pool(os);
2579	dbuf_dirty_record_t *dr;
2580	int delta = 0;
2581
2582	dr = db->db_data_pending;
2583	ASSERT3U(dr->dr_txg, ==, zio->io_txg);
2584
2585	/*
2586	 * The callback will be called io_phys_children times.  Retire one
2587	 * portion of our dirty space each time we are called.  Any rounding
2588	 * error will be cleaned up by dsl_pool_sync()'s call to
2589	 * dsl_pool_undirty_space().
2590	 */
2591	delta = dr->dr_accounted / zio->io_phys_children;
2592	dsl_pool_undirty_space(dp, delta, zio->io_txg);
2593}
2594
2595/* ARGSUSED */
2596static void
2597dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
2598{
2599	dmu_buf_impl_t *db = vdb;
2600	blkptr_t *bp_orig = &zio->io_bp_orig;
2601	blkptr_t *bp = db->db_blkptr;
2602	objset_t *os = db->db_objset;
2603	dmu_tx_t *tx = os->os_synctx;
2604	dbuf_dirty_record_t **drp, *dr;
2605
2606	ASSERT0(zio->io_error);
2607	ASSERT(db->db_blkptr == bp);
2608
2609	/*
2610	 * For nopwrites and rewrites we ensure that the bp matches our
2611	 * original and bypass all the accounting.
2612	 */
2613	if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
2614		ASSERT(BP_EQUAL(bp, bp_orig));
2615	} else {
2616		dsl_dataset_t *ds = os->os_dsl_dataset;
2617		(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
2618		dsl_dataset_block_born(ds, bp, tx);
2619	}
2620
2621	mutex_enter(&db->db_mtx);
2622
2623	DBUF_VERIFY(db);
2624
2625	drp = &db->db_last_dirty;
2626	while ((dr = *drp) != db->db_data_pending)
2627		drp = &dr->dr_next;
2628	ASSERT(!list_link_active(&dr->dr_dirty_node));
2629	ASSERT(dr->dr_dbuf == db);
2630	ASSERT(dr->dr_next == NULL);
2631	*drp = dr->dr_next;
2632
2633#ifdef ZFS_DEBUG
2634	if (db->db_blkid == DMU_SPILL_BLKID) {
2635		dnode_t *dn;
2636
2637		DB_DNODE_ENTER(db);
2638		dn = DB_DNODE(db);
2639		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
2640		ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
2641		    db->db_blkptr == &dn->dn_phys->dn_spill);
2642		DB_DNODE_EXIT(db);
2643	}
2644#endif
2645
2646	if (db->db_level == 0) {
2647		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2648		ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
2649		if (db->db_state != DB_NOFILL) {
2650			if (dr->dt.dl.dr_data != db->db_buf)
2651				VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
2652				    db));
2653			else if (!arc_released(db->db_buf))
2654				arc_set_callback(db->db_buf, dbuf_do_evict, db);
2655		}
2656	} else {
2657		dnode_t *dn;
2658
2659		DB_DNODE_ENTER(db);
2660		dn = DB_DNODE(db);
2661		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2662		ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
2663		if (!BP_IS_HOLE(db->db_blkptr)) {
2664			int epbs =
2665			    dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2666			ASSERT3U(db->db_blkid, <=,
2667			    dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
2668			ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
2669			    db->db.db_size);
2670			if (!arc_released(db->db_buf))
2671				arc_set_callback(db->db_buf, dbuf_do_evict, db);
2672		}
2673		DB_DNODE_EXIT(db);
2674		mutex_destroy(&dr->dt.di.dr_mtx);
2675		list_destroy(&dr->dt.di.dr_children);
2676	}
2677	kmem_free(dr, sizeof (dbuf_dirty_record_t));
2678
2679	cv_broadcast(&db->db_changed);
2680	ASSERT(db->db_dirtycnt > 0);
2681	db->db_dirtycnt -= 1;
2682	db->db_data_pending = NULL;
2683	dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg);
2684}
2685
2686static void
2687dbuf_write_nofill_ready(zio_t *zio)
2688{
2689	dbuf_write_ready(zio, NULL, zio->io_private);
2690}
2691
2692static void
2693dbuf_write_nofill_done(zio_t *zio)
2694{
2695	dbuf_write_done(zio, NULL, zio->io_private);
2696}
2697
2698static void
2699dbuf_write_override_ready(zio_t *zio)
2700{
2701	dbuf_dirty_record_t *dr = zio->io_private;
2702	dmu_buf_impl_t *db = dr->dr_dbuf;
2703
2704	dbuf_write_ready(zio, NULL, db);
2705}
2706
2707static void
2708dbuf_write_override_done(zio_t *zio)
2709{
2710	dbuf_dirty_record_t *dr = zio->io_private;
2711	dmu_buf_impl_t *db = dr->dr_dbuf;
2712	blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
2713
2714	mutex_enter(&db->db_mtx);
2715	if (!BP_EQUAL(zio->io_bp, obp)) {
2716		if (!BP_IS_HOLE(obp))
2717			dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
2718		arc_release(dr->dt.dl.dr_data, db);
2719	}
2720	mutex_exit(&db->db_mtx);
2721
2722	dbuf_write_done(zio, NULL, db);
2723}
2724
2725/* Issue I/O to commit a dirty buffer to disk. */
2726static void
2727dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
2728{
2729	dmu_buf_impl_t *db = dr->dr_dbuf;
2730	dnode_t *dn;
2731	objset_t *os;
2732	dmu_buf_impl_t *parent = db->db_parent;
2733	uint64_t txg = tx->tx_txg;
2734	zbookmark_phys_t zb;
2735	zio_prop_t zp;
2736	zio_t *zio;
2737	int wp_flag = 0;
2738
2739	DB_DNODE_ENTER(db);
2740	dn = DB_DNODE(db);
2741	os = dn->dn_objset;
2742
2743	if (db->db_state != DB_NOFILL) {
2744		if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
2745			/*
2746			 * Private object buffers are released here rather
2747			 * than in dbuf_dirty() since they are only modified
2748			 * in the syncing context and we don't want the
2749			 * overhead of making multiple copies of the data.
2750			 */
2751			if (BP_IS_HOLE(db->db_blkptr)) {
2752				arc_buf_thaw(data);
2753			} else {
2754				dbuf_release_bp(db);
2755			}
2756		}
2757	}
2758
2759	if (parent != dn->dn_dbuf) {
2760		/* Our parent is an indirect block. */
2761		/* We have a dirty parent that has been scheduled for write. */
2762		ASSERT(parent && parent->db_data_pending);
2763		/* Our parent's buffer is one level closer to the dnode. */
2764		ASSERT(db->db_level == parent->db_level-1);
2765		/*
2766		 * We're about to modify our parent's db_data by modifying
2767		 * our block pointer, so the parent must be released.
2768		 */
2769		ASSERT(arc_released(parent->db_buf));
2770		zio = parent->db_data_pending->dr_zio;
2771	} else {
2772		/* Our parent is the dnode itself. */
2773		ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
2774		    db->db_blkid != DMU_SPILL_BLKID) ||
2775		    (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
2776		if (db->db_blkid != DMU_SPILL_BLKID)
2777			ASSERT3P(db->db_blkptr, ==,
2778			    &dn->dn_phys->dn_blkptr[db->db_blkid]);
2779		zio = dn->dn_zio;
2780	}
2781
2782	ASSERT(db->db_level == 0 || data == db->db_buf);
2783	ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
2784	ASSERT(zio);
2785
2786	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
2787	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
2788	    db->db.db_object, db->db_level, db->db_blkid);
2789
2790	if (db->db_blkid == DMU_SPILL_BLKID)
2791		wp_flag = WP_SPILL;
2792	wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
2793
2794	dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
2795	DB_DNODE_EXIT(db);
2796
2797	if (db->db_level == 0 &&
2798	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
2799		/*
2800		 * The BP for this block has been provided by open context
2801		 * (by dmu_sync() or dmu_buf_write_embedded()).
2802		 */
2803		void *contents = (data != NULL) ? data->b_data : NULL;
2804
2805		dr->dr_zio = zio_write(zio, os->os_spa, txg,
2806		    db->db_blkptr, contents, db->db.db_size, &zp,
2807		    dbuf_write_override_ready, NULL, dbuf_write_override_done,
2808		    dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2809		mutex_enter(&db->db_mtx);
2810		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
2811		zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
2812		    dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
2813		mutex_exit(&db->db_mtx);
2814	} else if (db->db_state == DB_NOFILL) {
2815		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
2816		    zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
2817		dr->dr_zio = zio_write(zio, os->os_spa, txg,
2818		    db->db_blkptr, NULL, db->db.db_size, &zp,
2819		    dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db,
2820		    ZIO_PRIORITY_ASYNC_WRITE,
2821		    ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
2822	} else {
2823		ASSERT(arc_released(data));
2824		dr->dr_zio = arc_write(zio, os->os_spa, txg,
2825		    db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db),
2826		    DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
2827		    dbuf_write_physdone, dbuf_write_done, db,
2828		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2829	}
2830}
2831