dbuf.c revision 265740
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
24 * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
25 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
26 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
27 */
28
29#include <sys/zfs_context.h>
30#include <sys/dmu.h>
31#include <sys/dmu_send.h>
32#include <sys/dmu_impl.h>
33#include <sys/dbuf.h>
34#include <sys/dmu_objset.h>
35#include <sys/dsl_dataset.h>
36#include <sys/dsl_dir.h>
37#include <sys/dmu_tx.h>
38#include <sys/spa.h>
39#include <sys/zio.h>
40#include <sys/dmu_zfetch.h>
41#include <sys/sa.h>
42#include <sys/sa_impl.h>
43#include <sys/range_tree.h>
44
45/*
46 * Number of times that zfs_free_range() took the slow path while doing
47 * a zfs receive.  A nonzero value indicates a potential performance problem.
48 */
49uint64_t zfs_free_range_recv_miss;
50
51static void dbuf_destroy(dmu_buf_impl_t *db);
52static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
53static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
54
55/*
56 * Global data structures and functions for the dbuf cache.
57 */
58static kmem_cache_t *dbuf_cache;
59
60/* ARGSUSED */
61static int
62dbuf_cons(void *vdb, void *unused, int kmflag)
63{
64	dmu_buf_impl_t *db = vdb;
65	bzero(db, sizeof (dmu_buf_impl_t));
66
67	mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
68	cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
69	refcount_create(&db->db_holds);
70	return (0);
71}
72
73/* ARGSUSED */
74static void
75dbuf_dest(void *vdb, void *unused)
76{
77	dmu_buf_impl_t *db = vdb;
78	mutex_destroy(&db->db_mtx);
79	cv_destroy(&db->db_changed);
80	refcount_destroy(&db->db_holds);
81}
82
83/*
84 * dbuf hash table routines
85 */
86static dbuf_hash_table_t dbuf_hash_table;
87
88static uint64_t dbuf_hash_count;
89
90static uint64_t
91dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
92{
93	uintptr_t osv = (uintptr_t)os;
94	uint64_t crc = -1ULL;
95
96	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
97	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
98	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
99	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
100	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
101	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
102	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
103
104	crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
105
106	return (crc);
107}
108
109#define	DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
110
111#define	DBUF_EQUAL(dbuf, os, obj, level, blkid)		\
112	((dbuf)->db.db_object == (obj) &&		\
113	(dbuf)->db_objset == (os) &&			\
114	(dbuf)->db_level == (level) &&			\
115	(dbuf)->db_blkid == (blkid))
116
117dmu_buf_impl_t *
118dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
119{
120	dbuf_hash_table_t *h = &dbuf_hash_table;
121	objset_t *os = dn->dn_objset;
122	uint64_t obj = dn->dn_object;
123	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
124	uint64_t idx = hv & h->hash_table_mask;
125	dmu_buf_impl_t *db;
126
127	mutex_enter(DBUF_HASH_MUTEX(h, idx));
128	for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
129		if (DBUF_EQUAL(db, os, obj, level, blkid)) {
130			mutex_enter(&db->db_mtx);
131			if (db->db_state != DB_EVICTING) {
132				mutex_exit(DBUF_HASH_MUTEX(h, idx));
133				return (db);
134			}
135			mutex_exit(&db->db_mtx);
136		}
137	}
138	mutex_exit(DBUF_HASH_MUTEX(h, idx));
139	return (NULL);
140}
141
142/*
143 * Insert an entry into the hash table.  If there is already an element
144 * equal to elem in the hash table, then the already existing element
145 * will be returned and the new element will not be inserted.
146 * Otherwise returns NULL.
147 */
148static dmu_buf_impl_t *
149dbuf_hash_insert(dmu_buf_impl_t *db)
150{
151	dbuf_hash_table_t *h = &dbuf_hash_table;
152	objset_t *os = db->db_objset;
153	uint64_t obj = db->db.db_object;
154	int level = db->db_level;
155	uint64_t blkid = db->db_blkid;
156	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
157	uint64_t idx = hv & h->hash_table_mask;
158	dmu_buf_impl_t *dbf;
159
160	mutex_enter(DBUF_HASH_MUTEX(h, idx));
161	for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
162		if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
163			mutex_enter(&dbf->db_mtx);
164			if (dbf->db_state != DB_EVICTING) {
165				mutex_exit(DBUF_HASH_MUTEX(h, idx));
166				return (dbf);
167			}
168			mutex_exit(&dbf->db_mtx);
169		}
170	}
171
172	mutex_enter(&db->db_mtx);
173	db->db_hash_next = h->hash_table[idx];
174	h->hash_table[idx] = db;
175	mutex_exit(DBUF_HASH_MUTEX(h, idx));
176	atomic_add_64(&dbuf_hash_count, 1);
177
178	return (NULL);
179}
180
181/*
182 * Remove an entry from the hash table.  This operation will
183 * fail if there are any existing holds on the db.
184 */
185static void
186dbuf_hash_remove(dmu_buf_impl_t *db)
187{
188	dbuf_hash_table_t *h = &dbuf_hash_table;
189	uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
190	    db->db_level, db->db_blkid);
191	uint64_t idx = hv & h->hash_table_mask;
192	dmu_buf_impl_t *dbf, **dbp;
193
194	/*
195	 * We musn't hold db_mtx to maintin lock ordering:
196	 * DBUF_HASH_MUTEX > db_mtx.
197	 */
198	ASSERT(refcount_is_zero(&db->db_holds));
199	ASSERT(db->db_state == DB_EVICTING);
200	ASSERT(!MUTEX_HELD(&db->db_mtx));
201
202	mutex_enter(DBUF_HASH_MUTEX(h, idx));
203	dbp = &h->hash_table[idx];
204	while ((dbf = *dbp) != db) {
205		dbp = &dbf->db_hash_next;
206		ASSERT(dbf != NULL);
207	}
208	*dbp = db->db_hash_next;
209	db->db_hash_next = NULL;
210	mutex_exit(DBUF_HASH_MUTEX(h, idx));
211	atomic_add_64(&dbuf_hash_count, -1);
212}
213
214static arc_evict_func_t dbuf_do_evict;
215
216static void
217dbuf_evict_user(dmu_buf_impl_t *db)
218{
219	ASSERT(MUTEX_HELD(&db->db_mtx));
220
221	if (db->db_level != 0 || db->db_evict_func == NULL)
222		return;
223
224	if (db->db_user_data_ptr_ptr)
225		*db->db_user_data_ptr_ptr = db->db.db_data;
226	db->db_evict_func(&db->db, db->db_user_ptr);
227	db->db_user_ptr = NULL;
228	db->db_user_data_ptr_ptr = NULL;
229	db->db_evict_func = NULL;
230}
231
232boolean_t
233dbuf_is_metadata(dmu_buf_impl_t *db)
234{
235	if (db->db_level > 0) {
236		return (B_TRUE);
237	} else {
238		boolean_t is_metadata;
239
240		DB_DNODE_ENTER(db);
241		is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
242		DB_DNODE_EXIT(db);
243
244		return (is_metadata);
245	}
246}
247
248void
249dbuf_evict(dmu_buf_impl_t *db)
250{
251	ASSERT(MUTEX_HELD(&db->db_mtx));
252	ASSERT(db->db_buf == NULL);
253	ASSERT(db->db_data_pending == NULL);
254
255	dbuf_clear(db);
256	dbuf_destroy(db);
257}
258
259void
260dbuf_init(void)
261{
262	uint64_t hsize = 1ULL << 16;
263	dbuf_hash_table_t *h = &dbuf_hash_table;
264	int i;
265
266	/*
267	 * The hash table is big enough to fill all of physical memory
268	 * with an average 4K block size.  The table will take up
269	 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
270	 */
271	while (hsize * 4096 < (uint64_t)physmem * PAGESIZE)
272		hsize <<= 1;
273
274retry:
275	h->hash_table_mask = hsize - 1;
276	h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
277	if (h->hash_table == NULL) {
278		/* XXX - we should really return an error instead of assert */
279		ASSERT(hsize > (1ULL << 10));
280		hsize >>= 1;
281		goto retry;
282	}
283
284	dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
285	    sizeof (dmu_buf_impl_t),
286	    0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
287
288	for (i = 0; i < DBUF_MUTEXES; i++)
289		mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
290}
291
292void
293dbuf_fini(void)
294{
295	dbuf_hash_table_t *h = &dbuf_hash_table;
296	int i;
297
298	for (i = 0; i < DBUF_MUTEXES; i++)
299		mutex_destroy(&h->hash_mutexes[i]);
300	kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
301	kmem_cache_destroy(dbuf_cache);
302}
303
304/*
305 * Other stuff.
306 */
307
308#ifdef ZFS_DEBUG
309static void
310dbuf_verify(dmu_buf_impl_t *db)
311{
312	dnode_t *dn;
313	dbuf_dirty_record_t *dr;
314
315	ASSERT(MUTEX_HELD(&db->db_mtx));
316
317	if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
318		return;
319
320	ASSERT(db->db_objset != NULL);
321	DB_DNODE_ENTER(db);
322	dn = DB_DNODE(db);
323	if (dn == NULL) {
324		ASSERT(db->db_parent == NULL);
325		ASSERT(db->db_blkptr == NULL);
326	} else {
327		ASSERT3U(db->db.db_object, ==, dn->dn_object);
328		ASSERT3P(db->db_objset, ==, dn->dn_objset);
329		ASSERT3U(db->db_level, <, dn->dn_nlevels);
330		ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
331		    db->db_blkid == DMU_SPILL_BLKID ||
332		    !list_is_empty(&dn->dn_dbufs));
333	}
334	if (db->db_blkid == DMU_BONUS_BLKID) {
335		ASSERT(dn != NULL);
336		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
337		ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
338	} else if (db->db_blkid == DMU_SPILL_BLKID) {
339		ASSERT(dn != NULL);
340		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
341		ASSERT0(db->db.db_offset);
342	} else {
343		ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
344	}
345
346	for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
347		ASSERT(dr->dr_dbuf == db);
348
349	for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
350		ASSERT(dr->dr_dbuf == db);
351
352	/*
353	 * We can't assert that db_size matches dn_datablksz because it
354	 * can be momentarily different when another thread is doing
355	 * dnode_set_blksz().
356	 */
357	if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
358		dr = db->db_data_pending;
359		/*
360		 * It should only be modified in syncing context, so
361		 * make sure we only have one copy of the data.
362		 */
363		ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
364	}
365
366	/* verify db->db_blkptr */
367	if (db->db_blkptr) {
368		if (db->db_parent == dn->dn_dbuf) {
369			/* db is pointed to by the dnode */
370			/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
371			if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
372				ASSERT(db->db_parent == NULL);
373			else
374				ASSERT(db->db_parent != NULL);
375			if (db->db_blkid != DMU_SPILL_BLKID)
376				ASSERT3P(db->db_blkptr, ==,
377				    &dn->dn_phys->dn_blkptr[db->db_blkid]);
378		} else {
379			/* db is pointed to by an indirect block */
380			int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
381			ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
382			ASSERT3U(db->db_parent->db.db_object, ==,
383			    db->db.db_object);
384			/*
385			 * dnode_grow_indblksz() can make this fail if we don't
386			 * have the struct_rwlock.  XXX indblksz no longer
387			 * grows.  safe to do this now?
388			 */
389			if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
390				ASSERT3P(db->db_blkptr, ==,
391				    ((blkptr_t *)db->db_parent->db.db_data +
392				    db->db_blkid % epb));
393			}
394		}
395	}
396	if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
397	    (db->db_buf == NULL || db->db_buf->b_data) &&
398	    db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
399	    db->db_state != DB_FILL && !dn->dn_free_txg) {
400		/*
401		 * If the blkptr isn't set but they have nonzero data,
402		 * it had better be dirty, otherwise we'll lose that
403		 * data when we evict this buffer.
404		 */
405		if (db->db_dirtycnt == 0) {
406			uint64_t *buf = db->db.db_data;
407			int i;
408
409			for (i = 0; i < db->db.db_size >> 3; i++) {
410				ASSERT(buf[i] == 0);
411			}
412		}
413	}
414	DB_DNODE_EXIT(db);
415}
416#endif
417
418static void
419dbuf_update_data(dmu_buf_impl_t *db)
420{
421	ASSERT(MUTEX_HELD(&db->db_mtx));
422	if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
423		ASSERT(!refcount_is_zero(&db->db_holds));
424		*db->db_user_data_ptr_ptr = db->db.db_data;
425	}
426}
427
428static void
429dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
430{
431	ASSERT(MUTEX_HELD(&db->db_mtx));
432	ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
433	db->db_buf = buf;
434	if (buf != NULL) {
435		ASSERT(buf->b_data != NULL);
436		db->db.db_data = buf->b_data;
437		if (!arc_released(buf))
438			arc_set_callback(buf, dbuf_do_evict, db);
439		dbuf_update_data(db);
440	} else {
441		dbuf_evict_user(db);
442		db->db.db_data = NULL;
443		if (db->db_state != DB_NOFILL)
444			db->db_state = DB_UNCACHED;
445	}
446}
447
448/*
449 * Loan out an arc_buf for read.  Return the loaned arc_buf.
450 */
451arc_buf_t *
452dbuf_loan_arcbuf(dmu_buf_impl_t *db)
453{
454	arc_buf_t *abuf;
455
456	mutex_enter(&db->db_mtx);
457	if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
458		int blksz = db->db.db_size;
459		spa_t *spa = db->db_objset->os_spa;
460
461		mutex_exit(&db->db_mtx);
462		abuf = arc_loan_buf(spa, blksz);
463		bcopy(db->db.db_data, abuf->b_data, blksz);
464	} else {
465		abuf = db->db_buf;
466		arc_loan_inuse_buf(abuf, db);
467		dbuf_set_data(db, NULL);
468		mutex_exit(&db->db_mtx);
469	}
470	return (abuf);
471}
472
473uint64_t
474dbuf_whichblock(dnode_t *dn, uint64_t offset)
475{
476	if (dn->dn_datablkshift) {
477		return (offset >> dn->dn_datablkshift);
478	} else {
479		ASSERT3U(offset, <, dn->dn_datablksz);
480		return (0);
481	}
482}
483
484static void
485dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
486{
487	dmu_buf_impl_t *db = vdb;
488
489	mutex_enter(&db->db_mtx);
490	ASSERT3U(db->db_state, ==, DB_READ);
491	/*
492	 * All reads are synchronous, so we must have a hold on the dbuf
493	 */
494	ASSERT(refcount_count(&db->db_holds) > 0);
495	ASSERT(db->db_buf == NULL);
496	ASSERT(db->db.db_data == NULL);
497	if (db->db_level == 0 && db->db_freed_in_flight) {
498		/* we were freed in flight; disregard any error */
499		arc_release(buf, db);
500		bzero(buf->b_data, db->db.db_size);
501		arc_buf_freeze(buf);
502		db->db_freed_in_flight = FALSE;
503		dbuf_set_data(db, buf);
504		db->db_state = DB_CACHED;
505	} else if (zio == NULL || zio->io_error == 0) {
506		dbuf_set_data(db, buf);
507		db->db_state = DB_CACHED;
508	} else {
509		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
510		ASSERT3P(db->db_buf, ==, NULL);
511		VERIFY(arc_buf_remove_ref(buf, db));
512		db->db_state = DB_UNCACHED;
513	}
514	cv_broadcast(&db->db_changed);
515	dbuf_rele_and_unlock(db, NULL);
516}
517
518static void
519dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
520{
521	dnode_t *dn;
522	zbookmark_t zb;
523	uint32_t aflags = ARC_NOWAIT;
524
525	DB_DNODE_ENTER(db);
526	dn = DB_DNODE(db);
527	ASSERT(!refcount_is_zero(&db->db_holds));
528	/* We need the struct_rwlock to prevent db_blkptr from changing. */
529	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
530	ASSERT(MUTEX_HELD(&db->db_mtx));
531	ASSERT(db->db_state == DB_UNCACHED);
532	ASSERT(db->db_buf == NULL);
533
534	if (db->db_blkid == DMU_BONUS_BLKID) {
535		int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
536
537		ASSERT3U(bonuslen, <=, db->db.db_size);
538		db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
539		arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
540		if (bonuslen < DN_MAX_BONUSLEN)
541			bzero(db->db.db_data, DN_MAX_BONUSLEN);
542		if (bonuslen)
543			bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
544		DB_DNODE_EXIT(db);
545		dbuf_update_data(db);
546		db->db_state = DB_CACHED;
547		mutex_exit(&db->db_mtx);
548		return;
549	}
550
551	/*
552	 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
553	 * processes the delete record and clears the bp while we are waiting
554	 * for the dn_mtx (resulting in a "no" from block_freed).
555	 */
556	if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
557	    (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
558	    BP_IS_HOLE(db->db_blkptr)))) {
559		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
560
561		DB_DNODE_EXIT(db);
562		dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa,
563		    db->db.db_size, db, type));
564		bzero(db->db.db_data, db->db.db_size);
565		db->db_state = DB_CACHED;
566		*flags |= DB_RF_CACHED;
567		mutex_exit(&db->db_mtx);
568		return;
569	}
570
571	DB_DNODE_EXIT(db);
572
573	db->db_state = DB_READ;
574	mutex_exit(&db->db_mtx);
575
576	if (DBUF_IS_L2CACHEABLE(db))
577		aflags |= ARC_L2CACHE;
578	if (DBUF_IS_L2COMPRESSIBLE(db))
579		aflags |= ARC_L2COMPRESS;
580
581	SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
582	    db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
583	    db->db.db_object, db->db_level, db->db_blkid);
584
585	dbuf_add_ref(db, NULL);
586
587	(void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
588	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
589	    (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
590	    &aflags, &zb);
591	if (aflags & ARC_CACHED)
592		*flags |= DB_RF_CACHED;
593}
594
595int
596dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
597{
598	int err = 0;
599	boolean_t havepzio = (zio != NULL);
600	boolean_t prefetch;
601	dnode_t *dn;
602
603	/*
604	 * We don't have to hold the mutex to check db_state because it
605	 * can't be freed while we have a hold on the buffer.
606	 */
607	ASSERT(!refcount_is_zero(&db->db_holds));
608
609	if (db->db_state == DB_NOFILL)
610		return (SET_ERROR(EIO));
611
612	DB_DNODE_ENTER(db);
613	dn = DB_DNODE(db);
614	if ((flags & DB_RF_HAVESTRUCT) == 0)
615		rw_enter(&dn->dn_struct_rwlock, RW_READER);
616
617	prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
618	    (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
619	    DBUF_IS_CACHEABLE(db);
620
621	mutex_enter(&db->db_mtx);
622	if (db->db_state == DB_CACHED) {
623		mutex_exit(&db->db_mtx);
624		if (prefetch)
625			dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
626			    db->db.db_size, TRUE);
627		if ((flags & DB_RF_HAVESTRUCT) == 0)
628			rw_exit(&dn->dn_struct_rwlock);
629		DB_DNODE_EXIT(db);
630	} else if (db->db_state == DB_UNCACHED) {
631		spa_t *spa = dn->dn_objset->os_spa;
632
633		if (zio == NULL)
634			zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
635		dbuf_read_impl(db, zio, &flags);
636
637		/* dbuf_read_impl has dropped db_mtx for us */
638
639		if (prefetch)
640			dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
641			    db->db.db_size, flags & DB_RF_CACHED);
642
643		if ((flags & DB_RF_HAVESTRUCT) == 0)
644			rw_exit(&dn->dn_struct_rwlock);
645		DB_DNODE_EXIT(db);
646
647		if (!havepzio)
648			err = zio_wait(zio);
649	} else {
650		/*
651		 * Another reader came in while the dbuf was in flight
652		 * between UNCACHED and CACHED.  Either a writer will finish
653		 * writing the buffer (sending the dbuf to CACHED) or the
654		 * first reader's request will reach the read_done callback
655		 * and send the dbuf to CACHED.  Otherwise, a failure
656		 * occurred and the dbuf went to UNCACHED.
657		 */
658		mutex_exit(&db->db_mtx);
659		if (prefetch)
660			dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
661			    db->db.db_size, TRUE);
662		if ((flags & DB_RF_HAVESTRUCT) == 0)
663			rw_exit(&dn->dn_struct_rwlock);
664		DB_DNODE_EXIT(db);
665
666		/* Skip the wait per the caller's request. */
667		mutex_enter(&db->db_mtx);
668		if ((flags & DB_RF_NEVERWAIT) == 0) {
669			while (db->db_state == DB_READ ||
670			    db->db_state == DB_FILL) {
671				ASSERT(db->db_state == DB_READ ||
672				    (flags & DB_RF_HAVESTRUCT) == 0);
673				cv_wait(&db->db_changed, &db->db_mtx);
674			}
675			if (db->db_state == DB_UNCACHED)
676				err = SET_ERROR(EIO);
677		}
678		mutex_exit(&db->db_mtx);
679	}
680
681	ASSERT(err || havepzio || db->db_state == DB_CACHED);
682	return (err);
683}
684
685static void
686dbuf_noread(dmu_buf_impl_t *db)
687{
688	ASSERT(!refcount_is_zero(&db->db_holds));
689	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
690	mutex_enter(&db->db_mtx);
691	while (db->db_state == DB_READ || db->db_state == DB_FILL)
692		cv_wait(&db->db_changed, &db->db_mtx);
693	if (db->db_state == DB_UNCACHED) {
694		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
695		spa_t *spa = db->db_objset->os_spa;
696
697		ASSERT(db->db_buf == NULL);
698		ASSERT(db->db.db_data == NULL);
699		dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
700		db->db_state = DB_FILL;
701	} else if (db->db_state == DB_NOFILL) {
702		dbuf_set_data(db, NULL);
703	} else {
704		ASSERT3U(db->db_state, ==, DB_CACHED);
705	}
706	mutex_exit(&db->db_mtx);
707}
708
709/*
710 * This is our just-in-time copy function.  It makes a copy of
711 * buffers, that have been modified in a previous transaction
712 * group, before we modify them in the current active group.
713 *
714 * This function is used in two places: when we are dirtying a
715 * buffer for the first time in a txg, and when we are freeing
716 * a range in a dnode that includes this buffer.
717 *
718 * Note that when we are called from dbuf_free_range() we do
719 * not put a hold on the buffer, we just traverse the active
720 * dbuf list for the dnode.
721 */
722static void
723dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
724{
725	dbuf_dirty_record_t *dr = db->db_last_dirty;
726
727	ASSERT(MUTEX_HELD(&db->db_mtx));
728	ASSERT(db->db.db_data != NULL);
729	ASSERT(db->db_level == 0);
730	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
731
732	if (dr == NULL ||
733	    (dr->dt.dl.dr_data !=
734	    ((db->db_blkid  == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
735		return;
736
737	/*
738	 * If the last dirty record for this dbuf has not yet synced
739	 * and its referencing the dbuf data, either:
740	 *	reset the reference to point to a new copy,
741	 * or (if there a no active holders)
742	 *	just null out the current db_data pointer.
743	 */
744	ASSERT(dr->dr_txg >= txg - 2);
745	if (db->db_blkid == DMU_BONUS_BLKID) {
746		/* Note that the data bufs here are zio_bufs */
747		dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
748		arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
749		bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
750	} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
751		int size = db->db.db_size;
752		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
753		spa_t *spa = db->db_objset->os_spa;
754
755		dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
756		bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
757	} else {
758		dbuf_set_data(db, NULL);
759	}
760}
761
762void
763dbuf_unoverride(dbuf_dirty_record_t *dr)
764{
765	dmu_buf_impl_t *db = dr->dr_dbuf;
766	blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
767	uint64_t txg = dr->dr_txg;
768
769	ASSERT(MUTEX_HELD(&db->db_mtx));
770	ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
771	ASSERT(db->db_level == 0);
772
773	if (db->db_blkid == DMU_BONUS_BLKID ||
774	    dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
775		return;
776
777	ASSERT(db->db_data_pending != dr);
778
779	/* free this block */
780	if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
781		zio_free(db->db_objset->os_spa, txg, bp);
782
783	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
784	dr->dt.dl.dr_nopwrite = B_FALSE;
785
786	/*
787	 * Release the already-written buffer, so we leave it in
788	 * a consistent dirty state.  Note that all callers are
789	 * modifying the buffer, so they will immediately do
790	 * another (redundant) arc_release().  Therefore, leave
791	 * the buf thawed to save the effort of freezing &
792	 * immediately re-thawing it.
793	 */
794	arc_release(dr->dt.dl.dr_data, db);
795}
796
797/*
798 * Evict (if its unreferenced) or clear (if its referenced) any level-0
799 * data blocks in the free range, so that any future readers will find
800 * empty blocks.
801 *
802 * This is a no-op if the dataset is in the middle of an incremental
803 * receive; see comment below for details.
804 */
805void
806dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
807{
808	dmu_buf_impl_t *db, *db_next;
809	uint64_t txg = tx->tx_txg;
810
811	if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID))
812		end = dn->dn_maxblkid;
813	dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
814
815	mutex_enter(&dn->dn_dbufs_mtx);
816	if (start >= dn->dn_unlisted_l0_blkid * dn->dn_datablksz) {
817		/* There can't be any dbufs in this range; no need to search. */
818		mutex_exit(&dn->dn_dbufs_mtx);
819		return;
820	} else if (dmu_objset_is_receiving(dn->dn_objset)) {
821		/*
822		 * If we are receiving, we expect there to be no dbufs in
823		 * the range to be freed, because receive modifies each
824		 * block at most once, and in offset order.  If this is
825		 * not the case, it can lead to performance problems,
826		 * so note that we unexpectedly took the slow path.
827		 */
828		atomic_inc_64(&zfs_free_range_recv_miss);
829	}
830
831	for (db = list_head(&dn->dn_dbufs); db != NULL; db = db_next) {
832		db_next = list_next(&dn->dn_dbufs, db);
833		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
834
835		if (db->db_level != 0)
836			continue;
837		if (db->db_blkid < start || db->db_blkid > end)
838			continue;
839
840		/* found a level 0 buffer in the range */
841		mutex_enter(&db->db_mtx);
842		if (dbuf_undirty(db, tx)) {
843			/* mutex has been dropped and dbuf destroyed */
844			continue;
845		}
846
847		if (db->db_state == DB_UNCACHED ||
848		    db->db_state == DB_NOFILL ||
849		    db->db_state == DB_EVICTING) {
850			ASSERT(db->db.db_data == NULL);
851			mutex_exit(&db->db_mtx);
852			continue;
853		}
854		if (db->db_state == DB_READ || db->db_state == DB_FILL) {
855			/* will be handled in dbuf_read_done or dbuf_rele */
856			db->db_freed_in_flight = TRUE;
857			mutex_exit(&db->db_mtx);
858			continue;
859		}
860		if (refcount_count(&db->db_holds) == 0) {
861			ASSERT(db->db_buf);
862			dbuf_clear(db);
863			continue;
864		}
865		/* The dbuf is referenced */
866
867		if (db->db_last_dirty != NULL) {
868			dbuf_dirty_record_t *dr = db->db_last_dirty;
869
870			if (dr->dr_txg == txg) {
871				/*
872				 * This buffer is "in-use", re-adjust the file
873				 * size to reflect that this buffer may
874				 * contain new data when we sync.
875				 */
876				if (db->db_blkid != DMU_SPILL_BLKID &&
877				    db->db_blkid > dn->dn_maxblkid)
878					dn->dn_maxblkid = db->db_blkid;
879				dbuf_unoverride(dr);
880			} else {
881				/*
882				 * This dbuf is not dirty in the open context.
883				 * Either uncache it (if its not referenced in
884				 * the open context) or reset its contents to
885				 * empty.
886				 */
887				dbuf_fix_old_data(db, txg);
888			}
889		}
890		/* clear the contents if its cached */
891		if (db->db_state == DB_CACHED) {
892			ASSERT(db->db.db_data != NULL);
893			arc_release(db->db_buf, db);
894			bzero(db->db.db_data, db->db.db_size);
895			arc_buf_freeze(db->db_buf);
896		}
897
898		mutex_exit(&db->db_mtx);
899	}
900	mutex_exit(&dn->dn_dbufs_mtx);
901}
902
903static int
904dbuf_block_freeable(dmu_buf_impl_t *db)
905{
906	dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
907	uint64_t birth_txg = 0;
908
909	/*
910	 * We don't need any locking to protect db_blkptr:
911	 * If it's syncing, then db_last_dirty will be set
912	 * so we'll ignore db_blkptr.
913	 *
914	 * This logic ensures that only block births for
915	 * filled blocks are considered.
916	 */
917	ASSERT(MUTEX_HELD(&db->db_mtx));
918	if (db->db_last_dirty && (db->db_blkptr == NULL ||
919	    !BP_IS_HOLE(db->db_blkptr))) {
920		birth_txg = db->db_last_dirty->dr_txg;
921	} else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
922		birth_txg = db->db_blkptr->blk_birth;
923	}
924
925	/*
926	 * If this block don't exist or is in a snapshot, it can't be freed.
927	 * Don't pass the bp to dsl_dataset_block_freeable() since we
928	 * are holding the db_mtx lock and might deadlock if we are
929	 * prefetching a dedup-ed block.
930	 */
931	if (birth_txg != 0)
932		return (ds == NULL ||
933		    dsl_dataset_block_freeable(ds, NULL, birth_txg));
934	else
935		return (B_FALSE);
936}
937
938void
939dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
940{
941	arc_buf_t *buf, *obuf;
942	int osize = db->db.db_size;
943	arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
944	dnode_t *dn;
945
946	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
947
948	DB_DNODE_ENTER(db);
949	dn = DB_DNODE(db);
950
951	/* XXX does *this* func really need the lock? */
952	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
953
954	/*
955	 * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held
956	 * is OK, because there can be no other references to the db
957	 * when we are changing its size, so no concurrent DB_FILL can
958	 * be happening.
959	 */
960	/*
961	 * XXX we should be doing a dbuf_read, checking the return
962	 * value and returning that up to our callers
963	 */
964	dmu_buf_will_dirty(&db->db, tx);
965
966	/* create the data buffer for the new block */
967	buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
968
969	/* copy old block data to the new block */
970	obuf = db->db_buf;
971	bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
972	/* zero the remainder */
973	if (size > osize)
974		bzero((uint8_t *)buf->b_data + osize, size - osize);
975
976	mutex_enter(&db->db_mtx);
977	dbuf_set_data(db, buf);
978	VERIFY(arc_buf_remove_ref(obuf, db));
979	db->db.db_size = size;
980
981	if (db->db_level == 0) {
982		ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
983		db->db_last_dirty->dt.dl.dr_data = buf;
984	}
985	mutex_exit(&db->db_mtx);
986
987	dnode_willuse_space(dn, size-osize, tx);
988	DB_DNODE_EXIT(db);
989}
990
991void
992dbuf_release_bp(dmu_buf_impl_t *db)
993{
994	objset_t *os = db->db_objset;
995
996	ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
997	ASSERT(arc_released(os->os_phys_buf) ||
998	    list_link_active(&os->os_dsl_dataset->ds_synced_link));
999	ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
1000
1001	(void) arc_release(db->db_buf, db);
1002}
1003
1004dbuf_dirty_record_t *
1005dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1006{
1007	dnode_t *dn;
1008	objset_t *os;
1009	dbuf_dirty_record_t **drp, *dr;
1010	int drop_struct_lock = FALSE;
1011	boolean_t do_free_accounting = B_FALSE;
1012	int txgoff = tx->tx_txg & TXG_MASK;
1013
1014	ASSERT(tx->tx_txg != 0);
1015	ASSERT(!refcount_is_zero(&db->db_holds));
1016	DMU_TX_DIRTY_BUF(tx, db);
1017
1018	DB_DNODE_ENTER(db);
1019	dn = DB_DNODE(db);
1020	/*
1021	 * Shouldn't dirty a regular buffer in syncing context.  Private
1022	 * objects may be dirtied in syncing context, but only if they
1023	 * were already pre-dirtied in open context.
1024	 */
1025	ASSERT(!dmu_tx_is_syncing(tx) ||
1026	    BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
1027	    DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1028	    dn->dn_objset->os_dsl_dataset == NULL);
1029	/*
1030	 * We make this assert for private objects as well, but after we
1031	 * check if we're already dirty.  They are allowed to re-dirty
1032	 * in syncing context.
1033	 */
1034	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1035	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1036	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1037
1038	mutex_enter(&db->db_mtx);
1039	/*
1040	 * XXX make this true for indirects too?  The problem is that
1041	 * transactions created with dmu_tx_create_assigned() from
1042	 * syncing context don't bother holding ahead.
1043	 */
1044	ASSERT(db->db_level != 0 ||
1045	    db->db_state == DB_CACHED || db->db_state == DB_FILL ||
1046	    db->db_state == DB_NOFILL);
1047
1048	mutex_enter(&dn->dn_mtx);
1049	/*
1050	 * Don't set dirtyctx to SYNC if we're just modifying this as we
1051	 * initialize the objset.
1052	 */
1053	if (dn->dn_dirtyctx == DN_UNDIRTIED &&
1054	    !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
1055		dn->dn_dirtyctx =
1056		    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
1057		ASSERT(dn->dn_dirtyctx_firstset == NULL);
1058		dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
1059	}
1060	mutex_exit(&dn->dn_mtx);
1061
1062	if (db->db_blkid == DMU_SPILL_BLKID)
1063		dn->dn_have_spill = B_TRUE;
1064
1065	/*
1066	 * If this buffer is already dirty, we're done.
1067	 */
1068	drp = &db->db_last_dirty;
1069	ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
1070	    db->db.db_object == DMU_META_DNODE_OBJECT);
1071	while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
1072		drp = &dr->dr_next;
1073	if (dr && dr->dr_txg == tx->tx_txg) {
1074		DB_DNODE_EXIT(db);
1075
1076		if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
1077			/*
1078			 * If this buffer has already been written out,
1079			 * we now need to reset its state.
1080			 */
1081			dbuf_unoverride(dr);
1082			if (db->db.db_object != DMU_META_DNODE_OBJECT &&
1083			    db->db_state != DB_NOFILL)
1084				arc_buf_thaw(db->db_buf);
1085		}
1086		mutex_exit(&db->db_mtx);
1087		return (dr);
1088	}
1089
1090	/*
1091	 * Only valid if not already dirty.
1092	 */
1093	ASSERT(dn->dn_object == 0 ||
1094	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1095	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1096
1097	ASSERT3U(dn->dn_nlevels, >, db->db_level);
1098	ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
1099	    dn->dn_phys->dn_nlevels > db->db_level ||
1100	    dn->dn_next_nlevels[txgoff] > db->db_level ||
1101	    dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
1102	    dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
1103
1104	/*
1105	 * We should only be dirtying in syncing context if it's the
1106	 * mos or we're initializing the os or it's a special object.
1107	 * However, we are allowed to dirty in syncing context provided
1108	 * we already dirtied it in open context.  Hence we must make
1109	 * this assertion only if we're not already dirty.
1110	 */
1111	os = dn->dn_objset;
1112	ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1113	    os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
1114	ASSERT(db->db.db_size != 0);
1115
1116	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1117
1118	if (db->db_blkid != DMU_BONUS_BLKID) {
1119		/*
1120		 * Update the accounting.
1121		 * Note: we delay "free accounting" until after we drop
1122		 * the db_mtx.  This keeps us from grabbing other locks
1123		 * (and possibly deadlocking) in bp_get_dsize() while
1124		 * also holding the db_mtx.
1125		 */
1126		dnode_willuse_space(dn, db->db.db_size, tx);
1127		do_free_accounting = dbuf_block_freeable(db);
1128	}
1129
1130	/*
1131	 * If this buffer is dirty in an old transaction group we need
1132	 * to make a copy of it so that the changes we make in this
1133	 * transaction group won't leak out when we sync the older txg.
1134	 */
1135	dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
1136	if (db->db_level == 0) {
1137		void *data_old = db->db_buf;
1138
1139		if (db->db_state != DB_NOFILL) {
1140			if (db->db_blkid == DMU_BONUS_BLKID) {
1141				dbuf_fix_old_data(db, tx->tx_txg);
1142				data_old = db->db.db_data;
1143			} else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
1144				/*
1145				 * Release the data buffer from the cache so
1146				 * that we can modify it without impacting
1147				 * possible other users of this cached data
1148				 * block.  Note that indirect blocks and
1149				 * private objects are not released until the
1150				 * syncing state (since they are only modified
1151				 * then).
1152				 */
1153				arc_release(db->db_buf, db);
1154				dbuf_fix_old_data(db, tx->tx_txg);
1155				data_old = db->db_buf;
1156			}
1157			ASSERT(data_old != NULL);
1158		}
1159		dr->dt.dl.dr_data = data_old;
1160	} else {
1161		mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
1162		list_create(&dr->dt.di.dr_children,
1163		    sizeof (dbuf_dirty_record_t),
1164		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
1165	}
1166	if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
1167		dr->dr_accounted = db->db.db_size;
1168	dr->dr_dbuf = db;
1169	dr->dr_txg = tx->tx_txg;
1170	dr->dr_next = *drp;
1171	*drp = dr;
1172
1173	/*
1174	 * We could have been freed_in_flight between the dbuf_noread
1175	 * and dbuf_dirty.  We win, as though the dbuf_noread() had
1176	 * happened after the free.
1177	 */
1178	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1179	    db->db_blkid != DMU_SPILL_BLKID) {
1180		mutex_enter(&dn->dn_mtx);
1181		if (dn->dn_free_ranges[txgoff] != NULL) {
1182			range_tree_clear(dn->dn_free_ranges[txgoff],
1183			    db->db_blkid, 1);
1184		}
1185		mutex_exit(&dn->dn_mtx);
1186		db->db_freed_in_flight = FALSE;
1187	}
1188
1189	/*
1190	 * This buffer is now part of this txg
1191	 */
1192	dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
1193	db->db_dirtycnt += 1;
1194	ASSERT3U(db->db_dirtycnt, <=, 3);
1195
1196	mutex_exit(&db->db_mtx);
1197
1198	if (db->db_blkid == DMU_BONUS_BLKID ||
1199	    db->db_blkid == DMU_SPILL_BLKID) {
1200		mutex_enter(&dn->dn_mtx);
1201		ASSERT(!list_link_active(&dr->dr_dirty_node));
1202		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1203		mutex_exit(&dn->dn_mtx);
1204		dnode_setdirty(dn, tx);
1205		DB_DNODE_EXIT(db);
1206		return (dr);
1207	} else if (do_free_accounting) {
1208		blkptr_t *bp = db->db_blkptr;
1209		int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
1210		    bp_get_dsize(os->os_spa, bp) : db->db.db_size;
1211		/*
1212		 * This is only a guess -- if the dbuf is dirty
1213		 * in a previous txg, we don't know how much
1214		 * space it will use on disk yet.  We should
1215		 * really have the struct_rwlock to access
1216		 * db_blkptr, but since this is just a guess,
1217		 * it's OK if we get an odd answer.
1218		 */
1219		ddt_prefetch(os->os_spa, bp);
1220		dnode_willuse_space(dn, -willfree, tx);
1221	}
1222
1223	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
1224		rw_enter(&dn->dn_struct_rwlock, RW_READER);
1225		drop_struct_lock = TRUE;
1226	}
1227
1228	if (db->db_level == 0) {
1229		dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
1230		ASSERT(dn->dn_maxblkid >= db->db_blkid);
1231	}
1232
1233	if (db->db_level+1 < dn->dn_nlevels) {
1234		dmu_buf_impl_t *parent = db->db_parent;
1235		dbuf_dirty_record_t *di;
1236		int parent_held = FALSE;
1237
1238		if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
1239			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1240
1241			parent = dbuf_hold_level(dn, db->db_level+1,
1242			    db->db_blkid >> epbs, FTAG);
1243			ASSERT(parent != NULL);
1244			parent_held = TRUE;
1245		}
1246		if (drop_struct_lock)
1247			rw_exit(&dn->dn_struct_rwlock);
1248		ASSERT3U(db->db_level+1, ==, parent->db_level);
1249		di = dbuf_dirty(parent, tx);
1250		if (parent_held)
1251			dbuf_rele(parent, FTAG);
1252
1253		mutex_enter(&db->db_mtx);
1254		/*
1255		 * Since we've dropped the mutex, it's possible that
1256		 * dbuf_undirty() might have changed this out from under us.
1257		 */
1258		if (db->db_last_dirty == dr ||
1259		    dn->dn_object == DMU_META_DNODE_OBJECT) {
1260			mutex_enter(&di->dt.di.dr_mtx);
1261			ASSERT3U(di->dr_txg, ==, tx->tx_txg);
1262			ASSERT(!list_link_active(&dr->dr_dirty_node));
1263			list_insert_tail(&di->dt.di.dr_children, dr);
1264			mutex_exit(&di->dt.di.dr_mtx);
1265			dr->dr_parent = di;
1266		}
1267		mutex_exit(&db->db_mtx);
1268	} else {
1269		ASSERT(db->db_level+1 == dn->dn_nlevels);
1270		ASSERT(db->db_blkid < dn->dn_nblkptr);
1271		ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
1272		mutex_enter(&dn->dn_mtx);
1273		ASSERT(!list_link_active(&dr->dr_dirty_node));
1274		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1275		mutex_exit(&dn->dn_mtx);
1276		if (drop_struct_lock)
1277			rw_exit(&dn->dn_struct_rwlock);
1278	}
1279
1280	dnode_setdirty(dn, tx);
1281	DB_DNODE_EXIT(db);
1282	return (dr);
1283}
1284
1285/*
1286 * Undirty a buffer in the transaction group referenced by the given
1287 * transaction.  Return whether this evicted the dbuf.
1288 */
1289static boolean_t
1290dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1291{
1292	dnode_t *dn;
1293	uint64_t txg = tx->tx_txg;
1294	dbuf_dirty_record_t *dr, **drp;
1295
1296	ASSERT(txg != 0);
1297	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1298	ASSERT0(db->db_level);
1299	ASSERT(MUTEX_HELD(&db->db_mtx));
1300
1301	/*
1302	 * If this buffer is not dirty, we're done.
1303	 */
1304	for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
1305		if (dr->dr_txg <= txg)
1306			break;
1307	if (dr == NULL || dr->dr_txg < txg)
1308		return (B_FALSE);
1309	ASSERT(dr->dr_txg == txg);
1310	ASSERT(dr->dr_dbuf == db);
1311
1312	DB_DNODE_ENTER(db);
1313	dn = DB_DNODE(db);
1314
1315	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1316
1317	ASSERT(db->db.db_size != 0);
1318
1319	/*
1320	 * Any space we accounted for in dp_dirty_* will be cleaned up by
1321	 * dsl_pool_sync().  This is relatively rare so the discrepancy
1322	 * is not a big deal.
1323	 */
1324
1325	*drp = dr->dr_next;
1326
1327	/*
1328	 * Note that there are three places in dbuf_dirty()
1329	 * where this dirty record may be put on a list.
1330	 * Make sure to do a list_remove corresponding to
1331	 * every one of those list_insert calls.
1332	 */
1333	if (dr->dr_parent) {
1334		mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
1335		list_remove(&dr->dr_parent->dt.di.dr_children, dr);
1336		mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
1337	} else if (db->db_blkid == DMU_SPILL_BLKID ||
1338	    db->db_level+1 == dn->dn_nlevels) {
1339		ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
1340		mutex_enter(&dn->dn_mtx);
1341		list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
1342		mutex_exit(&dn->dn_mtx);
1343	}
1344	DB_DNODE_EXIT(db);
1345
1346	if (db->db_state != DB_NOFILL) {
1347		dbuf_unoverride(dr);
1348
1349		ASSERT(db->db_buf != NULL);
1350		ASSERT(dr->dt.dl.dr_data != NULL);
1351		if (dr->dt.dl.dr_data != db->db_buf)
1352			VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
1353	}
1354	kmem_free(dr, sizeof (dbuf_dirty_record_t));
1355
1356	ASSERT(db->db_dirtycnt > 0);
1357	db->db_dirtycnt -= 1;
1358
1359	if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
1360		arc_buf_t *buf = db->db_buf;
1361
1362		ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
1363		dbuf_set_data(db, NULL);
1364		VERIFY(arc_buf_remove_ref(buf, db));
1365		dbuf_evict(db);
1366		return (B_TRUE);
1367	}
1368
1369	return (B_FALSE);
1370}
1371
1372void
1373dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
1374{
1375	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1376	int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
1377
1378	ASSERT(tx->tx_txg != 0);
1379	ASSERT(!refcount_is_zero(&db->db_holds));
1380
1381	DB_DNODE_ENTER(db);
1382	if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
1383		rf |= DB_RF_HAVESTRUCT;
1384	DB_DNODE_EXIT(db);
1385	(void) dbuf_read(db, NULL, rf);
1386	(void) dbuf_dirty(db, tx);
1387}
1388
1389void
1390dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1391{
1392	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1393
1394	db->db_state = DB_NOFILL;
1395
1396	dmu_buf_will_fill(db_fake, tx);
1397}
1398
1399void
1400dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1401{
1402	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1403
1404	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1405	ASSERT(tx->tx_txg != 0);
1406	ASSERT(db->db_level == 0);
1407	ASSERT(!refcount_is_zero(&db->db_holds));
1408
1409	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
1410	    dmu_tx_private_ok(tx));
1411
1412	dbuf_noread(db);
1413	(void) dbuf_dirty(db, tx);
1414}
1415
1416#pragma weak dmu_buf_fill_done = dbuf_fill_done
1417/* ARGSUSED */
1418void
1419dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
1420{
1421	mutex_enter(&db->db_mtx);
1422	DBUF_VERIFY(db);
1423
1424	if (db->db_state == DB_FILL) {
1425		if (db->db_level == 0 && db->db_freed_in_flight) {
1426			ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1427			/* we were freed while filling */
1428			/* XXX dbuf_undirty? */
1429			bzero(db->db.db_data, db->db.db_size);
1430			db->db_freed_in_flight = FALSE;
1431		}
1432		db->db_state = DB_CACHED;
1433		cv_broadcast(&db->db_changed);
1434	}
1435	mutex_exit(&db->db_mtx);
1436}
1437
1438/*
1439 * Directly assign a provided arc buf to a given dbuf if it's not referenced
1440 * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
1441 */
1442void
1443dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
1444{
1445	ASSERT(!refcount_is_zero(&db->db_holds));
1446	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1447	ASSERT(db->db_level == 0);
1448	ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
1449	ASSERT(buf != NULL);
1450	ASSERT(arc_buf_size(buf) == db->db.db_size);
1451	ASSERT(tx->tx_txg != 0);
1452
1453	arc_return_buf(buf, db);
1454	ASSERT(arc_released(buf));
1455
1456	mutex_enter(&db->db_mtx);
1457
1458	while (db->db_state == DB_READ || db->db_state == DB_FILL)
1459		cv_wait(&db->db_changed, &db->db_mtx);
1460
1461	ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
1462
1463	if (db->db_state == DB_CACHED &&
1464	    refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
1465		mutex_exit(&db->db_mtx);
1466		(void) dbuf_dirty(db, tx);
1467		bcopy(buf->b_data, db->db.db_data, db->db.db_size);
1468		VERIFY(arc_buf_remove_ref(buf, db));
1469		xuio_stat_wbuf_copied();
1470		return;
1471	}
1472
1473	xuio_stat_wbuf_nocopy();
1474	if (db->db_state == DB_CACHED) {
1475		dbuf_dirty_record_t *dr = db->db_last_dirty;
1476
1477		ASSERT(db->db_buf != NULL);
1478		if (dr != NULL && dr->dr_txg == tx->tx_txg) {
1479			ASSERT(dr->dt.dl.dr_data == db->db_buf);
1480			if (!arc_released(db->db_buf)) {
1481				ASSERT(dr->dt.dl.dr_override_state ==
1482				    DR_OVERRIDDEN);
1483				arc_release(db->db_buf, db);
1484			}
1485			dr->dt.dl.dr_data = buf;
1486			VERIFY(arc_buf_remove_ref(db->db_buf, db));
1487		} else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
1488			arc_release(db->db_buf, db);
1489			VERIFY(arc_buf_remove_ref(db->db_buf, db));
1490		}
1491		db->db_buf = NULL;
1492	}
1493	ASSERT(db->db_buf == NULL);
1494	dbuf_set_data(db, buf);
1495	db->db_state = DB_FILL;
1496	mutex_exit(&db->db_mtx);
1497	(void) dbuf_dirty(db, tx);
1498	dmu_buf_fill_done(&db->db, tx);
1499}
1500
1501/*
1502 * "Clear" the contents of this dbuf.  This will mark the dbuf
1503 * EVICTING and clear *most* of its references.  Unfortunately,
1504 * when we are not holding the dn_dbufs_mtx, we can't clear the
1505 * entry in the dn_dbufs list.  We have to wait until dbuf_destroy()
1506 * in this case.  For callers from the DMU we will usually see:
1507 *	dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
1508 * For the arc callback, we will usually see:
1509 *	dbuf_do_evict()->dbuf_clear();dbuf_destroy()
1510 * Sometimes, though, we will get a mix of these two:
1511 *	DMU: dbuf_clear()->arc_buf_evict()
1512 *	ARC: dbuf_do_evict()->dbuf_destroy()
1513 */
1514void
1515dbuf_clear(dmu_buf_impl_t *db)
1516{
1517	dnode_t *dn;
1518	dmu_buf_impl_t *parent = db->db_parent;
1519	dmu_buf_impl_t *dndb;
1520	int dbuf_gone = FALSE;
1521
1522	ASSERT(MUTEX_HELD(&db->db_mtx));
1523	ASSERT(refcount_is_zero(&db->db_holds));
1524
1525	dbuf_evict_user(db);
1526
1527	if (db->db_state == DB_CACHED) {
1528		ASSERT(db->db.db_data != NULL);
1529		if (db->db_blkid == DMU_BONUS_BLKID) {
1530			zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
1531			arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
1532		}
1533		db->db.db_data = NULL;
1534		db->db_state = DB_UNCACHED;
1535	}
1536
1537	ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
1538	ASSERT(db->db_data_pending == NULL);
1539
1540	db->db_state = DB_EVICTING;
1541	db->db_blkptr = NULL;
1542
1543	DB_DNODE_ENTER(db);
1544	dn = DB_DNODE(db);
1545	dndb = dn->dn_dbuf;
1546	if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
1547		list_remove(&dn->dn_dbufs, db);
1548		(void) atomic_dec_32_nv(&dn->dn_dbufs_count);
1549		membar_producer();
1550		DB_DNODE_EXIT(db);
1551		/*
1552		 * Decrementing the dbuf count means that the hold corresponding
1553		 * to the removed dbuf is no longer discounted in dnode_move(),
1554		 * so the dnode cannot be moved until after we release the hold.
1555		 * The membar_producer() ensures visibility of the decremented
1556		 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
1557		 * release any lock.
1558		 */
1559		dnode_rele(dn, db);
1560		db->db_dnode_handle = NULL;
1561	} else {
1562		DB_DNODE_EXIT(db);
1563	}
1564
1565	if (db->db_buf)
1566		dbuf_gone = arc_buf_evict(db->db_buf);
1567
1568	if (!dbuf_gone)
1569		mutex_exit(&db->db_mtx);
1570
1571	/*
1572	 * If this dbuf is referenced from an indirect dbuf,
1573	 * decrement the ref count on the indirect dbuf.
1574	 */
1575	if (parent && parent != dndb)
1576		dbuf_rele(parent, db);
1577}
1578
1579static int
1580dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
1581    dmu_buf_impl_t **parentp, blkptr_t **bpp)
1582{
1583	int nlevels, epbs;
1584
1585	*parentp = NULL;
1586	*bpp = NULL;
1587
1588	ASSERT(blkid != DMU_BONUS_BLKID);
1589
1590	if (blkid == DMU_SPILL_BLKID) {
1591		mutex_enter(&dn->dn_mtx);
1592		if (dn->dn_have_spill &&
1593		    (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
1594			*bpp = &dn->dn_phys->dn_spill;
1595		else
1596			*bpp = NULL;
1597		dbuf_add_ref(dn->dn_dbuf, NULL);
1598		*parentp = dn->dn_dbuf;
1599		mutex_exit(&dn->dn_mtx);
1600		return (0);
1601	}
1602
1603	if (dn->dn_phys->dn_nlevels == 0)
1604		nlevels = 1;
1605	else
1606		nlevels = dn->dn_phys->dn_nlevels;
1607
1608	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1609
1610	ASSERT3U(level * epbs, <, 64);
1611	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1612	if (level >= nlevels ||
1613	    (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
1614		/* the buffer has no parent yet */
1615		return (SET_ERROR(ENOENT));
1616	} else if (level < nlevels-1) {
1617		/* this block is referenced from an indirect block */
1618		int err = dbuf_hold_impl(dn, level+1,
1619		    blkid >> epbs, fail_sparse, NULL, parentp);
1620		if (err)
1621			return (err);
1622		err = dbuf_read(*parentp, NULL,
1623		    (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
1624		if (err) {
1625			dbuf_rele(*parentp, NULL);
1626			*parentp = NULL;
1627			return (err);
1628		}
1629		*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
1630		    (blkid & ((1ULL << epbs) - 1));
1631		return (0);
1632	} else {
1633		/* the block is referenced from the dnode */
1634		ASSERT3U(level, ==, nlevels-1);
1635		ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
1636		    blkid < dn->dn_phys->dn_nblkptr);
1637		if (dn->dn_dbuf) {
1638			dbuf_add_ref(dn->dn_dbuf, NULL);
1639			*parentp = dn->dn_dbuf;
1640		}
1641		*bpp = &dn->dn_phys->dn_blkptr[blkid];
1642		return (0);
1643	}
1644}
1645
1646static dmu_buf_impl_t *
1647dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
1648    dmu_buf_impl_t *parent, blkptr_t *blkptr)
1649{
1650	objset_t *os = dn->dn_objset;
1651	dmu_buf_impl_t *db, *odb;
1652
1653	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1654	ASSERT(dn->dn_type != DMU_OT_NONE);
1655
1656	db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
1657
1658	db->db_objset = os;
1659	db->db.db_object = dn->dn_object;
1660	db->db_level = level;
1661	db->db_blkid = blkid;
1662	db->db_last_dirty = NULL;
1663	db->db_dirtycnt = 0;
1664	db->db_dnode_handle = dn->dn_handle;
1665	db->db_parent = parent;
1666	db->db_blkptr = blkptr;
1667
1668	db->db_user_ptr = NULL;
1669	db->db_user_data_ptr_ptr = NULL;
1670	db->db_evict_func = NULL;
1671	db->db_immediate_evict = 0;
1672	db->db_freed_in_flight = 0;
1673
1674	if (blkid == DMU_BONUS_BLKID) {
1675		ASSERT3P(parent, ==, dn->dn_dbuf);
1676		db->db.db_size = DN_MAX_BONUSLEN -
1677		    (dn->dn_nblkptr-1) * sizeof (blkptr_t);
1678		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
1679		db->db.db_offset = DMU_BONUS_BLKID;
1680		db->db_state = DB_UNCACHED;
1681		/* the bonus dbuf is not placed in the hash table */
1682		arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1683		return (db);
1684	} else if (blkid == DMU_SPILL_BLKID) {
1685		db->db.db_size = (blkptr != NULL) ?
1686		    BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
1687		db->db.db_offset = 0;
1688	} else {
1689		int blocksize =
1690		    db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
1691		db->db.db_size = blocksize;
1692		db->db.db_offset = db->db_blkid * blocksize;
1693	}
1694
1695	/*
1696	 * Hold the dn_dbufs_mtx while we get the new dbuf
1697	 * in the hash table *and* added to the dbufs list.
1698	 * This prevents a possible deadlock with someone
1699	 * trying to look up this dbuf before its added to the
1700	 * dn_dbufs list.
1701	 */
1702	mutex_enter(&dn->dn_dbufs_mtx);
1703	db->db_state = DB_EVICTING;
1704	if ((odb = dbuf_hash_insert(db)) != NULL) {
1705		/* someone else inserted it first */
1706		kmem_cache_free(dbuf_cache, db);
1707		mutex_exit(&dn->dn_dbufs_mtx);
1708		return (odb);
1709	}
1710	list_insert_head(&dn->dn_dbufs, db);
1711	if (db->db_level == 0 && db->db_blkid >=
1712	    dn->dn_unlisted_l0_blkid)
1713		dn->dn_unlisted_l0_blkid = db->db_blkid + 1;
1714	db->db_state = DB_UNCACHED;
1715	mutex_exit(&dn->dn_dbufs_mtx);
1716	arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1717
1718	if (parent && parent != dn->dn_dbuf)
1719		dbuf_add_ref(parent, db);
1720
1721	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1722	    refcount_count(&dn->dn_holds) > 0);
1723	(void) refcount_add(&dn->dn_holds, db);
1724	(void) atomic_inc_32_nv(&dn->dn_dbufs_count);
1725
1726	dprintf_dbuf(db, "db=%p\n", db);
1727
1728	return (db);
1729}
1730
1731static int
1732dbuf_do_evict(void *private)
1733{
1734	arc_buf_t *buf = private;
1735	dmu_buf_impl_t *db = buf->b_private;
1736
1737	if (!MUTEX_HELD(&db->db_mtx))
1738		mutex_enter(&db->db_mtx);
1739
1740	ASSERT(refcount_is_zero(&db->db_holds));
1741
1742	if (db->db_state != DB_EVICTING) {
1743		ASSERT(db->db_state == DB_CACHED);
1744		DBUF_VERIFY(db);
1745		db->db_buf = NULL;
1746		dbuf_evict(db);
1747	} else {
1748		mutex_exit(&db->db_mtx);
1749		dbuf_destroy(db);
1750	}
1751	return (0);
1752}
1753
1754static void
1755dbuf_destroy(dmu_buf_impl_t *db)
1756{
1757	ASSERT(refcount_is_zero(&db->db_holds));
1758
1759	if (db->db_blkid != DMU_BONUS_BLKID) {
1760		/*
1761		 * If this dbuf is still on the dn_dbufs list,
1762		 * remove it from that list.
1763		 */
1764		if (db->db_dnode_handle != NULL) {
1765			dnode_t *dn;
1766
1767			DB_DNODE_ENTER(db);
1768			dn = DB_DNODE(db);
1769			mutex_enter(&dn->dn_dbufs_mtx);
1770			list_remove(&dn->dn_dbufs, db);
1771			(void) atomic_dec_32_nv(&dn->dn_dbufs_count);
1772			mutex_exit(&dn->dn_dbufs_mtx);
1773			DB_DNODE_EXIT(db);
1774			/*
1775			 * Decrementing the dbuf count means that the hold
1776			 * corresponding to the removed dbuf is no longer
1777			 * discounted in dnode_move(), so the dnode cannot be
1778			 * moved until after we release the hold.
1779			 */
1780			dnode_rele(dn, db);
1781			db->db_dnode_handle = NULL;
1782		}
1783		dbuf_hash_remove(db);
1784	}
1785	db->db_parent = NULL;
1786	db->db_buf = NULL;
1787
1788	ASSERT(!list_link_active(&db->db_link));
1789	ASSERT(db->db.db_data == NULL);
1790	ASSERT(db->db_hash_next == NULL);
1791	ASSERT(db->db_blkptr == NULL);
1792	ASSERT(db->db_data_pending == NULL);
1793
1794	kmem_cache_free(dbuf_cache, db);
1795	arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1796}
1797
1798void
1799dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
1800{
1801	dmu_buf_impl_t *db = NULL;
1802	blkptr_t *bp = NULL;
1803
1804	ASSERT(blkid != DMU_BONUS_BLKID);
1805	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1806
1807	if (dnode_block_freed(dn, blkid))
1808		return;
1809
1810	/* dbuf_find() returns with db_mtx held */
1811	if (db = dbuf_find(dn, 0, blkid)) {
1812		/*
1813		 * This dbuf is already in the cache.  We assume that
1814		 * it is already CACHED, or else about to be either
1815		 * read or filled.
1816		 */
1817		mutex_exit(&db->db_mtx);
1818		return;
1819	}
1820
1821	if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
1822		if (bp && !BP_IS_HOLE(bp)) {
1823			dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
1824			uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
1825			zbookmark_t zb;
1826
1827			SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
1828			    dn->dn_object, 0, blkid);
1829
1830			(void) arc_read(NULL, dn->dn_objset->os_spa,
1831			    bp, NULL, NULL, prio,
1832			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
1833			    &aflags, &zb);
1834		}
1835		if (db)
1836			dbuf_rele(db, NULL);
1837	}
1838}
1839
1840/*
1841 * Returns with db_holds incremented, and db_mtx not held.
1842 * Note: dn_struct_rwlock must be held.
1843 */
1844int
1845dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
1846    void *tag, dmu_buf_impl_t **dbp)
1847{
1848	dmu_buf_impl_t *db, *parent = NULL;
1849
1850	ASSERT(blkid != DMU_BONUS_BLKID);
1851	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1852	ASSERT3U(dn->dn_nlevels, >, level);
1853
1854	*dbp = NULL;
1855top:
1856	/* dbuf_find() returns with db_mtx held */
1857	db = dbuf_find(dn, level, blkid);
1858
1859	if (db == NULL) {
1860		blkptr_t *bp = NULL;
1861		int err;
1862
1863		ASSERT3P(parent, ==, NULL);
1864		err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
1865		if (fail_sparse) {
1866			if (err == 0 && bp && BP_IS_HOLE(bp))
1867				err = SET_ERROR(ENOENT);
1868			if (err) {
1869				if (parent)
1870					dbuf_rele(parent, NULL);
1871				return (err);
1872			}
1873		}
1874		if (err && err != ENOENT)
1875			return (err);
1876		db = dbuf_create(dn, level, blkid, parent, bp);
1877	}
1878
1879	if (db->db_buf && refcount_is_zero(&db->db_holds)) {
1880		arc_buf_add_ref(db->db_buf, db);
1881		if (db->db_buf->b_data == NULL) {
1882			dbuf_clear(db);
1883			if (parent) {
1884				dbuf_rele(parent, NULL);
1885				parent = NULL;
1886			}
1887			goto top;
1888		}
1889		ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
1890	}
1891
1892	ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
1893
1894	/*
1895	 * If this buffer is currently syncing out, and we are are
1896	 * still referencing it from db_data, we need to make a copy
1897	 * of it in case we decide we want to dirty it again in this txg.
1898	 */
1899	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1900	    dn->dn_object != DMU_META_DNODE_OBJECT &&
1901	    db->db_state == DB_CACHED && db->db_data_pending) {
1902		dbuf_dirty_record_t *dr = db->db_data_pending;
1903
1904		if (dr->dt.dl.dr_data == db->db_buf) {
1905			arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1906
1907			dbuf_set_data(db,
1908			    arc_buf_alloc(dn->dn_objset->os_spa,
1909			    db->db.db_size, db, type));
1910			bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
1911			    db->db.db_size);
1912		}
1913	}
1914
1915	(void) refcount_add(&db->db_holds, tag);
1916	dbuf_update_data(db);
1917	DBUF_VERIFY(db);
1918	mutex_exit(&db->db_mtx);
1919
1920	/* NOTE: we can't rele the parent until after we drop the db_mtx */
1921	if (parent)
1922		dbuf_rele(parent, NULL);
1923
1924	ASSERT3P(DB_DNODE(db), ==, dn);
1925	ASSERT3U(db->db_blkid, ==, blkid);
1926	ASSERT3U(db->db_level, ==, level);
1927	*dbp = db;
1928
1929	return (0);
1930}
1931
1932dmu_buf_impl_t *
1933dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
1934{
1935	dmu_buf_impl_t *db;
1936	int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
1937	return (err ? NULL : db);
1938}
1939
1940dmu_buf_impl_t *
1941dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
1942{
1943	dmu_buf_impl_t *db;
1944	int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
1945	return (err ? NULL : db);
1946}
1947
1948void
1949dbuf_create_bonus(dnode_t *dn)
1950{
1951	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
1952
1953	ASSERT(dn->dn_bonus == NULL);
1954	dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
1955}
1956
1957int
1958dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
1959{
1960	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1961	dnode_t *dn;
1962
1963	if (db->db_blkid != DMU_SPILL_BLKID)
1964		return (SET_ERROR(ENOTSUP));
1965	if (blksz == 0)
1966		blksz = SPA_MINBLOCKSIZE;
1967	if (blksz > SPA_MAXBLOCKSIZE)
1968		blksz = SPA_MAXBLOCKSIZE;
1969	else
1970		blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
1971
1972	DB_DNODE_ENTER(db);
1973	dn = DB_DNODE(db);
1974	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1975	dbuf_new_size(db, blksz, tx);
1976	rw_exit(&dn->dn_struct_rwlock);
1977	DB_DNODE_EXIT(db);
1978
1979	return (0);
1980}
1981
1982void
1983dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
1984{
1985	dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
1986}
1987
1988#pragma weak dmu_buf_add_ref = dbuf_add_ref
1989void
1990dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
1991{
1992	int64_t holds = refcount_add(&db->db_holds, tag);
1993	ASSERT(holds > 1);
1994}
1995
1996/*
1997 * If you call dbuf_rele() you had better not be referencing the dnode handle
1998 * unless you have some other direct or indirect hold on the dnode. (An indirect
1999 * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
2000 * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
2001 * dnode's parent dbuf evicting its dnode handles.
2002 */
2003void
2004dbuf_rele(dmu_buf_impl_t *db, void *tag)
2005{
2006	mutex_enter(&db->db_mtx);
2007	dbuf_rele_and_unlock(db, tag);
2008}
2009
2010void
2011dmu_buf_rele(dmu_buf_t *db, void *tag)
2012{
2013	dbuf_rele((dmu_buf_impl_t *)db, tag);
2014}
2015
2016/*
2017 * dbuf_rele() for an already-locked dbuf.  This is necessary to allow
2018 * db_dirtycnt and db_holds to be updated atomically.
2019 */
2020void
2021dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
2022{
2023	int64_t holds;
2024
2025	ASSERT(MUTEX_HELD(&db->db_mtx));
2026	DBUF_VERIFY(db);
2027
2028	/*
2029	 * Remove the reference to the dbuf before removing its hold on the
2030	 * dnode so we can guarantee in dnode_move() that a referenced bonus
2031	 * buffer has a corresponding dnode hold.
2032	 */
2033	holds = refcount_remove(&db->db_holds, tag);
2034	ASSERT(holds >= 0);
2035
2036	/*
2037	 * We can't freeze indirects if there is a possibility that they
2038	 * may be modified in the current syncing context.
2039	 */
2040	if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
2041		arc_buf_freeze(db->db_buf);
2042
2043	if (holds == db->db_dirtycnt &&
2044	    db->db_level == 0 && db->db_immediate_evict)
2045		dbuf_evict_user(db);
2046
2047	if (holds == 0) {
2048		if (db->db_blkid == DMU_BONUS_BLKID) {
2049			mutex_exit(&db->db_mtx);
2050
2051			/*
2052			 * If the dnode moves here, we cannot cross this barrier
2053			 * until the move completes.
2054			 */
2055			DB_DNODE_ENTER(db);
2056			(void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count);
2057			DB_DNODE_EXIT(db);
2058			/*
2059			 * The bonus buffer's dnode hold is no longer discounted
2060			 * in dnode_move(). The dnode cannot move until after
2061			 * the dnode_rele().
2062			 */
2063			dnode_rele(DB_DNODE(db), db);
2064		} else if (db->db_buf == NULL) {
2065			/*
2066			 * This is a special case: we never associated this
2067			 * dbuf with any data allocated from the ARC.
2068			 */
2069			ASSERT(db->db_state == DB_UNCACHED ||
2070			    db->db_state == DB_NOFILL);
2071			dbuf_evict(db);
2072		} else if (arc_released(db->db_buf)) {
2073			arc_buf_t *buf = db->db_buf;
2074			/*
2075			 * This dbuf has anonymous data associated with it.
2076			 */
2077			dbuf_set_data(db, NULL);
2078			VERIFY(arc_buf_remove_ref(buf, db));
2079			dbuf_evict(db);
2080		} else {
2081			VERIFY(!arc_buf_remove_ref(db->db_buf, db));
2082
2083			/*
2084			 * A dbuf will be eligible for eviction if either the
2085			 * 'primarycache' property is set or a duplicate
2086			 * copy of this buffer is already cached in the arc.
2087			 *
2088			 * In the case of the 'primarycache' a buffer
2089			 * is considered for eviction if it matches the
2090			 * criteria set in the property.
2091			 *
2092			 * To decide if our buffer is considered a
2093			 * duplicate, we must call into the arc to determine
2094			 * if multiple buffers are referencing the same
2095			 * block on-disk. If so, then we simply evict
2096			 * ourselves.
2097			 */
2098			if (!DBUF_IS_CACHEABLE(db) ||
2099			    arc_buf_eviction_needed(db->db_buf))
2100				dbuf_clear(db);
2101			else
2102				mutex_exit(&db->db_mtx);
2103		}
2104	} else {
2105		mutex_exit(&db->db_mtx);
2106	}
2107}
2108
2109#pragma weak dmu_buf_refcount = dbuf_refcount
2110uint64_t
2111dbuf_refcount(dmu_buf_impl_t *db)
2112{
2113	return (refcount_count(&db->db_holds));
2114}
2115
2116void *
2117dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
2118    dmu_buf_evict_func_t *evict_func)
2119{
2120	return (dmu_buf_update_user(db_fake, NULL, user_ptr,
2121	    user_data_ptr_ptr, evict_func));
2122}
2123
2124void *
2125dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
2126    dmu_buf_evict_func_t *evict_func)
2127{
2128	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2129
2130	db->db_immediate_evict = TRUE;
2131	return (dmu_buf_update_user(db_fake, NULL, user_ptr,
2132	    user_data_ptr_ptr, evict_func));
2133}
2134
2135void *
2136dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
2137    void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
2138{
2139	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2140	ASSERT(db->db_level == 0);
2141
2142	ASSERT((user_ptr == NULL) == (evict_func == NULL));
2143
2144	mutex_enter(&db->db_mtx);
2145
2146	if (db->db_user_ptr == old_user_ptr) {
2147		db->db_user_ptr = user_ptr;
2148		db->db_user_data_ptr_ptr = user_data_ptr_ptr;
2149		db->db_evict_func = evict_func;
2150
2151		dbuf_update_data(db);
2152	} else {
2153		old_user_ptr = db->db_user_ptr;
2154	}
2155
2156	mutex_exit(&db->db_mtx);
2157	return (old_user_ptr);
2158}
2159
2160void *
2161dmu_buf_get_user(dmu_buf_t *db_fake)
2162{
2163	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2164	ASSERT(!refcount_is_zero(&db->db_holds));
2165
2166	return (db->db_user_ptr);
2167}
2168
2169boolean_t
2170dmu_buf_freeable(dmu_buf_t *dbuf)
2171{
2172	boolean_t res = B_FALSE;
2173	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
2174
2175	if (db->db_blkptr)
2176		res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
2177		    db->db_blkptr, db->db_blkptr->blk_birth);
2178
2179	return (res);
2180}
2181
2182blkptr_t *
2183dmu_buf_get_blkptr(dmu_buf_t *db)
2184{
2185	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
2186	return (dbi->db_blkptr);
2187}
2188
2189static void
2190dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
2191{
2192	/* ASSERT(dmu_tx_is_syncing(tx) */
2193	ASSERT(MUTEX_HELD(&db->db_mtx));
2194
2195	if (db->db_blkptr != NULL)
2196		return;
2197
2198	if (db->db_blkid == DMU_SPILL_BLKID) {
2199		db->db_blkptr = &dn->dn_phys->dn_spill;
2200		BP_ZERO(db->db_blkptr);
2201		return;
2202	}
2203	if (db->db_level == dn->dn_phys->dn_nlevels-1) {
2204		/*
2205		 * This buffer was allocated at a time when there was
2206		 * no available blkptrs from the dnode, or it was
2207		 * inappropriate to hook it in (i.e., nlevels mis-match).
2208		 */
2209		ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
2210		ASSERT(db->db_parent == NULL);
2211		db->db_parent = dn->dn_dbuf;
2212		db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
2213		DBUF_VERIFY(db);
2214	} else {
2215		dmu_buf_impl_t *parent = db->db_parent;
2216		int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2217
2218		ASSERT(dn->dn_phys->dn_nlevels > 1);
2219		if (parent == NULL) {
2220			mutex_exit(&db->db_mtx);
2221			rw_enter(&dn->dn_struct_rwlock, RW_READER);
2222			(void) dbuf_hold_impl(dn, db->db_level+1,
2223			    db->db_blkid >> epbs, FALSE, db, &parent);
2224			rw_exit(&dn->dn_struct_rwlock);
2225			mutex_enter(&db->db_mtx);
2226			db->db_parent = parent;
2227		}
2228		db->db_blkptr = (blkptr_t *)parent->db.db_data +
2229		    (db->db_blkid & ((1ULL << epbs) - 1));
2230		DBUF_VERIFY(db);
2231	}
2232}
2233
2234static void
2235dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2236{
2237	dmu_buf_impl_t *db = dr->dr_dbuf;
2238	dnode_t *dn;
2239	zio_t *zio;
2240
2241	ASSERT(dmu_tx_is_syncing(tx));
2242
2243	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2244
2245	mutex_enter(&db->db_mtx);
2246
2247	ASSERT(db->db_level > 0);
2248	DBUF_VERIFY(db);
2249
2250	/* Read the block if it hasn't been read yet. */
2251	if (db->db_buf == NULL) {
2252		mutex_exit(&db->db_mtx);
2253		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
2254		mutex_enter(&db->db_mtx);
2255	}
2256	ASSERT3U(db->db_state, ==, DB_CACHED);
2257	ASSERT(db->db_buf != NULL);
2258
2259	DB_DNODE_ENTER(db);
2260	dn = DB_DNODE(db);
2261	/* Indirect block size must match what the dnode thinks it is. */
2262	ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2263	dbuf_check_blkptr(dn, db);
2264	DB_DNODE_EXIT(db);
2265
2266	/* Provide the pending dirty record to child dbufs */
2267	db->db_data_pending = dr;
2268
2269	mutex_exit(&db->db_mtx);
2270	dbuf_write(dr, db->db_buf, tx);
2271
2272	zio = dr->dr_zio;
2273	mutex_enter(&dr->dt.di.dr_mtx);
2274	dbuf_sync_list(&dr->dt.di.dr_children, tx);
2275	ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2276	mutex_exit(&dr->dt.di.dr_mtx);
2277	zio_nowait(zio);
2278}
2279
2280static void
2281dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2282{
2283	arc_buf_t **datap = &dr->dt.dl.dr_data;
2284	dmu_buf_impl_t *db = dr->dr_dbuf;
2285	dnode_t *dn;
2286	objset_t *os;
2287	uint64_t txg = tx->tx_txg;
2288
2289	ASSERT(dmu_tx_is_syncing(tx));
2290
2291	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2292
2293	mutex_enter(&db->db_mtx);
2294	/*
2295	 * To be synced, we must be dirtied.  But we
2296	 * might have been freed after the dirty.
2297	 */
2298	if (db->db_state == DB_UNCACHED) {
2299		/* This buffer has been freed since it was dirtied */
2300		ASSERT(db->db.db_data == NULL);
2301	} else if (db->db_state == DB_FILL) {
2302		/* This buffer was freed and is now being re-filled */
2303		ASSERT(db->db.db_data != dr->dt.dl.dr_data);
2304	} else {
2305		ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
2306	}
2307	DBUF_VERIFY(db);
2308
2309	DB_DNODE_ENTER(db);
2310	dn = DB_DNODE(db);
2311
2312	if (db->db_blkid == DMU_SPILL_BLKID) {
2313		mutex_enter(&dn->dn_mtx);
2314		dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
2315		mutex_exit(&dn->dn_mtx);
2316	}
2317
2318	/*
2319	 * If this is a bonus buffer, simply copy the bonus data into the
2320	 * dnode.  It will be written out when the dnode is synced (and it
2321	 * will be synced, since it must have been dirty for dbuf_sync to
2322	 * be called).
2323	 */
2324	if (db->db_blkid == DMU_BONUS_BLKID) {
2325		dbuf_dirty_record_t **drp;
2326
2327		ASSERT(*datap != NULL);
2328		ASSERT0(db->db_level);
2329		ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
2330		bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
2331		DB_DNODE_EXIT(db);
2332
2333		if (*datap != db->db.db_data) {
2334			zio_buf_free(*datap, DN_MAX_BONUSLEN);
2335			arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
2336		}
2337		db->db_data_pending = NULL;
2338		drp = &db->db_last_dirty;
2339		while (*drp != dr)
2340			drp = &(*drp)->dr_next;
2341		ASSERT(dr->dr_next == NULL);
2342		ASSERT(dr->dr_dbuf == db);
2343		*drp = dr->dr_next;
2344		if (dr->dr_dbuf->db_level != 0) {
2345			list_destroy(&dr->dt.di.dr_children);
2346			mutex_destroy(&dr->dt.di.dr_mtx);
2347		}
2348		kmem_free(dr, sizeof (dbuf_dirty_record_t));
2349		ASSERT(db->db_dirtycnt > 0);
2350		db->db_dirtycnt -= 1;
2351		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
2352		return;
2353	}
2354
2355	os = dn->dn_objset;
2356
2357	/*
2358	 * This function may have dropped the db_mtx lock allowing a dmu_sync
2359	 * operation to sneak in. As a result, we need to ensure that we
2360	 * don't check the dr_override_state until we have returned from
2361	 * dbuf_check_blkptr.
2362	 */
2363	dbuf_check_blkptr(dn, db);
2364
2365	/*
2366	 * If this buffer is in the middle of an immediate write,
2367	 * wait for the synchronous IO to complete.
2368	 */
2369	while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
2370		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
2371		cv_wait(&db->db_changed, &db->db_mtx);
2372		ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
2373	}
2374
2375	if (db->db_state != DB_NOFILL &&
2376	    dn->dn_object != DMU_META_DNODE_OBJECT &&
2377	    refcount_count(&db->db_holds) > 1 &&
2378	    dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
2379	    *datap == db->db_buf) {
2380		/*
2381		 * If this buffer is currently "in use" (i.e., there
2382		 * are active holds and db_data still references it),
2383		 * then make a copy before we start the write so that
2384		 * any modifications from the open txg will not leak
2385		 * into this write.
2386		 *
2387		 * NOTE: this copy does not need to be made for
2388		 * objects only modified in the syncing context (e.g.
2389		 * DNONE_DNODE blocks).
2390		 */
2391		int blksz = arc_buf_size(*datap);
2392		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
2393		*datap = arc_buf_alloc(os->os_spa, blksz, db, type);
2394		bcopy(db->db.db_data, (*datap)->b_data, blksz);
2395	}
2396	db->db_data_pending = dr;
2397
2398	mutex_exit(&db->db_mtx);
2399
2400	dbuf_write(dr, *datap, tx);
2401
2402	ASSERT(!list_link_active(&dr->dr_dirty_node));
2403	if (dn->dn_object == DMU_META_DNODE_OBJECT) {
2404		list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
2405		DB_DNODE_EXIT(db);
2406	} else {
2407		/*
2408		 * Although zio_nowait() does not "wait for an IO", it does
2409		 * initiate the IO. If this is an empty write it seems plausible
2410		 * that the IO could actually be completed before the nowait
2411		 * returns. We need to DB_DNODE_EXIT() first in case
2412		 * zio_nowait() invalidates the dbuf.
2413		 */
2414		DB_DNODE_EXIT(db);
2415		zio_nowait(dr->dr_zio);
2416	}
2417}
2418
2419void
2420dbuf_sync_list(list_t *list, dmu_tx_t *tx)
2421{
2422	dbuf_dirty_record_t *dr;
2423
2424	while (dr = list_head(list)) {
2425		if (dr->dr_zio != NULL) {
2426			/*
2427			 * If we find an already initialized zio then we
2428			 * are processing the meta-dnode, and we have finished.
2429			 * The dbufs for all dnodes are put back on the list
2430			 * during processing, so that we can zio_wait()
2431			 * these IOs after initiating all child IOs.
2432			 */
2433			ASSERT3U(dr->dr_dbuf->db.db_object, ==,
2434			    DMU_META_DNODE_OBJECT);
2435			break;
2436		}
2437		list_remove(list, dr);
2438		if (dr->dr_dbuf->db_level > 0)
2439			dbuf_sync_indirect(dr, tx);
2440		else
2441			dbuf_sync_leaf(dr, tx);
2442	}
2443}
2444
2445/* ARGSUSED */
2446static void
2447dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
2448{
2449	dmu_buf_impl_t *db = vdb;
2450	dnode_t *dn;
2451	blkptr_t *bp = zio->io_bp;
2452	blkptr_t *bp_orig = &zio->io_bp_orig;
2453	spa_t *spa = zio->io_spa;
2454	int64_t delta;
2455	uint64_t fill = 0;
2456	int i;
2457
2458	ASSERT(db->db_blkptr == bp);
2459
2460	DB_DNODE_ENTER(db);
2461	dn = DB_DNODE(db);
2462	delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
2463	dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
2464	zio->io_prev_space_delta = delta;
2465
2466	if (bp->blk_birth != 0) {
2467		ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
2468		    BP_GET_TYPE(bp) == dn->dn_type) ||
2469		    (db->db_blkid == DMU_SPILL_BLKID &&
2470		    BP_GET_TYPE(bp) == dn->dn_bonustype));
2471		ASSERT(BP_GET_LEVEL(bp) == db->db_level);
2472	}
2473
2474	mutex_enter(&db->db_mtx);
2475
2476#ifdef ZFS_DEBUG
2477	if (db->db_blkid == DMU_SPILL_BLKID) {
2478		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
2479		ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
2480		    db->db_blkptr == &dn->dn_phys->dn_spill);
2481	}
2482#endif
2483
2484	if (db->db_level == 0) {
2485		mutex_enter(&dn->dn_mtx);
2486		if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
2487		    db->db_blkid != DMU_SPILL_BLKID)
2488			dn->dn_phys->dn_maxblkid = db->db_blkid;
2489		mutex_exit(&dn->dn_mtx);
2490
2491		if (dn->dn_type == DMU_OT_DNODE) {
2492			dnode_phys_t *dnp = db->db.db_data;
2493			for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
2494			    i--, dnp++) {
2495				if (dnp->dn_type != DMU_OT_NONE)
2496					fill++;
2497			}
2498		} else {
2499			if (BP_IS_HOLE(bp)) {
2500				fill = 0;
2501			} else {
2502				fill = 1;
2503			}
2504		}
2505	} else {
2506		blkptr_t *ibp = db->db.db_data;
2507		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2508		for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
2509			if (BP_IS_HOLE(ibp))
2510				continue;
2511			fill += ibp->blk_fill;
2512		}
2513	}
2514	DB_DNODE_EXIT(db);
2515
2516	bp->blk_fill = fill;
2517
2518	mutex_exit(&db->db_mtx);
2519}
2520
2521/*
2522 * The SPA will call this callback several times for each zio - once
2523 * for every physical child i/o (zio->io_phys_children times).  This
2524 * allows the DMU to monitor the progress of each logical i/o.  For example,
2525 * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
2526 * block.  There may be a long delay before all copies/fragments are completed,
2527 * so this callback allows us to retire dirty space gradually, as the physical
2528 * i/os complete.
2529 */
2530/* ARGSUSED */
2531static void
2532dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
2533{
2534	dmu_buf_impl_t *db = arg;
2535	objset_t *os = db->db_objset;
2536	dsl_pool_t *dp = dmu_objset_pool(os);
2537	dbuf_dirty_record_t *dr;
2538	int delta = 0;
2539
2540	dr = db->db_data_pending;
2541	ASSERT3U(dr->dr_txg, ==, zio->io_txg);
2542
2543	/*
2544	 * The callback will be called io_phys_children times.  Retire one
2545	 * portion of our dirty space each time we are called.  Any rounding
2546	 * error will be cleaned up by dsl_pool_sync()'s call to
2547	 * dsl_pool_undirty_space().
2548	 */
2549	delta = dr->dr_accounted / zio->io_phys_children;
2550	dsl_pool_undirty_space(dp, delta, zio->io_txg);
2551}
2552
2553/* ARGSUSED */
2554static void
2555dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
2556{
2557	dmu_buf_impl_t *db = vdb;
2558	blkptr_t *bp_orig = &zio->io_bp_orig;
2559	blkptr_t *bp = db->db_blkptr;
2560	objset_t *os = db->db_objset;
2561	dmu_tx_t *tx = os->os_synctx;
2562	dbuf_dirty_record_t **drp, *dr;
2563
2564	ASSERT0(zio->io_error);
2565	ASSERT(db->db_blkptr == bp);
2566
2567	/*
2568	 * For nopwrites and rewrites we ensure that the bp matches our
2569	 * original and bypass all the accounting.
2570	 */
2571	if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
2572		ASSERT(BP_EQUAL(bp, bp_orig));
2573	} else {
2574		dsl_dataset_t *ds = os->os_dsl_dataset;
2575		(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
2576		dsl_dataset_block_born(ds, bp, tx);
2577	}
2578
2579	mutex_enter(&db->db_mtx);
2580
2581	DBUF_VERIFY(db);
2582
2583	drp = &db->db_last_dirty;
2584	while ((dr = *drp) != db->db_data_pending)
2585		drp = &dr->dr_next;
2586	ASSERT(!list_link_active(&dr->dr_dirty_node));
2587	ASSERT(dr->dr_dbuf == db);
2588	ASSERT(dr->dr_next == NULL);
2589	*drp = dr->dr_next;
2590
2591#ifdef ZFS_DEBUG
2592	if (db->db_blkid == DMU_SPILL_BLKID) {
2593		dnode_t *dn;
2594
2595		DB_DNODE_ENTER(db);
2596		dn = DB_DNODE(db);
2597		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
2598		ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
2599		    db->db_blkptr == &dn->dn_phys->dn_spill);
2600		DB_DNODE_EXIT(db);
2601	}
2602#endif
2603
2604	if (db->db_level == 0) {
2605		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2606		ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
2607		if (db->db_state != DB_NOFILL) {
2608			if (dr->dt.dl.dr_data != db->db_buf)
2609				VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
2610				    db));
2611			else if (!arc_released(db->db_buf))
2612				arc_set_callback(db->db_buf, dbuf_do_evict, db);
2613		}
2614	} else {
2615		dnode_t *dn;
2616
2617		DB_DNODE_ENTER(db);
2618		dn = DB_DNODE(db);
2619		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2620		ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
2621		if (!BP_IS_HOLE(db->db_blkptr)) {
2622			int epbs =
2623			    dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2624			ASSERT3U(db->db_blkid, <=,
2625			    dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
2626			ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
2627			    db->db.db_size);
2628			arc_set_callback(db->db_buf, dbuf_do_evict, db);
2629		}
2630		DB_DNODE_EXIT(db);
2631		mutex_destroy(&dr->dt.di.dr_mtx);
2632		list_destroy(&dr->dt.di.dr_children);
2633	}
2634	kmem_free(dr, sizeof (dbuf_dirty_record_t));
2635
2636	cv_broadcast(&db->db_changed);
2637	ASSERT(db->db_dirtycnt > 0);
2638	db->db_dirtycnt -= 1;
2639	db->db_data_pending = NULL;
2640	dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg);
2641}
2642
2643static void
2644dbuf_write_nofill_ready(zio_t *zio)
2645{
2646	dbuf_write_ready(zio, NULL, zio->io_private);
2647}
2648
2649static void
2650dbuf_write_nofill_done(zio_t *zio)
2651{
2652	dbuf_write_done(zio, NULL, zio->io_private);
2653}
2654
2655static void
2656dbuf_write_override_ready(zio_t *zio)
2657{
2658	dbuf_dirty_record_t *dr = zio->io_private;
2659	dmu_buf_impl_t *db = dr->dr_dbuf;
2660
2661	dbuf_write_ready(zio, NULL, db);
2662}
2663
2664static void
2665dbuf_write_override_done(zio_t *zio)
2666{
2667	dbuf_dirty_record_t *dr = zio->io_private;
2668	dmu_buf_impl_t *db = dr->dr_dbuf;
2669	blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
2670
2671	mutex_enter(&db->db_mtx);
2672	if (!BP_EQUAL(zio->io_bp, obp)) {
2673		if (!BP_IS_HOLE(obp))
2674			dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
2675		arc_release(dr->dt.dl.dr_data, db);
2676	}
2677	mutex_exit(&db->db_mtx);
2678
2679	dbuf_write_done(zio, NULL, db);
2680}
2681
2682/* Issue I/O to commit a dirty buffer to disk. */
2683static void
2684dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
2685{
2686	dmu_buf_impl_t *db = dr->dr_dbuf;
2687	dnode_t *dn;
2688	objset_t *os;
2689	dmu_buf_impl_t *parent = db->db_parent;
2690	uint64_t txg = tx->tx_txg;
2691	zbookmark_t zb;
2692	zio_prop_t zp;
2693	zio_t *zio;
2694	int wp_flag = 0;
2695
2696	DB_DNODE_ENTER(db);
2697	dn = DB_DNODE(db);
2698	os = dn->dn_objset;
2699
2700	if (db->db_state != DB_NOFILL) {
2701		if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
2702			/*
2703			 * Private object buffers are released here rather
2704			 * than in dbuf_dirty() since they are only modified
2705			 * in the syncing context and we don't want the
2706			 * overhead of making multiple copies of the data.
2707			 */
2708			if (BP_IS_HOLE(db->db_blkptr)) {
2709				arc_buf_thaw(data);
2710			} else {
2711				dbuf_release_bp(db);
2712			}
2713		}
2714	}
2715
2716	if (parent != dn->dn_dbuf) {
2717		/* Our parent is an indirect block. */
2718		/* We have a dirty parent that has been scheduled for write. */
2719		ASSERT(parent && parent->db_data_pending);
2720		/* Our parent's buffer is one level closer to the dnode. */
2721		ASSERT(db->db_level == parent->db_level-1);
2722		/*
2723		 * We're about to modify our parent's db_data by modifying
2724		 * our block pointer, so the parent must be released.
2725		 */
2726		ASSERT(arc_released(parent->db_buf));
2727		zio = parent->db_data_pending->dr_zio;
2728	} else {
2729		/* Our parent is the dnode itself. */
2730		ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
2731		    db->db_blkid != DMU_SPILL_BLKID) ||
2732		    (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
2733		if (db->db_blkid != DMU_SPILL_BLKID)
2734			ASSERT3P(db->db_blkptr, ==,
2735			    &dn->dn_phys->dn_blkptr[db->db_blkid]);
2736		zio = dn->dn_zio;
2737	}
2738
2739	ASSERT(db->db_level == 0 || data == db->db_buf);
2740	ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
2741	ASSERT(zio);
2742
2743	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
2744	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
2745	    db->db.db_object, db->db_level, db->db_blkid);
2746
2747	if (db->db_blkid == DMU_SPILL_BLKID)
2748		wp_flag = WP_SPILL;
2749	wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
2750
2751	dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
2752	DB_DNODE_EXIT(db);
2753
2754	if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
2755		ASSERT(db->db_state != DB_NOFILL);
2756		dr->dr_zio = zio_write(zio, os->os_spa, txg,
2757		    db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
2758		    dbuf_write_override_ready, NULL, dbuf_write_override_done,
2759		    dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2760		mutex_enter(&db->db_mtx);
2761		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
2762		zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
2763		    dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
2764		mutex_exit(&db->db_mtx);
2765	} else if (db->db_state == DB_NOFILL) {
2766		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
2767		    zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
2768		dr->dr_zio = zio_write(zio, os->os_spa, txg,
2769		    db->db_blkptr, NULL, db->db.db_size, &zp,
2770		    dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db,
2771		    ZIO_PRIORITY_ASYNC_WRITE,
2772		    ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
2773	} else {
2774		ASSERT(arc_released(data));
2775		dr->dr_zio = arc_write(zio, os->os_spa, txg,
2776		    db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db),
2777		    DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
2778		    dbuf_write_physdone, dbuf_write_done, db,
2779		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2780	}
2781}
2782