1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21236884Smm
22168404Spjd/*
23219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24265740Sdelphij * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
25168404Spjd */
26168404Spjd
27168404Spjd#include <sys/zfs_context.h>
28168404Spjd#include <sys/dbuf.h>
29168404Spjd#include <sys/dnode.h>
30168404Spjd#include <sys/dmu.h>
31168404Spjd#include <sys/dmu_tx.h>
32168404Spjd#include <sys/dmu_objset.h>
33168404Spjd#include <sys/dsl_dataset.h>
34168404Spjd#include <sys/spa.h>
35265740Sdelphij#include <sys/range_tree.h>
36263397Sdelphij#include <sys/zfeature.h>
37168404Spjd
38168404Spjdstatic void
39168404Spjddnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
40168404Spjd{
41168404Spjd	dmu_buf_impl_t *db;
42168404Spjd	int txgoff = tx->tx_txg & TXG_MASK;
43168404Spjd	int nblkptr = dn->dn_phys->dn_nblkptr;
44168404Spjd	int old_toplvl = dn->dn_phys->dn_nlevels - 1;
45168404Spjd	int new_level = dn->dn_next_nlevels[txgoff];
46168404Spjd	int i;
47168404Spjd
48168404Spjd	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
49168404Spjd
50168404Spjd	/* this dnode can't be paged out because it's dirty */
51168404Spjd	ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
52168404Spjd	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
53168404Spjd	ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0);
54168404Spjd
55168404Spjd	db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG);
56168404Spjd	ASSERT(db != NULL);
57168404Spjd
58168404Spjd	dn->dn_phys->dn_nlevels = new_level;
59185029Spjd	dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset,
60185029Spjd	    dn->dn_object, dn->dn_phys->dn_nlevels);
61168404Spjd
62168404Spjd	/* check for existing blkptrs in the dnode */
63168404Spjd	for (i = 0; i < nblkptr; i++)
64168404Spjd		if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[i]))
65168404Spjd			break;
66168404Spjd	if (i != nblkptr) {
67168404Spjd		/* transfer dnode's block pointers to new indirect block */
68168404Spjd		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT);
69168404Spjd		ASSERT(db->db.db_data);
70168404Spjd		ASSERT(arc_released(db->db_buf));
71168404Spjd		ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size);
72168404Spjd		bcopy(dn->dn_phys->dn_blkptr, db->db.db_data,
73168404Spjd		    sizeof (blkptr_t) * nblkptr);
74168404Spjd		arc_buf_freeze(db->db_buf);
75168404Spjd	}
76168404Spjd
77168404Spjd	/* set dbuf's parent pointers to new indirect buf */
78168404Spjd	for (i = 0; i < nblkptr; i++) {
79168404Spjd		dmu_buf_impl_t *child = dbuf_find(dn, old_toplvl, i);
80168404Spjd
81168404Spjd		if (child == NULL)
82168404Spjd			continue;
83219089Spjd#ifdef	DEBUG
84219089Spjd		DB_DNODE_ENTER(child);
85219089Spjd		ASSERT3P(DB_DNODE(child), ==, dn);
86219089Spjd		DB_DNODE_EXIT(child);
87219089Spjd#endif	/* DEBUG */
88168404Spjd		if (child->db_parent && child->db_parent != dn->dn_dbuf) {
89168404Spjd			ASSERT(child->db_parent->db_level == db->db_level);
90168404Spjd			ASSERT(child->db_blkptr !=
91168404Spjd			    &dn->dn_phys->dn_blkptr[child->db_blkid]);
92168404Spjd			mutex_exit(&child->db_mtx);
93168404Spjd			continue;
94168404Spjd		}
95168404Spjd		ASSERT(child->db_parent == NULL ||
96168404Spjd		    child->db_parent == dn->dn_dbuf);
97168404Spjd
98168404Spjd		child->db_parent = db;
99168404Spjd		dbuf_add_ref(db, child);
100168404Spjd		if (db->db.db_data)
101168404Spjd			child->db_blkptr = (blkptr_t *)db->db.db_data + i;
102168404Spjd		else
103168404Spjd			child->db_blkptr = NULL;
104168404Spjd		dprintf_dbuf_bp(child, child->db_blkptr,
105168404Spjd		    "changed db_blkptr to new indirect %s", "");
106168404Spjd
107168404Spjd		mutex_exit(&child->db_mtx);
108168404Spjd	}
109168404Spjd
110168404Spjd	bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr);
111168404Spjd
112168404Spjd	dbuf_rele(db, FTAG);
113168404Spjd
114168404Spjd	rw_exit(&dn->dn_struct_rwlock);
115168404Spjd}
116168404Spjd
117263397Sdelphijstatic void
118168404Spjdfree_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx)
119168404Spjd{
120185029Spjd	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
121168404Spjd	uint64_t bytesfreed = 0;
122168404Spjd
123185029Spjd	dprintf("ds=%p obj=%llx num=%d\n", ds, dn->dn_object, num);
124168404Spjd
125263397Sdelphij	for (int i = 0; i < num; i++, bp++) {
126168404Spjd		if (BP_IS_HOLE(bp))
127168404Spjd			continue;
128168404Spjd
129219089Spjd		bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE);
130168404Spjd		ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys));
131263397Sdelphij
132263397Sdelphij		/*
133263397Sdelphij		 * Save some useful information on the holes being
134263397Sdelphij		 * punched, including logical size, type, and indirection
135263397Sdelphij		 * level. Retaining birth time enables detection of when
136263397Sdelphij		 * holes are punched for reducing the number of free
137263397Sdelphij		 * records transmitted during a zfs send.
138263397Sdelphij		 */
139263397Sdelphij
140263397Sdelphij		uint64_t lsize = BP_GET_LSIZE(bp);
141263397Sdelphij		dmu_object_type_t type = BP_GET_TYPE(bp);
142263397Sdelphij		uint64_t lvl = BP_GET_LEVEL(bp);
143263397Sdelphij
144168404Spjd		bzero(bp, sizeof (blkptr_t));
145263397Sdelphij
146263397Sdelphij		if (spa_feature_is_active(dn->dn_objset->os_spa,
147263397Sdelphij		    SPA_FEATURE_HOLE_BIRTH)) {
148263397Sdelphij			BP_SET_LSIZE(bp, lsize);
149263397Sdelphij			BP_SET_TYPE(bp, type);
150263397Sdelphij			BP_SET_LEVEL(bp, lvl);
151263397Sdelphij			BP_SET_BIRTH(bp, dmu_tx_get_txg(tx), 0);
152263397Sdelphij		}
153168404Spjd	}
154168404Spjd	dnode_diduse_space(dn, -bytesfreed);
155168404Spjd}
156168404Spjd
157168404Spjd#ifdef ZFS_DEBUG
158168404Spjdstatic void
159168404Spjdfree_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
160168404Spjd{
161168404Spjd	int off, num;
162168404Spjd	int i, err, epbs;
163168404Spjd	uint64_t txg = tx->tx_txg;
164219089Spjd	dnode_t *dn;
165168404Spjd
166219089Spjd	DB_DNODE_ENTER(db);
167219089Spjd	dn = DB_DNODE(db);
168219089Spjd	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
169168404Spjd	off = start - (db->db_blkid * 1<<epbs);
170168404Spjd	num = end - start + 1;
171168404Spjd
172168404Spjd	ASSERT3U(off, >=, 0);
173168404Spjd	ASSERT3U(num, >=, 0);
174168404Spjd	ASSERT3U(db->db_level, >, 0);
175219089Spjd	ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
176168404Spjd	ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT);
177168404Spjd	ASSERT(db->db_blkptr != NULL);
178168404Spjd
179168404Spjd	for (i = off; i < off+num; i++) {
180168404Spjd		uint64_t *buf;
181168404Spjd		dmu_buf_impl_t *child;
182168404Spjd		dbuf_dirty_record_t *dr;
183168404Spjd		int j;
184168404Spjd
185168404Spjd		ASSERT(db->db_level == 1);
186168404Spjd
187219089Spjd		rw_enter(&dn->dn_struct_rwlock, RW_READER);
188219089Spjd		err = dbuf_hold_impl(dn, db->db_level-1,
189185029Spjd		    (db->db_blkid << epbs) + i, TRUE, FTAG, &child);
190219089Spjd		rw_exit(&dn->dn_struct_rwlock);
191168404Spjd		if (err == ENOENT)
192168404Spjd			continue;
193168404Spjd		ASSERT(err == 0);
194168404Spjd		ASSERT(child->db_level == 0);
195168404Spjd		dr = child->db_last_dirty;
196168404Spjd		while (dr && dr->dr_txg > txg)
197168404Spjd			dr = dr->dr_next;
198168404Spjd		ASSERT(dr == NULL || dr->dr_txg == txg);
199168404Spjd
200168404Spjd		/* data_old better be zeroed */
201168404Spjd		if (dr) {
202168404Spjd			buf = dr->dt.dl.dr_data->b_data;
203168404Spjd			for (j = 0; j < child->db.db_size >> 3; j++) {
204168404Spjd				if (buf[j] != 0) {
205168404Spjd					panic("freed data not zero: "
206168404Spjd					    "child=%p i=%d off=%d num=%d\n",
207185029Spjd					    (void *)child, i, off, num);
208168404Spjd				}
209168404Spjd			}
210168404Spjd		}
211168404Spjd
212168404Spjd		/*
213168404Spjd		 * db_data better be zeroed unless it's dirty in a
214168404Spjd		 * future txg.
215168404Spjd		 */
216168404Spjd		mutex_enter(&child->db_mtx);
217168404Spjd		buf = child->db.db_data;
218168404Spjd		if (buf != NULL && child->db_state != DB_FILL &&
219168404Spjd		    child->db_last_dirty == NULL) {
220168404Spjd			for (j = 0; j < child->db.db_size >> 3; j++) {
221168404Spjd				if (buf[j] != 0) {
222168404Spjd					panic("freed data not zero: "
223168404Spjd					    "child=%p i=%d off=%d num=%d\n",
224185029Spjd					    (void *)child, i, off, num);
225168404Spjd				}
226168404Spjd			}
227168404Spjd		}
228168404Spjd		mutex_exit(&child->db_mtx);
229168404Spjd
230168404Spjd		dbuf_rele(child, FTAG);
231168404Spjd	}
232219089Spjd	DB_DNODE_EXIT(db);
233168404Spjd}
234168404Spjd#endif
235168404Spjd
236263397Sdelphijstatic void
237263397Sdelphijfree_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
238168404Spjd    dmu_tx_t *tx)
239168404Spjd{
240219089Spjd	dnode_t *dn;
241168404Spjd	blkptr_t *bp;
242168404Spjd	dmu_buf_impl_t *subdb;
243168404Spjd	uint64_t start, end, dbstart, dbend, i;
244263397Sdelphij	int epbs, shift;
245168404Spjd
246185029Spjd	/*
247185029Spjd	 * There is a small possibility that this block will not be cached:
248185029Spjd	 *   1 - if level > 1 and there are no children with level <= 1
249263397Sdelphij	 *   2 - if this block was evicted since we read it from
250263397Sdelphij	 *	 dmu_tx_hold_free().
251185029Spjd	 */
252185029Spjd	if (db->db_state != DB_CACHED)
253185029Spjd		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
254185029Spjd
255219089Spjd	dbuf_release_bp(db);
256263397Sdelphij	bp = db->db.db_data;
257168404Spjd
258219089Spjd	DB_DNODE_ENTER(db);
259219089Spjd	dn = DB_DNODE(db);
260219089Spjd	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
261168404Spjd	shift = (db->db_level - 1) * epbs;
262168404Spjd	dbstart = db->db_blkid << epbs;
263168404Spjd	start = blkid >> shift;
264168404Spjd	if (dbstart < start) {
265168404Spjd		bp += start - dbstart;
266168404Spjd	} else {
267168404Spjd		start = dbstart;
268168404Spjd	}
269168404Spjd	dbend = ((db->db_blkid + 1) << epbs) - 1;
270168404Spjd	end = (blkid + nblks - 1) >> shift;
271168404Spjd	if (dbend <= end)
272168404Spjd		end = dbend;
273263397Sdelphij
274168404Spjd	ASSERT3U(start, <=, end);
275168404Spjd
276168404Spjd	if (db->db_level == 1) {
277168404Spjd		FREE_VERIFY(db, start, end, tx);
278263397Sdelphij		free_blocks(dn, bp, end-start+1, tx);
279263397Sdelphij	} else {
280263397Sdelphij		for (i = start; i <= end; i++, bp++) {
281263397Sdelphij			if (BP_IS_HOLE(bp))
282263397Sdelphij				continue;
283263397Sdelphij			rw_enter(&dn->dn_struct_rwlock, RW_READER);
284263397Sdelphij			VERIFY0(dbuf_hold_impl(dn, db->db_level - 1,
285263397Sdelphij			    i, B_TRUE, FTAG, &subdb));
286263397Sdelphij			rw_exit(&dn->dn_struct_rwlock);
287263397Sdelphij			ASSERT3P(bp, ==, subdb->db_blkptr);
288263397Sdelphij
289263397Sdelphij			free_children(subdb, blkid, nblks, tx);
290263397Sdelphij			dbuf_rele(subdb, FTAG);
291263397Sdelphij		}
292168404Spjd	}
293168404Spjd
294263397Sdelphij	/* If this whole block is free, free ourself too. */
295263397Sdelphij	for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) {
296263397Sdelphij		if (!BP_IS_HOLE(bp))
297263397Sdelphij			break;
298263397Sdelphij	}
299263397Sdelphij	if (i == 1 << epbs) {
300263397Sdelphij		/* didn't find any non-holes */
301263397Sdelphij		bzero(db->db.db_data, db->db.db_size);
302263397Sdelphij		free_blocks(dn, db->db_blkptr, 1, tx);
303263397Sdelphij	} else {
304263397Sdelphij		/*
305263397Sdelphij		 * Partial block free; must be marked dirty so that it
306263397Sdelphij		 * will be written out.
307263397Sdelphij		 */
308263397Sdelphij		ASSERT(db->db_dirtycnt > 0);
309263397Sdelphij	}
310168404Spjd
311219089Spjd	DB_DNODE_EXIT(db);
312168404Spjd	arc_buf_freeze(db->db_buf);
313168404Spjd}
314168404Spjd
315168404Spjd/*
316251631Sdelphij * Traverse the indicated range of the provided file
317168404Spjd * and "free" all the blocks contained there.
318168404Spjd */
319168404Spjdstatic void
320265740Sdelphijdnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
321263397Sdelphij    dmu_tx_t *tx)
322168404Spjd{
323168404Spjd	blkptr_t *bp = dn->dn_phys->dn_blkptr;
324168404Spjd	int dnlevel = dn->dn_phys->dn_nlevels;
325263397Sdelphij	boolean_t trunc = B_FALSE;
326168404Spjd
327168404Spjd	if (blkid > dn->dn_phys->dn_maxblkid)
328168404Spjd		return;
329168404Spjd
330168404Spjd	ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX);
331263397Sdelphij	if (blkid + nblks > dn->dn_phys->dn_maxblkid) {
332168404Spjd		nblks = dn->dn_phys->dn_maxblkid - blkid + 1;
333263397Sdelphij		trunc = B_TRUE;
334263397Sdelphij	}
335168404Spjd
336168404Spjd	/* There are no indirect blocks in the object */
337168404Spjd	if (dnlevel == 1) {
338168404Spjd		if (blkid >= dn->dn_phys->dn_nblkptr) {
339168404Spjd			/* this range was never made persistent */
340168404Spjd			return;
341168404Spjd		}
342168404Spjd		ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr);
343263397Sdelphij		free_blocks(dn, bp + blkid, nblks, tx);
344263397Sdelphij	} else {
345263397Sdelphij		int shift = (dnlevel - 1) *
346263397Sdelphij		    (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT);
347263397Sdelphij		int start = blkid >> shift;
348263397Sdelphij		int end = (blkid + nblks - 1) >> shift;
349263397Sdelphij		dmu_buf_impl_t *db;
350168404Spjd
351263397Sdelphij		ASSERT(start < dn->dn_phys->dn_nblkptr);
352263397Sdelphij		bp += start;
353263397Sdelphij		for (int i = start; i <= end; i++, bp++) {
354263397Sdelphij			if (BP_IS_HOLE(bp))
355263397Sdelphij				continue;
356263397Sdelphij			rw_enter(&dn->dn_struct_rwlock, RW_READER);
357263397Sdelphij			VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i,
358263397Sdelphij			    TRUE, FTAG, &db));
359263397Sdelphij			rw_exit(&dn->dn_struct_rwlock);
360168404Spjd
361263397Sdelphij			free_children(db, blkid, nblks, tx);
362263397Sdelphij			dbuf_rele(db, FTAG);
363168404Spjd		}
364168404Spjd	}
365263397Sdelphij
366168404Spjd	if (trunc) {
367263397Sdelphij		dn->dn_phys->dn_maxblkid = blkid == 0 ? 0 : blkid - 1;
368263397Sdelphij
369168404Spjd		uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
370168404Spjd		    (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
371168404Spjd		ASSERT(off < dn->dn_phys->dn_maxblkid ||
372168404Spjd		    dn->dn_phys->dn_maxblkid == 0 ||
373185029Spjd		    dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0);
374168404Spjd	}
375168404Spjd}
376168404Spjd
377265740Sdelphijtypedef struct dnode_sync_free_range_arg {
378265740Sdelphij	dnode_t *dsfra_dnode;
379265740Sdelphij	dmu_tx_t *dsfra_tx;
380265740Sdelphij} dnode_sync_free_range_arg_t;
381265740Sdelphij
382265740Sdelphijstatic void
383265740Sdelphijdnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks)
384265740Sdelphij{
385265740Sdelphij	dnode_sync_free_range_arg_t *dsfra = arg;
386265740Sdelphij	dnode_t *dn = dsfra->dsfra_dnode;
387265740Sdelphij
388265740Sdelphij	mutex_exit(&dn->dn_mtx);
389265740Sdelphij	dnode_sync_free_range_impl(dn, blkid, nblks, dsfra->dsfra_tx);
390265740Sdelphij	mutex_enter(&dn->dn_mtx);
391265740Sdelphij}
392265740Sdelphij
393168404Spjd/*
394251631Sdelphij * Try to kick all the dnode's dbufs out of the cache...
395168404Spjd */
396185029Spjdvoid
397185029Spjddnode_evict_dbufs(dnode_t *dn)
398168404Spjd{
399168404Spjd	int progress;
400168404Spjd	int pass = 0;
401168404Spjd
402168404Spjd	do {
403269845Sdelphij		dmu_buf_impl_t *db, *db_next;
404168404Spjd		int evicting = FALSE;
405168404Spjd
406168404Spjd		progress = FALSE;
407168404Spjd		mutex_enter(&dn->dn_dbufs_mtx);
408269845Sdelphij		for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) {
409269845Sdelphij			db_next = AVL_NEXT(&dn->dn_dbufs, db);
410219089Spjd#ifdef	DEBUG
411219089Spjd			DB_DNODE_ENTER(db);
412219089Spjd			ASSERT3P(DB_DNODE(db), ==, dn);
413219089Spjd			DB_DNODE_EXIT(db);
414219089Spjd#endif	/* DEBUG */
415168404Spjd
416168404Spjd			mutex_enter(&db->db_mtx);
417168404Spjd			if (db->db_state == DB_EVICTING) {
418168404Spjd				progress = TRUE;
419168404Spjd				evicting = TRUE;
420168404Spjd				mutex_exit(&db->db_mtx);
421168404Spjd			} else if (refcount_is_zero(&db->db_holds)) {
422168404Spjd				progress = TRUE;
423168404Spjd				dbuf_clear(db); /* exits db_mtx for us */
424168404Spjd			} else {
425168404Spjd				mutex_exit(&db->db_mtx);
426168404Spjd			}
427168404Spjd
428168404Spjd		}
429168404Spjd		/*
430168404Spjd		 * NB: we need to drop dn_dbufs_mtx between passes so
431168404Spjd		 * that any DB_EVICTING dbufs can make progress.
432168404Spjd		 * Ideally, we would have some cv we could wait on, but
433168404Spjd		 * since we don't, just wait a bit to give the other
434168404Spjd		 * thread a chance to run.
435168404Spjd		 */
436168404Spjd		mutex_exit(&dn->dn_dbufs_mtx);
437168404Spjd		if (evicting)
438168404Spjd			delay(1);
439168404Spjd		pass++;
440168404Spjd		ASSERT(pass < 100); /* sanity check */
441168404Spjd	} while (progress);
442168404Spjd
443168404Spjd	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
444168404Spjd	if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) {
445168404Spjd		mutex_enter(&dn->dn_bonus->db_mtx);
446168404Spjd		dbuf_evict(dn->dn_bonus);
447168404Spjd		dn->dn_bonus = NULL;
448168404Spjd	}
449168404Spjd	rw_exit(&dn->dn_struct_rwlock);
450168404Spjd}
451168404Spjd
452168404Spjdstatic void
453168404Spjddnode_undirty_dbufs(list_t *list)
454168404Spjd{
455168404Spjd	dbuf_dirty_record_t *dr;
456168404Spjd
457168404Spjd	while (dr = list_head(list)) {
458168404Spjd		dmu_buf_impl_t *db = dr->dr_dbuf;
459168404Spjd		uint64_t txg = dr->dr_txg;
460168404Spjd
461219089Spjd		if (db->db_level != 0)
462219089Spjd			dnode_undirty_dbufs(&dr->dt.di.dr_children);
463219089Spjd
464168404Spjd		mutex_enter(&db->db_mtx);
465168404Spjd		/* XXX - use dbuf_undirty()? */
466168404Spjd		list_remove(list, dr);
467168404Spjd		ASSERT(db->db_last_dirty == dr);
468168404Spjd		db->db_last_dirty = NULL;
469168404Spjd		db->db_dirtycnt -= 1;
470168404Spjd		if (db->db_level == 0) {
471219089Spjd			ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
472168404Spjd			    dr->dt.dl.dr_data == db->db_buf);
473168404Spjd			dbuf_unoverride(dr);
474168404Spjd		} else {
475269218Sdelphij			mutex_destroy(&dr->dt.di.dr_mtx);
476169325Spjd			list_destroy(&dr->dt.di.dr_children);
477168404Spjd		}
478168404Spjd		kmem_free(dr, sizeof (dbuf_dirty_record_t));
479219089Spjd		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
480168404Spjd	}
481168404Spjd}
482168404Spjd
483168404Spjdstatic void
484168404Spjddnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
485168404Spjd{
486168404Spjd	int txgoff = tx->tx_txg & TXG_MASK;
487168404Spjd
488168404Spjd	ASSERT(dmu_tx_is_syncing(tx));
489168404Spjd
490185029Spjd	/*
491185029Spjd	 * Our contents should have been freed in dnode_sync() by the
492185029Spjd	 * free range record inserted by the caller of dnode_free().
493185029Spjd	 */
494240415Smm	ASSERT0(DN_USED_BYTES(dn->dn_phys));
495185029Spjd	ASSERT(BP_IS_HOLE(dn->dn_phys->dn_blkptr));
496185029Spjd
497168404Spjd	dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
498185029Spjd	dnode_evict_dbufs(dn);
499269845Sdelphij	ASSERT(avl_is_empty(&dn->dn_dbufs));
500248571Smm	ASSERT3P(dn->dn_bonus, ==, NULL);
501168404Spjd
502168404Spjd	/*
503168404Spjd	 * XXX - It would be nice to assert this, but we may still
504168404Spjd	 * have residual holds from async evictions from the arc...
505168404Spjd	 *
506168404Spjd	 * zfs_obj_to_path() also depends on this being
507168404Spjd	 * commented out.
508168404Spjd	 *
509168404Spjd	 * ASSERT3U(refcount_count(&dn->dn_holds), ==, 1);
510168404Spjd	 */
511168404Spjd
512168404Spjd	/* Undirty next bits */
513168404Spjd	dn->dn_next_nlevels[txgoff] = 0;
514168404Spjd	dn->dn_next_indblkshift[txgoff] = 0;
515168404Spjd	dn->dn_next_blksz[txgoff] = 0;
516168404Spjd
517168404Spjd	/* ASSERT(blkptrs are zero); */
518168404Spjd	ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
519168404Spjd	ASSERT(dn->dn_type != DMU_OT_NONE);
520168404Spjd
521168404Spjd	ASSERT(dn->dn_free_txg > 0);
522168404Spjd	if (dn->dn_allocated_txg != dn->dn_free_txg)
523263397Sdelphij		dmu_buf_will_dirty(&dn->dn_dbuf->db, tx);
524168404Spjd	bzero(dn->dn_phys, sizeof (dnode_phys_t));
525168404Spjd
526168404Spjd	mutex_enter(&dn->dn_mtx);
527168404Spjd	dn->dn_type = DMU_OT_NONE;
528168404Spjd	dn->dn_maxblkid = 0;
529168404Spjd	dn->dn_allocated_txg = 0;
530185029Spjd	dn->dn_free_txg = 0;
531219089Spjd	dn->dn_have_spill = B_FALSE;
532168404Spjd	mutex_exit(&dn->dn_mtx);
533168404Spjd
534168404Spjd	ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
535168404Spjd
536168404Spjd	dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
537168404Spjd	/*
538168404Spjd	 * Now that we've released our hold, the dnode may
539168404Spjd	 * be evicted, so we musn't access it.
540168404Spjd	 */
541168404Spjd}
542168404Spjd
543168404Spjd/*
544168404Spjd * Write out the dnode's dirty buffers.
545168404Spjd */
546168404Spjdvoid
547168404Spjddnode_sync(dnode_t *dn, dmu_tx_t *tx)
548168404Spjd{
549168404Spjd	dnode_phys_t *dnp = dn->dn_phys;
550168404Spjd	int txgoff = tx->tx_txg & TXG_MASK;
551168404Spjd	list_t *list = &dn->dn_dirty_records[txgoff];
552209962Smm	static const dnode_phys_t zerodn = { 0 };
553219089Spjd	boolean_t kill_spill = B_FALSE;
554168404Spjd
555168404Spjd	ASSERT(dmu_tx_is_syncing(tx));
556168404Spjd	ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
557209962Smm	ASSERT(dnp->dn_type != DMU_OT_NONE ||
558209962Smm	    bcmp(dnp, &zerodn, DNODE_SIZE) == 0);
559168404Spjd	DNODE_VERIFY(dn);
560168404Spjd
561168404Spjd	ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf));
562168404Spjd
563209962Smm	if (dmu_objset_userused_enabled(dn->dn_objset) &&
564209962Smm	    !DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
565219089Spjd		mutex_enter(&dn->dn_mtx);
566219089Spjd		dn->dn_oldused = DN_USED_BYTES(dn->dn_phys);
567219089Spjd		dn->dn_oldflags = dn->dn_phys->dn_flags;
568209962Smm		dn->dn_phys->dn_flags |= DNODE_FLAG_USERUSED_ACCOUNTED;
569219089Spjd		mutex_exit(&dn->dn_mtx);
570219089Spjd		dmu_objset_userquota_get_ids(dn, B_FALSE, tx);
571209962Smm	} else {
572209962Smm		/* Once we account for it, we should always account for it. */
573209962Smm		ASSERT(!(dn->dn_phys->dn_flags &
574209962Smm		    DNODE_FLAG_USERUSED_ACCOUNTED));
575209962Smm	}
576209962Smm
577168404Spjd	mutex_enter(&dn->dn_mtx);
578168404Spjd	if (dn->dn_allocated_txg == tx->tx_txg) {
579168404Spjd		/* The dnode is newly allocated or reallocated */
580168404Spjd		if (dnp->dn_type == DMU_OT_NONE) {
581168404Spjd			/* this is a first alloc, not a realloc */
582168404Spjd			dnp->dn_nlevels = 1;
583196703Spjd			dnp->dn_nblkptr = dn->dn_nblkptr;
584168404Spjd		}
585168404Spjd
586168404Spjd		dnp->dn_type = dn->dn_type;
587168404Spjd		dnp->dn_bonustype = dn->dn_bonustype;
588168404Spjd		dnp->dn_bonuslen = dn->dn_bonuslen;
589168404Spjd	}
590168404Spjd	ASSERT(dnp->dn_nlevels > 1 ||
591168404Spjd	    BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
592268649Sdelphij	    BP_IS_EMBEDDED(&dnp->dn_blkptr[0]) ||
593168404Spjd	    BP_GET_LSIZE(&dnp->dn_blkptr[0]) ==
594168404Spjd	    dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
595268649Sdelphij	ASSERT(dnp->dn_nlevels < 2 ||
596268649Sdelphij	    BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
597268649Sdelphij	    BP_GET_LSIZE(&dnp->dn_blkptr[0]) == 1 << dnp->dn_indblkshift);
598168404Spjd
599263390Sdelphij	if (dn->dn_next_type[txgoff] != 0) {
600263390Sdelphij		dnp->dn_type = dn->dn_type;
601263390Sdelphij		dn->dn_next_type[txgoff] = 0;
602263390Sdelphij	}
603263390Sdelphij
604263390Sdelphij	if (dn->dn_next_blksz[txgoff] != 0) {
605168404Spjd		ASSERT(P2PHASE(dn->dn_next_blksz[txgoff],
606168404Spjd		    SPA_MINBLOCKSIZE) == 0);
607168404Spjd		ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
608185029Spjd		    dn->dn_maxblkid == 0 || list_head(list) != NULL ||
609168404Spjd		    dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT ==
610265740Sdelphij		    dnp->dn_datablkszsec ||
611265740Sdelphij		    range_tree_space(dn->dn_free_ranges[txgoff]) != 0);
612168404Spjd		dnp->dn_datablkszsec =
613168404Spjd		    dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT;
614168404Spjd		dn->dn_next_blksz[txgoff] = 0;
615168404Spjd	}
616168404Spjd
617263390Sdelphij	if (dn->dn_next_bonuslen[txgoff] != 0) {
618185029Spjd		if (dn->dn_next_bonuslen[txgoff] == DN_ZERO_BONUSLEN)
619185029Spjd			dnp->dn_bonuslen = 0;
620185029Spjd		else
621185029Spjd			dnp->dn_bonuslen = dn->dn_next_bonuslen[txgoff];
622185029Spjd		ASSERT(dnp->dn_bonuslen <= DN_MAX_BONUSLEN);
623185029Spjd		dn->dn_next_bonuslen[txgoff] = 0;
624185029Spjd	}
625185029Spjd
626263390Sdelphij	if (dn->dn_next_bonustype[txgoff] != 0) {
627236884Smm		ASSERT(DMU_OT_IS_VALID(dn->dn_next_bonustype[txgoff]));
628219089Spjd		dnp->dn_bonustype = dn->dn_next_bonustype[txgoff];
629219089Spjd		dn->dn_next_bonustype[txgoff] = 0;
630219089Spjd	}
631219089Spjd
632263397Sdelphij	boolean_t freeing_dnode = dn->dn_free_txg > 0 &&
633263397Sdelphij	    dn->dn_free_txg <= tx->tx_txg;
634263397Sdelphij
635219089Spjd	/*
636219089Spjd	 * We will either remove a spill block when a file is being removed
637219089Spjd	 * or we have been asked to remove it.
638219089Spjd	 */
639219089Spjd	if (dn->dn_rm_spillblk[txgoff] ||
640263397Sdelphij	    ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) && freeing_dnode)) {
641219089Spjd		if ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
642219089Spjd			kill_spill = B_TRUE;
643219089Spjd		dn->dn_rm_spillblk[txgoff] = 0;
644219089Spjd	}
645219089Spjd
646263390Sdelphij	if (dn->dn_next_indblkshift[txgoff] != 0) {
647168404Spjd		ASSERT(dnp->dn_nlevels == 1);
648168404Spjd		dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff];
649168404Spjd		dn->dn_next_indblkshift[txgoff] = 0;
650168404Spjd	}
651168404Spjd
652168404Spjd	/*
653168404Spjd	 * Just take the live (open-context) values for checksum and compress.
654168404Spjd	 * Strictly speaking it's a future leak, but nothing bad happens if we
655168404Spjd	 * start using the new checksum or compress algorithm a little early.
656168404Spjd	 */
657168404Spjd	dnp->dn_checksum = dn->dn_checksum;
658168404Spjd	dnp->dn_compress = dn->dn_compress;
659168404Spjd
660168404Spjd	mutex_exit(&dn->dn_mtx);
661168404Spjd
662219089Spjd	if (kill_spill) {
663263397Sdelphij		free_blocks(dn, &dn->dn_phys->dn_spill, 1, tx);
664219089Spjd		mutex_enter(&dn->dn_mtx);
665219089Spjd		dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR;
666219089Spjd		mutex_exit(&dn->dn_mtx);
667219089Spjd	}
668219089Spjd
669168404Spjd	/* process all the "freed" ranges in the file */
670265740Sdelphij	if (dn->dn_free_ranges[txgoff] != NULL) {
671265740Sdelphij		dnode_sync_free_range_arg_t dsfra;
672265740Sdelphij		dsfra.dsfra_dnode = dn;
673265740Sdelphij		dsfra.dsfra_tx = tx;
674185029Spjd		mutex_enter(&dn->dn_mtx);
675265740Sdelphij		range_tree_vacate(dn->dn_free_ranges[txgoff],
676265740Sdelphij		    dnode_sync_free_range, &dsfra);
677265740Sdelphij		range_tree_destroy(dn->dn_free_ranges[txgoff]);
678265740Sdelphij		dn->dn_free_ranges[txgoff] = NULL;
679185029Spjd		mutex_exit(&dn->dn_mtx);
680168404Spjd	}
681168404Spjd
682263397Sdelphij	if (freeing_dnode) {
683168404Spjd		dnode_sync_free(dn, tx);
684168404Spjd		return;
685168404Spjd	}
686168404Spjd
687271392Sdelphij	if (dn->dn_next_nlevels[txgoff]) {
688271392Sdelphij		dnode_increase_indirection(dn, tx);
689271392Sdelphij		dn->dn_next_nlevels[txgoff] = 0;
690271392Sdelphij	}
691271392Sdelphij
692196703Spjd	if (dn->dn_next_nblkptr[txgoff]) {
693196703Spjd		/* this should only happen on a realloc */
694196703Spjd		ASSERT(dn->dn_allocated_txg == tx->tx_txg);
695196703Spjd		if (dn->dn_next_nblkptr[txgoff] > dnp->dn_nblkptr) {
696196703Spjd			/* zero the new blkptrs we are gaining */
697196703Spjd			bzero(dnp->dn_blkptr + dnp->dn_nblkptr,
698196703Spjd			    sizeof (blkptr_t) *
699196703Spjd			    (dn->dn_next_nblkptr[txgoff] - dnp->dn_nblkptr));
700196703Spjd#ifdef ZFS_DEBUG
701196703Spjd		} else {
702196703Spjd			int i;
703196703Spjd			ASSERT(dn->dn_next_nblkptr[txgoff] < dnp->dn_nblkptr);
704196703Spjd			/* the blkptrs we are losing better be unallocated */
705196703Spjd			for (i = dn->dn_next_nblkptr[txgoff];
706196703Spjd			    i < dnp->dn_nblkptr; i++)
707196703Spjd				ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[i]));
708196703Spjd#endif
709196703Spjd		}
710196703Spjd		mutex_enter(&dn->dn_mtx);
711196703Spjd		dnp->dn_nblkptr = dn->dn_next_nblkptr[txgoff];
712196703Spjd		dn->dn_next_nblkptr[txgoff] = 0;
713196703Spjd		mutex_exit(&dn->dn_mtx);
714196703Spjd	}
715196703Spjd
716168404Spjd	dbuf_sync_list(list, tx);
717168404Spjd
718209962Smm	if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
719168404Spjd		ASSERT3P(list_head(list), ==, NULL);
720168404Spjd		dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
721168404Spjd	}
722168404Spjd
723168404Spjd	/*
724168404Spjd	 * Although we have dropped our reference to the dnode, it
725168404Spjd	 * can't be evicted until its written, and we haven't yet
726168404Spjd	 * initiated the IO for the dnode's dbuf.
727168404Spjd	 */
728168404Spjd}
729