dsl_dataset.c revision 330590
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Portions Copyright (c) 2011 Martin Matuska <mm@FreeBSD.org>
25 * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
26 * Copyright (c) 2014, Joyent, Inc. All rights reserved.
27 * Copyright (c) 2014 RackTop Systems.
28 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
29 * Copyright (c) 2014 Integros [integros.com]
30 * Copyright 2016, OmniTI Computer Consulting, Inc. All rights reserved.
31 * Copyright 2017 Nexenta Systems, Inc.
32 */
33
34#include <sys/dmu_objset.h>
35#include <sys/dsl_dataset.h>
36#include <sys/dsl_dir.h>
37#include <sys/dsl_prop.h>
38#include <sys/dsl_synctask.h>
39#include <sys/dmu_traverse.h>
40#include <sys/dmu_impl.h>
41#include <sys/dmu_send.h>
42#include <sys/dmu_tx.h>
43#include <sys/arc.h>
44#include <sys/zio.h>
45#include <sys/zap.h>
46#include <sys/zfeature.h>
47#include <sys/unique.h>
48#include <sys/zfs_context.h>
49#include <sys/zfs_ioctl.h>
50#include <sys/spa.h>
51#include <sys/zfs_znode.h>
52#include <sys/zfs_onexit.h>
53#include <sys/zvol.h>
54#include <sys/dsl_scan.h>
55#include <sys/dsl_deadlist.h>
56#include <sys/dsl_destroy.h>
57#include <sys/dsl_userhold.h>
58#include <sys/dsl_bookmark.h>
59#include <sys/dmu_send.h>
60#include <sys/zio_checksum.h>
61#include <sys/zio_compress.h>
62#include <zfs_fletcher.h>
63
64SYSCTL_DECL(_vfs_zfs);
65
66/*
67 * The SPA supports block sizes up to 16MB.  However, very large blocks
68 * can have an impact on i/o latency (e.g. tying up a spinning disk for
69 * ~300ms), and also potentially on the memory allocator.  Therefore,
70 * we do not allow the recordsize to be set larger than zfs_max_recordsize
71 * (default 1MB).  Larger blocks can be created by changing this tunable,
72 * and pools with larger blocks can always be imported and used, regardless
73 * of this setting.
74 */
75int zfs_max_recordsize = 1 * 1024 * 1024;
76SYSCTL_INT(_vfs_zfs, OID_AUTO, max_recordsize, CTLFLAG_RWTUN,
77    &zfs_max_recordsize, 0,
78    "Maximum block size.  Expect dragons when tuning this.");
79
80#define	SWITCH64(x, y) \
81	{ \
82		uint64_t __tmp = (x); \
83		(x) = (y); \
84		(y) = __tmp; \
85	}
86
87#define	DS_REF_MAX	(1ULL << 62)
88
89extern inline dsl_dataset_phys_t *dsl_dataset_phys(dsl_dataset_t *ds);
90
91extern int spa_asize_inflation;
92
93static zil_header_t zero_zil;
94
95/*
96 * Figure out how much of this delta should be propogated to the dsl_dir
97 * layer.  If there's a refreservation, that space has already been
98 * partially accounted for in our ancestors.
99 */
100static int64_t
101parent_delta(dsl_dataset_t *ds, int64_t delta)
102{
103	dsl_dataset_phys_t *ds_phys;
104	uint64_t old_bytes, new_bytes;
105
106	if (ds->ds_reserved == 0)
107		return (delta);
108
109	ds_phys = dsl_dataset_phys(ds);
110	old_bytes = MAX(ds_phys->ds_unique_bytes, ds->ds_reserved);
111	new_bytes = MAX(ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
112
113	ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
114	return (new_bytes - old_bytes);
115}
116
117void
118dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
119{
120	int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
121	int compressed = BP_GET_PSIZE(bp);
122	int uncompressed = BP_GET_UCSIZE(bp);
123	int64_t delta;
124
125	dprintf_bp(bp, "ds=%p", ds);
126
127	ASSERT(dmu_tx_is_syncing(tx));
128	/* It could have been compressed away to nothing */
129	if (BP_IS_HOLE(bp))
130		return;
131	ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
132	ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp)));
133	if (ds == NULL) {
134		dsl_pool_mos_diduse_space(tx->tx_pool,
135		    used, compressed, uncompressed);
136		return;
137	}
138
139	ASSERT3U(bp->blk_birth, >, dsl_dataset_phys(ds)->ds_prev_snap_txg);
140	dmu_buf_will_dirty(ds->ds_dbuf, tx);
141	mutex_enter(&ds->ds_lock);
142	delta = parent_delta(ds, used);
143	dsl_dataset_phys(ds)->ds_referenced_bytes += used;
144	dsl_dataset_phys(ds)->ds_compressed_bytes += compressed;
145	dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed;
146	dsl_dataset_phys(ds)->ds_unique_bytes += used;
147
148	if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) {
149		ds->ds_feature_activation_needed[SPA_FEATURE_LARGE_BLOCKS] =
150		    B_TRUE;
151	}
152
153	spa_feature_t f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp));
154	if (f != SPA_FEATURE_NONE)
155		ds->ds_feature_activation_needed[f] = B_TRUE;
156
157	mutex_exit(&ds->ds_lock);
158	dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
159	    compressed, uncompressed, tx);
160	dsl_dir_transfer_space(ds->ds_dir, used - delta,
161	    DD_USED_REFRSRV, DD_USED_HEAD, NULL);
162}
163
164int
165dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
166    boolean_t async)
167{
168	int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
169	int compressed = BP_GET_PSIZE(bp);
170	int uncompressed = BP_GET_UCSIZE(bp);
171
172	if (BP_IS_HOLE(bp))
173		return (0);
174
175	ASSERT(dmu_tx_is_syncing(tx));
176	ASSERT(bp->blk_birth <= tx->tx_txg);
177
178	if (ds == NULL) {
179		dsl_free(tx->tx_pool, tx->tx_txg, bp);
180		dsl_pool_mos_diduse_space(tx->tx_pool,
181		    -used, -compressed, -uncompressed);
182		return (used);
183	}
184	ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
185
186	ASSERT(!ds->ds_is_snapshot);
187	dmu_buf_will_dirty(ds->ds_dbuf, tx);
188
189	if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
190		int64_t delta;
191
192		dprintf_bp(bp, "freeing ds=%llu", ds->ds_object);
193		dsl_free(tx->tx_pool, tx->tx_txg, bp);
194
195		mutex_enter(&ds->ds_lock);
196		ASSERT(dsl_dataset_phys(ds)->ds_unique_bytes >= used ||
197		    !DS_UNIQUE_IS_ACCURATE(ds));
198		delta = parent_delta(ds, -used);
199		dsl_dataset_phys(ds)->ds_unique_bytes -= used;
200		mutex_exit(&ds->ds_lock);
201		dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
202		    delta, -compressed, -uncompressed, tx);
203		dsl_dir_transfer_space(ds->ds_dir, -used - delta,
204		    DD_USED_REFRSRV, DD_USED_HEAD, NULL);
205	} else {
206		dprintf_bp(bp, "putting on dead list: %s", "");
207		if (async) {
208			/*
209			 * We are here as part of zio's write done callback,
210			 * which means we're a zio interrupt thread.  We can't
211			 * call dsl_deadlist_insert() now because it may block
212			 * waiting for I/O.  Instead, put bp on the deferred
213			 * queue and let dsl_pool_sync() finish the job.
214			 */
215			bplist_append(&ds->ds_pending_deadlist, bp);
216		} else {
217			dsl_deadlist_insert(&ds->ds_deadlist, bp, tx);
218		}
219		ASSERT3U(ds->ds_prev->ds_object, ==,
220		    dsl_dataset_phys(ds)->ds_prev_snap_obj);
221		ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0);
222		/* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
223		if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
224		    ds->ds_object && bp->blk_birth >
225		    dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) {
226			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
227			mutex_enter(&ds->ds_prev->ds_lock);
228			dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used;
229			mutex_exit(&ds->ds_prev->ds_lock);
230		}
231		if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
232			dsl_dir_transfer_space(ds->ds_dir, used,
233			    DD_USED_HEAD, DD_USED_SNAP, tx);
234		}
235	}
236	mutex_enter(&ds->ds_lock);
237	ASSERT3U(dsl_dataset_phys(ds)->ds_referenced_bytes, >=, used);
238	dsl_dataset_phys(ds)->ds_referenced_bytes -= used;
239	ASSERT3U(dsl_dataset_phys(ds)->ds_compressed_bytes, >=, compressed);
240	dsl_dataset_phys(ds)->ds_compressed_bytes -= compressed;
241	ASSERT3U(dsl_dataset_phys(ds)->ds_uncompressed_bytes, >=, uncompressed);
242	dsl_dataset_phys(ds)->ds_uncompressed_bytes -= uncompressed;
243	mutex_exit(&ds->ds_lock);
244
245	return (used);
246}
247
248/*
249 * We have to release the fsid syncronously or we risk that a subsequent
250 * mount of the same dataset will fail to unique_insert the fsid.  This
251 * failure would manifest itself as the fsid of this dataset changing
252 * between mounts which makes NFS clients quite unhappy.
253 */
254static void
255dsl_dataset_evict_sync(void *dbu)
256{
257	dsl_dataset_t *ds = dbu;
258
259	ASSERT(ds->ds_owner == NULL);
260
261	unique_remove(ds->ds_fsid_guid);
262}
263
264static void
265dsl_dataset_evict_async(void *dbu)
266{
267	dsl_dataset_t *ds = dbu;
268
269	ASSERT(ds->ds_owner == NULL);
270
271	ds->ds_dbuf = NULL;
272
273	if (ds->ds_objset != NULL)
274		dmu_objset_evict(ds->ds_objset);
275
276	if (ds->ds_prev) {
277		dsl_dataset_rele(ds->ds_prev, ds);
278		ds->ds_prev = NULL;
279	}
280
281	bplist_destroy(&ds->ds_pending_deadlist);
282	if (ds->ds_deadlist.dl_os != NULL)
283		dsl_deadlist_close(&ds->ds_deadlist);
284	if (ds->ds_dir)
285		dsl_dir_async_rele(ds->ds_dir, ds);
286
287	ASSERT(!list_link_active(&ds->ds_synced_link));
288
289	list_destroy(&ds->ds_prop_cbs);
290	if (mutex_owned(&ds->ds_lock))
291		mutex_exit(&ds->ds_lock);
292	mutex_destroy(&ds->ds_lock);
293	if (mutex_owned(&ds->ds_opening_lock))
294		mutex_exit(&ds->ds_opening_lock);
295	mutex_destroy(&ds->ds_opening_lock);
296	mutex_destroy(&ds->ds_sendstream_lock);
297	refcount_destroy(&ds->ds_longholds);
298	rrw_destroy(&ds->ds_bp_rwlock);
299
300	kmem_free(ds, sizeof (dsl_dataset_t));
301}
302
303int
304dsl_dataset_get_snapname(dsl_dataset_t *ds)
305{
306	dsl_dataset_phys_t *headphys;
307	int err;
308	dmu_buf_t *headdbuf;
309	dsl_pool_t *dp = ds->ds_dir->dd_pool;
310	objset_t *mos = dp->dp_meta_objset;
311
312	if (ds->ds_snapname[0])
313		return (0);
314	if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0)
315		return (0);
316
317	err = dmu_bonus_hold(mos, dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj,
318	    FTAG, &headdbuf);
319	if (err != 0)
320		return (err);
321	headphys = headdbuf->db_data;
322	err = zap_value_search(dp->dp_meta_objset,
323	    headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname);
324	dmu_buf_rele(headdbuf, FTAG);
325	return (err);
326}
327
328int
329dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value)
330{
331	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
332	uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
333	matchtype_t mt = 0;
334	int err;
335
336	if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
337		mt = MT_NORMALIZE;
338
339	err = zap_lookup_norm(mos, snapobj, name, 8, 1,
340	    value, mt, NULL, 0, NULL);
341	if (err == ENOTSUP && (mt & MT_NORMALIZE))
342		err = zap_lookup(mos, snapobj, name, 8, 1, value);
343	return (err);
344}
345
346int
347dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx,
348    boolean_t adj_cnt)
349{
350	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
351	uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
352	matchtype_t mt = 0;
353	int err;
354
355	dsl_dir_snap_cmtime_update(ds->ds_dir);
356
357	if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
358		mt = MT_NORMALIZE;
359
360	err = zap_remove_norm(mos, snapobj, name, mt, tx);
361	if (err == ENOTSUP && (mt & MT_NORMALIZE))
362		err = zap_remove(mos, snapobj, name, tx);
363
364	if (err == 0 && adj_cnt)
365		dsl_fs_ss_count_adjust(ds->ds_dir, -1,
366		    DD_FIELD_SNAPSHOT_COUNT, tx);
367
368	return (err);
369}
370
371boolean_t
372dsl_dataset_try_add_ref(dsl_pool_t *dp, dsl_dataset_t *ds, void *tag)
373{
374	dmu_buf_t *dbuf = ds->ds_dbuf;
375	boolean_t result = B_FALSE;
376
377	if (dbuf != NULL && dmu_buf_try_add_ref(dbuf, dp->dp_meta_objset,
378	    ds->ds_object, DMU_BONUS_BLKID, tag)) {
379
380		if (ds == dmu_buf_get_user(dbuf))
381			result = B_TRUE;
382		else
383			dmu_buf_rele(dbuf, tag);
384	}
385
386	return (result);
387}
388
389int
390dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
391    dsl_dataset_t **dsp)
392{
393	objset_t *mos = dp->dp_meta_objset;
394	dmu_buf_t *dbuf;
395	dsl_dataset_t *ds;
396	int err;
397	dmu_object_info_t doi;
398
399	ASSERT(dsl_pool_config_held(dp));
400
401	err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
402	if (err != 0)
403		return (err);
404
405	/* Make sure dsobj has the correct object type. */
406	dmu_object_info_from_db(dbuf, &doi);
407	if (doi.doi_bonus_type != DMU_OT_DSL_DATASET) {
408		dmu_buf_rele(dbuf, tag);
409		return (SET_ERROR(EINVAL));
410	}
411
412	ds = dmu_buf_get_user(dbuf);
413	if (ds == NULL) {
414		dsl_dataset_t *winner = NULL;
415
416		ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
417		ds->ds_dbuf = dbuf;
418		ds->ds_object = dsobj;
419		ds->ds_is_snapshot = dsl_dataset_phys(ds)->ds_num_children != 0;
420
421		mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
422		mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
423		mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL);
424		rrw_init(&ds->ds_bp_rwlock, B_FALSE);
425		refcount_create(&ds->ds_longholds);
426
427		bplist_create(&ds->ds_pending_deadlist);
428		dsl_deadlist_open(&ds->ds_deadlist,
429		    mos, dsl_dataset_phys(ds)->ds_deadlist_obj);
430
431		list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t),
432		    offsetof(dmu_sendarg_t, dsa_link));
433
434		list_create(&ds->ds_prop_cbs, sizeof (dsl_prop_cb_record_t),
435		    offsetof(dsl_prop_cb_record_t, cbr_ds_node));
436
437		if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
438			for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
439				if (!(spa_feature_table[f].fi_flags &
440				    ZFEATURE_FLAG_PER_DATASET))
441					continue;
442				err = zap_contains(mos, dsobj,
443				    spa_feature_table[f].fi_guid);
444				if (err == 0) {
445					ds->ds_feature_inuse[f] = B_TRUE;
446				} else {
447					ASSERT3U(err, ==, ENOENT);
448					err = 0;
449				}
450			}
451		}
452
453		err = dsl_dir_hold_obj(dp,
454		    dsl_dataset_phys(ds)->ds_dir_obj, NULL, ds, &ds->ds_dir);
455		if (err != 0) {
456			mutex_destroy(&ds->ds_lock);
457			mutex_destroy(&ds->ds_opening_lock);
458			mutex_destroy(&ds->ds_sendstream_lock);
459			refcount_destroy(&ds->ds_longholds);
460			bplist_destroy(&ds->ds_pending_deadlist);
461			dsl_deadlist_close(&ds->ds_deadlist);
462			kmem_free(ds, sizeof (dsl_dataset_t));
463			dmu_buf_rele(dbuf, tag);
464			return (err);
465		}
466
467		if (!ds->ds_is_snapshot) {
468			ds->ds_snapname[0] = '\0';
469			if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
470				err = dsl_dataset_hold_obj(dp,
471				    dsl_dataset_phys(ds)->ds_prev_snap_obj,
472				    ds, &ds->ds_prev);
473			}
474			if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
475				int zaperr = zap_lookup(mos, ds->ds_object,
476				    DS_FIELD_BOOKMARK_NAMES,
477				    sizeof (ds->ds_bookmarks), 1,
478				    &ds->ds_bookmarks);
479				if (zaperr != ENOENT)
480					VERIFY0(zaperr);
481			}
482		} else {
483			if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
484				err = dsl_dataset_get_snapname(ds);
485			if (err == 0 &&
486			    dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {
487				err = zap_count(
488				    ds->ds_dir->dd_pool->dp_meta_objset,
489				    dsl_dataset_phys(ds)->ds_userrefs_obj,
490				    &ds->ds_userrefs);
491			}
492		}
493
494		if (err == 0 && !ds->ds_is_snapshot) {
495			err = dsl_prop_get_int_ds(ds,
496			    zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
497			    &ds->ds_reserved);
498			if (err == 0) {
499				err = dsl_prop_get_int_ds(ds,
500				    zfs_prop_to_name(ZFS_PROP_REFQUOTA),
501				    &ds->ds_quota);
502			}
503		} else {
504			ds->ds_reserved = ds->ds_quota = 0;
505		}
506
507		dmu_buf_init_user(&ds->ds_dbu, dsl_dataset_evict_sync,
508		    dsl_dataset_evict_async, &ds->ds_dbuf);
509		if (err == 0)
510			winner = dmu_buf_set_user_ie(dbuf, &ds->ds_dbu);
511
512		if (err != 0 || winner != NULL) {
513			bplist_destroy(&ds->ds_pending_deadlist);
514			dsl_deadlist_close(&ds->ds_deadlist);
515			if (ds->ds_prev)
516				dsl_dataset_rele(ds->ds_prev, ds);
517			dsl_dir_rele(ds->ds_dir, ds);
518			mutex_destroy(&ds->ds_lock);
519			mutex_destroy(&ds->ds_opening_lock);
520			mutex_destroy(&ds->ds_sendstream_lock);
521			refcount_destroy(&ds->ds_longholds);
522			kmem_free(ds, sizeof (dsl_dataset_t));
523			if (err != 0) {
524				dmu_buf_rele(dbuf, tag);
525				return (err);
526			}
527			ds = winner;
528		} else {
529			ds->ds_fsid_guid =
530			    unique_insert(dsl_dataset_phys(ds)->ds_fsid_guid);
531			if (ds->ds_fsid_guid !=
532			    dsl_dataset_phys(ds)->ds_fsid_guid) {
533				zfs_dbgmsg("ds_fsid_guid changed from "
534				    "%llx to %llx for pool %s dataset id %llu",
535				    (long long)
536				    dsl_dataset_phys(ds)->ds_fsid_guid,
537				    (long long)ds->ds_fsid_guid,
538				    spa_name(dp->dp_spa),
539				    dsobj);
540			}
541		}
542	}
543	ASSERT3P(ds->ds_dbuf, ==, dbuf);
544	ASSERT3P(dsl_dataset_phys(ds), ==, dbuf->db_data);
545	ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0 ||
546	    spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN ||
547	    dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);
548	*dsp = ds;
549	return (0);
550}
551
552int
553dsl_dataset_hold(dsl_pool_t *dp, const char *name,
554    void *tag, dsl_dataset_t **dsp)
555{
556	dsl_dir_t *dd;
557	const char *snapname;
558	uint64_t obj;
559	int err = 0;
560	dsl_dataset_t *ds;
561
562	err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname);
563	if (err != 0)
564		return (err);
565
566	ASSERT(dsl_pool_config_held(dp));
567	obj = dsl_dir_phys(dd)->dd_head_dataset_obj;
568	if (obj != 0)
569		err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
570	else
571		err = SET_ERROR(ENOENT);
572
573	/* we may be looking for a snapshot */
574	if (err == 0 && snapname != NULL) {
575		dsl_dataset_t *snap_ds;
576
577		if (*snapname++ != '@') {
578			dsl_dataset_rele(ds, tag);
579			dsl_dir_rele(dd, FTAG);
580			return (SET_ERROR(ENOENT));
581		}
582
583		dprintf("looking for snapshot '%s'\n", snapname);
584		err = dsl_dataset_snap_lookup(ds, snapname, &obj);
585		if (err == 0)
586			err = dsl_dataset_hold_obj(dp, obj, tag, &snap_ds);
587		dsl_dataset_rele(ds, tag);
588
589		if (err == 0) {
590			mutex_enter(&snap_ds->ds_lock);
591			if (snap_ds->ds_snapname[0] == 0)
592				(void) strlcpy(snap_ds->ds_snapname, snapname,
593				    sizeof (snap_ds->ds_snapname));
594			mutex_exit(&snap_ds->ds_lock);
595			ds = snap_ds;
596		}
597	}
598	if (err == 0)
599		*dsp = ds;
600	dsl_dir_rele(dd, FTAG);
601	return (err);
602}
603
604int
605dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj,
606    void *tag, dsl_dataset_t **dsp)
607{
608	int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp);
609	if (err != 0)
610		return (err);
611	if (!dsl_dataset_tryown(*dsp, tag)) {
612		dsl_dataset_rele(*dsp, tag);
613		*dsp = NULL;
614		return (SET_ERROR(EBUSY));
615	}
616	return (0);
617}
618
619int
620dsl_dataset_own(dsl_pool_t *dp, const char *name,
621    void *tag, dsl_dataset_t **dsp)
622{
623	int err = dsl_dataset_hold(dp, name, tag, dsp);
624	if (err != 0)
625		return (err);
626	if (!dsl_dataset_tryown(*dsp, tag)) {
627		dsl_dataset_rele(*dsp, tag);
628		return (SET_ERROR(EBUSY));
629	}
630	return (0);
631}
632
633/*
634 * See the comment above dsl_pool_hold() for details.  In summary, a long
635 * hold is used to prevent destruction of a dataset while the pool hold
636 * is dropped, allowing other concurrent operations (e.g. spa_sync()).
637 *
638 * The dataset and pool must be held when this function is called.  After it
639 * is called, the pool hold may be released while the dataset is still held
640 * and accessed.
641 */
642void
643dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag)
644{
645	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
646	(void) refcount_add(&ds->ds_longholds, tag);
647}
648
649void
650dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag)
651{
652	(void) refcount_remove(&ds->ds_longholds, tag);
653}
654
655/* Return B_TRUE if there are any long holds on this dataset. */
656boolean_t
657dsl_dataset_long_held(dsl_dataset_t *ds)
658{
659	return (!refcount_is_zero(&ds->ds_longholds));
660}
661
662void
663dsl_dataset_name(dsl_dataset_t *ds, char *name)
664{
665	if (ds == NULL) {
666		(void) strcpy(name, "mos");
667	} else {
668		dsl_dir_name(ds->ds_dir, name);
669		VERIFY0(dsl_dataset_get_snapname(ds));
670		if (ds->ds_snapname[0]) {
671			VERIFY3U(strlcat(name, "@", ZFS_MAX_DATASET_NAME_LEN),
672			    <, ZFS_MAX_DATASET_NAME_LEN);
673			/*
674			 * We use a "recursive" mutex so that we
675			 * can call dprintf_ds() with ds_lock held.
676			 */
677			if (!MUTEX_HELD(&ds->ds_lock)) {
678				mutex_enter(&ds->ds_lock);
679				VERIFY3U(strlcat(name, ds->ds_snapname,
680				    ZFS_MAX_DATASET_NAME_LEN), <,
681				    ZFS_MAX_DATASET_NAME_LEN);
682				mutex_exit(&ds->ds_lock);
683			} else {
684				VERIFY3U(strlcat(name, ds->ds_snapname,
685				    ZFS_MAX_DATASET_NAME_LEN), <,
686				    ZFS_MAX_DATASET_NAME_LEN);
687			}
688		}
689	}
690}
691
692int
693dsl_dataset_namelen(dsl_dataset_t *ds)
694{
695	VERIFY0(dsl_dataset_get_snapname(ds));
696	mutex_enter(&ds->ds_lock);
697	int len = dsl_dir_namelen(ds->ds_dir) + 1 + strlen(ds->ds_snapname);
698	mutex_exit(&ds->ds_lock);
699	return (len);
700}
701
702void
703dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
704{
705	dmu_buf_rele(ds->ds_dbuf, tag);
706}
707
708void
709dsl_dataset_disown(dsl_dataset_t *ds, void *tag)
710{
711	ASSERT3P(ds->ds_owner, ==, tag);
712	ASSERT(ds->ds_dbuf != NULL);
713
714	mutex_enter(&ds->ds_lock);
715	ds->ds_owner = NULL;
716	mutex_exit(&ds->ds_lock);
717	dsl_dataset_long_rele(ds, tag);
718	dsl_dataset_rele(ds, tag);
719}
720
721boolean_t
722dsl_dataset_tryown(dsl_dataset_t *ds, void *tag)
723{
724	boolean_t gotit = FALSE;
725
726	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
727	mutex_enter(&ds->ds_lock);
728	if (ds->ds_owner == NULL && !DS_IS_INCONSISTENT(ds)) {
729		ds->ds_owner = tag;
730		dsl_dataset_long_hold(ds, tag);
731		gotit = TRUE;
732	}
733	mutex_exit(&ds->ds_lock);
734	return (gotit);
735}
736
737boolean_t
738dsl_dataset_has_owner(dsl_dataset_t *ds)
739{
740	boolean_t rv;
741	mutex_enter(&ds->ds_lock);
742	rv = (ds->ds_owner != NULL);
743	mutex_exit(&ds->ds_lock);
744	return (rv);
745}
746
747static void
748dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx)
749{
750	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
751	objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
752	uint64_t zero = 0;
753
754	VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET);
755
756	spa_feature_incr(spa, f, tx);
757	dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
758
759	VERIFY0(zap_add(mos, dsobj, spa_feature_table[f].fi_guid,
760	    sizeof (zero), 1, &zero, tx));
761}
762
763void
764dsl_dataset_deactivate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx)
765{
766	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
767	objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
768
769	VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET);
770
771	VERIFY0(zap_remove(mos, dsobj, spa_feature_table[f].fi_guid, tx));
772	spa_feature_decr(spa, f, tx);
773}
774
775uint64_t
776dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
777    uint64_t flags, dmu_tx_t *tx)
778{
779	dsl_pool_t *dp = dd->dd_pool;
780	dmu_buf_t *dbuf;
781	dsl_dataset_phys_t *dsphys;
782	uint64_t dsobj;
783	objset_t *mos = dp->dp_meta_objset;
784
785	if (origin == NULL)
786		origin = dp->dp_origin_snap;
787
788	ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp);
789	ASSERT(origin == NULL || dsl_dataset_phys(origin)->ds_num_children > 0);
790	ASSERT(dmu_tx_is_syncing(tx));
791	ASSERT(dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
792
793	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
794	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
795	VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
796	dmu_buf_will_dirty(dbuf, tx);
797	dsphys = dbuf->db_data;
798	bzero(dsphys, sizeof (dsl_dataset_phys_t));
799	dsphys->ds_dir_obj = dd->dd_object;
800	dsphys->ds_flags = flags;
801	dsphys->ds_fsid_guid = unique_create();
802	do {
803		(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
804		    sizeof (dsphys->ds_guid));
805	} while (dsphys->ds_guid == 0);
806	dsphys->ds_snapnames_zapobj =
807	    zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP,
808	    DMU_OT_NONE, 0, tx);
809	dsphys->ds_creation_time = gethrestime_sec();
810	dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;
811
812	if (origin == NULL) {
813		dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx);
814	} else {
815		dsl_dataset_t *ohds; /* head of the origin snapshot */
816
817		dsphys->ds_prev_snap_obj = origin->ds_object;
818		dsphys->ds_prev_snap_txg =
819		    dsl_dataset_phys(origin)->ds_creation_txg;
820		dsphys->ds_referenced_bytes =
821		    dsl_dataset_phys(origin)->ds_referenced_bytes;
822		dsphys->ds_compressed_bytes =
823		    dsl_dataset_phys(origin)->ds_compressed_bytes;
824		dsphys->ds_uncompressed_bytes =
825		    dsl_dataset_phys(origin)->ds_uncompressed_bytes;
826		rrw_enter(&origin->ds_bp_rwlock, RW_READER, FTAG);
827		dsphys->ds_bp = dsl_dataset_phys(origin)->ds_bp;
828		rrw_exit(&origin->ds_bp_rwlock, FTAG);
829
830		/*
831		 * Inherit flags that describe the dataset's contents
832		 * (INCONSISTENT) or properties (Case Insensitive).
833		 */
834		dsphys->ds_flags |= dsl_dataset_phys(origin)->ds_flags &
835		    (DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET);
836
837		for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
838			if (origin->ds_feature_inuse[f])
839				dsl_dataset_activate_feature(dsobj, f, tx);
840		}
841
842		dmu_buf_will_dirty(origin->ds_dbuf, tx);
843		dsl_dataset_phys(origin)->ds_num_children++;
844
845		VERIFY0(dsl_dataset_hold_obj(dp,
846		    dsl_dir_phys(origin->ds_dir)->dd_head_dataset_obj,
847		    FTAG, &ohds));
848		dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,
849		    dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);
850		dsl_dataset_rele(ohds, FTAG);
851
852		if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
853			if (dsl_dataset_phys(origin)->ds_next_clones_obj == 0) {
854				dsl_dataset_phys(origin)->ds_next_clones_obj =
855				    zap_create(mos,
856				    DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
857			}
858			VERIFY0(zap_add_int(mos,
859			    dsl_dataset_phys(origin)->ds_next_clones_obj,
860			    dsobj, tx));
861		}
862
863		dmu_buf_will_dirty(dd->dd_dbuf, tx);
864		dsl_dir_phys(dd)->dd_origin_obj = origin->ds_object;
865		if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
866			if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {
867				dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
868				dsl_dir_phys(origin->ds_dir)->dd_clones =
869				    zap_create(mos,
870				    DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
871			}
872			VERIFY0(zap_add_int(mos,
873			    dsl_dir_phys(origin->ds_dir)->dd_clones,
874			    dsobj, tx));
875		}
876	}
877
878	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
879		dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
880
881	dmu_buf_rele(dbuf, FTAG);
882
883	dmu_buf_will_dirty(dd->dd_dbuf, tx);
884	dsl_dir_phys(dd)->dd_head_dataset_obj = dsobj;
885
886	return (dsobj);
887}
888
889static void
890dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx)
891{
892	objset_t *os;
893
894	VERIFY0(dmu_objset_from_ds(ds, &os));
895	if (bcmp(&os->os_zil_header, &zero_zil, sizeof (zero_zil)) != 0) {
896		dsl_pool_t *dp = ds->ds_dir->dd_pool;
897		zio_t *zio;
898
899		bzero(&os->os_zil_header, sizeof (os->os_zil_header));
900
901		zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
902		dsl_dataset_sync(ds, zio, tx);
903		VERIFY0(zio_wait(zio));
904
905		/* dsl_dataset_sync_done will drop this reference. */
906		dmu_buf_add_ref(ds->ds_dbuf, ds);
907		dsl_dataset_sync_done(ds, tx);
908	}
909}
910
911uint64_t
912dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
913    dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx)
914{
915	dsl_pool_t *dp = pdd->dd_pool;
916	uint64_t dsobj, ddobj;
917	dsl_dir_t *dd;
918
919	ASSERT(dmu_tx_is_syncing(tx));
920	ASSERT(lastname[0] != '@');
921
922	ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
923	VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd));
924
925	dsobj = dsl_dataset_create_sync_dd(dd, origin,
926	    flags & ~DS_CREATE_FLAG_NODIRTY, tx);
927
928	dsl_deleg_set_create_perms(dd, tx, cr);
929
930	/*
931	 * Since we're creating a new node we know it's a leaf, so we can
932	 * initialize the counts if the limit feature is active.
933	 */
934	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
935		uint64_t cnt = 0;
936		objset_t *os = dd->dd_pool->dp_meta_objset;
937
938		dsl_dir_zapify(dd, tx);
939		VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
940		    sizeof (cnt), 1, &cnt, tx));
941		VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
942		    sizeof (cnt), 1, &cnt, tx));
943	}
944
945	dsl_dir_rele(dd, FTAG);
946
947	/*
948	 * If we are creating a clone, make sure we zero out any stale
949	 * data from the origin snapshots zil header.
950	 */
951	if (origin != NULL && !(flags & DS_CREATE_FLAG_NODIRTY)) {
952		dsl_dataset_t *ds;
953
954		VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
955		dsl_dataset_zero_zil(ds, tx);
956		dsl_dataset_rele(ds, FTAG);
957	}
958
959	return (dsobj);
960}
961
962#ifdef __FreeBSD__
963/* FreeBSD ioctl compat begin */
964struct destroyarg {
965	nvlist_t *nvl;
966	const char *snapname;
967};
968
969static int
970dsl_check_snap_cb(const char *name, void *arg)
971{
972	struct destroyarg *da = arg;
973	dsl_dataset_t *ds;
974	char *dsname;
975
976	dsname = kmem_asprintf("%s@%s", name, da->snapname);
977	fnvlist_add_boolean(da->nvl, dsname);
978	kmem_free(dsname, strlen(dsname) + 1);
979
980	return (0);
981}
982
983int
984dmu_get_recursive_snaps_nvl(char *fsname, const char *snapname,
985    nvlist_t *snaps)
986{
987	struct destroyarg *da;
988	int err;
989
990	da = kmem_zalloc(sizeof (struct destroyarg), KM_SLEEP);
991	da->nvl = snaps;
992	da->snapname = snapname;
993	err = dmu_objset_find(fsname, dsl_check_snap_cb, da,
994	    DS_FIND_CHILDREN);
995	kmem_free(da, sizeof (struct destroyarg));
996
997	return (err);
998}
999/* FreeBSD ioctl compat end */
1000#endif /* __FreeBSD__ */
1001
1002/*
1003 * The unique space in the head dataset can be calculated by subtracting
1004 * the space used in the most recent snapshot, that is still being used
1005 * in this file system, from the space currently in use.  To figure out
1006 * the space in the most recent snapshot still in use, we need to take
1007 * the total space used in the snapshot and subtract out the space that
1008 * has been freed up since the snapshot was taken.
1009 */
1010void
1011dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
1012{
1013	uint64_t mrs_used;
1014	uint64_t dlused, dlcomp, dluncomp;
1015
1016	ASSERT(!ds->ds_is_snapshot);
1017
1018	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0)
1019		mrs_used = dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes;
1020	else
1021		mrs_used = 0;
1022
1023	dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
1024
1025	ASSERT3U(dlused, <=, mrs_used);
1026	dsl_dataset_phys(ds)->ds_unique_bytes =
1027	    dsl_dataset_phys(ds)->ds_referenced_bytes - (mrs_used - dlused);
1028
1029	if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
1030	    SPA_VERSION_UNIQUE_ACCURATE)
1031		dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
1032}
1033
1034void
1035dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj,
1036    dmu_tx_t *tx)
1037{
1038	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1039	uint64_t count;
1040	int err;
1041
1042	ASSERT(dsl_dataset_phys(ds)->ds_num_children >= 2);
1043	err = zap_remove_int(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
1044	    obj, tx);
1045	/*
1046	 * The err should not be ENOENT, but a bug in a previous version
1047	 * of the code could cause upgrade_clones_cb() to not set
1048	 * ds_next_snap_obj when it should, leading to a missing entry.
1049	 * If we knew that the pool was created after
1050	 * SPA_VERSION_NEXT_CLONES, we could assert that it isn't
1051	 * ENOENT.  However, at least we can check that we don't have
1052	 * too many entries in the next_clones_obj even after failing to
1053	 * remove this one.
1054	 */
1055	if (err != ENOENT)
1056		VERIFY0(err);
1057	ASSERT0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
1058	    &count));
1059	ASSERT3U(count, <=, dsl_dataset_phys(ds)->ds_num_children - 2);
1060}
1061
1062
1063blkptr_t *
1064dsl_dataset_get_blkptr(dsl_dataset_t *ds)
1065{
1066	return (&dsl_dataset_phys(ds)->ds_bp);
1067}
1068
1069spa_t *
1070dsl_dataset_get_spa(dsl_dataset_t *ds)
1071{
1072	return (ds->ds_dir->dd_pool->dp_spa);
1073}
1074
1075void
1076dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
1077{
1078	dsl_pool_t *dp;
1079
1080	if (ds == NULL) /* this is the meta-objset */
1081		return;
1082
1083	ASSERT(ds->ds_objset != NULL);
1084
1085	if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0)
1086		panic("dirtying snapshot!");
1087
1088	/* Must not dirty a dataset in the same txg where it got snapshotted. */
1089	ASSERT3U(tx->tx_txg, >, dsl_dataset_phys(ds)->ds_prev_snap_txg);
1090
1091	dp = ds->ds_dir->dd_pool;
1092	if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg)) {
1093		/* up the hold count until we can be written out */
1094		dmu_buf_add_ref(ds->ds_dbuf, ds);
1095	}
1096}
1097
1098boolean_t
1099dsl_dataset_is_dirty(dsl_dataset_t *ds)
1100{
1101	for (int t = 0; t < TXG_SIZE; t++) {
1102		if (txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets,
1103		    ds, t))
1104			return (B_TRUE);
1105	}
1106	return (B_FALSE);
1107}
1108
1109static int
1110dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
1111{
1112	uint64_t asize;
1113
1114	if (!dmu_tx_is_syncing(tx))
1115		return (0);
1116
1117	/*
1118	 * If there's an fs-only reservation, any blocks that might become
1119	 * owned by the snapshot dataset must be accommodated by space
1120	 * outside of the reservation.
1121	 */
1122	ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds));
1123	asize = MIN(dsl_dataset_phys(ds)->ds_unique_bytes, ds->ds_reserved);
1124	if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
1125		return (SET_ERROR(ENOSPC));
1126
1127	/*
1128	 * Propagate any reserved space for this snapshot to other
1129	 * snapshot checks in this sync group.
1130	 */
1131	if (asize > 0)
1132		dsl_dir_willuse_space(ds->ds_dir, asize, tx);
1133
1134	return (0);
1135}
1136
1137int
1138dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname,
1139    dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr)
1140{
1141	int error;
1142	uint64_t value;
1143
1144	ds->ds_trysnap_txg = tx->tx_txg;
1145
1146	if (!dmu_tx_is_syncing(tx))
1147		return (0);
1148
1149	/*
1150	 * We don't allow multiple snapshots of the same txg.  If there
1151	 * is already one, try again.
1152	 */
1153	if (dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg)
1154		return (SET_ERROR(EAGAIN));
1155
1156	/*
1157	 * Check for conflicting snapshot name.
1158	 */
1159	error = dsl_dataset_snap_lookup(ds, snapname, &value);
1160	if (error == 0)
1161		return (SET_ERROR(EEXIST));
1162	if (error != ENOENT)
1163		return (error);
1164
1165	/*
1166	 * We don't allow taking snapshots of inconsistent datasets, such as
1167	 * those into which we are currently receiving.  However, if we are
1168	 * creating this snapshot as part of a receive, this check will be
1169	 * executed atomically with respect to the completion of the receive
1170	 * itself but prior to the clearing of DS_FLAG_INCONSISTENT; in this
1171	 * case we ignore this, knowing it will be fixed up for us shortly in
1172	 * dmu_recv_end_sync().
1173	 */
1174	if (!recv && DS_IS_INCONSISTENT(ds))
1175		return (SET_ERROR(EBUSY));
1176
1177	/*
1178	 * Skip the check for temporary snapshots or if we have already checked
1179	 * the counts in dsl_dataset_snapshot_check. This means we really only
1180	 * check the count here when we're receiving a stream.
1181	 */
1182	if (cnt != 0 && cr != NULL) {
1183		error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
1184		    ZFS_PROP_SNAPSHOT_LIMIT, NULL, cr);
1185		if (error != 0)
1186			return (error);
1187	}
1188
1189	error = dsl_dataset_snapshot_reserve_space(ds, tx);
1190	if (error != 0)
1191		return (error);
1192
1193	return (0);
1194}
1195
1196int
1197dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx)
1198{
1199	dsl_dataset_snapshot_arg_t *ddsa = arg;
1200	dsl_pool_t *dp = dmu_tx_pool(tx);
1201	nvpair_t *pair;
1202	int rv = 0;
1203
1204	/*
1205	 * Pre-compute how many total new snapshots will be created for each
1206	 * level in the tree and below. This is needed for validating the
1207	 * snapshot limit when either taking a recursive snapshot or when
1208	 * taking multiple snapshots.
1209	 *
1210	 * The problem is that the counts are not actually adjusted when
1211	 * we are checking, only when we finally sync. For a single snapshot,
1212	 * this is easy, the count will increase by 1 at each node up the tree,
1213	 * but its more complicated for the recursive/multiple snapshot case.
1214	 *
1215	 * The dsl_fs_ss_limit_check function does recursively check the count
1216	 * at each level up the tree but since it is validating each snapshot
1217	 * independently we need to be sure that we are validating the complete
1218	 * count for the entire set of snapshots. We do this by rolling up the
1219	 * counts for each component of the name into an nvlist and then
1220	 * checking each of those cases with the aggregated count.
1221	 *
1222	 * This approach properly handles not only the recursive snapshot
1223	 * case (where we get all of those on the ddsa_snaps list) but also
1224	 * the sibling case (e.g. snapshot a/b and a/c so that we will also
1225	 * validate the limit on 'a' using a count of 2).
1226	 *
1227	 * We validate the snapshot names in the third loop and only report
1228	 * name errors once.
1229	 */
1230	if (dmu_tx_is_syncing(tx)) {
1231		nvlist_t *cnt_track = NULL;
1232		cnt_track = fnvlist_alloc();
1233
1234		/* Rollup aggregated counts into the cnt_track list */
1235		for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
1236		    pair != NULL;
1237		    pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
1238			char *pdelim;
1239			uint64_t val;
1240			char nm[MAXPATHLEN];
1241
1242			(void) strlcpy(nm, nvpair_name(pair), sizeof (nm));
1243			pdelim = strchr(nm, '@');
1244			if (pdelim == NULL)
1245				continue;
1246			*pdelim = '\0';
1247
1248			do {
1249				if (nvlist_lookup_uint64(cnt_track, nm,
1250				    &val) == 0) {
1251					/* update existing entry */
1252					fnvlist_add_uint64(cnt_track, nm,
1253					    val + 1);
1254				} else {
1255					/* add to list */
1256					fnvlist_add_uint64(cnt_track, nm, 1);
1257				}
1258
1259				pdelim = strrchr(nm, '/');
1260				if (pdelim != NULL)
1261					*pdelim = '\0';
1262			} while (pdelim != NULL);
1263		}
1264
1265		/* Check aggregated counts at each level */
1266		for (pair = nvlist_next_nvpair(cnt_track, NULL);
1267		    pair != NULL; pair = nvlist_next_nvpair(cnt_track, pair)) {
1268			int error = 0;
1269			char *name;
1270			uint64_t cnt = 0;
1271			dsl_dataset_t *ds;
1272
1273			name = nvpair_name(pair);
1274			cnt = fnvpair_value_uint64(pair);
1275			ASSERT(cnt > 0);
1276
1277			error = dsl_dataset_hold(dp, name, FTAG, &ds);
1278			if (error == 0) {
1279				error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
1280				    ZFS_PROP_SNAPSHOT_LIMIT, NULL,
1281				    ddsa->ddsa_cr);
1282				dsl_dataset_rele(ds, FTAG);
1283			}
1284
1285			if (error != 0) {
1286				if (ddsa->ddsa_errors != NULL)
1287					fnvlist_add_int32(ddsa->ddsa_errors,
1288					    name, error);
1289				rv = error;
1290				/* only report one error for this check */
1291				break;
1292			}
1293		}
1294		nvlist_free(cnt_track);
1295	}
1296
1297	for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
1298	    pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
1299		int error = 0;
1300		dsl_dataset_t *ds;
1301		char *name, *atp;
1302		char dsname[ZFS_MAX_DATASET_NAME_LEN];
1303
1304		name = nvpair_name(pair);
1305		if (strlen(name) >= ZFS_MAX_DATASET_NAME_LEN)
1306			error = SET_ERROR(ENAMETOOLONG);
1307		if (error == 0) {
1308			atp = strchr(name, '@');
1309			if (atp == NULL)
1310				error = SET_ERROR(EINVAL);
1311			if (error == 0)
1312				(void) strlcpy(dsname, name, atp - name + 1);
1313		}
1314		if (error == 0)
1315			error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
1316		if (error == 0) {
1317			/* passing 0/NULL skips dsl_fs_ss_limit_check */
1318			error = dsl_dataset_snapshot_check_impl(ds,
1319			    atp + 1, tx, B_FALSE, 0, NULL);
1320			dsl_dataset_rele(ds, FTAG);
1321		}
1322
1323		if (error != 0) {
1324			if (ddsa->ddsa_errors != NULL) {
1325				fnvlist_add_int32(ddsa->ddsa_errors,
1326				    name, error);
1327			}
1328			rv = error;
1329		}
1330	}
1331
1332	return (rv);
1333}
1334
1335void
1336dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
1337    dmu_tx_t *tx)
1338{
1339	dsl_pool_t *dp = ds->ds_dir->dd_pool;
1340	dmu_buf_t *dbuf;
1341	dsl_dataset_phys_t *dsphys;
1342	uint64_t dsobj, crtxg;
1343	objset_t *mos = dp->dp_meta_objset;
1344	objset_t *os;
1345
1346	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
1347
1348	/*
1349	 * If we are on an old pool, the zil must not be active, in which
1350	 * case it will be zeroed.  Usually zil_suspend() accomplishes this.
1351	 */
1352	ASSERT(spa_version(dmu_tx_pool(tx)->dp_spa) >= SPA_VERSION_FAST_SNAP ||
1353	    dmu_objset_from_ds(ds, &os) != 0 ||
1354	    bcmp(&os->os_phys->os_zil_header, &zero_zil,
1355	    sizeof (zero_zil)) == 0);
1356
1357	/* Should not snapshot a dirty dataset. */
1358	ASSERT(!txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets,
1359	    ds, tx->tx_txg));
1360
1361	dsl_fs_ss_count_adjust(ds->ds_dir, 1, DD_FIELD_SNAPSHOT_COUNT, tx);
1362
1363	/*
1364	 * The origin's ds_creation_txg has to be < TXG_INITIAL
1365	 */
1366	if (strcmp(snapname, ORIGIN_DIR_NAME) == 0)
1367		crtxg = 1;
1368	else
1369		crtxg = tx->tx_txg;
1370
1371	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
1372	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
1373	VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
1374	dmu_buf_will_dirty(dbuf, tx);
1375	dsphys = dbuf->db_data;
1376	bzero(dsphys, sizeof (dsl_dataset_phys_t));
1377	dsphys->ds_dir_obj = ds->ds_dir->dd_object;
1378	dsphys->ds_fsid_guid = unique_create();
1379	do {
1380		(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
1381		    sizeof (dsphys->ds_guid));
1382	} while (dsphys->ds_guid == 0);
1383	dsphys->ds_prev_snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
1384	dsphys->ds_prev_snap_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
1385	dsphys->ds_next_snap_obj = ds->ds_object;
1386	dsphys->ds_num_children = 1;
1387	dsphys->ds_creation_time = gethrestime_sec();
1388	dsphys->ds_creation_txg = crtxg;
1389	dsphys->ds_deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj;
1390	dsphys->ds_referenced_bytes = dsl_dataset_phys(ds)->ds_referenced_bytes;
1391	dsphys->ds_compressed_bytes = dsl_dataset_phys(ds)->ds_compressed_bytes;
1392	dsphys->ds_uncompressed_bytes =
1393	    dsl_dataset_phys(ds)->ds_uncompressed_bytes;
1394	dsphys->ds_flags = dsl_dataset_phys(ds)->ds_flags;
1395	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
1396	dsphys->ds_bp = dsl_dataset_phys(ds)->ds_bp;
1397	rrw_exit(&ds->ds_bp_rwlock, FTAG);
1398	dmu_buf_rele(dbuf, FTAG);
1399
1400	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
1401		if (ds->ds_feature_inuse[f])
1402			dsl_dataset_activate_feature(dsobj, f, tx);
1403	}
1404
1405	ASSERT3U(ds->ds_prev != 0, ==,
1406	    dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
1407	if (ds->ds_prev) {
1408		uint64_t next_clones_obj =
1409		    dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj;
1410		ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
1411		    ds->ds_object ||
1412		    dsl_dataset_phys(ds->ds_prev)->ds_num_children > 1);
1413		if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
1414		    ds->ds_object) {
1415			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
1416			ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==,
1417			    dsl_dataset_phys(ds->ds_prev)->ds_creation_txg);
1418			dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj = dsobj;
1419		} else if (next_clones_obj != 0) {
1420			dsl_dataset_remove_from_next_clones(ds->ds_prev,
1421			    dsphys->ds_next_snap_obj, tx);
1422			VERIFY0(zap_add_int(mos,
1423			    next_clones_obj, dsobj, tx));
1424		}
1425	}
1426
1427	/*
1428	 * If we have a reference-reservation on this dataset, we will
1429	 * need to increase the amount of refreservation being charged
1430	 * since our unique space is going to zero.
1431	 */
1432	if (ds->ds_reserved) {
1433		int64_t delta;
1434		ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
1435		delta = MIN(dsl_dataset_phys(ds)->ds_unique_bytes,
1436		    ds->ds_reserved);
1437		dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,
1438		    delta, 0, 0, tx);
1439	}
1440
1441	dmu_buf_will_dirty(ds->ds_dbuf, tx);
1442	dsl_dataset_phys(ds)->ds_deadlist_obj =
1443	    dsl_deadlist_clone(&ds->ds_deadlist, UINT64_MAX,
1444	    dsl_dataset_phys(ds)->ds_prev_snap_obj, tx);
1445	dsl_deadlist_close(&ds->ds_deadlist);
1446	dsl_deadlist_open(&ds->ds_deadlist, mos,
1447	    dsl_dataset_phys(ds)->ds_deadlist_obj);
1448	dsl_deadlist_add_key(&ds->ds_deadlist,
1449	    dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
1450
1451	ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, <, tx->tx_txg);
1452	dsl_dataset_phys(ds)->ds_prev_snap_obj = dsobj;
1453	dsl_dataset_phys(ds)->ds_prev_snap_txg = crtxg;
1454	dsl_dataset_phys(ds)->ds_unique_bytes = 0;
1455	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
1456		dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
1457
1458	VERIFY0(zap_add(mos, dsl_dataset_phys(ds)->ds_snapnames_zapobj,
1459	    snapname, 8, 1, &dsobj, tx));
1460
1461	if (ds->ds_prev)
1462		dsl_dataset_rele(ds->ds_prev, ds);
1463	VERIFY0(dsl_dataset_hold_obj(dp,
1464	    dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev));
1465
1466	dsl_scan_ds_snapshotted(ds, tx);
1467
1468	dsl_dir_snap_cmtime_update(ds->ds_dir);
1469
1470	spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, "");
1471}
1472
1473void
1474dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx)
1475{
1476	dsl_dataset_snapshot_arg_t *ddsa = arg;
1477	dsl_pool_t *dp = dmu_tx_pool(tx);
1478	nvpair_t *pair;
1479
1480	for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
1481	    pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
1482		dsl_dataset_t *ds;
1483		char *name, *atp;
1484		char dsname[ZFS_MAX_DATASET_NAME_LEN];
1485
1486		name = nvpair_name(pair);
1487		atp = strchr(name, '@');
1488		(void) strlcpy(dsname, name, atp - name + 1);
1489		VERIFY0(dsl_dataset_hold(dp, dsname, FTAG, &ds));
1490
1491		dsl_dataset_snapshot_sync_impl(ds, atp + 1, tx);
1492		if (ddsa->ddsa_props != NULL) {
1493			dsl_props_set_sync_impl(ds->ds_prev,
1494			    ZPROP_SRC_LOCAL, ddsa->ddsa_props, tx);
1495		}
1496		dsl_dataset_rele(ds, FTAG);
1497	}
1498}
1499
1500/*
1501 * The snapshots must all be in the same pool.
1502 * All-or-nothing: if there are any failures, nothing will be modified.
1503 */
1504int
1505dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors)
1506{
1507	dsl_dataset_snapshot_arg_t ddsa;
1508	nvpair_t *pair;
1509	boolean_t needsuspend;
1510	int error;
1511	spa_t *spa;
1512	char *firstname;
1513	nvlist_t *suspended = NULL;
1514
1515	pair = nvlist_next_nvpair(snaps, NULL);
1516	if (pair == NULL)
1517		return (0);
1518	firstname = nvpair_name(pair);
1519
1520	error = spa_open(firstname, &spa, FTAG);
1521	if (error != 0)
1522		return (error);
1523	needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
1524	spa_close(spa, FTAG);
1525
1526	if (needsuspend) {
1527		suspended = fnvlist_alloc();
1528		for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
1529		    pair = nvlist_next_nvpair(snaps, pair)) {
1530			char fsname[ZFS_MAX_DATASET_NAME_LEN];
1531			char *snapname = nvpair_name(pair);
1532			char *atp;
1533			void *cookie;
1534
1535			atp = strchr(snapname, '@');
1536			if (atp == NULL) {
1537				error = SET_ERROR(EINVAL);
1538				break;
1539			}
1540			(void) strlcpy(fsname, snapname, atp - snapname + 1);
1541
1542			error = zil_suspend(fsname, &cookie);
1543			if (error != 0)
1544				break;
1545			fnvlist_add_uint64(suspended, fsname,
1546			    (uintptr_t)cookie);
1547		}
1548	}
1549
1550	ddsa.ddsa_snaps = snaps;
1551	ddsa.ddsa_props = props;
1552	ddsa.ddsa_errors = errors;
1553	ddsa.ddsa_cr = CRED();
1554
1555	if (error == 0) {
1556		error = dsl_sync_task(firstname, dsl_dataset_snapshot_check,
1557		    dsl_dataset_snapshot_sync, &ddsa,
1558		    fnvlist_num_pairs(snaps) * 3, ZFS_SPACE_CHECK_NORMAL);
1559	}
1560
1561	if (suspended != NULL) {
1562		for (pair = nvlist_next_nvpair(suspended, NULL); pair != NULL;
1563		    pair = nvlist_next_nvpair(suspended, pair)) {
1564			zil_resume((void *)(uintptr_t)
1565			    fnvpair_value_uint64(pair));
1566		}
1567		fnvlist_free(suspended);
1568	}
1569
1570#ifdef __FreeBSD__
1571#ifdef _KERNEL
1572	if (error == 0) {
1573		for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
1574		    pair = nvlist_next_nvpair(snaps, pair)) {
1575			char *snapname = nvpair_name(pair);
1576			zvol_create_minors(snapname);
1577		}
1578	}
1579#endif
1580#endif
1581	return (error);
1582}
1583
1584typedef struct dsl_dataset_snapshot_tmp_arg {
1585	const char *ddsta_fsname;
1586	const char *ddsta_snapname;
1587	minor_t ddsta_cleanup_minor;
1588	const char *ddsta_htag;
1589} dsl_dataset_snapshot_tmp_arg_t;
1590
1591static int
1592dsl_dataset_snapshot_tmp_check(void *arg, dmu_tx_t *tx)
1593{
1594	dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
1595	dsl_pool_t *dp = dmu_tx_pool(tx);
1596	dsl_dataset_t *ds;
1597	int error;
1598
1599	error = dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds);
1600	if (error != 0)
1601		return (error);
1602
1603	/* NULL cred means no limit check for tmp snapshot */
1604	error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname,
1605	    tx, B_FALSE, 0, NULL);
1606	if (error != 0) {
1607		dsl_dataset_rele(ds, FTAG);
1608		return (error);
1609	}
1610
1611	if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) {
1612		dsl_dataset_rele(ds, FTAG);
1613		return (SET_ERROR(ENOTSUP));
1614	}
1615	error = dsl_dataset_user_hold_check_one(NULL, ddsta->ddsta_htag,
1616	    B_TRUE, tx);
1617	if (error != 0) {
1618		dsl_dataset_rele(ds, FTAG);
1619		return (error);
1620	}
1621
1622	dsl_dataset_rele(ds, FTAG);
1623	return (0);
1624}
1625
1626static void
1627dsl_dataset_snapshot_tmp_sync(void *arg, dmu_tx_t *tx)
1628{
1629	dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
1630	dsl_pool_t *dp = dmu_tx_pool(tx);
1631	dsl_dataset_t *ds;
1632
1633	VERIFY0(dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds));
1634
1635	dsl_dataset_snapshot_sync_impl(ds, ddsta->ddsta_snapname, tx);
1636	dsl_dataset_user_hold_sync_one(ds->ds_prev, ddsta->ddsta_htag,
1637	    ddsta->ddsta_cleanup_minor, gethrestime_sec(), tx);
1638	dsl_destroy_snapshot_sync_impl(ds->ds_prev, B_TRUE, tx);
1639
1640	dsl_dataset_rele(ds, FTAG);
1641}
1642
1643int
1644dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname,
1645    minor_t cleanup_minor, const char *htag)
1646{
1647	dsl_dataset_snapshot_tmp_arg_t ddsta;
1648	int error;
1649	spa_t *spa;
1650	boolean_t needsuspend;
1651	void *cookie;
1652
1653	ddsta.ddsta_fsname = fsname;
1654	ddsta.ddsta_snapname = snapname;
1655	ddsta.ddsta_cleanup_minor = cleanup_minor;
1656	ddsta.ddsta_htag = htag;
1657
1658	error = spa_open(fsname, &spa, FTAG);
1659	if (error != 0)
1660		return (error);
1661	needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
1662	spa_close(spa, FTAG);
1663
1664	if (needsuspend) {
1665		error = zil_suspend(fsname, &cookie);
1666		if (error != 0)
1667			return (error);
1668	}
1669
1670	error = dsl_sync_task(fsname, dsl_dataset_snapshot_tmp_check,
1671	    dsl_dataset_snapshot_tmp_sync, &ddsta, 3, ZFS_SPACE_CHECK_RESERVED);
1672
1673	if (needsuspend)
1674		zil_resume(cookie);
1675	return (error);
1676}
1677
1678void
1679dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
1680{
1681	ASSERT(dmu_tx_is_syncing(tx));
1682	ASSERT(ds->ds_objset != NULL);
1683	ASSERT(dsl_dataset_phys(ds)->ds_next_snap_obj == 0);
1684
1685	/*
1686	 * in case we had to change ds_fsid_guid when we opened it,
1687	 * sync it out now.
1688	 */
1689	dmu_buf_will_dirty(ds->ds_dbuf, tx);
1690	dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid;
1691
1692	if (ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] != 0) {
1693		VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
1694		    ds->ds_object, DS_FIELD_RESUME_OBJECT, 8, 1,
1695		    &ds->ds_resume_object[tx->tx_txg & TXG_MASK], tx));
1696		VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
1697		    ds->ds_object, DS_FIELD_RESUME_OFFSET, 8, 1,
1698		    &ds->ds_resume_offset[tx->tx_txg & TXG_MASK], tx));
1699		VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
1700		    ds->ds_object, DS_FIELD_RESUME_BYTES, 8, 1,
1701		    &ds->ds_resume_bytes[tx->tx_txg & TXG_MASK], tx));
1702		ds->ds_resume_object[tx->tx_txg & TXG_MASK] = 0;
1703		ds->ds_resume_offset[tx->tx_txg & TXG_MASK] = 0;
1704		ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0;
1705	}
1706
1707	dmu_objset_sync(ds->ds_objset, zio, tx);
1708
1709	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
1710		if (ds->ds_feature_activation_needed[f]) {
1711			if (ds->ds_feature_inuse[f])
1712				continue;
1713			dsl_dataset_activate_feature(ds->ds_object, f, tx);
1714			ds->ds_feature_inuse[f] = B_TRUE;
1715		}
1716	}
1717}
1718
1719static int
1720deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
1721{
1722	dsl_deadlist_t *dl = arg;
1723	dsl_deadlist_insert(dl, bp, tx);
1724	return (0);
1725}
1726
1727void
1728dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx)
1729{
1730	objset_t *os = ds->ds_objset;
1731
1732	bplist_iterate(&ds->ds_pending_deadlist,
1733	    deadlist_enqueue_cb, &ds->ds_deadlist, tx);
1734
1735	if (os->os_synced_dnodes != NULL) {
1736		multilist_destroy(os->os_synced_dnodes);
1737		os->os_synced_dnodes = NULL;
1738	}
1739
1740	ASSERT(!dmu_objset_is_dirty(os, dmu_tx_get_txg(tx)));
1741
1742	dmu_buf_rele(ds->ds_dbuf, ds);
1743}
1744
1745int
1746get_clones_stat_impl(dsl_dataset_t *ds, nvlist_t *val)
1747{
1748	uint64_t count = 0;
1749	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1750	zap_cursor_t zc;
1751	zap_attribute_t za;
1752
1753	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
1754
1755	/*
1756	 * There may be missing entries in ds_next_clones_obj
1757	 * due to a bug in a previous version of the code.
1758	 * Only trust it if it has the right number of entries.
1759	 */
1760	if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
1761		VERIFY0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
1762		    &count));
1763	}
1764	if (count != dsl_dataset_phys(ds)->ds_num_children - 1) {
1765		return (ENOENT);
1766	}
1767	for (zap_cursor_init(&zc, mos,
1768	    dsl_dataset_phys(ds)->ds_next_clones_obj);
1769	    zap_cursor_retrieve(&zc, &za) == 0;
1770	    zap_cursor_advance(&zc)) {
1771		dsl_dataset_t *clone;
1772		char buf[ZFS_MAX_DATASET_NAME_LEN];
1773		VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
1774		    za.za_first_integer, FTAG, &clone));
1775		dsl_dir_name(clone->ds_dir, buf);
1776		fnvlist_add_boolean(val, buf);
1777		dsl_dataset_rele(clone, FTAG);
1778	}
1779	zap_cursor_fini(&zc);
1780	return (0);
1781}
1782
1783void
1784get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv)
1785{
1786	nvlist_t *propval = fnvlist_alloc();
1787	nvlist_t *val;
1788
1789	/*
1790	 * We use nvlist_alloc() instead of fnvlist_alloc() because the
1791	 * latter would allocate the list with NV_UNIQUE_NAME flag.
1792	 * As a result, every time a clone name is appended to the list
1793	 * it would be (linearly) searched for for a duplicate name.
1794	 * We already know that all clone names must be unique and we
1795	 * want avoid the quadratic complexity of double-checking that
1796	 * because we can have a large number of clones.
1797	 */
1798	VERIFY0(nvlist_alloc(&val, 0, KM_SLEEP));
1799
1800	if (get_clones_stat_impl(ds, val) == 0) {
1801		fnvlist_add_nvlist(propval, ZPROP_VALUE, val);
1802		fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES),
1803		    propval);
1804	}
1805
1806	nvlist_free(val);
1807	nvlist_free(propval);
1808}
1809
1810/*
1811 * Returns a string that represents the receive resume stats token. It should
1812 * be freed with strfree().
1813 */
1814char *
1815get_receive_resume_stats_impl(dsl_dataset_t *ds)
1816{
1817	dsl_pool_t *dp = ds->ds_dir->dd_pool;
1818
1819	if (dsl_dataset_has_resume_receive_state(ds)) {
1820		char *str;
1821		void *packed;
1822		uint8_t *compressed;
1823		uint64_t val;
1824		nvlist_t *token_nv = fnvlist_alloc();
1825		size_t packed_size, compressed_size;
1826
1827		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
1828		    DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val) == 0) {
1829			fnvlist_add_uint64(token_nv, "fromguid", val);
1830		}
1831		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
1832		    DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val) == 0) {
1833			fnvlist_add_uint64(token_nv, "object", val);
1834		}
1835		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
1836		    DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val) == 0) {
1837			fnvlist_add_uint64(token_nv, "offset", val);
1838		}
1839		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
1840		    DS_FIELD_RESUME_BYTES, sizeof (val), 1, &val) == 0) {
1841			fnvlist_add_uint64(token_nv, "bytes", val);
1842		}
1843		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
1844		    DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val) == 0) {
1845			fnvlist_add_uint64(token_nv, "toguid", val);
1846		}
1847		char buf[256];
1848		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
1849		    DS_FIELD_RESUME_TONAME, 1, sizeof (buf), buf) == 0) {
1850			fnvlist_add_string(token_nv, "toname", buf);
1851		}
1852		if (zap_contains(dp->dp_meta_objset, ds->ds_object,
1853		    DS_FIELD_RESUME_LARGEBLOCK) == 0) {
1854			fnvlist_add_boolean(token_nv, "largeblockok");
1855		}
1856		if (zap_contains(dp->dp_meta_objset, ds->ds_object,
1857		    DS_FIELD_RESUME_EMBEDOK) == 0) {
1858			fnvlist_add_boolean(token_nv, "embedok");
1859		}
1860		if (zap_contains(dp->dp_meta_objset, ds->ds_object,
1861		    DS_FIELD_RESUME_COMPRESSOK) == 0) {
1862			fnvlist_add_boolean(token_nv, "compressok");
1863		}
1864		packed = fnvlist_pack(token_nv, &packed_size);
1865		fnvlist_free(token_nv);
1866		compressed = kmem_alloc(packed_size, KM_SLEEP);
1867
1868		compressed_size = gzip_compress(packed, compressed,
1869		    packed_size, packed_size, 6);
1870
1871		zio_cksum_t cksum;
1872		fletcher_4_native(compressed, compressed_size, NULL, &cksum);
1873
1874		str = kmem_alloc(compressed_size * 2 + 1, KM_SLEEP);
1875		for (int i = 0; i < compressed_size; i++) {
1876			(void) sprintf(str + i * 2, "%02x", compressed[i]);
1877		}
1878		str[compressed_size * 2] = '\0';
1879		char *propval = kmem_asprintf("%u-%llx-%llx-%s",
1880		    ZFS_SEND_RESUME_TOKEN_VERSION,
1881		    (longlong_t)cksum.zc_word[0],
1882		    (longlong_t)packed_size, str);
1883		kmem_free(packed, packed_size);
1884		kmem_free(str, compressed_size * 2 + 1);
1885		kmem_free(compressed, packed_size);
1886		return (propval);
1887	}
1888	return (spa_strdup(""));
1889}
1890
1891/*
1892 * Returns a string that represents the receive resume stats token of the
1893 * dataset's child. It should be freed with strfree().
1894 */
1895char *
1896get_child_receive_stats(dsl_dataset_t *ds)
1897{
1898	char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
1899	dsl_dataset_t *recv_ds;
1900	dsl_dataset_name(ds, recvname);
1901	if (strlcat(recvname, "/", sizeof (recvname)) <
1902	    sizeof (recvname) &&
1903	    strlcat(recvname, recv_clone_name, sizeof (recvname)) <
1904	    sizeof (recvname) &&
1905	    dsl_dataset_hold(ds->ds_dir->dd_pool, recvname, FTAG,
1906	    &recv_ds)  == 0) {
1907		char *propval = get_receive_resume_stats_impl(recv_ds);
1908		dsl_dataset_rele(recv_ds, FTAG);
1909		return (propval);
1910	}
1911	return (spa_strdup(""));
1912}
1913
1914static void
1915get_receive_resume_stats(dsl_dataset_t *ds, nvlist_t *nv)
1916{
1917	char *propval = get_receive_resume_stats_impl(ds);
1918	if (strcmp(propval, "") != 0) {
1919		dsl_prop_nvlist_add_string(nv,
1920		    ZFS_PROP_RECEIVE_RESUME_TOKEN, propval);
1921	} else {
1922		char *childval = get_child_receive_stats(ds);
1923		if (strcmp(childval, "") != 0) {
1924			dsl_prop_nvlist_add_string(nv,
1925			    ZFS_PROP_RECEIVE_RESUME_TOKEN, childval);
1926		}
1927		strfree(childval);
1928	}
1929	strfree(propval);
1930}
1931
1932uint64_t
1933dsl_get_refratio(dsl_dataset_t *ds)
1934{
1935	uint64_t ratio = dsl_dataset_phys(ds)->ds_compressed_bytes == 0 ? 100 :
1936	    (dsl_dataset_phys(ds)->ds_uncompressed_bytes * 100 /
1937	    dsl_dataset_phys(ds)->ds_compressed_bytes);
1938	return (ratio);
1939}
1940
1941uint64_t
1942dsl_get_logicalreferenced(dsl_dataset_t *ds)
1943{
1944	return (dsl_dataset_phys(ds)->ds_uncompressed_bytes);
1945}
1946
1947uint64_t
1948dsl_get_compressratio(dsl_dataset_t *ds)
1949{
1950	if (ds->ds_is_snapshot) {
1951		return (dsl_get_refratio(ds));
1952	} else {
1953		dsl_dir_t *dd = ds->ds_dir;
1954		mutex_enter(&dd->dd_lock);
1955		uint64_t val = dsl_dir_get_compressratio(dd);
1956		mutex_exit(&dd->dd_lock);
1957		return (val);
1958	}
1959}
1960
1961uint64_t
1962dsl_get_used(dsl_dataset_t *ds)
1963{
1964	if (ds->ds_is_snapshot) {
1965		return (dsl_dataset_phys(ds)->ds_unique_bytes);
1966	} else {
1967		dsl_dir_t *dd = ds->ds_dir;
1968		mutex_enter(&dd->dd_lock);
1969		uint64_t val = dsl_dir_get_used(dd);
1970		mutex_exit(&dd->dd_lock);
1971		return (val);
1972	}
1973}
1974
1975uint64_t
1976dsl_get_creation(dsl_dataset_t *ds)
1977{
1978	return (dsl_dataset_phys(ds)->ds_creation_time);
1979}
1980
1981uint64_t
1982dsl_get_creationtxg(dsl_dataset_t *ds)
1983{
1984	return (dsl_dataset_phys(ds)->ds_creation_txg);
1985}
1986
1987uint64_t
1988dsl_get_refquota(dsl_dataset_t *ds)
1989{
1990	return (ds->ds_quota);
1991}
1992
1993uint64_t
1994dsl_get_refreservation(dsl_dataset_t *ds)
1995{
1996	return (ds->ds_reserved);
1997}
1998
1999uint64_t
2000dsl_get_guid(dsl_dataset_t *ds)
2001{
2002	return (dsl_dataset_phys(ds)->ds_guid);
2003}
2004
2005uint64_t
2006dsl_get_unique(dsl_dataset_t *ds)
2007{
2008	return (dsl_dataset_phys(ds)->ds_unique_bytes);
2009}
2010
2011uint64_t
2012dsl_get_objsetid(dsl_dataset_t *ds)
2013{
2014	return (ds->ds_object);
2015}
2016
2017uint64_t
2018dsl_get_userrefs(dsl_dataset_t *ds)
2019{
2020	return (ds->ds_userrefs);
2021}
2022
2023uint64_t
2024dsl_get_defer_destroy(dsl_dataset_t *ds)
2025{
2026	return (DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
2027}
2028
2029uint64_t
2030dsl_get_referenced(dsl_dataset_t *ds)
2031{
2032	return (dsl_dataset_phys(ds)->ds_referenced_bytes);
2033}
2034
2035uint64_t
2036dsl_get_numclones(dsl_dataset_t *ds)
2037{
2038	ASSERT(ds->ds_is_snapshot);
2039	return (dsl_dataset_phys(ds)->ds_num_children - 1);
2040}
2041
2042uint64_t
2043dsl_get_inconsistent(dsl_dataset_t *ds)
2044{
2045	return ((dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT) ?
2046	    1 : 0);
2047}
2048
2049uint64_t
2050dsl_get_available(dsl_dataset_t *ds)
2051{
2052	uint64_t refdbytes = dsl_get_referenced(ds);
2053	uint64_t availbytes = dsl_dir_space_available(ds->ds_dir,
2054	    NULL, 0, TRUE);
2055	if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) {
2056		availbytes +=
2057		    ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes;
2058	}
2059	if (ds->ds_quota != 0) {
2060		/*
2061		 * Adjust available bytes according to refquota
2062		 */
2063		if (refdbytes < ds->ds_quota) {
2064			availbytes = MIN(availbytes,
2065			    ds->ds_quota - refdbytes);
2066		} else {
2067			availbytes = 0;
2068		}
2069	}
2070	return (availbytes);
2071}
2072
2073int
2074dsl_get_written(dsl_dataset_t *ds, uint64_t *written)
2075{
2076	dsl_pool_t *dp = ds->ds_dir->dd_pool;
2077	dsl_dataset_t *prev;
2078	int err = dsl_dataset_hold_obj(dp,
2079	    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
2080	if (err == 0) {
2081		uint64_t comp, uncomp;
2082		err = dsl_dataset_space_written(prev, ds, written,
2083		    &comp, &uncomp);
2084		dsl_dataset_rele(prev, FTAG);
2085	}
2086	return (err);
2087}
2088
2089/*
2090 * 'snap' should be a buffer of size ZFS_MAX_DATASET_NAME_LEN.
2091 */
2092int
2093dsl_get_prev_snap(dsl_dataset_t *ds, char *snap)
2094{
2095	dsl_pool_t *dp = ds->ds_dir->dd_pool;
2096	if (ds->ds_prev != NULL && ds->ds_prev != dp->dp_origin_snap) {
2097		dsl_dataset_name(ds->ds_prev, snap);
2098		return (0);
2099	} else {
2100		return (ENOENT);
2101	}
2102}
2103
2104/*
2105 * Returns the mountpoint property and source for the given dataset in the value
2106 * and source buffers. The value buffer must be at least as large as MAXPATHLEN
2107 * and the source buffer as least as large a ZFS_MAX_DATASET_NAME_LEN.
2108 * Returns 0 on success and an error on failure.
2109 */
2110int
2111dsl_get_mountpoint(dsl_dataset_t *ds, const char *dsname, char *value,
2112    char *source)
2113{
2114	int error;
2115	dsl_pool_t *dp = ds->ds_dir->dd_pool;
2116
2117	/* Retrieve the mountpoint value stored in the zap opbject */
2118	error = dsl_prop_get_ds(ds, zfs_prop_to_name(ZFS_PROP_MOUNTPOINT), 1,
2119	    ZAP_MAXVALUELEN, value, source);
2120	if (error != 0) {
2121		return (error);
2122	}
2123
2124	/* Process the dsname and source to find the full mountpoint string */
2125	if (value[0] == '/') {
2126		char *buf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
2127		char *root = buf;
2128		const char *relpath;
2129
2130		/*
2131		 * If we inherit the mountpoint, even from a dataset
2132		 * with a received value, the source will be the path of
2133		 * the dataset we inherit from. If source is
2134		 * ZPROP_SOURCE_VAL_RECVD, the received value is not
2135		 * inherited.
2136		 */
2137		if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) {
2138			relpath = "";
2139		} else {
2140			ASSERT0(strncmp(dsname, source, strlen(source)));
2141			relpath = dsname + strlen(source);
2142			if (relpath[0] == '/')
2143				relpath++;
2144		}
2145
2146		spa_altroot(dp->dp_spa, root, ZAP_MAXVALUELEN);
2147
2148		/*
2149		 * Special case an alternate root of '/'. This will
2150		 * avoid having multiple leading slashes in the
2151		 * mountpoint path.
2152		 */
2153		if (strcmp(root, "/") == 0)
2154			root++;
2155
2156		/*
2157		 * If the mountpoint is '/' then skip over this
2158		 * if we are obtaining either an alternate root or
2159		 * an inherited mountpoint.
2160		 */
2161		char *mnt = value;
2162		if (value[1] == '\0' && (root[0] != '\0' ||
2163		    relpath[0] != '\0'))
2164			mnt = value + 1;
2165
2166		if (relpath[0] == '\0') {
2167			(void) snprintf(value, ZAP_MAXVALUELEN, "%s%s",
2168			    root, mnt);
2169		} else {
2170			(void) snprintf(value, ZAP_MAXVALUELEN, "%s%s%s%s",
2171			    root, mnt, relpath[0] == '@' ? "" : "/",
2172			    relpath);
2173		}
2174		kmem_free(buf, ZAP_MAXVALUELEN);
2175	} else {
2176		/* 'legacy' or 'none' */
2177		(void) snprintf(value, ZAP_MAXVALUELEN, "%s", value);
2178	}
2179	return (0);
2180}
2181
2182void
2183dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
2184{
2185	dsl_pool_t *dp = ds->ds_dir->dd_pool;
2186
2187	ASSERT(dsl_pool_config_held(dp));
2188
2189	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO,
2190	    dsl_get_refratio(ds));
2191	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALREFERENCED,
2192	    dsl_get_logicalreferenced(ds));
2193	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
2194	    dsl_get_compressratio(ds));
2195	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
2196	    dsl_get_used(ds));
2197
2198	if (ds->ds_is_snapshot) {
2199		get_clones_stat(ds, nv);
2200	} else {
2201		char buf[ZFS_MAX_DATASET_NAME_LEN];
2202		if (dsl_get_prev_snap(ds, buf) == 0)
2203			dsl_prop_nvlist_add_string(nv, ZFS_PROP_PREV_SNAP,
2204			    buf);
2205		dsl_dir_stats(ds->ds_dir, nv);
2206	}
2207
2208	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE,
2209	    dsl_get_available(ds));
2210	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED,
2211	    dsl_get_referenced(ds));
2212	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
2213	    dsl_get_creation(ds));
2214	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
2215	    dsl_get_creationtxg(ds));
2216	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA,
2217	    dsl_get_refquota(ds));
2218	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION,
2219	    dsl_get_refreservation(ds));
2220	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
2221	    dsl_get_guid(ds));
2222	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE,
2223	    dsl_get_unique(ds));
2224	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID,
2225	    dsl_get_objsetid(ds));
2226	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS,
2227	    dsl_get_userrefs(ds));
2228	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
2229	    dsl_get_defer_destroy(ds));
2230
2231	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
2232		uint64_t written;
2233		if (dsl_get_written(ds, &written) == 0) {
2234			dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN,
2235			    written);
2236		}
2237	}
2238
2239	if (!dsl_dataset_is_snapshot(ds)) {
2240		/*
2241		 * A failed "newfs" (e.g. full) resumable receive leaves
2242		 * the stats set on this dataset.  Check here for the prop.
2243		 */
2244		get_receive_resume_stats(ds, nv);
2245
2246		/*
2247		 * A failed incremental resumable receive leaves the
2248		 * stats set on our child named "%recv".  Check the child
2249		 * for the prop.
2250		 */
2251		/* 6 extra bytes for /%recv */
2252		char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
2253		dsl_dataset_t *recv_ds;
2254		dsl_dataset_name(ds, recvname);
2255		if (strlcat(recvname, "/", sizeof (recvname)) <
2256		    sizeof (recvname) &&
2257		    strlcat(recvname, recv_clone_name, sizeof (recvname)) <
2258		    sizeof (recvname) &&
2259		    dsl_dataset_hold(dp, recvname, FTAG, &recv_ds) == 0) {
2260			get_receive_resume_stats(recv_ds, nv);
2261			dsl_dataset_rele(recv_ds, FTAG);
2262		}
2263	}
2264}
2265
2266void
2267dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
2268{
2269	dsl_pool_t *dp = ds->ds_dir->dd_pool;
2270	ASSERT(dsl_pool_config_held(dp));
2271
2272	stat->dds_creation_txg = dsl_get_creationtxg(ds);
2273	stat->dds_inconsistent = dsl_get_inconsistent(ds);
2274	stat->dds_guid = dsl_get_guid(ds);
2275	stat->dds_origin[0] = '\0';
2276	if (ds->ds_is_snapshot) {
2277		stat->dds_is_snapshot = B_TRUE;
2278		stat->dds_num_clones = dsl_get_numclones(ds);
2279	} else {
2280		stat->dds_is_snapshot = B_FALSE;
2281		stat->dds_num_clones = 0;
2282
2283		if (dsl_dir_is_clone(ds->ds_dir)) {
2284			dsl_dir_get_origin(ds->ds_dir, stat->dds_origin);
2285		}
2286	}
2287}
2288
2289uint64_t
2290dsl_dataset_fsid_guid(dsl_dataset_t *ds)
2291{
2292	return (ds->ds_fsid_guid);
2293}
2294
2295void
2296dsl_dataset_space(dsl_dataset_t *ds,
2297    uint64_t *refdbytesp, uint64_t *availbytesp,
2298    uint64_t *usedobjsp, uint64_t *availobjsp)
2299{
2300	*refdbytesp = dsl_dataset_phys(ds)->ds_referenced_bytes;
2301	*availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
2302	if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes)
2303		*availbytesp +=
2304		    ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes;
2305	if (ds->ds_quota != 0) {
2306		/*
2307		 * Adjust available bytes according to refquota
2308		 */
2309		if (*refdbytesp < ds->ds_quota)
2310			*availbytesp = MIN(*availbytesp,
2311			    ds->ds_quota - *refdbytesp);
2312		else
2313			*availbytesp = 0;
2314	}
2315	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
2316	*usedobjsp = BP_GET_FILL(&dsl_dataset_phys(ds)->ds_bp);
2317	rrw_exit(&ds->ds_bp_rwlock, FTAG);
2318	*availobjsp = DN_MAX_OBJECT - *usedobjsp;
2319}
2320
2321boolean_t
2322dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap)
2323{
2324	dsl_pool_t *dp = ds->ds_dir->dd_pool;
2325	uint64_t birth;
2326
2327	ASSERT(dsl_pool_config_held(dp));
2328	if (snap == NULL)
2329		return (B_FALSE);
2330	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
2331	birth = dsl_dataset_get_blkptr(ds)->blk_birth;
2332	rrw_exit(&ds->ds_bp_rwlock, FTAG);
2333	if (birth > dsl_dataset_phys(snap)->ds_creation_txg) {
2334		objset_t *os, *os_snap;
2335		/*
2336		 * It may be that only the ZIL differs, because it was
2337		 * reset in the head.  Don't count that as being
2338		 * modified.
2339		 */
2340		if (dmu_objset_from_ds(ds, &os) != 0)
2341			return (B_TRUE);
2342		if (dmu_objset_from_ds(snap, &os_snap) != 0)
2343			return (B_TRUE);
2344		return (bcmp(&os->os_phys->os_meta_dnode,
2345		    &os_snap->os_phys->os_meta_dnode,
2346		    sizeof (os->os_phys->os_meta_dnode)) != 0);
2347	}
2348	return (B_FALSE);
2349}
2350
2351typedef struct dsl_dataset_rename_snapshot_arg {
2352	const char *ddrsa_fsname;
2353	const char *ddrsa_oldsnapname;
2354	const char *ddrsa_newsnapname;
2355	boolean_t ddrsa_recursive;
2356	dmu_tx_t *ddrsa_tx;
2357} dsl_dataset_rename_snapshot_arg_t;
2358
2359/* ARGSUSED */
2360static int
2361dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp,
2362    dsl_dataset_t *hds, void *arg)
2363{
2364	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
2365	int error;
2366	uint64_t val;
2367
2368	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
2369	if (error != 0) {
2370		/* ignore nonexistent snapshots */
2371		return (error == ENOENT ? 0 : error);
2372	}
2373
2374	/* new name should not exist */
2375	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_newsnapname, &val);
2376	if (error == 0)
2377		error = SET_ERROR(EEXIST);
2378	else if (error == ENOENT)
2379		error = 0;
2380
2381	/* dataset name + 1 for the "@" + the new snapshot name must fit */
2382	if (dsl_dir_namelen(hds->ds_dir) + 1 +
2383	    strlen(ddrsa->ddrsa_newsnapname) >= ZFS_MAX_DATASET_NAME_LEN)
2384		error = SET_ERROR(ENAMETOOLONG);
2385
2386	return (error);
2387}
2388
2389static int
2390dsl_dataset_rename_snapshot_check(void *arg, dmu_tx_t *tx)
2391{
2392	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
2393	dsl_pool_t *dp = dmu_tx_pool(tx);
2394	dsl_dataset_t *hds;
2395	int error;
2396
2397	error = dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds);
2398	if (error != 0)
2399		return (error);
2400
2401	if (ddrsa->ddrsa_recursive) {
2402		error = dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
2403		    dsl_dataset_rename_snapshot_check_impl, ddrsa,
2404		    DS_FIND_CHILDREN);
2405	} else {
2406		error = dsl_dataset_rename_snapshot_check_impl(dp, hds, ddrsa);
2407	}
2408	dsl_dataset_rele(hds, FTAG);
2409	return (error);
2410}
2411
2412static int
2413dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp,
2414    dsl_dataset_t *hds, void *arg)
2415{
2416#ifdef __FreeBSD__
2417#ifdef _KERNEL
2418	char *oldname, *newname;
2419#endif
2420#endif
2421	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
2422	dsl_dataset_t *ds;
2423	uint64_t val;
2424	dmu_tx_t *tx = ddrsa->ddrsa_tx;
2425	int error;
2426
2427	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
2428	ASSERT(error == 0 || error == ENOENT);
2429	if (error == ENOENT) {
2430		/* ignore nonexistent snapshots */
2431		return (0);
2432	}
2433
2434	VERIFY0(dsl_dataset_hold_obj(dp, val, FTAG, &ds));
2435
2436	/* log before we change the name */
2437	spa_history_log_internal_ds(ds, "rename", tx,
2438	    "-> @%s", ddrsa->ddrsa_newsnapname);
2439
2440	VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx,
2441	    B_FALSE));
2442	mutex_enter(&ds->ds_lock);
2443	(void) strcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname);
2444	mutex_exit(&ds->ds_lock);
2445	VERIFY0(zap_add(dp->dp_meta_objset,
2446	    dsl_dataset_phys(hds)->ds_snapnames_zapobj,
2447	    ds->ds_snapname, 8, 1, &ds->ds_object, tx));
2448
2449#ifdef __FreeBSD__
2450#ifdef _KERNEL
2451	oldname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2452	newname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2453	snprintf(oldname, MAXPATHLEN, "%s@%s", ddrsa->ddrsa_fsname,
2454	    ddrsa->ddrsa_oldsnapname);
2455	snprintf(newname, MAXPATHLEN, "%s@%s", ddrsa->ddrsa_fsname,
2456	    ddrsa->ddrsa_newsnapname);
2457	zfsvfs_update_fromname(oldname, newname);
2458	zvol_rename_minors(oldname, newname);
2459	kmem_free(newname, MAXPATHLEN);
2460	kmem_free(oldname, MAXPATHLEN);
2461#endif
2462#endif
2463	dsl_dataset_rele(ds, FTAG);
2464
2465	return (0);
2466}
2467
2468static void
2469dsl_dataset_rename_snapshot_sync(void *arg, dmu_tx_t *tx)
2470{
2471	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
2472	dsl_pool_t *dp = dmu_tx_pool(tx);
2473	dsl_dataset_t *hds;
2474
2475	VERIFY0(dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds));
2476	ddrsa->ddrsa_tx = tx;
2477	if (ddrsa->ddrsa_recursive) {
2478		VERIFY0(dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
2479		    dsl_dataset_rename_snapshot_sync_impl, ddrsa,
2480		    DS_FIND_CHILDREN));
2481	} else {
2482		VERIFY0(dsl_dataset_rename_snapshot_sync_impl(dp, hds, ddrsa));
2483	}
2484	dsl_dataset_rele(hds, FTAG);
2485}
2486
2487int
2488dsl_dataset_rename_snapshot(const char *fsname,
2489    const char *oldsnapname, const char *newsnapname, boolean_t recursive)
2490{
2491	dsl_dataset_rename_snapshot_arg_t ddrsa;
2492
2493	ddrsa.ddrsa_fsname = fsname;
2494	ddrsa.ddrsa_oldsnapname = oldsnapname;
2495	ddrsa.ddrsa_newsnapname = newsnapname;
2496	ddrsa.ddrsa_recursive = recursive;
2497
2498	return (dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check,
2499	    dsl_dataset_rename_snapshot_sync, &ddrsa,
2500	    1, ZFS_SPACE_CHECK_RESERVED));
2501}
2502
2503/*
2504 * If we're doing an ownership handoff, we need to make sure that there is
2505 * only one long hold on the dataset.  We're not allowed to change anything here
2506 * so we don't permanently release the long hold or regular hold here.  We want
2507 * to do this only when syncing to avoid the dataset unexpectedly going away
2508 * when we release the long hold.
2509 */
2510static int
2511dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx)
2512{
2513	boolean_t held;
2514
2515	if (!dmu_tx_is_syncing(tx))
2516		return (0);
2517
2518	if (owner != NULL) {
2519		VERIFY3P(ds->ds_owner, ==, owner);
2520		dsl_dataset_long_rele(ds, owner);
2521	}
2522
2523	held = dsl_dataset_long_held(ds);
2524
2525	if (owner != NULL)
2526		dsl_dataset_long_hold(ds, owner);
2527
2528	if (held)
2529		return (SET_ERROR(EBUSY));
2530
2531	return (0);
2532}
2533
2534int
2535dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx)
2536{
2537	dsl_dataset_rollback_arg_t *ddra = arg;
2538	dsl_pool_t *dp = dmu_tx_pool(tx);
2539	dsl_dataset_t *ds;
2540	int64_t unused_refres_delta;
2541	int error;
2542
2543	error = dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds);
2544	if (error != 0)
2545		return (error);
2546
2547	/* must not be a snapshot */
2548	if (ds->ds_is_snapshot) {
2549		dsl_dataset_rele(ds, FTAG);
2550		return (SET_ERROR(EINVAL));
2551	}
2552
2553	/* must have a most recent snapshot */
2554	if (dsl_dataset_phys(ds)->ds_prev_snap_txg < TXG_INITIAL) {
2555		dsl_dataset_rele(ds, FTAG);
2556		return (SET_ERROR(ESRCH));
2557	}
2558
2559	/*
2560	 * No rollback to a snapshot created in the current txg, because
2561	 * the rollback may dirty the dataset and create blocks that are
2562	 * not reachable from the rootbp while having a birth txg that
2563	 * falls into the snapshot's range.
2564	 */
2565	if (dmu_tx_is_syncing(tx) &&
2566	    dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg) {
2567		dsl_dataset_rele(ds, FTAG);
2568		return (SET_ERROR(EAGAIN));
2569	}
2570
2571	/*
2572	 * If the expected target snapshot is specified, then check that
2573	 * the latest snapshot is it.
2574	 */
2575	if (ddra->ddra_tosnap != NULL) {
2576		dsl_dataset_t *snapds;
2577
2578		/* Check if the target snapshot exists at all. */
2579		error = dsl_dataset_hold(dp, ddra->ddra_tosnap, FTAG, &snapds);
2580		if (error != 0) {
2581			/*
2582			 * ESRCH is used to signal that the target snapshot does
2583			 * not exist, while ENOENT is used to report that
2584			 * the rolled back dataset does not exist.
2585			 * ESRCH is also used to cover other cases where the
2586			 * target snapshot is not related to the dataset being
2587			 * rolled back such as being in a different pool.
2588			 */
2589			if (error == ENOENT || error == EXDEV)
2590				error = SET_ERROR(ESRCH);
2591			dsl_dataset_rele(ds, FTAG);
2592			return (error);
2593		}
2594		ASSERT(snapds->ds_is_snapshot);
2595
2596		/* Check if the snapshot is the latest snapshot indeed. */
2597		if (snapds != ds->ds_prev) {
2598			/*
2599			 * Distinguish between the case where the only problem
2600			 * is intervening snapshots (EEXIST) vs the snapshot
2601			 * not being a valid target for rollback (ESRCH).
2602			 */
2603			if (snapds->ds_dir == ds->ds_dir ||
2604			    (dsl_dir_is_clone(ds->ds_dir) &&
2605			    dsl_dir_phys(ds->ds_dir)->dd_origin_obj ==
2606			    snapds->ds_object)) {
2607				error = SET_ERROR(EEXIST);
2608			} else {
2609				error = SET_ERROR(ESRCH);
2610			}
2611			dsl_dataset_rele(snapds, FTAG);
2612			dsl_dataset_rele(ds, FTAG);
2613			return (error);
2614		}
2615		dsl_dataset_rele(snapds, FTAG);
2616	}
2617
2618	/* must not have any bookmarks after the most recent snapshot */
2619	nvlist_t *proprequest = fnvlist_alloc();
2620	fnvlist_add_boolean(proprequest, zfs_prop_to_name(ZFS_PROP_CREATETXG));
2621	nvlist_t *bookmarks = fnvlist_alloc();
2622	error = dsl_get_bookmarks_impl(ds, proprequest, bookmarks);
2623	fnvlist_free(proprequest);
2624	if (error != 0) {
2625		dsl_dataset_rele(ds, FTAG);
2626		return (error);
2627	}
2628	for (nvpair_t *pair = nvlist_next_nvpair(bookmarks, NULL);
2629	    pair != NULL; pair = nvlist_next_nvpair(bookmarks, pair)) {
2630		nvlist_t *valuenv =
2631		    fnvlist_lookup_nvlist(fnvpair_value_nvlist(pair),
2632		    zfs_prop_to_name(ZFS_PROP_CREATETXG));
2633		uint64_t createtxg = fnvlist_lookup_uint64(valuenv, "value");
2634		if (createtxg > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
2635			fnvlist_free(bookmarks);
2636			dsl_dataset_rele(ds, FTAG);
2637			return (SET_ERROR(EEXIST));
2638		}
2639	}
2640	fnvlist_free(bookmarks);
2641
2642	error = dsl_dataset_handoff_check(ds, ddra->ddra_owner, tx);
2643	if (error != 0) {
2644		dsl_dataset_rele(ds, FTAG);
2645		return (error);
2646	}
2647
2648	/*
2649	 * Check if the snap we are rolling back to uses more than
2650	 * the refquota.
2651	 */
2652	if (ds->ds_quota != 0 &&
2653	    dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes > ds->ds_quota) {
2654		dsl_dataset_rele(ds, FTAG);
2655		return (SET_ERROR(EDQUOT));
2656	}
2657
2658	/*
2659	 * When we do the clone swap, we will temporarily use more space
2660	 * due to the refreservation (the head will no longer have any
2661	 * unique space, so the entire amount of the refreservation will need
2662	 * to be free).  We will immediately destroy the clone, freeing
2663	 * this space, but the freeing happens over many txg's.
2664	 */
2665	unused_refres_delta = (int64_t)MIN(ds->ds_reserved,
2666	    dsl_dataset_phys(ds)->ds_unique_bytes);
2667
2668	if (unused_refres_delta > 0 &&
2669	    unused_refres_delta >
2670	    dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) {
2671		dsl_dataset_rele(ds, FTAG);
2672		return (SET_ERROR(ENOSPC));
2673	}
2674
2675	dsl_dataset_rele(ds, FTAG);
2676	return (0);
2677}
2678
2679void
2680dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx)
2681{
2682	dsl_dataset_rollback_arg_t *ddra = arg;
2683	dsl_pool_t *dp = dmu_tx_pool(tx);
2684	dsl_dataset_t *ds, *clone;
2685	uint64_t cloneobj;
2686	char namebuf[ZFS_MAX_DATASET_NAME_LEN];
2687
2688	VERIFY0(dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds));
2689
2690	dsl_dataset_name(ds->ds_prev, namebuf);
2691	fnvlist_add_string(ddra->ddra_result, "target", namebuf);
2692
2693	cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback",
2694	    ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, tx);
2695
2696	VERIFY0(dsl_dataset_hold_obj(dp, cloneobj, FTAG, &clone));
2697
2698	dsl_dataset_clone_swap_sync_impl(clone, ds, tx);
2699	dsl_dataset_zero_zil(ds, tx);
2700
2701	dsl_destroy_head_sync_impl(clone, tx);
2702
2703	dsl_dataset_rele(clone, FTAG);
2704	dsl_dataset_rele(ds, FTAG);
2705}
2706
2707/*
2708 * Rolls back the given filesystem or volume to the most recent snapshot.
2709 * The name of the most recent snapshot will be returned under key "target"
2710 * in the result nvlist.
2711 *
2712 * If owner != NULL:
2713 * - The existing dataset MUST be owned by the specified owner at entry
2714 * - Upon return, dataset will still be held by the same owner, whether we
2715 *   succeed or not.
2716 *
2717 * This mode is required any time the existing filesystem is mounted.  See
2718 * notes above zfs_suspend_fs() for further details.
2719 */
2720int
2721dsl_dataset_rollback(const char *fsname, const char *tosnap, void *owner,
2722    nvlist_t *result)
2723{
2724	dsl_dataset_rollback_arg_t ddra;
2725
2726	ddra.ddra_fsname = fsname;
2727	ddra.ddra_tosnap = tosnap;
2728	ddra.ddra_owner = owner;
2729	ddra.ddra_result = result;
2730
2731	return (dsl_sync_task(fsname, dsl_dataset_rollback_check,
2732	    dsl_dataset_rollback_sync, &ddra,
2733	    1, ZFS_SPACE_CHECK_RESERVED));
2734}
2735
2736struct promotenode {
2737	list_node_t link;
2738	dsl_dataset_t *ds;
2739};
2740
2741static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
2742static int promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp,
2743    void *tag);
2744static void promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag);
2745
2746int
2747dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
2748{
2749	dsl_dataset_promote_arg_t *ddpa = arg;
2750	dsl_pool_t *dp = dmu_tx_pool(tx);
2751	dsl_dataset_t *hds;
2752	struct promotenode *snap;
2753	dsl_dataset_t *origin_ds;
2754	int err;
2755	uint64_t unused;
2756	uint64_t ss_mv_cnt;
2757	size_t max_snap_len;
2758	boolean_t conflicting_snaps;
2759
2760	err = promote_hold(ddpa, dp, FTAG);
2761	if (err != 0)
2762		return (err);
2763
2764	hds = ddpa->ddpa_clone;
2765	snap = list_head(&ddpa->shared_snaps);
2766	origin_ds = snap->ds;
2767	max_snap_len = MAXNAMELEN - strlen(ddpa->ddpa_clonename) - 1;
2768
2769	snap = list_head(&ddpa->origin_snaps);
2770
2771	if (dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE) {
2772		promote_rele(ddpa, FTAG);
2773		return (SET_ERROR(EXDEV));
2774	}
2775
2776	/*
2777	 * Compute and check the amount of space to transfer.  Since this is
2778	 * so expensive, don't do the preliminary check.
2779	 */
2780	if (!dmu_tx_is_syncing(tx)) {
2781		promote_rele(ddpa, FTAG);
2782		return (0);
2783	}
2784
2785	/* compute origin's new unique space */
2786	snap = list_tail(&ddpa->clone_snaps);
2787	ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
2788	    origin_ds->ds_object);
2789	dsl_deadlist_space_range(&snap->ds->ds_deadlist,
2790	    dsl_dataset_phys(origin_ds)->ds_prev_snap_txg, UINT64_MAX,
2791	    &ddpa->unique, &unused, &unused);
2792
2793	/*
2794	 * Walk the snapshots that we are moving
2795	 *
2796	 * Compute space to transfer.  Consider the incremental changes
2797	 * to used by each snapshot:
2798	 * (my used) = (prev's used) + (blocks born) - (blocks killed)
2799	 * So each snapshot gave birth to:
2800	 * (blocks born) = (my used) - (prev's used) + (blocks killed)
2801	 * So a sequence would look like:
2802	 * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0)
2803	 * Which simplifies to:
2804	 * uN + kN + kN-1 + ... + k1 + k0
2805	 * Note however, if we stop before we reach the ORIGIN we get:
2806	 * uN + kN + kN-1 + ... + kM - uM-1
2807	 */
2808	conflicting_snaps = B_FALSE;
2809	ss_mv_cnt = 0;
2810	ddpa->used = dsl_dataset_phys(origin_ds)->ds_referenced_bytes;
2811	ddpa->comp = dsl_dataset_phys(origin_ds)->ds_compressed_bytes;
2812	ddpa->uncomp = dsl_dataset_phys(origin_ds)->ds_uncompressed_bytes;
2813	for (snap = list_head(&ddpa->shared_snaps); snap;
2814	    snap = list_next(&ddpa->shared_snaps, snap)) {
2815		uint64_t val, dlused, dlcomp, dluncomp;
2816		dsl_dataset_t *ds = snap->ds;
2817
2818		ss_mv_cnt++;
2819
2820		/*
2821		 * If there are long holds, we won't be able to evict
2822		 * the objset.
2823		 */
2824		if (dsl_dataset_long_held(ds)) {
2825			err = SET_ERROR(EBUSY);
2826			goto out;
2827		}
2828
2829		/* Check that the snapshot name does not conflict */
2830		VERIFY0(dsl_dataset_get_snapname(ds));
2831		if (strlen(ds->ds_snapname) >= max_snap_len) {
2832			err = SET_ERROR(ENAMETOOLONG);
2833			goto out;
2834		}
2835		err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
2836		if (err == 0) {
2837			fnvlist_add_boolean(ddpa->err_ds,
2838			    snap->ds->ds_snapname);
2839			conflicting_snaps = B_TRUE;
2840		} else if (err != ENOENT) {
2841			goto out;
2842		}
2843
2844		/* The very first snapshot does not have a deadlist */
2845		if (dsl_dataset_phys(ds)->ds_prev_snap_obj == 0)
2846			continue;
2847
2848		dsl_deadlist_space(&ds->ds_deadlist,
2849		    &dlused, &dlcomp, &dluncomp);
2850		ddpa->used += dlused;
2851		ddpa->comp += dlcomp;
2852		ddpa->uncomp += dluncomp;
2853	}
2854
2855	/*
2856	 * In order to return the full list of conflicting snapshots, we check
2857	 * whether there was a conflict after traversing all of them.
2858	 */
2859	if (conflicting_snaps) {
2860		err = SET_ERROR(EEXIST);
2861		goto out;
2862	}
2863
2864	/*
2865	 * If we are a clone of a clone then we never reached ORIGIN,
2866	 * so we need to subtract out the clone origin's used space.
2867	 */
2868	if (ddpa->origin_origin) {
2869		ddpa->used -=
2870		    dsl_dataset_phys(ddpa->origin_origin)->ds_referenced_bytes;
2871		ddpa->comp -=
2872		    dsl_dataset_phys(ddpa->origin_origin)->ds_compressed_bytes;
2873		ddpa->uncomp -=
2874		    dsl_dataset_phys(ddpa->origin_origin)->
2875		    ds_uncompressed_bytes;
2876	}
2877
2878	/* Check that there is enough space and limit headroom here */
2879	err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
2880	    0, ss_mv_cnt, ddpa->used, ddpa->cr);
2881	if (err != 0)
2882		goto out;
2883
2884	/*
2885	 * Compute the amounts of space that will be used by snapshots
2886	 * after the promotion (for both origin and clone).  For each,
2887	 * it is the amount of space that will be on all of their
2888	 * deadlists (that was not born before their new origin).
2889	 */
2890	if (dsl_dir_phys(hds->ds_dir)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
2891		uint64_t space;
2892
2893		/*
2894		 * Note, typically this will not be a clone of a clone,
2895		 * so dd_origin_txg will be < TXG_INITIAL, so
2896		 * these snaplist_space() -> dsl_deadlist_space_range()
2897		 * calls will be fast because they do not have to
2898		 * iterate over all bps.
2899		 */
2900		snap = list_head(&ddpa->origin_snaps);
2901		err = snaplist_space(&ddpa->shared_snaps,
2902		    snap->ds->ds_dir->dd_origin_txg, &ddpa->cloneusedsnap);
2903		if (err != 0)
2904			goto out;
2905
2906		err = snaplist_space(&ddpa->clone_snaps,
2907		    snap->ds->ds_dir->dd_origin_txg, &space);
2908		if (err != 0)
2909			goto out;
2910		ddpa->cloneusedsnap += space;
2911	}
2912	if (dsl_dir_phys(origin_ds->ds_dir)->dd_flags &
2913	    DD_FLAG_USED_BREAKDOWN) {
2914		err = snaplist_space(&ddpa->origin_snaps,
2915		    dsl_dataset_phys(origin_ds)->ds_creation_txg,
2916		    &ddpa->originusedsnap);
2917		if (err != 0)
2918			goto out;
2919	}
2920
2921out:
2922	promote_rele(ddpa, FTAG);
2923	return (err);
2924}
2925
2926void
2927dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
2928{
2929	dsl_dataset_promote_arg_t *ddpa = arg;
2930	dsl_pool_t *dp = dmu_tx_pool(tx);
2931	dsl_dataset_t *hds;
2932	struct promotenode *snap;
2933	dsl_dataset_t *origin_ds;
2934	dsl_dataset_t *origin_head;
2935	dsl_dir_t *dd;
2936	dsl_dir_t *odd = NULL;
2937	uint64_t oldnext_obj;
2938	int64_t delta;
2939#if defined(__FreeBSD__) && defined(_KERNEL)
2940	char *oldname, *newname;
2941#endif
2942
2943	VERIFY0(promote_hold(ddpa, dp, FTAG));
2944	hds = ddpa->ddpa_clone;
2945
2946	ASSERT0(dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE);
2947
2948	snap = list_head(&ddpa->shared_snaps);
2949	origin_ds = snap->ds;
2950	dd = hds->ds_dir;
2951
2952	snap = list_head(&ddpa->origin_snaps);
2953	origin_head = snap->ds;
2954
2955	/*
2956	 * We need to explicitly open odd, since origin_ds's dd will be
2957	 * changing.
2958	 */
2959	VERIFY0(dsl_dir_hold_obj(dp, origin_ds->ds_dir->dd_object,
2960	    NULL, FTAG, &odd));
2961
2962	/* change origin's next snap */
2963	dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
2964	oldnext_obj = dsl_dataset_phys(origin_ds)->ds_next_snap_obj;
2965	snap = list_tail(&ddpa->clone_snaps);
2966	ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
2967	    origin_ds->ds_object);
2968	dsl_dataset_phys(origin_ds)->ds_next_snap_obj = snap->ds->ds_object;
2969
2970	/* change the origin's next clone */
2971	if (dsl_dataset_phys(origin_ds)->ds_next_clones_obj) {
2972		dsl_dataset_remove_from_next_clones(origin_ds,
2973		    snap->ds->ds_object, tx);
2974		VERIFY0(zap_add_int(dp->dp_meta_objset,
2975		    dsl_dataset_phys(origin_ds)->ds_next_clones_obj,
2976		    oldnext_obj, tx));
2977	}
2978
2979	/* change origin */
2980	dmu_buf_will_dirty(dd->dd_dbuf, tx);
2981	ASSERT3U(dsl_dir_phys(dd)->dd_origin_obj, ==, origin_ds->ds_object);
2982	dsl_dir_phys(dd)->dd_origin_obj = dsl_dir_phys(odd)->dd_origin_obj;
2983	dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg;
2984	dmu_buf_will_dirty(odd->dd_dbuf, tx);
2985	dsl_dir_phys(odd)->dd_origin_obj = origin_ds->ds_object;
2986	origin_head->ds_dir->dd_origin_txg =
2987	    dsl_dataset_phys(origin_ds)->ds_creation_txg;
2988
2989	/* change dd_clone entries */
2990	if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
2991		VERIFY0(zap_remove_int(dp->dp_meta_objset,
2992		    dsl_dir_phys(odd)->dd_clones, hds->ds_object, tx));
2993		VERIFY0(zap_add_int(dp->dp_meta_objset,
2994		    dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
2995		    hds->ds_object, tx));
2996
2997		VERIFY0(zap_remove_int(dp->dp_meta_objset,
2998		    dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
2999		    origin_head->ds_object, tx));
3000		if (dsl_dir_phys(dd)->dd_clones == 0) {
3001			dsl_dir_phys(dd)->dd_clones =
3002			    zap_create(dp->dp_meta_objset, DMU_OT_DSL_CLONES,
3003			    DMU_OT_NONE, 0, tx);
3004		}
3005		VERIFY0(zap_add_int(dp->dp_meta_objset,
3006		    dsl_dir_phys(dd)->dd_clones, origin_head->ds_object, tx));
3007	}
3008
3009#if defined(__FreeBSD__) && defined(_KERNEL)
3010	/* Take the spa_namespace_lock early so zvol renames don't deadlock. */
3011	mutex_enter(&spa_namespace_lock);
3012
3013	oldname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
3014	newname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
3015#endif
3016
3017	/* move snapshots to this dir */
3018	for (snap = list_head(&ddpa->shared_snaps); snap;
3019	    snap = list_next(&ddpa->shared_snaps, snap)) {
3020		dsl_dataset_t *ds = snap->ds;
3021
3022		/*
3023		 * Property callbacks are registered to a particular
3024		 * dsl_dir.  Since ours is changing, evict the objset
3025		 * so that they will be unregistered from the old dsl_dir.
3026		 */
3027		if (ds->ds_objset) {
3028			dmu_objset_evict(ds->ds_objset);
3029			ds->ds_objset = NULL;
3030		}
3031
3032		/* move snap name entry */
3033		VERIFY0(dsl_dataset_get_snapname(ds));
3034		VERIFY0(dsl_dataset_snap_remove(origin_head,
3035		    ds->ds_snapname, tx, B_TRUE));
3036		VERIFY0(zap_add(dp->dp_meta_objset,
3037		    dsl_dataset_phys(hds)->ds_snapnames_zapobj, ds->ds_snapname,
3038		    8, 1, &ds->ds_object, tx));
3039		dsl_fs_ss_count_adjust(hds->ds_dir, 1,
3040		    DD_FIELD_SNAPSHOT_COUNT, tx);
3041
3042		/* change containing dsl_dir */
3043		dmu_buf_will_dirty(ds->ds_dbuf, tx);
3044		ASSERT3U(dsl_dataset_phys(ds)->ds_dir_obj, ==, odd->dd_object);
3045		dsl_dataset_phys(ds)->ds_dir_obj = dd->dd_object;
3046		ASSERT3P(ds->ds_dir, ==, odd);
3047		dsl_dir_rele(ds->ds_dir, ds);
3048		VERIFY0(dsl_dir_hold_obj(dp, dd->dd_object,
3049		    NULL, ds, &ds->ds_dir));
3050
3051#if defined(__FreeBSD__) && defined(_KERNEL)
3052		dsl_dataset_name(ds, newname);
3053		zfsvfs_update_fromname(oldname, newname);
3054		zvol_rename_minors(oldname, newname);
3055#endif
3056
3057		/* move any clone references */
3058		if (dsl_dataset_phys(ds)->ds_next_clones_obj &&
3059		    spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
3060			zap_cursor_t zc;
3061			zap_attribute_t za;
3062
3063			for (zap_cursor_init(&zc, dp->dp_meta_objset,
3064			    dsl_dataset_phys(ds)->ds_next_clones_obj);
3065			    zap_cursor_retrieve(&zc, &za) == 0;
3066			    zap_cursor_advance(&zc)) {
3067				dsl_dataset_t *cnds;
3068				uint64_t o;
3069
3070				if (za.za_first_integer == oldnext_obj) {
3071					/*
3072					 * We've already moved the
3073					 * origin's reference.
3074					 */
3075					continue;
3076				}
3077
3078				VERIFY0(dsl_dataset_hold_obj(dp,
3079				    za.za_first_integer, FTAG, &cnds));
3080				o = dsl_dir_phys(cnds->ds_dir)->
3081				    dd_head_dataset_obj;
3082
3083				VERIFY0(zap_remove_int(dp->dp_meta_objset,
3084				    dsl_dir_phys(odd)->dd_clones, o, tx));
3085				VERIFY0(zap_add_int(dp->dp_meta_objset,
3086				    dsl_dir_phys(dd)->dd_clones, o, tx));
3087				dsl_dataset_rele(cnds, FTAG);
3088			}
3089			zap_cursor_fini(&zc);
3090		}
3091
3092		ASSERT(!dsl_prop_hascb(ds));
3093	}
3094
3095#if defined(__FreeBSD__) && defined(_KERNEL)
3096	mutex_exit(&spa_namespace_lock);
3097
3098	kmem_free(newname, MAXPATHLEN);
3099	kmem_free(oldname, MAXPATHLEN);
3100#endif
3101	/*
3102	 * Change space accounting.
3103	 * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either
3104	 * both be valid, or both be 0 (resulting in delta == 0).  This
3105	 * is true for each of {clone,origin} independently.
3106	 */
3107
3108	delta = ddpa->cloneusedsnap -
3109	    dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP];
3110	ASSERT3S(delta, >=, 0);
3111	ASSERT3U(ddpa->used, >=, delta);
3112	dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx);
3113	dsl_dir_diduse_space(dd, DD_USED_HEAD,
3114	    ddpa->used - delta, ddpa->comp, ddpa->uncomp, tx);
3115
3116	delta = ddpa->originusedsnap -
3117	    dsl_dir_phys(odd)->dd_used_breakdown[DD_USED_SNAP];
3118	ASSERT3S(delta, <=, 0);
3119	ASSERT3U(ddpa->used, >=, -delta);
3120	dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx);
3121	dsl_dir_diduse_space(odd, DD_USED_HEAD,
3122	    -ddpa->used - delta, -ddpa->comp, -ddpa->uncomp, tx);
3123
3124	dsl_dataset_phys(origin_ds)->ds_unique_bytes = ddpa->unique;
3125
3126	/* log history record */
3127	spa_history_log_internal_ds(hds, "promote", tx, "");
3128
3129	dsl_dir_rele(odd, FTAG);
3130	promote_rele(ddpa, FTAG);
3131}
3132
3133/*
3134 * Make a list of dsl_dataset_t's for the snapshots between first_obj
3135 * (exclusive) and last_obj (inclusive).  The list will be in reverse
3136 * order (last_obj will be the list_head()).  If first_obj == 0, do all
3137 * snapshots back to this dataset's origin.
3138 */
3139static int
3140snaplist_make(dsl_pool_t *dp,
3141    uint64_t first_obj, uint64_t last_obj, list_t *l, void *tag)
3142{
3143	uint64_t obj = last_obj;
3144
3145	list_create(l, sizeof (struct promotenode),
3146	    offsetof(struct promotenode, link));
3147
3148	while (obj != first_obj) {
3149		dsl_dataset_t *ds;
3150		struct promotenode *snap;
3151		int err;
3152
3153		err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
3154		ASSERT(err != ENOENT);
3155		if (err != 0)
3156			return (err);
3157
3158		if (first_obj == 0)
3159			first_obj = dsl_dir_phys(ds->ds_dir)->dd_origin_obj;
3160
3161		snap = kmem_alloc(sizeof (*snap), KM_SLEEP);
3162		snap->ds = ds;
3163		list_insert_tail(l, snap);
3164		obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
3165	}
3166
3167	return (0);
3168}
3169
3170static int
3171snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep)
3172{
3173	struct promotenode *snap;
3174
3175	*spacep = 0;
3176	for (snap = list_head(l); snap; snap = list_next(l, snap)) {
3177		uint64_t used, comp, uncomp;
3178		dsl_deadlist_space_range(&snap->ds->ds_deadlist,
3179		    mintxg, UINT64_MAX, &used, &comp, &uncomp);
3180		*spacep += used;
3181	}
3182	return (0);
3183}
3184
3185static void
3186snaplist_destroy(list_t *l, void *tag)
3187{
3188	struct promotenode *snap;
3189
3190	if (l == NULL || !list_link_active(&l->list_head))
3191		return;
3192
3193	while ((snap = list_tail(l)) != NULL) {
3194		list_remove(l, snap);
3195		dsl_dataset_rele(snap->ds, tag);
3196		kmem_free(snap, sizeof (*snap));
3197	}
3198	list_destroy(l);
3199}
3200
3201static int
3202promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, void *tag)
3203{
3204	int error;
3205	dsl_dir_t *dd;
3206	struct promotenode *snap;
3207
3208	error = dsl_dataset_hold(dp, ddpa->ddpa_clonename, tag,
3209	    &ddpa->ddpa_clone);
3210	if (error != 0)
3211		return (error);
3212	dd = ddpa->ddpa_clone->ds_dir;
3213
3214	if (ddpa->ddpa_clone->ds_is_snapshot ||
3215	    !dsl_dir_is_clone(dd)) {
3216		dsl_dataset_rele(ddpa->ddpa_clone, tag);
3217		return (SET_ERROR(EINVAL));
3218	}
3219
3220	error = snaplist_make(dp, 0, dsl_dir_phys(dd)->dd_origin_obj,
3221	    &ddpa->shared_snaps, tag);
3222	if (error != 0)
3223		goto out;
3224
3225	error = snaplist_make(dp, 0, ddpa->ddpa_clone->ds_object,
3226	    &ddpa->clone_snaps, tag);
3227	if (error != 0)
3228		goto out;
3229
3230	snap = list_head(&ddpa->shared_snaps);
3231	ASSERT3U(snap->ds->ds_object, ==, dsl_dir_phys(dd)->dd_origin_obj);
3232	error = snaplist_make(dp, dsl_dir_phys(dd)->dd_origin_obj,
3233	    dsl_dir_phys(snap->ds->ds_dir)->dd_head_dataset_obj,
3234	    &ddpa->origin_snaps, tag);
3235	if (error != 0)
3236		goto out;
3237
3238	if (dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj != 0) {
3239		error = dsl_dataset_hold_obj(dp,
3240		    dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj,
3241		    tag, &ddpa->origin_origin);
3242		if (error != 0)
3243			goto out;
3244	}
3245out:
3246	if (error != 0)
3247		promote_rele(ddpa, tag);
3248	return (error);
3249}
3250
3251static void
3252promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag)
3253{
3254	snaplist_destroy(&ddpa->shared_snaps, tag);
3255	snaplist_destroy(&ddpa->clone_snaps, tag);
3256	snaplist_destroy(&ddpa->origin_snaps, tag);
3257	if (ddpa->origin_origin != NULL)
3258		dsl_dataset_rele(ddpa->origin_origin, tag);
3259	dsl_dataset_rele(ddpa->ddpa_clone, tag);
3260}
3261
3262/*
3263 * Promote a clone.
3264 *
3265 * If it fails due to a conflicting snapshot name, "conflsnap" will be filled
3266 * in with the name.  (It must be at least ZFS_MAX_DATASET_NAME_LEN bytes long.)
3267 */
3268int
3269dsl_dataset_promote(const char *name, char *conflsnap)
3270{
3271	dsl_dataset_promote_arg_t ddpa = { 0 };
3272	uint64_t numsnaps;
3273	int error;
3274	nvpair_t *snap_pair;
3275	objset_t *os;
3276
3277	/*
3278	 * We will modify space proportional to the number of
3279	 * snapshots.  Compute numsnaps.
3280	 */
3281	error = dmu_objset_hold(name, FTAG, &os);
3282	if (error != 0)
3283		return (error);
3284	error = zap_count(dmu_objset_pool(os)->dp_meta_objset,
3285	    dsl_dataset_phys(dmu_objset_ds(os))->ds_snapnames_zapobj,
3286	    &numsnaps);
3287	dmu_objset_rele(os, FTAG);
3288	if (error != 0)
3289		return (error);
3290
3291	ddpa.ddpa_clonename = name;
3292	ddpa.err_ds = fnvlist_alloc();
3293	ddpa.cr = CRED();
3294
3295	error = dsl_sync_task(name, dsl_dataset_promote_check,
3296	    dsl_dataset_promote_sync, &ddpa,
3297	    2 + numsnaps, ZFS_SPACE_CHECK_RESERVED);
3298
3299	/*
3300	 * Return the first conflicting snapshot found.
3301	 */
3302	snap_pair = nvlist_next_nvpair(ddpa.err_ds, NULL);
3303	if (snap_pair != NULL && conflsnap != NULL)
3304		(void) strcpy(conflsnap, nvpair_name(snap_pair));
3305
3306	fnvlist_free(ddpa.err_ds);
3307	return (error);
3308}
3309
3310int
3311dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
3312    dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx)
3313{
3314	/*
3315	 * "slack" factor for received datasets with refquota set on them.
3316	 * See the bottom of this function for details on its use.
3317	 */
3318	uint64_t refquota_slack = DMU_MAX_ACCESS * spa_asize_inflation;
3319	int64_t unused_refres_delta;
3320
3321	/* they should both be heads */
3322	if (clone->ds_is_snapshot ||
3323	    origin_head->ds_is_snapshot)
3324		return (SET_ERROR(EINVAL));
3325
3326	/* if we are not forcing, the branch point should be just before them */
3327	if (!force && clone->ds_prev != origin_head->ds_prev)
3328		return (SET_ERROR(EINVAL));
3329
3330	/* clone should be the clone (unless they are unrelated) */
3331	if (clone->ds_prev != NULL &&
3332	    clone->ds_prev != clone->ds_dir->dd_pool->dp_origin_snap &&
3333	    origin_head->ds_dir != clone->ds_prev->ds_dir)
3334		return (SET_ERROR(EINVAL));
3335
3336	/* the clone should be a child of the origin */
3337	if (clone->ds_dir->dd_parent != origin_head->ds_dir)
3338		return (SET_ERROR(EINVAL));
3339
3340	/* origin_head shouldn't be modified unless 'force' */
3341	if (!force &&
3342	    dsl_dataset_modified_since_snap(origin_head, origin_head->ds_prev))
3343		return (SET_ERROR(ETXTBSY));
3344
3345	/* origin_head should have no long holds (e.g. is not mounted) */
3346	if (dsl_dataset_handoff_check(origin_head, owner, tx))
3347		return (SET_ERROR(EBUSY));
3348
3349	/* check amount of any unconsumed refreservation */
3350	unused_refres_delta =
3351	    (int64_t)MIN(origin_head->ds_reserved,
3352	    dsl_dataset_phys(origin_head)->ds_unique_bytes) -
3353	    (int64_t)MIN(origin_head->ds_reserved,
3354	    dsl_dataset_phys(clone)->ds_unique_bytes);
3355
3356	if (unused_refres_delta > 0 &&
3357	    unused_refres_delta >
3358	    dsl_dir_space_available(origin_head->ds_dir, NULL, 0, TRUE))
3359		return (SET_ERROR(ENOSPC));
3360
3361	/*
3362	 * The clone can't be too much over the head's refquota.
3363	 *
3364	 * To ensure that the entire refquota can be used, we allow one
3365	 * transaction to exceed the the refquota.  Therefore, this check
3366	 * needs to also allow for the space referenced to be more than the
3367	 * refquota.  The maximum amount of space that one transaction can use
3368	 * on disk is DMU_MAX_ACCESS * spa_asize_inflation.  Allowing this
3369	 * overage ensures that we are able to receive a filesystem that
3370	 * exceeds the refquota on the source system.
3371	 *
3372	 * So that overage is the refquota_slack we use below.
3373	 */
3374	if (origin_head->ds_quota != 0 &&
3375	    dsl_dataset_phys(clone)->ds_referenced_bytes >
3376	    origin_head->ds_quota + refquota_slack)
3377		return (SET_ERROR(EDQUOT));
3378
3379	return (0);
3380}
3381
3382void
3383dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
3384    dsl_dataset_t *origin_head, dmu_tx_t *tx)
3385{
3386	dsl_pool_t *dp = dmu_tx_pool(tx);
3387	int64_t unused_refres_delta;
3388
3389	ASSERT(clone->ds_reserved == 0);
3390	/*
3391	 * NOTE: On DEBUG kernels there could be a race between this and
3392	 * the check function if spa_asize_inflation is adjusted...
3393	 */
3394	ASSERT(origin_head->ds_quota == 0 ||
3395	    dsl_dataset_phys(clone)->ds_unique_bytes <= origin_head->ds_quota +
3396	    DMU_MAX_ACCESS * spa_asize_inflation);
3397	ASSERT3P(clone->ds_prev, ==, origin_head->ds_prev);
3398
3399	/*
3400	 * Swap per-dataset feature flags.
3401	 */
3402	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
3403		if (!(spa_feature_table[f].fi_flags &
3404		    ZFEATURE_FLAG_PER_DATASET)) {
3405			ASSERT(!clone->ds_feature_inuse[f]);
3406			ASSERT(!origin_head->ds_feature_inuse[f]);
3407			continue;
3408		}
3409
3410		boolean_t clone_inuse = clone->ds_feature_inuse[f];
3411		boolean_t origin_head_inuse = origin_head->ds_feature_inuse[f];
3412
3413		if (clone_inuse) {
3414			dsl_dataset_deactivate_feature(clone->ds_object, f, tx);
3415			clone->ds_feature_inuse[f] = B_FALSE;
3416		}
3417		if (origin_head_inuse) {
3418			dsl_dataset_deactivate_feature(origin_head->ds_object,
3419			    f, tx);
3420			origin_head->ds_feature_inuse[f] = B_FALSE;
3421		}
3422		if (clone_inuse) {
3423			dsl_dataset_activate_feature(origin_head->ds_object,
3424			    f, tx);
3425			origin_head->ds_feature_inuse[f] = B_TRUE;
3426		}
3427		if (origin_head_inuse) {
3428			dsl_dataset_activate_feature(clone->ds_object, f, tx);
3429			clone->ds_feature_inuse[f] = B_TRUE;
3430		}
3431	}
3432
3433	dmu_buf_will_dirty(clone->ds_dbuf, tx);
3434	dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
3435
3436	if (clone->ds_objset != NULL) {
3437		dmu_objset_evict(clone->ds_objset);
3438		clone->ds_objset = NULL;
3439	}
3440
3441	if (origin_head->ds_objset != NULL) {
3442		dmu_objset_evict(origin_head->ds_objset);
3443		origin_head->ds_objset = NULL;
3444	}
3445
3446	unused_refres_delta =
3447	    (int64_t)MIN(origin_head->ds_reserved,
3448	    dsl_dataset_phys(origin_head)->ds_unique_bytes) -
3449	    (int64_t)MIN(origin_head->ds_reserved,
3450	    dsl_dataset_phys(clone)->ds_unique_bytes);
3451
3452	/*
3453	 * Reset origin's unique bytes, if it exists.
3454	 */
3455	if (clone->ds_prev) {
3456		dsl_dataset_t *origin = clone->ds_prev;
3457		uint64_t comp, uncomp;
3458
3459		dmu_buf_will_dirty(origin->ds_dbuf, tx);
3460		dsl_deadlist_space_range(&clone->ds_deadlist,
3461		    dsl_dataset_phys(origin)->ds_prev_snap_txg, UINT64_MAX,
3462		    &dsl_dataset_phys(origin)->ds_unique_bytes, &comp, &uncomp);
3463	}
3464
3465	/* swap blkptrs */
3466	{
3467		rrw_enter(&clone->ds_bp_rwlock, RW_WRITER, FTAG);
3468		rrw_enter(&origin_head->ds_bp_rwlock, RW_WRITER, FTAG);
3469		blkptr_t tmp;
3470		tmp = dsl_dataset_phys(origin_head)->ds_bp;
3471		dsl_dataset_phys(origin_head)->ds_bp =
3472		    dsl_dataset_phys(clone)->ds_bp;
3473		dsl_dataset_phys(clone)->ds_bp = tmp;
3474		rrw_exit(&origin_head->ds_bp_rwlock, FTAG);
3475		rrw_exit(&clone->ds_bp_rwlock, FTAG);
3476	}
3477
3478	/* set dd_*_bytes */
3479	{
3480		int64_t dused, dcomp, duncomp;
3481		uint64_t cdl_used, cdl_comp, cdl_uncomp;
3482		uint64_t odl_used, odl_comp, odl_uncomp;
3483
3484		ASSERT3U(dsl_dir_phys(clone->ds_dir)->
3485		    dd_used_breakdown[DD_USED_SNAP], ==, 0);
3486
3487		dsl_deadlist_space(&clone->ds_deadlist,
3488		    &cdl_used, &cdl_comp, &cdl_uncomp);
3489		dsl_deadlist_space(&origin_head->ds_deadlist,
3490		    &odl_used, &odl_comp, &odl_uncomp);
3491
3492		dused = dsl_dataset_phys(clone)->ds_referenced_bytes +
3493		    cdl_used -
3494		    (dsl_dataset_phys(origin_head)->ds_referenced_bytes +
3495		    odl_used);
3496		dcomp = dsl_dataset_phys(clone)->ds_compressed_bytes +
3497		    cdl_comp -
3498		    (dsl_dataset_phys(origin_head)->ds_compressed_bytes +
3499		    odl_comp);
3500		duncomp = dsl_dataset_phys(clone)->ds_uncompressed_bytes +
3501		    cdl_uncomp -
3502		    (dsl_dataset_phys(origin_head)->ds_uncompressed_bytes +
3503		    odl_uncomp);
3504
3505		dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_HEAD,
3506		    dused, dcomp, duncomp, tx);
3507		dsl_dir_diduse_space(clone->ds_dir, DD_USED_HEAD,
3508		    -dused, -dcomp, -duncomp, tx);
3509
3510		/*
3511		 * The difference in the space used by snapshots is the
3512		 * difference in snapshot space due to the head's
3513		 * deadlist (since that's the only thing that's
3514		 * changing that affects the snapused).
3515		 */
3516		dsl_deadlist_space_range(&clone->ds_deadlist,
3517		    origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
3518		    &cdl_used, &cdl_comp, &cdl_uncomp);
3519		dsl_deadlist_space_range(&origin_head->ds_deadlist,
3520		    origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
3521		    &odl_used, &odl_comp, &odl_uncomp);
3522		dsl_dir_transfer_space(origin_head->ds_dir, cdl_used - odl_used,
3523		    DD_USED_HEAD, DD_USED_SNAP, NULL);
3524	}
3525
3526	/* swap ds_*_bytes */
3527	SWITCH64(dsl_dataset_phys(origin_head)->ds_referenced_bytes,
3528	    dsl_dataset_phys(clone)->ds_referenced_bytes);
3529	SWITCH64(dsl_dataset_phys(origin_head)->ds_compressed_bytes,
3530	    dsl_dataset_phys(clone)->ds_compressed_bytes);
3531	SWITCH64(dsl_dataset_phys(origin_head)->ds_uncompressed_bytes,
3532	    dsl_dataset_phys(clone)->ds_uncompressed_bytes);
3533	SWITCH64(dsl_dataset_phys(origin_head)->ds_unique_bytes,
3534	    dsl_dataset_phys(clone)->ds_unique_bytes);
3535
3536	/* apply any parent delta for change in unconsumed refreservation */
3537	dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_REFRSRV,
3538	    unused_refres_delta, 0, 0, tx);
3539
3540	/*
3541	 * Swap deadlists.
3542	 */
3543	dsl_deadlist_close(&clone->ds_deadlist);
3544	dsl_deadlist_close(&origin_head->ds_deadlist);
3545	SWITCH64(dsl_dataset_phys(origin_head)->ds_deadlist_obj,
3546	    dsl_dataset_phys(clone)->ds_deadlist_obj);
3547	dsl_deadlist_open(&clone->ds_deadlist, dp->dp_meta_objset,
3548	    dsl_dataset_phys(clone)->ds_deadlist_obj);
3549	dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset,
3550	    dsl_dataset_phys(origin_head)->ds_deadlist_obj);
3551
3552	dsl_scan_ds_clone_swapped(origin_head, clone, tx);
3553
3554	spa_history_log_internal_ds(clone, "clone swap", tx,
3555	    "parent=%s", origin_head->ds_dir->dd_myname);
3556}
3557
3558/*
3559 * Given a pool name and a dataset object number in that pool,
3560 * return the name of that dataset.
3561 */
3562int
3563dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
3564{
3565	dsl_pool_t *dp;
3566	dsl_dataset_t *ds;
3567	int error;
3568
3569	error = dsl_pool_hold(pname, FTAG, &dp);
3570	if (error != 0)
3571		return (error);
3572
3573	error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
3574	if (error == 0) {
3575		dsl_dataset_name(ds, buf);
3576		dsl_dataset_rele(ds, FTAG);
3577	}
3578	dsl_pool_rele(dp, FTAG);
3579
3580	return (error);
3581}
3582
3583int
3584dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
3585    uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv)
3586{
3587	int error = 0;
3588
3589	ASSERT3S(asize, >, 0);
3590
3591	/*
3592	 * *ref_rsrv is the portion of asize that will come from any
3593	 * unconsumed refreservation space.
3594	 */
3595	*ref_rsrv = 0;
3596
3597	mutex_enter(&ds->ds_lock);
3598	/*
3599	 * Make a space adjustment for reserved bytes.
3600	 */
3601	if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) {
3602		ASSERT3U(*used, >=,
3603		    ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
3604		*used -=
3605		    (ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
3606		*ref_rsrv =
3607		    asize - MIN(asize, parent_delta(ds, asize + inflight));
3608	}
3609
3610	if (!check_quota || ds->ds_quota == 0) {
3611		mutex_exit(&ds->ds_lock);
3612		return (0);
3613	}
3614	/*
3615	 * If they are requesting more space, and our current estimate
3616	 * is over quota, they get to try again unless the actual
3617	 * on-disk is over quota and there are no pending changes (which
3618	 * may free up space for us).
3619	 */
3620	if (dsl_dataset_phys(ds)->ds_referenced_bytes + inflight >=
3621	    ds->ds_quota) {
3622		if (inflight > 0 ||
3623		    dsl_dataset_phys(ds)->ds_referenced_bytes < ds->ds_quota)
3624			error = SET_ERROR(ERESTART);
3625		else
3626			error = SET_ERROR(EDQUOT);
3627	}
3628	mutex_exit(&ds->ds_lock);
3629
3630	return (error);
3631}
3632
3633typedef struct dsl_dataset_set_qr_arg {
3634	const char *ddsqra_name;
3635	zprop_source_t ddsqra_source;
3636	uint64_t ddsqra_value;
3637} dsl_dataset_set_qr_arg_t;
3638
3639
3640/* ARGSUSED */
3641static int
3642dsl_dataset_set_refquota_check(void *arg, dmu_tx_t *tx)
3643{
3644	dsl_dataset_set_qr_arg_t *ddsqra = arg;
3645	dsl_pool_t *dp = dmu_tx_pool(tx);
3646	dsl_dataset_t *ds;
3647	int error;
3648	uint64_t newval;
3649
3650	if (spa_version(dp->dp_spa) < SPA_VERSION_REFQUOTA)
3651		return (SET_ERROR(ENOTSUP));
3652
3653	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
3654	if (error != 0)
3655		return (error);
3656
3657	if (ds->ds_is_snapshot) {
3658		dsl_dataset_rele(ds, FTAG);
3659		return (SET_ERROR(EINVAL));
3660	}
3661
3662	error = dsl_prop_predict(ds->ds_dir,
3663	    zfs_prop_to_name(ZFS_PROP_REFQUOTA),
3664	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
3665	if (error != 0) {
3666		dsl_dataset_rele(ds, FTAG);
3667		return (error);
3668	}
3669
3670	if (newval == 0) {
3671		dsl_dataset_rele(ds, FTAG);
3672		return (0);
3673	}
3674
3675	if (newval < dsl_dataset_phys(ds)->ds_referenced_bytes ||
3676	    newval < ds->ds_reserved) {
3677		dsl_dataset_rele(ds, FTAG);
3678		return (SET_ERROR(ENOSPC));
3679	}
3680
3681	dsl_dataset_rele(ds, FTAG);
3682	return (0);
3683}
3684
3685static void
3686dsl_dataset_set_refquota_sync(void *arg, dmu_tx_t *tx)
3687{
3688	dsl_dataset_set_qr_arg_t *ddsqra = arg;
3689	dsl_pool_t *dp = dmu_tx_pool(tx);
3690	dsl_dataset_t *ds;
3691	uint64_t newval;
3692
3693	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
3694
3695	dsl_prop_set_sync_impl(ds,
3696	    zfs_prop_to_name(ZFS_PROP_REFQUOTA),
3697	    ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
3698	    &ddsqra->ddsqra_value, tx);
3699
3700	VERIFY0(dsl_prop_get_int_ds(ds,
3701	    zfs_prop_to_name(ZFS_PROP_REFQUOTA), &newval));
3702
3703	if (ds->ds_quota != newval) {
3704		dmu_buf_will_dirty(ds->ds_dbuf, tx);
3705		ds->ds_quota = newval;
3706	}
3707	dsl_dataset_rele(ds, FTAG);
3708}
3709
3710int
3711dsl_dataset_set_refquota(const char *dsname, zprop_source_t source,
3712    uint64_t refquota)
3713{
3714	dsl_dataset_set_qr_arg_t ddsqra;
3715
3716	ddsqra.ddsqra_name = dsname;
3717	ddsqra.ddsqra_source = source;
3718	ddsqra.ddsqra_value = refquota;
3719
3720	return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check,
3721	    dsl_dataset_set_refquota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
3722}
3723
3724static int
3725dsl_dataset_set_refreservation_check(void *arg, dmu_tx_t *tx)
3726{
3727	dsl_dataset_set_qr_arg_t *ddsqra = arg;
3728	dsl_pool_t *dp = dmu_tx_pool(tx);
3729	dsl_dataset_t *ds;
3730	int error;
3731	uint64_t newval, unique;
3732
3733	if (spa_version(dp->dp_spa) < SPA_VERSION_REFRESERVATION)
3734		return (SET_ERROR(ENOTSUP));
3735
3736	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
3737	if (error != 0)
3738		return (error);
3739
3740	if (ds->ds_is_snapshot) {
3741		dsl_dataset_rele(ds, FTAG);
3742		return (SET_ERROR(EINVAL));
3743	}
3744
3745	error = dsl_prop_predict(ds->ds_dir,
3746	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
3747	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
3748	if (error != 0) {
3749		dsl_dataset_rele(ds, FTAG);
3750		return (error);
3751	}
3752
3753	/*
3754	 * If we are doing the preliminary check in open context, the
3755	 * space estimates may be inaccurate.
3756	 */
3757	if (!dmu_tx_is_syncing(tx)) {
3758		dsl_dataset_rele(ds, FTAG);
3759		return (0);
3760	}
3761
3762	mutex_enter(&ds->ds_lock);
3763	if (!DS_UNIQUE_IS_ACCURATE(ds))
3764		dsl_dataset_recalc_head_uniq(ds);
3765	unique = dsl_dataset_phys(ds)->ds_unique_bytes;
3766	mutex_exit(&ds->ds_lock);
3767
3768	if (MAX(unique, newval) > MAX(unique, ds->ds_reserved)) {
3769		uint64_t delta = MAX(unique, newval) -
3770		    MAX(unique, ds->ds_reserved);
3771
3772		if (delta >
3773		    dsl_dir_space_available(ds->ds_dir, NULL, 0, B_TRUE) ||
3774		    (ds->ds_quota > 0 && newval > ds->ds_quota)) {
3775			dsl_dataset_rele(ds, FTAG);
3776			return (SET_ERROR(ENOSPC));
3777		}
3778	}
3779
3780	dsl_dataset_rele(ds, FTAG);
3781	return (0);
3782}
3783
3784void
3785dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,
3786    zprop_source_t source, uint64_t value, dmu_tx_t *tx)
3787{
3788	uint64_t newval;
3789	uint64_t unique;
3790	int64_t delta;
3791
3792	dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
3793	    source, sizeof (value), 1, &value, tx);
3794
3795	VERIFY0(dsl_prop_get_int_ds(ds,
3796	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &newval));
3797
3798	dmu_buf_will_dirty(ds->ds_dbuf, tx);
3799	mutex_enter(&ds->ds_dir->dd_lock);
3800	mutex_enter(&ds->ds_lock);
3801	ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
3802	unique = dsl_dataset_phys(ds)->ds_unique_bytes;
3803	delta = MAX(0, (int64_t)(newval - unique)) -
3804	    MAX(0, (int64_t)(ds->ds_reserved - unique));
3805	ds->ds_reserved = newval;
3806	mutex_exit(&ds->ds_lock);
3807
3808	dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
3809	mutex_exit(&ds->ds_dir->dd_lock);
3810}
3811
3812static void
3813dsl_dataset_set_refreservation_sync(void *arg, dmu_tx_t *tx)
3814{
3815	dsl_dataset_set_qr_arg_t *ddsqra = arg;
3816	dsl_pool_t *dp = dmu_tx_pool(tx);
3817	dsl_dataset_t *ds;
3818
3819	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
3820	dsl_dataset_set_refreservation_sync_impl(ds,
3821	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, tx);
3822	dsl_dataset_rele(ds, FTAG);
3823}
3824
3825int
3826dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source,
3827    uint64_t refreservation)
3828{
3829	dsl_dataset_set_qr_arg_t ddsqra;
3830
3831	ddsqra.ddsqra_name = dsname;
3832	ddsqra.ddsqra_source = source;
3833	ddsqra.ddsqra_value = refreservation;
3834
3835	return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check,
3836	    dsl_dataset_set_refreservation_sync, &ddsqra,
3837	    0, ZFS_SPACE_CHECK_NONE));
3838}
3839
3840/*
3841 * Return (in *usedp) the amount of space written in new that is not
3842 * present in oldsnap.  New may be a snapshot or the head.  Old must be
3843 * a snapshot before new, in new's filesystem (or its origin).  If not then
3844 * fail and return EINVAL.
3845 *
3846 * The written space is calculated by considering two components:  First, we
3847 * ignore any freed space, and calculate the written as new's used space
3848 * minus old's used space.  Next, we add in the amount of space that was freed
3849 * between the two snapshots, thus reducing new's used space relative to old's.
3850 * Specifically, this is the space that was born before old->ds_creation_txg,
3851 * and freed before new (ie. on new's deadlist or a previous deadlist).
3852 *
3853 * space freed                         [---------------------]
3854 * snapshots                       ---O-------O--------O-------O------
3855 *                                         oldsnap            new
3856 */
3857int
3858dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
3859    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
3860{
3861	int err = 0;
3862	uint64_t snapobj;
3863	dsl_pool_t *dp = new->ds_dir->dd_pool;
3864
3865	ASSERT(dsl_pool_config_held(dp));
3866
3867	*usedp = 0;
3868	*usedp += dsl_dataset_phys(new)->ds_referenced_bytes;
3869	*usedp -= dsl_dataset_phys(oldsnap)->ds_referenced_bytes;
3870
3871	*compp = 0;
3872	*compp += dsl_dataset_phys(new)->ds_compressed_bytes;
3873	*compp -= dsl_dataset_phys(oldsnap)->ds_compressed_bytes;
3874
3875	*uncompp = 0;
3876	*uncompp += dsl_dataset_phys(new)->ds_uncompressed_bytes;
3877	*uncompp -= dsl_dataset_phys(oldsnap)->ds_uncompressed_bytes;
3878
3879	snapobj = new->ds_object;
3880	while (snapobj != oldsnap->ds_object) {
3881		dsl_dataset_t *snap;
3882		uint64_t used, comp, uncomp;
3883
3884		if (snapobj == new->ds_object) {
3885			snap = new;
3886		} else {
3887			err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
3888			if (err != 0)
3889				break;
3890		}
3891
3892		if (dsl_dataset_phys(snap)->ds_prev_snap_txg ==
3893		    dsl_dataset_phys(oldsnap)->ds_creation_txg) {
3894			/*
3895			 * The blocks in the deadlist can not be born after
3896			 * ds_prev_snap_txg, so get the whole deadlist space,
3897			 * which is more efficient (especially for old-format
3898			 * deadlists).  Unfortunately the deadlist code
3899			 * doesn't have enough information to make this
3900			 * optimization itself.
3901			 */
3902			dsl_deadlist_space(&snap->ds_deadlist,
3903			    &used, &comp, &uncomp);
3904		} else {
3905			dsl_deadlist_space_range(&snap->ds_deadlist,
3906			    0, dsl_dataset_phys(oldsnap)->ds_creation_txg,
3907			    &used, &comp, &uncomp);
3908		}
3909		*usedp += used;
3910		*compp += comp;
3911		*uncompp += uncomp;
3912
3913		/*
3914		 * If we get to the beginning of the chain of snapshots
3915		 * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap
3916		 * was not a snapshot of/before new.
3917		 */
3918		snapobj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
3919		if (snap != new)
3920			dsl_dataset_rele(snap, FTAG);
3921		if (snapobj == 0) {
3922			err = SET_ERROR(EINVAL);
3923			break;
3924		}
3925
3926	}
3927	return (err);
3928}
3929
3930/*
3931 * Return (in *usedp) the amount of space that will be reclaimed if firstsnap,
3932 * lastsnap, and all snapshots in between are deleted.
3933 *
3934 * blocks that would be freed            [---------------------------]
3935 * snapshots                       ---O-------O--------O-------O--------O
3936 *                                        firstsnap        lastsnap
3937 *
3938 * This is the set of blocks that were born after the snap before firstsnap,
3939 * (birth > firstsnap->prev_snap_txg) and died before the snap after the
3940 * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist).
3941 * We calculate this by iterating over the relevant deadlists (from the snap
3942 * after lastsnap, backward to the snap after firstsnap), summing up the
3943 * space on the deadlist that was born after the snap before firstsnap.
3944 */
3945int
3946dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
3947    dsl_dataset_t *lastsnap,
3948    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
3949{
3950	int err = 0;
3951	uint64_t snapobj;
3952	dsl_pool_t *dp = firstsnap->ds_dir->dd_pool;
3953
3954	ASSERT(firstsnap->ds_is_snapshot);
3955	ASSERT(lastsnap->ds_is_snapshot);
3956
3957	/*
3958	 * Check that the snapshots are in the same dsl_dir, and firstsnap
3959	 * is before lastsnap.
3960	 */
3961	if (firstsnap->ds_dir != lastsnap->ds_dir ||
3962	    dsl_dataset_phys(firstsnap)->ds_creation_txg >
3963	    dsl_dataset_phys(lastsnap)->ds_creation_txg)
3964		return (SET_ERROR(EINVAL));
3965
3966	*usedp = *compp = *uncompp = 0;
3967
3968	snapobj = dsl_dataset_phys(lastsnap)->ds_next_snap_obj;
3969	while (snapobj != firstsnap->ds_object) {
3970		dsl_dataset_t *ds;
3971		uint64_t used, comp, uncomp;
3972
3973		err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds);
3974		if (err != 0)
3975			break;
3976
3977		dsl_deadlist_space_range(&ds->ds_deadlist,
3978		    dsl_dataset_phys(firstsnap)->ds_prev_snap_txg, UINT64_MAX,
3979		    &used, &comp, &uncomp);
3980		*usedp += used;
3981		*compp += comp;
3982		*uncompp += uncomp;
3983
3984		snapobj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
3985		ASSERT3U(snapobj, !=, 0);
3986		dsl_dataset_rele(ds, FTAG);
3987	}
3988	return (err);
3989}
3990
3991/*
3992 * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline.
3993 * For example, they could both be snapshots of the same filesystem, and
3994 * 'earlier' is before 'later'.  Or 'earlier' could be the origin of
3995 * 'later's filesystem.  Or 'earlier' could be an older snapshot in the origin's
3996 * filesystem.  Or 'earlier' could be the origin's origin.
3997 *
3998 * If non-zero, earlier_txg is used instead of earlier's ds_creation_txg.
3999 */
4000boolean_t
4001dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier,
4002    uint64_t earlier_txg)
4003{
4004	dsl_pool_t *dp = later->ds_dir->dd_pool;
4005	int error;
4006	boolean_t ret;
4007
4008	ASSERT(dsl_pool_config_held(dp));
4009	ASSERT(earlier->ds_is_snapshot || earlier_txg != 0);
4010
4011	if (earlier_txg == 0)
4012		earlier_txg = dsl_dataset_phys(earlier)->ds_creation_txg;
4013
4014	if (later->ds_is_snapshot &&
4015	    earlier_txg >= dsl_dataset_phys(later)->ds_creation_txg)
4016		return (B_FALSE);
4017
4018	if (later->ds_dir == earlier->ds_dir)
4019		return (B_TRUE);
4020	if (!dsl_dir_is_clone(later->ds_dir))
4021		return (B_FALSE);
4022
4023	if (dsl_dir_phys(later->ds_dir)->dd_origin_obj == earlier->ds_object)
4024		return (B_TRUE);
4025	dsl_dataset_t *origin;
4026	error = dsl_dataset_hold_obj(dp,
4027	    dsl_dir_phys(later->ds_dir)->dd_origin_obj, FTAG, &origin);
4028	if (error != 0)
4029		return (B_FALSE);
4030	ret = dsl_dataset_is_before(origin, earlier, earlier_txg);
4031	dsl_dataset_rele(origin, FTAG);
4032	return (ret);
4033}
4034
4035void
4036dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx)
4037{
4038	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
4039	dmu_object_zapify(mos, ds->ds_object, DMU_OT_DSL_DATASET, tx);
4040}
4041
4042boolean_t
4043dsl_dataset_is_zapified(dsl_dataset_t *ds)
4044{
4045	dmu_object_info_t doi;
4046
4047	dmu_object_info_from_db(ds->ds_dbuf, &doi);
4048	return (doi.doi_type == DMU_OTN_ZAP_METADATA);
4049}
4050
4051boolean_t
4052dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds)
4053{
4054	return (dsl_dataset_is_zapified(ds) &&
4055	    zap_contains(ds->ds_dir->dd_pool->dp_meta_objset,
4056	    ds->ds_object, DS_FIELD_RESUME_TOGUID) == 0);
4057}
4058