dsl_dataset.c revision 290756
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Portions Copyright (c) 2011 Martin Matuska <mm@FreeBSD.org>
24 * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
25 * Copyright (c) 2014, Joyent, Inc. All rights reserved.
26 * Copyright (c) 2014 RackTop Systems.
27 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
28 */
29
30#include <sys/dmu_objset.h>
31#include <sys/dsl_dataset.h>
32#include <sys/dsl_dir.h>
33#include <sys/dsl_prop.h>
34#include <sys/dsl_synctask.h>
35#include <sys/dmu_traverse.h>
36#include <sys/dmu_impl.h>
37#include <sys/dmu_send.h>
38#include <sys/dmu_tx.h>
39#include <sys/arc.h>
40#include <sys/zio.h>
41#include <sys/zap.h>
42#include <sys/zfeature.h>
43#include <sys/unique.h>
44#include <sys/zfs_context.h>
45#include <sys/zfs_ioctl.h>
46#include <sys/spa.h>
47#include <sys/zfs_znode.h>
48#include <sys/zfs_onexit.h>
49#include <sys/zvol.h>
50#include <sys/dsl_scan.h>
51#include <sys/dsl_deadlist.h>
52#include <sys/dsl_destroy.h>
53#include <sys/dsl_userhold.h>
54#include <sys/dsl_bookmark.h>
55#include <sys/zio_compress.h>
56#include <zfs_fletcher.h>
57
58SYSCTL_DECL(_vfs_zfs);
59
60/*
61 * The SPA supports block sizes up to 16MB.  However, very large blocks
62 * can have an impact on i/o latency (e.g. tying up a spinning disk for
63 * ~300ms), and also potentially on the memory allocator.  Therefore,
64 * we do not allow the recordsize to be set larger than zfs_max_recordsize
65 * (default 1MB).  Larger blocks can be created by changing this tunable,
66 * and pools with larger blocks can always be imported and used, regardless
67 * of this setting.
68 */
69int zfs_max_recordsize = 1 * 1024 * 1024;
70SYSCTL_INT(_vfs_zfs, OID_AUTO, max_recordsize, CTLFLAG_RWTUN,
71    &zfs_max_recordsize, 0,
72    "Maximum block size.  Expect dragons when tuning this.");
73
74#define	SWITCH64(x, y) \
75	{ \
76		uint64_t __tmp = (x); \
77		(x) = (y); \
78		(y) = __tmp; \
79	}
80
81#define	DS_REF_MAX	(1ULL << 62)
82
83extern inline dsl_dataset_phys_t *dsl_dataset_phys(dsl_dataset_t *ds);
84
85/*
86 * Figure out how much of this delta should be propogated to the dsl_dir
87 * layer.  If there's a refreservation, that space has already been
88 * partially accounted for in our ancestors.
89 */
90static int64_t
91parent_delta(dsl_dataset_t *ds, int64_t delta)
92{
93	dsl_dataset_phys_t *ds_phys;
94	uint64_t old_bytes, new_bytes;
95
96	if (ds->ds_reserved == 0)
97		return (delta);
98
99	ds_phys = dsl_dataset_phys(ds);
100	old_bytes = MAX(ds_phys->ds_unique_bytes, ds->ds_reserved);
101	new_bytes = MAX(ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
102
103	ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
104	return (new_bytes - old_bytes);
105}
106
107void
108dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
109{
110	int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
111	int compressed = BP_GET_PSIZE(bp);
112	int uncompressed = BP_GET_UCSIZE(bp);
113	int64_t delta;
114
115	dprintf_bp(bp, "ds=%p", ds);
116
117	ASSERT(dmu_tx_is_syncing(tx));
118	/* It could have been compressed away to nothing */
119	if (BP_IS_HOLE(bp))
120		return;
121	ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
122	ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp)));
123	if (ds == NULL) {
124		dsl_pool_mos_diduse_space(tx->tx_pool,
125		    used, compressed, uncompressed);
126		return;
127	}
128
129	dmu_buf_will_dirty(ds->ds_dbuf, tx);
130	mutex_enter(&ds->ds_lock);
131	delta = parent_delta(ds, used);
132	dsl_dataset_phys(ds)->ds_referenced_bytes += used;
133	dsl_dataset_phys(ds)->ds_compressed_bytes += compressed;
134	dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed;
135	dsl_dataset_phys(ds)->ds_unique_bytes += used;
136	if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) {
137		ds->ds_feature_activation_needed[SPA_FEATURE_LARGE_BLOCKS] =
138		    B_TRUE;
139	}
140	mutex_exit(&ds->ds_lock);
141	dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
142	    compressed, uncompressed, tx);
143	dsl_dir_transfer_space(ds->ds_dir, used - delta,
144	    DD_USED_REFRSRV, DD_USED_HEAD, NULL);
145}
146
147int
148dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
149    boolean_t async)
150{
151	int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
152	int compressed = BP_GET_PSIZE(bp);
153	int uncompressed = BP_GET_UCSIZE(bp);
154
155	if (BP_IS_HOLE(bp))
156		return (0);
157
158	ASSERT(dmu_tx_is_syncing(tx));
159	ASSERT(bp->blk_birth <= tx->tx_txg);
160
161	if (ds == NULL) {
162		dsl_free(tx->tx_pool, tx->tx_txg, bp);
163		dsl_pool_mos_diduse_space(tx->tx_pool,
164		    -used, -compressed, -uncompressed);
165		return (used);
166	}
167	ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
168
169	ASSERT(!ds->ds_is_snapshot);
170	dmu_buf_will_dirty(ds->ds_dbuf, tx);
171
172	if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
173		int64_t delta;
174
175		dprintf_bp(bp, "freeing ds=%llu", ds->ds_object);
176		dsl_free(tx->tx_pool, tx->tx_txg, bp);
177
178		mutex_enter(&ds->ds_lock);
179		ASSERT(dsl_dataset_phys(ds)->ds_unique_bytes >= used ||
180		    !DS_UNIQUE_IS_ACCURATE(ds));
181		delta = parent_delta(ds, -used);
182		dsl_dataset_phys(ds)->ds_unique_bytes -= used;
183		mutex_exit(&ds->ds_lock);
184		dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
185		    delta, -compressed, -uncompressed, tx);
186		dsl_dir_transfer_space(ds->ds_dir, -used - delta,
187		    DD_USED_REFRSRV, DD_USED_HEAD, NULL);
188	} else {
189		dprintf_bp(bp, "putting on dead list: %s", "");
190		if (async) {
191			/*
192			 * We are here as part of zio's write done callback,
193			 * which means we're a zio interrupt thread.  We can't
194			 * call dsl_deadlist_insert() now because it may block
195			 * waiting for I/O.  Instead, put bp on the deferred
196			 * queue and let dsl_pool_sync() finish the job.
197			 */
198			bplist_append(&ds->ds_pending_deadlist, bp);
199		} else {
200			dsl_deadlist_insert(&ds->ds_deadlist, bp, tx);
201		}
202		ASSERT3U(ds->ds_prev->ds_object, ==,
203		    dsl_dataset_phys(ds)->ds_prev_snap_obj);
204		ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0);
205		/* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
206		if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
207		    ds->ds_object && bp->blk_birth >
208		    dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) {
209			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
210			mutex_enter(&ds->ds_prev->ds_lock);
211			dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used;
212			mutex_exit(&ds->ds_prev->ds_lock);
213		}
214		if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
215			dsl_dir_transfer_space(ds->ds_dir, used,
216			    DD_USED_HEAD, DD_USED_SNAP, tx);
217		}
218	}
219	mutex_enter(&ds->ds_lock);
220	ASSERT3U(dsl_dataset_phys(ds)->ds_referenced_bytes, >=, used);
221	dsl_dataset_phys(ds)->ds_referenced_bytes -= used;
222	ASSERT3U(dsl_dataset_phys(ds)->ds_compressed_bytes, >=, compressed);
223	dsl_dataset_phys(ds)->ds_compressed_bytes -= compressed;
224	ASSERT3U(dsl_dataset_phys(ds)->ds_uncompressed_bytes, >=, uncompressed);
225	dsl_dataset_phys(ds)->ds_uncompressed_bytes -= uncompressed;
226	mutex_exit(&ds->ds_lock);
227
228	return (used);
229}
230
231uint64_t
232dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
233{
234	uint64_t trysnap = 0;
235
236	if (ds == NULL)
237		return (0);
238	/*
239	 * The snapshot creation could fail, but that would cause an
240	 * incorrect FALSE return, which would only result in an
241	 * overestimation of the amount of space that an operation would
242	 * consume, which is OK.
243	 *
244	 * There's also a small window where we could miss a pending
245	 * snapshot, because we could set the sync task in the quiescing
246	 * phase.  So this should only be used as a guess.
247	 */
248	if (ds->ds_trysnap_txg >
249	    spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa))
250		trysnap = ds->ds_trysnap_txg;
251	return (MAX(dsl_dataset_phys(ds)->ds_prev_snap_txg, trysnap));
252}
253
254boolean_t
255dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp,
256    uint64_t blk_birth)
257{
258	if (blk_birth <= dsl_dataset_prev_snap_txg(ds) ||
259	    (bp != NULL && BP_IS_HOLE(bp)))
260		return (B_FALSE);
261
262	ddt_prefetch(dsl_dataset_get_spa(ds), bp);
263
264	return (B_TRUE);
265}
266
267static void
268dsl_dataset_evict(void *dbu)
269{
270	dsl_dataset_t *ds = dbu;
271
272	ASSERT(ds->ds_owner == NULL);
273
274	ds->ds_dbuf = NULL;
275
276	unique_remove(ds->ds_fsid_guid);
277
278	if (ds->ds_objset != NULL)
279		dmu_objset_evict(ds->ds_objset);
280
281	if (ds->ds_prev) {
282		dsl_dataset_rele(ds->ds_prev, ds);
283		ds->ds_prev = NULL;
284	}
285
286	bplist_destroy(&ds->ds_pending_deadlist);
287	if (ds->ds_deadlist.dl_os != NULL)
288		dsl_deadlist_close(&ds->ds_deadlist);
289	if (ds->ds_dir)
290		dsl_dir_async_rele(ds->ds_dir, ds);
291
292	ASSERT(!list_link_active(&ds->ds_synced_link));
293
294	list_destroy(&ds->ds_prop_cbs);
295	if (mutex_owned(&ds->ds_lock))
296		mutex_exit(&ds->ds_lock);
297	mutex_destroy(&ds->ds_lock);
298	if (mutex_owned(&ds->ds_opening_lock))
299		mutex_exit(&ds->ds_opening_lock);
300	mutex_destroy(&ds->ds_opening_lock);
301	mutex_destroy(&ds->ds_sendstream_lock);
302	refcount_destroy(&ds->ds_longholds);
303
304	kmem_free(ds, sizeof (dsl_dataset_t));
305}
306
307int
308dsl_dataset_get_snapname(dsl_dataset_t *ds)
309{
310	dsl_dataset_phys_t *headphys;
311	int err;
312	dmu_buf_t *headdbuf;
313	dsl_pool_t *dp = ds->ds_dir->dd_pool;
314	objset_t *mos = dp->dp_meta_objset;
315
316	if (ds->ds_snapname[0])
317		return (0);
318	if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0)
319		return (0);
320
321	err = dmu_bonus_hold(mos, dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj,
322	    FTAG, &headdbuf);
323	if (err != 0)
324		return (err);
325	headphys = headdbuf->db_data;
326	err = zap_value_search(dp->dp_meta_objset,
327	    headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname);
328	dmu_buf_rele(headdbuf, FTAG);
329	return (err);
330}
331
332int
333dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value)
334{
335	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
336	uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
337	matchtype_t mt;
338	int err;
339
340	if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
341		mt = MT_FIRST;
342	else
343		mt = MT_EXACT;
344
345	err = zap_lookup_norm(mos, snapobj, name, 8, 1,
346	    value, mt, NULL, 0, NULL);
347	if (err == ENOTSUP && mt == MT_FIRST)
348		err = zap_lookup(mos, snapobj, name, 8, 1, value);
349	return (err);
350}
351
352int
353dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx,
354    boolean_t adj_cnt)
355{
356	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
357	uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
358	matchtype_t mt;
359	int err;
360
361	dsl_dir_snap_cmtime_update(ds->ds_dir);
362
363	if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
364		mt = MT_FIRST;
365	else
366		mt = MT_EXACT;
367
368	err = zap_remove_norm(mos, snapobj, name, mt, tx);
369	if (err == ENOTSUP && mt == MT_FIRST)
370		err = zap_remove(mos, snapobj, name, tx);
371
372	if (err == 0 && adj_cnt)
373		dsl_fs_ss_count_adjust(ds->ds_dir, -1,
374		    DD_FIELD_SNAPSHOT_COUNT, tx);
375
376	return (err);
377}
378
379boolean_t
380dsl_dataset_try_add_ref(dsl_pool_t *dp, dsl_dataset_t *ds, void *tag)
381{
382	dmu_buf_t *dbuf = ds->ds_dbuf;
383	boolean_t result = B_FALSE;
384
385	if (dbuf != NULL && dmu_buf_try_add_ref(dbuf, dp->dp_meta_objset,
386	    ds->ds_object, DMU_BONUS_BLKID, tag)) {
387
388		if (ds == dmu_buf_get_user(dbuf))
389			result = B_TRUE;
390		else
391			dmu_buf_rele(dbuf, tag);
392	}
393
394	return (result);
395}
396
397int
398dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
399    dsl_dataset_t **dsp)
400{
401	objset_t *mos = dp->dp_meta_objset;
402	dmu_buf_t *dbuf;
403	dsl_dataset_t *ds;
404	int err;
405	dmu_object_info_t doi;
406
407	ASSERT(dsl_pool_config_held(dp));
408
409	err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
410	if (err != 0)
411		return (err);
412
413	/* Make sure dsobj has the correct object type. */
414	dmu_object_info_from_db(dbuf, &doi);
415	if (doi.doi_bonus_type != DMU_OT_DSL_DATASET) {
416		dmu_buf_rele(dbuf, tag);
417		return (SET_ERROR(EINVAL));
418	}
419
420	ds = dmu_buf_get_user(dbuf);
421	if (ds == NULL) {
422		dsl_dataset_t *winner = NULL;
423
424		ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
425		ds->ds_dbuf = dbuf;
426		ds->ds_object = dsobj;
427		ds->ds_is_snapshot = dsl_dataset_phys(ds)->ds_num_children != 0;
428
429		mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
430		mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
431		mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL);
432		refcount_create(&ds->ds_longholds);
433
434		bplist_create(&ds->ds_pending_deadlist);
435		dsl_deadlist_open(&ds->ds_deadlist,
436		    mos, dsl_dataset_phys(ds)->ds_deadlist_obj);
437
438		list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t),
439		    offsetof(dmu_sendarg_t, dsa_link));
440
441		list_create(&ds->ds_prop_cbs, sizeof (dsl_prop_cb_record_t),
442		    offsetof(dsl_prop_cb_record_t, cbr_ds_node));
443
444		if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
445			for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
446				if (!(spa_feature_table[f].fi_flags &
447				    ZFEATURE_FLAG_PER_DATASET))
448					continue;
449				err = zap_contains(mos, dsobj,
450				    spa_feature_table[f].fi_guid);
451				if (err == 0) {
452					ds->ds_feature_inuse[f] = B_TRUE;
453				} else {
454					ASSERT3U(err, ==, ENOENT);
455					err = 0;
456				}
457			}
458		}
459
460		err = dsl_dir_hold_obj(dp,
461		    dsl_dataset_phys(ds)->ds_dir_obj, NULL, ds, &ds->ds_dir);
462		if (err != 0) {
463			mutex_destroy(&ds->ds_lock);
464			mutex_destroy(&ds->ds_opening_lock);
465			mutex_destroy(&ds->ds_sendstream_lock);
466			refcount_destroy(&ds->ds_longholds);
467			bplist_destroy(&ds->ds_pending_deadlist);
468			dsl_deadlist_close(&ds->ds_deadlist);
469			kmem_free(ds, sizeof (dsl_dataset_t));
470			dmu_buf_rele(dbuf, tag);
471			return (err);
472		}
473
474		if (!ds->ds_is_snapshot) {
475			ds->ds_snapname[0] = '\0';
476			if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
477				err = dsl_dataset_hold_obj(dp,
478				    dsl_dataset_phys(ds)->ds_prev_snap_obj,
479				    ds, &ds->ds_prev);
480			}
481			if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
482				int zaperr = zap_lookup(mos, ds->ds_object,
483				    DS_FIELD_BOOKMARK_NAMES,
484				    sizeof (ds->ds_bookmarks), 1,
485				    &ds->ds_bookmarks);
486				if (zaperr != ENOENT)
487					VERIFY0(zaperr);
488			}
489		} else {
490			if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
491				err = dsl_dataset_get_snapname(ds);
492			if (err == 0 &&
493			    dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {
494				err = zap_count(
495				    ds->ds_dir->dd_pool->dp_meta_objset,
496				    dsl_dataset_phys(ds)->ds_userrefs_obj,
497				    &ds->ds_userrefs);
498			}
499		}
500
501		if (err == 0 && !ds->ds_is_snapshot) {
502			err = dsl_prop_get_int_ds(ds,
503			    zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
504			    &ds->ds_reserved);
505			if (err == 0) {
506				err = dsl_prop_get_int_ds(ds,
507				    zfs_prop_to_name(ZFS_PROP_REFQUOTA),
508				    &ds->ds_quota);
509			}
510		} else {
511			ds->ds_reserved = ds->ds_quota = 0;
512		}
513
514		dmu_buf_init_user(&ds->ds_dbu, dsl_dataset_evict, &ds->ds_dbuf);
515		if (err == 0)
516			winner = dmu_buf_set_user_ie(dbuf, &ds->ds_dbu);
517
518		if (err != 0 || winner != NULL) {
519			bplist_destroy(&ds->ds_pending_deadlist);
520			dsl_deadlist_close(&ds->ds_deadlist);
521			if (ds->ds_prev)
522				dsl_dataset_rele(ds->ds_prev, ds);
523			dsl_dir_rele(ds->ds_dir, ds);
524			mutex_destroy(&ds->ds_lock);
525			mutex_destroy(&ds->ds_opening_lock);
526			mutex_destroy(&ds->ds_sendstream_lock);
527			refcount_destroy(&ds->ds_longholds);
528			kmem_free(ds, sizeof (dsl_dataset_t));
529			if (err != 0) {
530				dmu_buf_rele(dbuf, tag);
531				return (err);
532			}
533			ds = winner;
534		} else {
535			ds->ds_fsid_guid =
536			    unique_insert(dsl_dataset_phys(ds)->ds_fsid_guid);
537		}
538	}
539	ASSERT3P(ds->ds_dbuf, ==, dbuf);
540	ASSERT3P(dsl_dataset_phys(ds), ==, dbuf->db_data);
541	ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0 ||
542	    spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN ||
543	    dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);
544	*dsp = ds;
545	return (0);
546}
547
548int
549dsl_dataset_hold(dsl_pool_t *dp, const char *name,
550    void *tag, dsl_dataset_t **dsp)
551{
552	dsl_dir_t *dd;
553	const char *snapname;
554	uint64_t obj;
555	int err = 0;
556	dsl_dataset_t *ds;
557
558	err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname);
559	if (err != 0)
560		return (err);
561
562	ASSERT(dsl_pool_config_held(dp));
563	obj = dsl_dir_phys(dd)->dd_head_dataset_obj;
564	if (obj != 0)
565		err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
566	else
567		err = SET_ERROR(ENOENT);
568
569	/* we may be looking for a snapshot */
570	if (err == 0 && snapname != NULL) {
571		dsl_dataset_t *snap_ds;
572
573		if (*snapname++ != '@') {
574			dsl_dataset_rele(ds, tag);
575			dsl_dir_rele(dd, FTAG);
576			return (SET_ERROR(ENOENT));
577		}
578
579		dprintf("looking for snapshot '%s'\n", snapname);
580		err = dsl_dataset_snap_lookup(ds, snapname, &obj);
581		if (err == 0)
582			err = dsl_dataset_hold_obj(dp, obj, tag, &snap_ds);
583		dsl_dataset_rele(ds, tag);
584
585		if (err == 0) {
586			mutex_enter(&snap_ds->ds_lock);
587			if (snap_ds->ds_snapname[0] == 0)
588				(void) strlcpy(snap_ds->ds_snapname, snapname,
589				    sizeof (snap_ds->ds_snapname));
590			mutex_exit(&snap_ds->ds_lock);
591			ds = snap_ds;
592		}
593	}
594	if (err == 0)
595		*dsp = ds;
596	dsl_dir_rele(dd, FTAG);
597	return (err);
598}
599
600int
601dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj,
602    void *tag, dsl_dataset_t **dsp)
603{
604	int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp);
605	if (err != 0)
606		return (err);
607	if (!dsl_dataset_tryown(*dsp, tag)) {
608		dsl_dataset_rele(*dsp, tag);
609		*dsp = NULL;
610		return (SET_ERROR(EBUSY));
611	}
612	return (0);
613}
614
615int
616dsl_dataset_own(dsl_pool_t *dp, const char *name,
617    void *tag, dsl_dataset_t **dsp)
618{
619	int err = dsl_dataset_hold(dp, name, tag, dsp);
620	if (err != 0)
621		return (err);
622	if (!dsl_dataset_tryown(*dsp, tag)) {
623		dsl_dataset_rele(*dsp, tag);
624		return (SET_ERROR(EBUSY));
625	}
626	return (0);
627}
628
629/*
630 * See the comment above dsl_pool_hold() for details.  In summary, a long
631 * hold is used to prevent destruction of a dataset while the pool hold
632 * is dropped, allowing other concurrent operations (e.g. spa_sync()).
633 *
634 * The dataset and pool must be held when this function is called.  After it
635 * is called, the pool hold may be released while the dataset is still held
636 * and accessed.
637 */
638void
639dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag)
640{
641	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
642	(void) refcount_add(&ds->ds_longholds, tag);
643}
644
645void
646dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag)
647{
648	(void) refcount_remove(&ds->ds_longholds, tag);
649}
650
651/* Return B_TRUE if there are any long holds on this dataset. */
652boolean_t
653dsl_dataset_long_held(dsl_dataset_t *ds)
654{
655	return (!refcount_is_zero(&ds->ds_longholds));
656}
657
658void
659dsl_dataset_name(dsl_dataset_t *ds, char *name)
660{
661	if (ds == NULL) {
662		(void) strcpy(name, "mos");
663	} else {
664		dsl_dir_name(ds->ds_dir, name);
665		VERIFY0(dsl_dataset_get_snapname(ds));
666		if (ds->ds_snapname[0]) {
667			(void) strcat(name, "@");
668			/*
669			 * We use a "recursive" mutex so that we
670			 * can call dprintf_ds() with ds_lock held.
671			 */
672			if (!MUTEX_HELD(&ds->ds_lock)) {
673				mutex_enter(&ds->ds_lock);
674				(void) strcat(name, ds->ds_snapname);
675				mutex_exit(&ds->ds_lock);
676			} else {
677				(void) strcat(name, ds->ds_snapname);
678			}
679		}
680	}
681}
682
683void
684dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
685{
686	dmu_buf_rele(ds->ds_dbuf, tag);
687}
688
689void
690dsl_dataset_disown(dsl_dataset_t *ds, void *tag)
691{
692	ASSERT3P(ds->ds_owner, ==, tag);
693	ASSERT(ds->ds_dbuf != NULL);
694
695	mutex_enter(&ds->ds_lock);
696	ds->ds_owner = NULL;
697	mutex_exit(&ds->ds_lock);
698	dsl_dataset_long_rele(ds, tag);
699	dsl_dataset_rele(ds, tag);
700}
701
702boolean_t
703dsl_dataset_tryown(dsl_dataset_t *ds, void *tag)
704{
705	boolean_t gotit = FALSE;
706
707	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
708	mutex_enter(&ds->ds_lock);
709	if (ds->ds_owner == NULL && !DS_IS_INCONSISTENT(ds)) {
710		ds->ds_owner = tag;
711		dsl_dataset_long_hold(ds, tag);
712		gotit = TRUE;
713	}
714	mutex_exit(&ds->ds_lock);
715	return (gotit);
716}
717
718boolean_t
719dsl_dataset_has_owner(dsl_dataset_t *ds)
720{
721	boolean_t rv;
722	mutex_enter(&ds->ds_lock);
723	rv = (ds->ds_owner != NULL);
724	mutex_exit(&ds->ds_lock);
725	return (rv);
726}
727
728static void
729dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx)
730{
731	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
732	objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
733	uint64_t zero = 0;
734
735	VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET);
736
737	spa_feature_incr(spa, f, tx);
738	dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
739
740	VERIFY0(zap_add(mos, dsobj, spa_feature_table[f].fi_guid,
741	    sizeof (zero), 1, &zero, tx));
742}
743
744void
745dsl_dataset_deactivate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx)
746{
747	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
748	objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
749
750	VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET);
751
752	VERIFY0(zap_remove(mos, dsobj, spa_feature_table[f].fi_guid, tx));
753	spa_feature_decr(spa, f, tx);
754}
755
756uint64_t
757dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
758    uint64_t flags, dmu_tx_t *tx)
759{
760	dsl_pool_t *dp = dd->dd_pool;
761	dmu_buf_t *dbuf;
762	dsl_dataset_phys_t *dsphys;
763	uint64_t dsobj;
764	objset_t *mos = dp->dp_meta_objset;
765
766	if (origin == NULL)
767		origin = dp->dp_origin_snap;
768
769	ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp);
770	ASSERT(origin == NULL || dsl_dataset_phys(origin)->ds_num_children > 0);
771	ASSERT(dmu_tx_is_syncing(tx));
772	ASSERT(dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
773
774	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
775	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
776	VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
777	dmu_buf_will_dirty(dbuf, tx);
778	dsphys = dbuf->db_data;
779	bzero(dsphys, sizeof (dsl_dataset_phys_t));
780	dsphys->ds_dir_obj = dd->dd_object;
781	dsphys->ds_flags = flags;
782	dsphys->ds_fsid_guid = unique_create();
783	do {
784		(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
785		    sizeof (dsphys->ds_guid));
786	} while (dsphys->ds_guid == 0);
787	dsphys->ds_snapnames_zapobj =
788	    zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP,
789	    DMU_OT_NONE, 0, tx);
790	dsphys->ds_creation_time = gethrestime_sec();
791	dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;
792
793	if (origin == NULL) {
794		dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx);
795	} else {
796		dsl_dataset_t *ohds; /* head of the origin snapshot */
797
798		dsphys->ds_prev_snap_obj = origin->ds_object;
799		dsphys->ds_prev_snap_txg =
800		    dsl_dataset_phys(origin)->ds_creation_txg;
801		dsphys->ds_referenced_bytes =
802		    dsl_dataset_phys(origin)->ds_referenced_bytes;
803		dsphys->ds_compressed_bytes =
804		    dsl_dataset_phys(origin)->ds_compressed_bytes;
805		dsphys->ds_uncompressed_bytes =
806		    dsl_dataset_phys(origin)->ds_uncompressed_bytes;
807		dsphys->ds_bp = dsl_dataset_phys(origin)->ds_bp;
808
809		/*
810		 * Inherit flags that describe the dataset's contents
811		 * (INCONSISTENT) or properties (Case Insensitive).
812		 */
813		dsphys->ds_flags |= dsl_dataset_phys(origin)->ds_flags &
814		    (DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET);
815
816		for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
817			if (origin->ds_feature_inuse[f])
818				dsl_dataset_activate_feature(dsobj, f, tx);
819		}
820
821		dmu_buf_will_dirty(origin->ds_dbuf, tx);
822		dsl_dataset_phys(origin)->ds_num_children++;
823
824		VERIFY0(dsl_dataset_hold_obj(dp,
825		    dsl_dir_phys(origin->ds_dir)->dd_head_dataset_obj,
826		    FTAG, &ohds));
827		dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,
828		    dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);
829		dsl_dataset_rele(ohds, FTAG);
830
831		if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
832			if (dsl_dataset_phys(origin)->ds_next_clones_obj == 0) {
833				dsl_dataset_phys(origin)->ds_next_clones_obj =
834				    zap_create(mos,
835				    DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
836			}
837			VERIFY0(zap_add_int(mos,
838			    dsl_dataset_phys(origin)->ds_next_clones_obj,
839			    dsobj, tx));
840		}
841
842		dmu_buf_will_dirty(dd->dd_dbuf, tx);
843		dsl_dir_phys(dd)->dd_origin_obj = origin->ds_object;
844		if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
845			if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {
846				dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
847				dsl_dir_phys(origin->ds_dir)->dd_clones =
848				    zap_create(mos,
849				    DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
850			}
851			VERIFY0(zap_add_int(mos,
852			    dsl_dir_phys(origin->ds_dir)->dd_clones,
853			    dsobj, tx));
854		}
855	}
856
857	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
858		dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
859
860	dmu_buf_rele(dbuf, FTAG);
861
862	dmu_buf_will_dirty(dd->dd_dbuf, tx);
863	dsl_dir_phys(dd)->dd_head_dataset_obj = dsobj;
864
865	return (dsobj);
866}
867
868static void
869dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx)
870{
871	objset_t *os;
872
873	VERIFY0(dmu_objset_from_ds(ds, &os));
874	bzero(&os->os_zil_header, sizeof (os->os_zil_header));
875	dsl_dataset_dirty(ds, tx);
876}
877
878uint64_t
879dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
880    dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx)
881{
882	dsl_pool_t *dp = pdd->dd_pool;
883	uint64_t dsobj, ddobj;
884	dsl_dir_t *dd;
885
886	ASSERT(dmu_tx_is_syncing(tx));
887	ASSERT(lastname[0] != '@');
888
889	ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
890	VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd));
891
892	dsobj = dsl_dataset_create_sync_dd(dd, origin,
893	    flags & ~DS_CREATE_FLAG_NODIRTY, tx);
894
895	dsl_deleg_set_create_perms(dd, tx, cr);
896
897	/*
898	 * Since we're creating a new node we know it's a leaf, so we can
899	 * initialize the counts if the limit feature is active.
900	 */
901	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
902		uint64_t cnt = 0;
903		objset_t *os = dd->dd_pool->dp_meta_objset;
904
905		dsl_dir_zapify(dd, tx);
906		VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
907		    sizeof (cnt), 1, &cnt, tx));
908		VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
909		    sizeof (cnt), 1, &cnt, tx));
910	}
911
912	dsl_dir_rele(dd, FTAG);
913
914	/*
915	 * If we are creating a clone, make sure we zero out any stale
916	 * data from the origin snapshots zil header.
917	 */
918	if (origin != NULL && !(flags & DS_CREATE_FLAG_NODIRTY)) {
919		dsl_dataset_t *ds;
920
921		VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
922		dsl_dataset_zero_zil(ds, tx);
923		dsl_dataset_rele(ds, FTAG);
924	}
925
926	return (dsobj);
927}
928
929#ifdef __FreeBSD__
930/* FreeBSD ioctl compat begin */
931struct destroyarg {
932	nvlist_t *nvl;
933	const char *snapname;
934};
935
936static int
937dsl_check_snap_cb(const char *name, void *arg)
938{
939	struct destroyarg *da = arg;
940	dsl_dataset_t *ds;
941	char *dsname;
942
943	dsname = kmem_asprintf("%s@%s", name, da->snapname);
944	fnvlist_add_boolean(da->nvl, dsname);
945	kmem_free(dsname, strlen(dsname) + 1);
946
947	return (0);
948}
949
950int
951dmu_get_recursive_snaps_nvl(char *fsname, const char *snapname,
952    nvlist_t *snaps)
953{
954	struct destroyarg *da;
955	int err;
956
957	da = kmem_zalloc(sizeof (struct destroyarg), KM_SLEEP);
958	da->nvl = snaps;
959	da->snapname = snapname;
960	err = dmu_objset_find(fsname, dsl_check_snap_cb, da,
961	    DS_FIND_CHILDREN);
962	kmem_free(da, sizeof (struct destroyarg));
963
964	return (err);
965}
966/* FreeBSD ioctl compat end */
967#endif /* __FreeBSD__ */
968
969/*
970 * The unique space in the head dataset can be calculated by subtracting
971 * the space used in the most recent snapshot, that is still being used
972 * in this file system, from the space currently in use.  To figure out
973 * the space in the most recent snapshot still in use, we need to take
974 * the total space used in the snapshot and subtract out the space that
975 * has been freed up since the snapshot was taken.
976 */
977void
978dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
979{
980	uint64_t mrs_used;
981	uint64_t dlused, dlcomp, dluncomp;
982
983	ASSERT(!ds->ds_is_snapshot);
984
985	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0)
986		mrs_used = dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes;
987	else
988		mrs_used = 0;
989
990	dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
991
992	ASSERT3U(dlused, <=, mrs_used);
993	dsl_dataset_phys(ds)->ds_unique_bytes =
994	    dsl_dataset_phys(ds)->ds_referenced_bytes - (mrs_used - dlused);
995
996	if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
997	    SPA_VERSION_UNIQUE_ACCURATE)
998		dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
999}
1000
1001void
1002dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj,
1003    dmu_tx_t *tx)
1004{
1005	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1006	uint64_t count;
1007	int err;
1008
1009	ASSERT(dsl_dataset_phys(ds)->ds_num_children >= 2);
1010	err = zap_remove_int(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
1011	    obj, tx);
1012	/*
1013	 * The err should not be ENOENT, but a bug in a previous version
1014	 * of the code could cause upgrade_clones_cb() to not set
1015	 * ds_next_snap_obj when it should, leading to a missing entry.
1016	 * If we knew that the pool was created after
1017	 * SPA_VERSION_NEXT_CLONES, we could assert that it isn't
1018	 * ENOENT.  However, at least we can check that we don't have
1019	 * too many entries in the next_clones_obj even after failing to
1020	 * remove this one.
1021	 */
1022	if (err != ENOENT)
1023		VERIFY0(err);
1024	ASSERT0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
1025	    &count));
1026	ASSERT3U(count, <=, dsl_dataset_phys(ds)->ds_num_children - 2);
1027}
1028
1029
1030blkptr_t *
1031dsl_dataset_get_blkptr(dsl_dataset_t *ds)
1032{
1033	return (&dsl_dataset_phys(ds)->ds_bp);
1034}
1035
1036void
1037dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
1038{
1039	ASSERT(dmu_tx_is_syncing(tx));
1040	/* If it's the meta-objset, set dp_meta_rootbp */
1041	if (ds == NULL) {
1042		tx->tx_pool->dp_meta_rootbp = *bp;
1043	} else {
1044		dmu_buf_will_dirty(ds->ds_dbuf, tx);
1045		dsl_dataset_phys(ds)->ds_bp = *bp;
1046	}
1047}
1048
1049spa_t *
1050dsl_dataset_get_spa(dsl_dataset_t *ds)
1051{
1052	return (ds->ds_dir->dd_pool->dp_spa);
1053}
1054
1055void
1056dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
1057{
1058	dsl_pool_t *dp;
1059
1060	if (ds == NULL) /* this is the meta-objset */
1061		return;
1062
1063	ASSERT(ds->ds_objset != NULL);
1064
1065	if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0)
1066		panic("dirtying snapshot!");
1067
1068	dp = ds->ds_dir->dd_pool;
1069
1070	if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg)) {
1071		/* up the hold count until we can be written out */
1072		dmu_buf_add_ref(ds->ds_dbuf, ds);
1073	}
1074}
1075
1076boolean_t
1077dsl_dataset_is_dirty(dsl_dataset_t *ds)
1078{
1079	for (int t = 0; t < TXG_SIZE; t++) {
1080		if (txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets,
1081		    ds, t))
1082			return (B_TRUE);
1083	}
1084	return (B_FALSE);
1085}
1086
1087static int
1088dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
1089{
1090	uint64_t asize;
1091
1092	if (!dmu_tx_is_syncing(tx))
1093		return (0);
1094
1095	/*
1096	 * If there's an fs-only reservation, any blocks that might become
1097	 * owned by the snapshot dataset must be accommodated by space
1098	 * outside of the reservation.
1099	 */
1100	ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds));
1101	asize = MIN(dsl_dataset_phys(ds)->ds_unique_bytes, ds->ds_reserved);
1102	if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
1103		return (SET_ERROR(ENOSPC));
1104
1105	/*
1106	 * Propagate any reserved space for this snapshot to other
1107	 * snapshot checks in this sync group.
1108	 */
1109	if (asize > 0)
1110		dsl_dir_willuse_space(ds->ds_dir, asize, tx);
1111
1112	return (0);
1113}
1114
1115typedef struct dsl_dataset_snapshot_arg {
1116	nvlist_t *ddsa_snaps;
1117	nvlist_t *ddsa_props;
1118	nvlist_t *ddsa_errors;
1119	cred_t *ddsa_cr;
1120} dsl_dataset_snapshot_arg_t;
1121
1122int
1123dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname,
1124    dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr)
1125{
1126	int error;
1127	uint64_t value;
1128
1129	ds->ds_trysnap_txg = tx->tx_txg;
1130
1131	if (!dmu_tx_is_syncing(tx))
1132		return (0);
1133
1134	/*
1135	 * We don't allow multiple snapshots of the same txg.  If there
1136	 * is already one, try again.
1137	 */
1138	if (dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg)
1139		return (SET_ERROR(EAGAIN));
1140
1141	/*
1142	 * Check for conflicting snapshot name.
1143	 */
1144	error = dsl_dataset_snap_lookup(ds, snapname, &value);
1145	if (error == 0)
1146		return (SET_ERROR(EEXIST));
1147	if (error != ENOENT)
1148		return (error);
1149
1150	/*
1151	 * We don't allow taking snapshots of inconsistent datasets, such as
1152	 * those into which we are currently receiving.  However, if we are
1153	 * creating this snapshot as part of a receive, this check will be
1154	 * executed atomically with respect to the completion of the receive
1155	 * itself but prior to the clearing of DS_FLAG_INCONSISTENT; in this
1156	 * case we ignore this, knowing it will be fixed up for us shortly in
1157	 * dmu_recv_end_sync().
1158	 */
1159	if (!recv && DS_IS_INCONSISTENT(ds))
1160		return (SET_ERROR(EBUSY));
1161
1162	/*
1163	 * Skip the check for temporary snapshots or if we have already checked
1164	 * the counts in dsl_dataset_snapshot_check. This means we really only
1165	 * check the count here when we're receiving a stream.
1166	 */
1167	if (cnt != 0 && cr != NULL) {
1168		error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
1169		    ZFS_PROP_SNAPSHOT_LIMIT, NULL, cr);
1170		if (error != 0)
1171			return (error);
1172	}
1173
1174	error = dsl_dataset_snapshot_reserve_space(ds, tx);
1175	if (error != 0)
1176		return (error);
1177
1178	return (0);
1179}
1180
1181static int
1182dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx)
1183{
1184	dsl_dataset_snapshot_arg_t *ddsa = arg;
1185	dsl_pool_t *dp = dmu_tx_pool(tx);
1186	nvpair_t *pair;
1187	int rv = 0;
1188
1189	/*
1190	 * Pre-compute how many total new snapshots will be created for each
1191	 * level in the tree and below. This is needed for validating the
1192	 * snapshot limit when either taking a recursive snapshot or when
1193	 * taking multiple snapshots.
1194	 *
1195	 * The problem is that the counts are not actually adjusted when
1196	 * we are checking, only when we finally sync. For a single snapshot,
1197	 * this is easy, the count will increase by 1 at each node up the tree,
1198	 * but its more complicated for the recursive/multiple snapshot case.
1199	 *
1200	 * The dsl_fs_ss_limit_check function does recursively check the count
1201	 * at each level up the tree but since it is validating each snapshot
1202	 * independently we need to be sure that we are validating the complete
1203	 * count for the entire set of snapshots. We do this by rolling up the
1204	 * counts for each component of the name into an nvlist and then
1205	 * checking each of those cases with the aggregated count.
1206	 *
1207	 * This approach properly handles not only the recursive snapshot
1208	 * case (where we get all of those on the ddsa_snaps list) but also
1209	 * the sibling case (e.g. snapshot a/b and a/c so that we will also
1210	 * validate the limit on 'a' using a count of 2).
1211	 *
1212	 * We validate the snapshot names in the third loop and only report
1213	 * name errors once.
1214	 */
1215	if (dmu_tx_is_syncing(tx)) {
1216		nvlist_t *cnt_track = NULL;
1217		cnt_track = fnvlist_alloc();
1218
1219		/* Rollup aggregated counts into the cnt_track list */
1220		for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
1221		    pair != NULL;
1222		    pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
1223			char *pdelim;
1224			uint64_t val;
1225			char nm[MAXPATHLEN];
1226
1227			(void) strlcpy(nm, nvpair_name(pair), sizeof (nm));
1228			pdelim = strchr(nm, '@');
1229			if (pdelim == NULL)
1230				continue;
1231			*pdelim = '\0';
1232
1233			do {
1234				if (nvlist_lookup_uint64(cnt_track, nm,
1235				    &val) == 0) {
1236					/* update existing entry */
1237					fnvlist_add_uint64(cnt_track, nm,
1238					    val + 1);
1239				} else {
1240					/* add to list */
1241					fnvlist_add_uint64(cnt_track, nm, 1);
1242				}
1243
1244				pdelim = strrchr(nm, '/');
1245				if (pdelim != NULL)
1246					*pdelim = '\0';
1247			} while (pdelim != NULL);
1248		}
1249
1250		/* Check aggregated counts at each level */
1251		for (pair = nvlist_next_nvpair(cnt_track, NULL);
1252		    pair != NULL; pair = nvlist_next_nvpair(cnt_track, pair)) {
1253			int error = 0;
1254			char *name;
1255			uint64_t cnt = 0;
1256			dsl_dataset_t *ds;
1257
1258			name = nvpair_name(pair);
1259			cnt = fnvpair_value_uint64(pair);
1260			ASSERT(cnt > 0);
1261
1262			error = dsl_dataset_hold(dp, name, FTAG, &ds);
1263			if (error == 0) {
1264				error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
1265				    ZFS_PROP_SNAPSHOT_LIMIT, NULL,
1266				    ddsa->ddsa_cr);
1267				dsl_dataset_rele(ds, FTAG);
1268			}
1269
1270			if (error != 0) {
1271				if (ddsa->ddsa_errors != NULL)
1272					fnvlist_add_int32(ddsa->ddsa_errors,
1273					    name, error);
1274				rv = error;
1275				/* only report one error for this check */
1276				break;
1277			}
1278		}
1279		nvlist_free(cnt_track);
1280	}
1281
1282	for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
1283	    pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
1284		int error = 0;
1285		dsl_dataset_t *ds;
1286		char *name, *atp;
1287		char dsname[MAXNAMELEN];
1288
1289		name = nvpair_name(pair);
1290		if (strlen(name) >= MAXNAMELEN)
1291			error = SET_ERROR(ENAMETOOLONG);
1292		if (error == 0) {
1293			atp = strchr(name, '@');
1294			if (atp == NULL)
1295				error = SET_ERROR(EINVAL);
1296			if (error == 0)
1297				(void) strlcpy(dsname, name, atp - name + 1);
1298		}
1299		if (error == 0)
1300			error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
1301		if (error == 0) {
1302			/* passing 0/NULL skips dsl_fs_ss_limit_check */
1303			error = dsl_dataset_snapshot_check_impl(ds,
1304			    atp + 1, tx, B_FALSE, 0, NULL);
1305			dsl_dataset_rele(ds, FTAG);
1306		}
1307
1308		if (error != 0) {
1309			if (ddsa->ddsa_errors != NULL) {
1310				fnvlist_add_int32(ddsa->ddsa_errors,
1311				    name, error);
1312			}
1313			rv = error;
1314		}
1315	}
1316
1317	return (rv);
1318}
1319
1320void
1321dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
1322    dmu_tx_t *tx)
1323{
1324	static zil_header_t zero_zil;
1325
1326	dsl_pool_t *dp = ds->ds_dir->dd_pool;
1327	dmu_buf_t *dbuf;
1328	dsl_dataset_phys_t *dsphys;
1329	uint64_t dsobj, crtxg;
1330	objset_t *mos = dp->dp_meta_objset;
1331	objset_t *os;
1332
1333	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
1334
1335	/*
1336	 * If we are on an old pool, the zil must not be active, in which
1337	 * case it will be zeroed.  Usually zil_suspend() accomplishes this.
1338	 */
1339	ASSERT(spa_version(dmu_tx_pool(tx)->dp_spa) >= SPA_VERSION_FAST_SNAP ||
1340	    dmu_objset_from_ds(ds, &os) != 0 ||
1341	    bcmp(&os->os_phys->os_zil_header, &zero_zil,
1342	    sizeof (zero_zil)) == 0);
1343
1344	dsl_fs_ss_count_adjust(ds->ds_dir, 1, DD_FIELD_SNAPSHOT_COUNT, tx);
1345
1346	/*
1347	 * The origin's ds_creation_txg has to be < TXG_INITIAL
1348	 */
1349	if (strcmp(snapname, ORIGIN_DIR_NAME) == 0)
1350		crtxg = 1;
1351	else
1352		crtxg = tx->tx_txg;
1353
1354	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
1355	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
1356	VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
1357	dmu_buf_will_dirty(dbuf, tx);
1358	dsphys = dbuf->db_data;
1359	bzero(dsphys, sizeof (dsl_dataset_phys_t));
1360	dsphys->ds_dir_obj = ds->ds_dir->dd_object;
1361	dsphys->ds_fsid_guid = unique_create();
1362	do {
1363		(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
1364		    sizeof (dsphys->ds_guid));
1365	} while (dsphys->ds_guid == 0);
1366	dsphys->ds_prev_snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
1367	dsphys->ds_prev_snap_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
1368	dsphys->ds_next_snap_obj = ds->ds_object;
1369	dsphys->ds_num_children = 1;
1370	dsphys->ds_creation_time = gethrestime_sec();
1371	dsphys->ds_creation_txg = crtxg;
1372	dsphys->ds_deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj;
1373	dsphys->ds_referenced_bytes = dsl_dataset_phys(ds)->ds_referenced_bytes;
1374	dsphys->ds_compressed_bytes = dsl_dataset_phys(ds)->ds_compressed_bytes;
1375	dsphys->ds_uncompressed_bytes =
1376	    dsl_dataset_phys(ds)->ds_uncompressed_bytes;
1377	dsphys->ds_flags = dsl_dataset_phys(ds)->ds_flags;
1378	dsphys->ds_bp = dsl_dataset_phys(ds)->ds_bp;
1379	dmu_buf_rele(dbuf, FTAG);
1380
1381	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
1382		if (ds->ds_feature_inuse[f])
1383			dsl_dataset_activate_feature(dsobj, f, tx);
1384	}
1385
1386	ASSERT3U(ds->ds_prev != 0, ==,
1387	    dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
1388	if (ds->ds_prev) {
1389		uint64_t next_clones_obj =
1390		    dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj;
1391		ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
1392		    ds->ds_object ||
1393		    dsl_dataset_phys(ds->ds_prev)->ds_num_children > 1);
1394		if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
1395		    ds->ds_object) {
1396			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
1397			ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==,
1398			    dsl_dataset_phys(ds->ds_prev)->ds_creation_txg);
1399			dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj = dsobj;
1400		} else if (next_clones_obj != 0) {
1401			dsl_dataset_remove_from_next_clones(ds->ds_prev,
1402			    dsphys->ds_next_snap_obj, tx);
1403			VERIFY0(zap_add_int(mos,
1404			    next_clones_obj, dsobj, tx));
1405		}
1406	}
1407
1408	/*
1409	 * If we have a reference-reservation on this dataset, we will
1410	 * need to increase the amount of refreservation being charged
1411	 * since our unique space is going to zero.
1412	 */
1413	if (ds->ds_reserved) {
1414		int64_t delta;
1415		ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
1416		delta = MIN(dsl_dataset_phys(ds)->ds_unique_bytes,
1417		    ds->ds_reserved);
1418		dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,
1419		    delta, 0, 0, tx);
1420	}
1421
1422	dmu_buf_will_dirty(ds->ds_dbuf, tx);
1423	dsl_dataset_phys(ds)->ds_deadlist_obj =
1424	    dsl_deadlist_clone(&ds->ds_deadlist, UINT64_MAX,
1425	    dsl_dataset_phys(ds)->ds_prev_snap_obj, tx);
1426	dsl_deadlist_close(&ds->ds_deadlist);
1427	dsl_deadlist_open(&ds->ds_deadlist, mos,
1428	    dsl_dataset_phys(ds)->ds_deadlist_obj);
1429	dsl_deadlist_add_key(&ds->ds_deadlist,
1430	    dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
1431
1432	ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, <, tx->tx_txg);
1433	dsl_dataset_phys(ds)->ds_prev_snap_obj = dsobj;
1434	dsl_dataset_phys(ds)->ds_prev_snap_txg = crtxg;
1435	dsl_dataset_phys(ds)->ds_unique_bytes = 0;
1436	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
1437		dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
1438
1439	VERIFY0(zap_add(mos, dsl_dataset_phys(ds)->ds_snapnames_zapobj,
1440	    snapname, 8, 1, &dsobj, tx));
1441
1442	if (ds->ds_prev)
1443		dsl_dataset_rele(ds->ds_prev, ds);
1444	VERIFY0(dsl_dataset_hold_obj(dp,
1445	    dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev));
1446
1447	dsl_scan_ds_snapshotted(ds, tx);
1448
1449	dsl_dir_snap_cmtime_update(ds->ds_dir);
1450
1451	spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, "");
1452}
1453
1454static void
1455dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx)
1456{
1457	dsl_dataset_snapshot_arg_t *ddsa = arg;
1458	dsl_pool_t *dp = dmu_tx_pool(tx);
1459	nvpair_t *pair;
1460
1461	for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
1462	    pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
1463		dsl_dataset_t *ds;
1464		char *name, *atp;
1465		char dsname[MAXNAMELEN];
1466
1467		name = nvpair_name(pair);
1468		atp = strchr(name, '@');
1469		(void) strlcpy(dsname, name, atp - name + 1);
1470		VERIFY0(dsl_dataset_hold(dp, dsname, FTAG, &ds));
1471
1472		dsl_dataset_snapshot_sync_impl(ds, atp + 1, tx);
1473		if (ddsa->ddsa_props != NULL) {
1474			dsl_props_set_sync_impl(ds->ds_prev,
1475			    ZPROP_SRC_LOCAL, ddsa->ddsa_props, tx);
1476		}
1477		dsl_dataset_rele(ds, FTAG);
1478	}
1479}
1480
1481/*
1482 * The snapshots must all be in the same pool.
1483 * All-or-nothing: if there are any failures, nothing will be modified.
1484 */
1485int
1486dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors)
1487{
1488	dsl_dataset_snapshot_arg_t ddsa;
1489	nvpair_t *pair;
1490	boolean_t needsuspend;
1491	int error;
1492	spa_t *spa;
1493	char *firstname;
1494	nvlist_t *suspended = NULL;
1495
1496	pair = nvlist_next_nvpair(snaps, NULL);
1497	if (pair == NULL)
1498		return (0);
1499	firstname = nvpair_name(pair);
1500
1501	error = spa_open(firstname, &spa, FTAG);
1502	if (error != 0)
1503		return (error);
1504	needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
1505	spa_close(spa, FTAG);
1506
1507	if (needsuspend) {
1508		suspended = fnvlist_alloc();
1509		for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
1510		    pair = nvlist_next_nvpair(snaps, pair)) {
1511			char fsname[MAXNAMELEN];
1512			char *snapname = nvpair_name(pair);
1513			char *atp;
1514			void *cookie;
1515
1516			atp = strchr(snapname, '@');
1517			if (atp == NULL) {
1518				error = SET_ERROR(EINVAL);
1519				break;
1520			}
1521			(void) strlcpy(fsname, snapname, atp - snapname + 1);
1522
1523			error = zil_suspend(fsname, &cookie);
1524			if (error != 0)
1525				break;
1526			fnvlist_add_uint64(suspended, fsname,
1527			    (uintptr_t)cookie);
1528		}
1529	}
1530
1531	ddsa.ddsa_snaps = snaps;
1532	ddsa.ddsa_props = props;
1533	ddsa.ddsa_errors = errors;
1534	ddsa.ddsa_cr = CRED();
1535
1536	if (error == 0) {
1537		error = dsl_sync_task(firstname, dsl_dataset_snapshot_check,
1538		    dsl_dataset_snapshot_sync, &ddsa,
1539		    fnvlist_num_pairs(snaps) * 3, ZFS_SPACE_CHECK_NORMAL);
1540	}
1541
1542	if (suspended != NULL) {
1543		for (pair = nvlist_next_nvpair(suspended, NULL); pair != NULL;
1544		    pair = nvlist_next_nvpair(suspended, pair)) {
1545			zil_resume((void *)(uintptr_t)
1546			    fnvpair_value_uint64(pair));
1547		}
1548		fnvlist_free(suspended);
1549	}
1550
1551#ifdef __FreeBSD__
1552#ifdef _KERNEL
1553	if (error == 0) {
1554		for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
1555		    pair = nvlist_next_nvpair(snaps, pair)) {
1556			char *snapname = nvpair_name(pair);
1557			zvol_create_minors(snapname);
1558		}
1559	}
1560#endif
1561#endif
1562	return (error);
1563}
1564
1565typedef struct dsl_dataset_snapshot_tmp_arg {
1566	const char *ddsta_fsname;
1567	const char *ddsta_snapname;
1568	minor_t ddsta_cleanup_minor;
1569	const char *ddsta_htag;
1570} dsl_dataset_snapshot_tmp_arg_t;
1571
1572static int
1573dsl_dataset_snapshot_tmp_check(void *arg, dmu_tx_t *tx)
1574{
1575	dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
1576	dsl_pool_t *dp = dmu_tx_pool(tx);
1577	dsl_dataset_t *ds;
1578	int error;
1579
1580	error = dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds);
1581	if (error != 0)
1582		return (error);
1583
1584	/* NULL cred means no limit check for tmp snapshot */
1585	error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname,
1586	    tx, B_FALSE, 0, NULL);
1587	if (error != 0) {
1588		dsl_dataset_rele(ds, FTAG);
1589		return (error);
1590	}
1591
1592	if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) {
1593		dsl_dataset_rele(ds, FTAG);
1594		return (SET_ERROR(ENOTSUP));
1595	}
1596	error = dsl_dataset_user_hold_check_one(NULL, ddsta->ddsta_htag,
1597	    B_TRUE, tx);
1598	if (error != 0) {
1599		dsl_dataset_rele(ds, FTAG);
1600		return (error);
1601	}
1602
1603	dsl_dataset_rele(ds, FTAG);
1604	return (0);
1605}
1606
1607static void
1608dsl_dataset_snapshot_tmp_sync(void *arg, dmu_tx_t *tx)
1609{
1610	dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
1611	dsl_pool_t *dp = dmu_tx_pool(tx);
1612	dsl_dataset_t *ds;
1613
1614	VERIFY0(dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds));
1615
1616	dsl_dataset_snapshot_sync_impl(ds, ddsta->ddsta_snapname, tx);
1617	dsl_dataset_user_hold_sync_one(ds->ds_prev, ddsta->ddsta_htag,
1618	    ddsta->ddsta_cleanup_minor, gethrestime_sec(), tx);
1619	dsl_destroy_snapshot_sync_impl(ds->ds_prev, B_TRUE, tx);
1620
1621	dsl_dataset_rele(ds, FTAG);
1622}
1623
1624int
1625dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname,
1626    minor_t cleanup_minor, const char *htag)
1627{
1628	dsl_dataset_snapshot_tmp_arg_t ddsta;
1629	int error;
1630	spa_t *spa;
1631	boolean_t needsuspend;
1632	void *cookie;
1633
1634	ddsta.ddsta_fsname = fsname;
1635	ddsta.ddsta_snapname = snapname;
1636	ddsta.ddsta_cleanup_minor = cleanup_minor;
1637	ddsta.ddsta_htag = htag;
1638
1639	error = spa_open(fsname, &spa, FTAG);
1640	if (error != 0)
1641		return (error);
1642	needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
1643	spa_close(spa, FTAG);
1644
1645	if (needsuspend) {
1646		error = zil_suspend(fsname, &cookie);
1647		if (error != 0)
1648			return (error);
1649	}
1650
1651	error = dsl_sync_task(fsname, dsl_dataset_snapshot_tmp_check,
1652	    dsl_dataset_snapshot_tmp_sync, &ddsta, 3, ZFS_SPACE_CHECK_RESERVED);
1653
1654	if (needsuspend)
1655		zil_resume(cookie);
1656	return (error);
1657}
1658
1659
1660void
1661dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
1662{
1663	ASSERT(dmu_tx_is_syncing(tx));
1664	ASSERT(ds->ds_objset != NULL);
1665	ASSERT(dsl_dataset_phys(ds)->ds_next_snap_obj == 0);
1666
1667	/*
1668	 * in case we had to change ds_fsid_guid when we opened it,
1669	 * sync it out now.
1670	 */
1671	dmu_buf_will_dirty(ds->ds_dbuf, tx);
1672	dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid;
1673
1674	if (ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] != 0) {
1675		VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
1676		    ds->ds_object, DS_FIELD_RESUME_OBJECT, 8, 1,
1677		    &ds->ds_resume_object[tx->tx_txg & TXG_MASK], tx));
1678		VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
1679		    ds->ds_object, DS_FIELD_RESUME_OFFSET, 8, 1,
1680		    &ds->ds_resume_offset[tx->tx_txg & TXG_MASK], tx));
1681		VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
1682		    ds->ds_object, DS_FIELD_RESUME_BYTES, 8, 1,
1683		    &ds->ds_resume_bytes[tx->tx_txg & TXG_MASK], tx));
1684		ds->ds_resume_object[tx->tx_txg & TXG_MASK] = 0;
1685		ds->ds_resume_offset[tx->tx_txg & TXG_MASK] = 0;
1686		ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0;
1687	}
1688
1689	dmu_objset_sync(ds->ds_objset, zio, tx);
1690
1691	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
1692		if (ds->ds_feature_activation_needed[f]) {
1693			if (ds->ds_feature_inuse[f])
1694				continue;
1695			dsl_dataset_activate_feature(ds->ds_object, f, tx);
1696			ds->ds_feature_inuse[f] = B_TRUE;
1697		}
1698	}
1699}
1700
1701static void
1702get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv)
1703{
1704	uint64_t count = 0;
1705	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1706	zap_cursor_t zc;
1707	zap_attribute_t za;
1708	nvlist_t *propval = fnvlist_alloc();
1709	nvlist_t *val = fnvlist_alloc();
1710
1711	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
1712
1713	/*
1714	 * There may be missing entries in ds_next_clones_obj
1715	 * due to a bug in a previous version of the code.
1716	 * Only trust it if it has the right number of entries.
1717	 */
1718	if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
1719		VERIFY0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
1720		    &count));
1721	}
1722	if (count != dsl_dataset_phys(ds)->ds_num_children - 1)
1723		goto fail;
1724	for (zap_cursor_init(&zc, mos,
1725	    dsl_dataset_phys(ds)->ds_next_clones_obj);
1726	    zap_cursor_retrieve(&zc, &za) == 0;
1727	    zap_cursor_advance(&zc)) {
1728		dsl_dataset_t *clone;
1729		char buf[ZFS_MAXNAMELEN];
1730		VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
1731		    za.za_first_integer, FTAG, &clone));
1732		dsl_dir_name(clone->ds_dir, buf);
1733		fnvlist_add_boolean(val, buf);
1734		dsl_dataset_rele(clone, FTAG);
1735	}
1736	zap_cursor_fini(&zc);
1737	fnvlist_add_nvlist(propval, ZPROP_VALUE, val);
1738	fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES), propval);
1739fail:
1740	nvlist_free(val);
1741	nvlist_free(propval);
1742}
1743
1744static void
1745get_receive_resume_stats(dsl_dataset_t *ds, nvlist_t *nv)
1746{
1747	dsl_pool_t *dp = ds->ds_dir->dd_pool;
1748
1749	if (dsl_dataset_has_resume_receive_state(ds)) {
1750		char *str;
1751		void *packed;
1752		uint8_t *compressed;
1753		uint64_t val;
1754		nvlist_t *token_nv = fnvlist_alloc();
1755		size_t packed_size, compressed_size;
1756
1757		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
1758		    DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val) == 0) {
1759			fnvlist_add_uint64(token_nv, "fromguid", val);
1760		}
1761		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
1762		    DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val) == 0) {
1763			fnvlist_add_uint64(token_nv, "object", val);
1764		}
1765		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
1766		    DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val) == 0) {
1767			fnvlist_add_uint64(token_nv, "offset", val);
1768		}
1769		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
1770		    DS_FIELD_RESUME_BYTES, sizeof (val), 1, &val) == 0) {
1771			fnvlist_add_uint64(token_nv, "bytes", val);
1772		}
1773		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
1774		    DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val) == 0) {
1775			fnvlist_add_uint64(token_nv, "toguid", val);
1776		}
1777		char buf[256];
1778		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
1779		    DS_FIELD_RESUME_TONAME, 1, sizeof (buf), buf) == 0) {
1780			fnvlist_add_string(token_nv, "toname", buf);
1781		}
1782		if (zap_contains(dp->dp_meta_objset, ds->ds_object,
1783		    DS_FIELD_RESUME_EMBEDOK) == 0) {
1784			fnvlist_add_boolean(token_nv, "embedok");
1785		}
1786		packed = fnvlist_pack(token_nv, &packed_size);
1787		fnvlist_free(token_nv);
1788		compressed = kmem_alloc(packed_size, KM_SLEEP);
1789
1790		compressed_size = gzip_compress(packed, compressed,
1791		    packed_size, packed_size, 6);
1792
1793		zio_cksum_t cksum;
1794		fletcher_4_native(compressed, compressed_size, &cksum);
1795
1796		str = kmem_alloc(compressed_size * 2 + 1, KM_SLEEP);
1797		for (int i = 0; i < compressed_size; i++) {
1798			(void) sprintf(str + i * 2, "%02x", compressed[i]);
1799		}
1800		str[compressed_size * 2] = '\0';
1801		char *propval = kmem_asprintf("%u-%llx-%llx-%s",
1802		    ZFS_SEND_RESUME_TOKEN_VERSION,
1803		    (longlong_t)cksum.zc_word[0],
1804		    (longlong_t)packed_size, str);
1805		dsl_prop_nvlist_add_string(nv,
1806		    ZFS_PROP_RECEIVE_RESUME_TOKEN, propval);
1807		kmem_free(packed, packed_size);
1808		kmem_free(str, compressed_size * 2 + 1);
1809		kmem_free(compressed, packed_size);
1810		strfree(propval);
1811	}
1812}
1813
1814void
1815dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
1816{
1817	dsl_pool_t *dp = ds->ds_dir->dd_pool;
1818	uint64_t refd, avail, uobjs, aobjs, ratio;
1819
1820	ASSERT(dsl_pool_config_held(dp));
1821
1822	ratio = dsl_dataset_phys(ds)->ds_compressed_bytes == 0 ? 100 :
1823	    (dsl_dataset_phys(ds)->ds_uncompressed_bytes * 100 /
1824	    dsl_dataset_phys(ds)->ds_compressed_bytes);
1825
1826	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio);
1827	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALREFERENCED,
1828	    dsl_dataset_phys(ds)->ds_uncompressed_bytes);
1829
1830	if (ds->ds_is_snapshot) {
1831		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio);
1832		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
1833		    dsl_dataset_phys(ds)->ds_unique_bytes);
1834		get_clones_stat(ds, nv);
1835	} else {
1836		if (ds->ds_prev != NULL && ds->ds_prev != dp->dp_origin_snap) {
1837			char buf[MAXNAMELEN];
1838			dsl_dataset_name(ds->ds_prev, buf);
1839			dsl_prop_nvlist_add_string(nv, ZFS_PROP_PREV_SNAP, buf);
1840		}
1841
1842		dsl_dir_stats(ds->ds_dir, nv);
1843	}
1844
1845	dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs);
1846	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail);
1847	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd);
1848
1849	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
1850	    dsl_dataset_phys(ds)->ds_creation_time);
1851	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
1852	    dsl_dataset_phys(ds)->ds_creation_txg);
1853	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA,
1854	    ds->ds_quota);
1855	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION,
1856	    ds->ds_reserved);
1857	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
1858	    dsl_dataset_phys(ds)->ds_guid);
1859	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE,
1860	    dsl_dataset_phys(ds)->ds_unique_bytes);
1861	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID,
1862	    ds->ds_object);
1863	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS,
1864	    ds->ds_userrefs);
1865	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
1866	    DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
1867
1868	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
1869		uint64_t written, comp, uncomp;
1870		dsl_pool_t *dp = ds->ds_dir->dd_pool;
1871		dsl_dataset_t *prev;
1872
1873		int err = dsl_dataset_hold_obj(dp,
1874		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
1875		if (err == 0) {
1876			err = dsl_dataset_space_written(prev, ds, &written,
1877			    &comp, &uncomp);
1878			dsl_dataset_rele(prev, FTAG);
1879			if (err == 0) {
1880				dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN,
1881				    written);
1882			}
1883		}
1884	}
1885
1886	if (!dsl_dataset_is_snapshot(ds)) {
1887		/*
1888		 * A failed "newfs" (e.g. full) resumable receive leaves
1889		 * the stats set on this dataset.  Check here for the prop.
1890		 */
1891		get_receive_resume_stats(ds, nv);
1892
1893		/*
1894		 * A failed incremental resumable receive leaves the
1895		 * stats set on our child named "%recv".  Check the child
1896		 * for the prop.
1897		 */
1898		char recvname[ZFS_MAXNAMELEN];
1899		dsl_dataset_t *recv_ds;
1900		dsl_dataset_name(ds, recvname);
1901		(void) strcat(recvname, "/");
1902		(void) strcat(recvname, recv_clone_name);
1903		if (dsl_dataset_hold(dp, recvname, FTAG, &recv_ds) == 0) {
1904			get_receive_resume_stats(recv_ds, nv);
1905			dsl_dataset_rele(recv_ds, FTAG);
1906		}
1907	}
1908}
1909
1910void
1911dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
1912{
1913	dsl_pool_t *dp = ds->ds_dir->dd_pool;
1914	ASSERT(dsl_pool_config_held(dp));
1915
1916	stat->dds_creation_txg = dsl_dataset_phys(ds)->ds_creation_txg;
1917	stat->dds_inconsistent =
1918	    dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT;
1919	stat->dds_guid = dsl_dataset_phys(ds)->ds_guid;
1920	stat->dds_origin[0] = '\0';
1921	if (ds->ds_is_snapshot) {
1922		stat->dds_is_snapshot = B_TRUE;
1923		stat->dds_num_clones =
1924		    dsl_dataset_phys(ds)->ds_num_children - 1;
1925	} else {
1926		stat->dds_is_snapshot = B_FALSE;
1927		stat->dds_num_clones = 0;
1928
1929		if (dsl_dir_is_clone(ds->ds_dir)) {
1930			dsl_dataset_t *ods;
1931
1932			VERIFY0(dsl_dataset_hold_obj(dp,
1933			    dsl_dir_phys(ds->ds_dir)->dd_origin_obj,
1934			    FTAG, &ods));
1935			dsl_dataset_name(ods, stat->dds_origin);
1936			dsl_dataset_rele(ods, FTAG);
1937		}
1938	}
1939}
1940
1941uint64_t
1942dsl_dataset_fsid_guid(dsl_dataset_t *ds)
1943{
1944	return (ds->ds_fsid_guid);
1945}
1946
1947void
1948dsl_dataset_space(dsl_dataset_t *ds,
1949    uint64_t *refdbytesp, uint64_t *availbytesp,
1950    uint64_t *usedobjsp, uint64_t *availobjsp)
1951{
1952	*refdbytesp = dsl_dataset_phys(ds)->ds_referenced_bytes;
1953	*availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
1954	if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes)
1955		*availbytesp +=
1956		    ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes;
1957	if (ds->ds_quota != 0) {
1958		/*
1959		 * Adjust available bytes according to refquota
1960		 */
1961		if (*refdbytesp < ds->ds_quota)
1962			*availbytesp = MIN(*availbytesp,
1963			    ds->ds_quota - *refdbytesp);
1964		else
1965			*availbytesp = 0;
1966	}
1967	*usedobjsp = BP_GET_FILL(&dsl_dataset_phys(ds)->ds_bp);
1968	*availobjsp = DN_MAX_OBJECT - *usedobjsp;
1969}
1970
1971boolean_t
1972dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap)
1973{
1974	dsl_pool_t *dp = ds->ds_dir->dd_pool;
1975
1976	ASSERT(dsl_pool_config_held(dp));
1977	if (snap == NULL)
1978		return (B_FALSE);
1979	if (dsl_dataset_phys(ds)->ds_bp.blk_birth >
1980	    dsl_dataset_phys(snap)->ds_creation_txg) {
1981		objset_t *os, *os_snap;
1982		/*
1983		 * It may be that only the ZIL differs, because it was
1984		 * reset in the head.  Don't count that as being
1985		 * modified.
1986		 */
1987		if (dmu_objset_from_ds(ds, &os) != 0)
1988			return (B_TRUE);
1989		if (dmu_objset_from_ds(snap, &os_snap) != 0)
1990			return (B_TRUE);
1991		return (bcmp(&os->os_phys->os_meta_dnode,
1992		    &os_snap->os_phys->os_meta_dnode,
1993		    sizeof (os->os_phys->os_meta_dnode)) != 0);
1994	}
1995	return (B_FALSE);
1996}
1997
1998typedef struct dsl_dataset_rename_snapshot_arg {
1999	const char *ddrsa_fsname;
2000	const char *ddrsa_oldsnapname;
2001	const char *ddrsa_newsnapname;
2002	boolean_t ddrsa_recursive;
2003	dmu_tx_t *ddrsa_tx;
2004} dsl_dataset_rename_snapshot_arg_t;
2005
2006/* ARGSUSED */
2007static int
2008dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp,
2009    dsl_dataset_t *hds, void *arg)
2010{
2011	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
2012	int error;
2013	uint64_t val;
2014
2015	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
2016	if (error != 0) {
2017		/* ignore nonexistent snapshots */
2018		return (error == ENOENT ? 0 : error);
2019	}
2020
2021	/* new name should not exist */
2022	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_newsnapname, &val);
2023	if (error == 0)
2024		error = SET_ERROR(EEXIST);
2025	else if (error == ENOENT)
2026		error = 0;
2027
2028	/* dataset name + 1 for the "@" + the new snapshot name must fit */
2029	if (dsl_dir_namelen(hds->ds_dir) + 1 +
2030	    strlen(ddrsa->ddrsa_newsnapname) >= MAXNAMELEN)
2031		error = SET_ERROR(ENAMETOOLONG);
2032
2033	return (error);
2034}
2035
2036static int
2037dsl_dataset_rename_snapshot_check(void *arg, dmu_tx_t *tx)
2038{
2039	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
2040	dsl_pool_t *dp = dmu_tx_pool(tx);
2041	dsl_dataset_t *hds;
2042	int error;
2043
2044	error = dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds);
2045	if (error != 0)
2046		return (error);
2047
2048	if (ddrsa->ddrsa_recursive) {
2049		error = dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
2050		    dsl_dataset_rename_snapshot_check_impl, ddrsa,
2051		    DS_FIND_CHILDREN);
2052	} else {
2053		error = dsl_dataset_rename_snapshot_check_impl(dp, hds, ddrsa);
2054	}
2055	dsl_dataset_rele(hds, FTAG);
2056	return (error);
2057}
2058
2059static int
2060dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp,
2061    dsl_dataset_t *hds, void *arg)
2062{
2063#ifdef __FreeBSD__
2064#ifdef _KERNEL
2065	char *oldname, *newname;
2066#endif
2067#endif
2068	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
2069	dsl_dataset_t *ds;
2070	uint64_t val;
2071	dmu_tx_t *tx = ddrsa->ddrsa_tx;
2072	int error;
2073
2074	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
2075	ASSERT(error == 0 || error == ENOENT);
2076	if (error == ENOENT) {
2077		/* ignore nonexistent snapshots */
2078		return (0);
2079	}
2080
2081	VERIFY0(dsl_dataset_hold_obj(dp, val, FTAG, &ds));
2082
2083	/* log before we change the name */
2084	spa_history_log_internal_ds(ds, "rename", tx,
2085	    "-> @%s", ddrsa->ddrsa_newsnapname);
2086
2087	VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx,
2088	    B_FALSE));
2089	mutex_enter(&ds->ds_lock);
2090	(void) strcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname);
2091	mutex_exit(&ds->ds_lock);
2092	VERIFY0(zap_add(dp->dp_meta_objset,
2093	    dsl_dataset_phys(hds)->ds_snapnames_zapobj,
2094	    ds->ds_snapname, 8, 1, &ds->ds_object, tx));
2095
2096#ifdef __FreeBSD__
2097#ifdef _KERNEL
2098	oldname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2099	newname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2100	snprintf(oldname, MAXPATHLEN, "%s@%s", ddrsa->ddrsa_fsname,
2101	    ddrsa->ddrsa_oldsnapname);
2102	snprintf(newname, MAXPATHLEN, "%s@%s", ddrsa->ddrsa_fsname,
2103	    ddrsa->ddrsa_newsnapname);
2104	zfsvfs_update_fromname(oldname, newname);
2105	zvol_rename_minors(oldname, newname);
2106	kmem_free(newname, MAXPATHLEN);
2107	kmem_free(oldname, MAXPATHLEN);
2108#endif
2109#endif
2110	dsl_dataset_rele(ds, FTAG);
2111
2112	return (0);
2113}
2114
2115static void
2116dsl_dataset_rename_snapshot_sync(void *arg, dmu_tx_t *tx)
2117{
2118	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
2119	dsl_pool_t *dp = dmu_tx_pool(tx);
2120	dsl_dataset_t *hds;
2121
2122	VERIFY0(dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds));
2123	ddrsa->ddrsa_tx = tx;
2124	if (ddrsa->ddrsa_recursive) {
2125		VERIFY0(dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
2126		    dsl_dataset_rename_snapshot_sync_impl, ddrsa,
2127		    DS_FIND_CHILDREN));
2128	} else {
2129		VERIFY0(dsl_dataset_rename_snapshot_sync_impl(dp, hds, ddrsa));
2130	}
2131	dsl_dataset_rele(hds, FTAG);
2132}
2133
2134int
2135dsl_dataset_rename_snapshot(const char *fsname,
2136    const char *oldsnapname, const char *newsnapname, boolean_t recursive)
2137{
2138	dsl_dataset_rename_snapshot_arg_t ddrsa;
2139
2140	ddrsa.ddrsa_fsname = fsname;
2141	ddrsa.ddrsa_oldsnapname = oldsnapname;
2142	ddrsa.ddrsa_newsnapname = newsnapname;
2143	ddrsa.ddrsa_recursive = recursive;
2144
2145	return (dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check,
2146	    dsl_dataset_rename_snapshot_sync, &ddrsa,
2147	    1, ZFS_SPACE_CHECK_RESERVED));
2148}
2149
2150/*
2151 * If we're doing an ownership handoff, we need to make sure that there is
2152 * only one long hold on the dataset.  We're not allowed to change anything here
2153 * so we don't permanently release the long hold or regular hold here.  We want
2154 * to do this only when syncing to avoid the dataset unexpectedly going away
2155 * when we release the long hold.
2156 */
2157static int
2158dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx)
2159{
2160	boolean_t held;
2161
2162	if (!dmu_tx_is_syncing(tx))
2163		return (0);
2164
2165	if (owner != NULL) {
2166		VERIFY3P(ds->ds_owner, ==, owner);
2167		dsl_dataset_long_rele(ds, owner);
2168	}
2169
2170	held = dsl_dataset_long_held(ds);
2171
2172	if (owner != NULL)
2173		dsl_dataset_long_hold(ds, owner);
2174
2175	if (held)
2176		return (SET_ERROR(EBUSY));
2177
2178	return (0);
2179}
2180
2181typedef struct dsl_dataset_rollback_arg {
2182	const char *ddra_fsname;
2183	void *ddra_owner;
2184	nvlist_t *ddra_result;
2185} dsl_dataset_rollback_arg_t;
2186
2187static int
2188dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx)
2189{
2190	dsl_dataset_rollback_arg_t *ddra = arg;
2191	dsl_pool_t *dp = dmu_tx_pool(tx);
2192	dsl_dataset_t *ds;
2193	int64_t unused_refres_delta;
2194	int error;
2195
2196	error = dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds);
2197	if (error != 0)
2198		return (error);
2199
2200	/* must not be a snapshot */
2201	if (ds->ds_is_snapshot) {
2202		dsl_dataset_rele(ds, FTAG);
2203		return (SET_ERROR(EINVAL));
2204	}
2205
2206	/* must have a most recent snapshot */
2207	if (dsl_dataset_phys(ds)->ds_prev_snap_txg < TXG_INITIAL) {
2208		dsl_dataset_rele(ds, FTAG);
2209		return (SET_ERROR(EINVAL));
2210	}
2211
2212	/* must not have any bookmarks after the most recent snapshot */
2213	nvlist_t *proprequest = fnvlist_alloc();
2214	fnvlist_add_boolean(proprequest, zfs_prop_to_name(ZFS_PROP_CREATETXG));
2215	nvlist_t *bookmarks = fnvlist_alloc();
2216	error = dsl_get_bookmarks_impl(ds, proprequest, bookmarks);
2217	fnvlist_free(proprequest);
2218	if (error != 0)
2219		return (error);
2220	for (nvpair_t *pair = nvlist_next_nvpair(bookmarks, NULL);
2221	    pair != NULL; pair = nvlist_next_nvpair(bookmarks, pair)) {
2222		nvlist_t *valuenv =
2223		    fnvlist_lookup_nvlist(fnvpair_value_nvlist(pair),
2224		    zfs_prop_to_name(ZFS_PROP_CREATETXG));
2225		uint64_t createtxg = fnvlist_lookup_uint64(valuenv, "value");
2226		if (createtxg > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
2227			fnvlist_free(bookmarks);
2228			dsl_dataset_rele(ds, FTAG);
2229			return (SET_ERROR(EEXIST));
2230		}
2231	}
2232	fnvlist_free(bookmarks);
2233
2234	error = dsl_dataset_handoff_check(ds, ddra->ddra_owner, tx);
2235	if (error != 0) {
2236		dsl_dataset_rele(ds, FTAG);
2237		return (error);
2238	}
2239
2240	/*
2241	 * Check if the snap we are rolling back to uses more than
2242	 * the refquota.
2243	 */
2244	if (ds->ds_quota != 0 &&
2245	    dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes > ds->ds_quota) {
2246		dsl_dataset_rele(ds, FTAG);
2247		return (SET_ERROR(EDQUOT));
2248	}
2249
2250	/*
2251	 * When we do the clone swap, we will temporarily use more space
2252	 * due to the refreservation (the head will no longer have any
2253	 * unique space, so the entire amount of the refreservation will need
2254	 * to be free).  We will immediately destroy the clone, freeing
2255	 * this space, but the freeing happens over many txg's.
2256	 */
2257	unused_refres_delta = (int64_t)MIN(ds->ds_reserved,
2258	    dsl_dataset_phys(ds)->ds_unique_bytes);
2259
2260	if (unused_refres_delta > 0 &&
2261	    unused_refres_delta >
2262	    dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) {
2263		dsl_dataset_rele(ds, FTAG);
2264		return (SET_ERROR(ENOSPC));
2265	}
2266
2267	dsl_dataset_rele(ds, FTAG);
2268	return (0);
2269}
2270
2271static void
2272dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx)
2273{
2274	dsl_dataset_rollback_arg_t *ddra = arg;
2275	dsl_pool_t *dp = dmu_tx_pool(tx);
2276	dsl_dataset_t *ds, *clone;
2277	uint64_t cloneobj;
2278	char namebuf[ZFS_MAXNAMELEN];
2279
2280	VERIFY0(dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds));
2281
2282	dsl_dataset_name(ds->ds_prev, namebuf);
2283	fnvlist_add_string(ddra->ddra_result, "target", namebuf);
2284
2285	cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback",
2286	    ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, tx);
2287
2288	VERIFY0(dsl_dataset_hold_obj(dp, cloneobj, FTAG, &clone));
2289
2290	dsl_dataset_clone_swap_sync_impl(clone, ds, tx);
2291	dsl_dataset_zero_zil(ds, tx);
2292
2293	dsl_destroy_head_sync_impl(clone, tx);
2294
2295	dsl_dataset_rele(clone, FTAG);
2296	dsl_dataset_rele(ds, FTAG);
2297}
2298
2299/*
2300 * Rolls back the given filesystem or volume to the most recent snapshot.
2301 * The name of the most recent snapshot will be returned under key "target"
2302 * in the result nvlist.
2303 *
2304 * If owner != NULL:
2305 * - The existing dataset MUST be owned by the specified owner at entry
2306 * - Upon return, dataset will still be held by the same owner, whether we
2307 *   succeed or not.
2308 *
2309 * This mode is required any time the existing filesystem is mounted.  See
2310 * notes above zfs_suspend_fs() for further details.
2311 */
2312int
2313dsl_dataset_rollback(const char *fsname, void *owner, nvlist_t *result)
2314{
2315	dsl_dataset_rollback_arg_t ddra;
2316
2317	ddra.ddra_fsname = fsname;
2318	ddra.ddra_owner = owner;
2319	ddra.ddra_result = result;
2320
2321	return (dsl_sync_task(fsname, dsl_dataset_rollback_check,
2322	    dsl_dataset_rollback_sync, &ddra,
2323	    1, ZFS_SPACE_CHECK_RESERVED));
2324}
2325
2326struct promotenode {
2327	list_node_t link;
2328	dsl_dataset_t *ds;
2329};
2330
2331typedef struct dsl_dataset_promote_arg {
2332	const char *ddpa_clonename;
2333	dsl_dataset_t *ddpa_clone;
2334	list_t shared_snaps, origin_snaps, clone_snaps;
2335	dsl_dataset_t *origin_origin; /* origin of the origin */
2336	uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
2337	char *err_ds;
2338	cred_t *cr;
2339} dsl_dataset_promote_arg_t;
2340
2341static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
2342static int promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp,
2343    void *tag);
2344static void promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag);
2345
2346static int
2347dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
2348{
2349	dsl_dataset_promote_arg_t *ddpa = arg;
2350	dsl_pool_t *dp = dmu_tx_pool(tx);
2351	dsl_dataset_t *hds;
2352	struct promotenode *snap;
2353	dsl_dataset_t *origin_ds;
2354	int err;
2355	uint64_t unused;
2356	uint64_t ss_mv_cnt;
2357	size_t max_snap_len;
2358
2359	err = promote_hold(ddpa, dp, FTAG);
2360	if (err != 0)
2361		return (err);
2362
2363	hds = ddpa->ddpa_clone;
2364	max_snap_len = MAXNAMELEN - strlen(ddpa->ddpa_clonename) - 1;
2365
2366	if (dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE) {
2367		promote_rele(ddpa, FTAG);
2368		return (SET_ERROR(EXDEV));
2369	}
2370
2371	/*
2372	 * Compute and check the amount of space to transfer.  Since this is
2373	 * so expensive, don't do the preliminary check.
2374	 */
2375	if (!dmu_tx_is_syncing(tx)) {
2376		promote_rele(ddpa, FTAG);
2377		return (0);
2378	}
2379
2380	snap = list_head(&ddpa->shared_snaps);
2381	origin_ds = snap->ds;
2382
2383	/* compute origin's new unique space */
2384	snap = list_tail(&ddpa->clone_snaps);
2385	ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
2386	    origin_ds->ds_object);
2387	dsl_deadlist_space_range(&snap->ds->ds_deadlist,
2388	    dsl_dataset_phys(origin_ds)->ds_prev_snap_txg, UINT64_MAX,
2389	    &ddpa->unique, &unused, &unused);
2390
2391	/*
2392	 * Walk the snapshots that we are moving
2393	 *
2394	 * Compute space to transfer.  Consider the incremental changes
2395	 * to used by each snapshot:
2396	 * (my used) = (prev's used) + (blocks born) - (blocks killed)
2397	 * So each snapshot gave birth to:
2398	 * (blocks born) = (my used) - (prev's used) + (blocks killed)
2399	 * So a sequence would look like:
2400	 * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0)
2401	 * Which simplifies to:
2402	 * uN + kN + kN-1 + ... + k1 + k0
2403	 * Note however, if we stop before we reach the ORIGIN we get:
2404	 * uN + kN + kN-1 + ... + kM - uM-1
2405	 */
2406	ss_mv_cnt = 0;
2407	ddpa->used = dsl_dataset_phys(origin_ds)->ds_referenced_bytes;
2408	ddpa->comp = dsl_dataset_phys(origin_ds)->ds_compressed_bytes;
2409	ddpa->uncomp = dsl_dataset_phys(origin_ds)->ds_uncompressed_bytes;
2410	for (snap = list_head(&ddpa->shared_snaps); snap;
2411	    snap = list_next(&ddpa->shared_snaps, snap)) {
2412		uint64_t val, dlused, dlcomp, dluncomp;
2413		dsl_dataset_t *ds = snap->ds;
2414
2415		ss_mv_cnt++;
2416
2417		/*
2418		 * If there are long holds, we won't be able to evict
2419		 * the objset.
2420		 */
2421		if (dsl_dataset_long_held(ds)) {
2422			err = SET_ERROR(EBUSY);
2423			goto out;
2424		}
2425
2426		/* Check that the snapshot name does not conflict */
2427		VERIFY0(dsl_dataset_get_snapname(ds));
2428		if (strlen(ds->ds_snapname) >= max_snap_len) {
2429			err = SET_ERROR(ENAMETOOLONG);
2430			goto out;
2431		}
2432		err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
2433		if (err == 0) {
2434			(void) strcpy(ddpa->err_ds, snap->ds->ds_snapname);
2435			err = SET_ERROR(EEXIST);
2436			goto out;
2437		}
2438		if (err != ENOENT)
2439			goto out;
2440
2441		/* The very first snapshot does not have a deadlist */
2442		if (dsl_dataset_phys(ds)->ds_prev_snap_obj == 0)
2443			continue;
2444
2445		dsl_deadlist_space(&ds->ds_deadlist,
2446		    &dlused, &dlcomp, &dluncomp);
2447		ddpa->used += dlused;
2448		ddpa->comp += dlcomp;
2449		ddpa->uncomp += dluncomp;
2450	}
2451
2452	/*
2453	 * If we are a clone of a clone then we never reached ORIGIN,
2454	 * so we need to subtract out the clone origin's used space.
2455	 */
2456	if (ddpa->origin_origin) {
2457		ddpa->used -=
2458		    dsl_dataset_phys(ddpa->origin_origin)->ds_referenced_bytes;
2459		ddpa->comp -=
2460		    dsl_dataset_phys(ddpa->origin_origin)->ds_compressed_bytes;
2461		ddpa->uncomp -=
2462		    dsl_dataset_phys(ddpa->origin_origin)->
2463		    ds_uncompressed_bytes;
2464	}
2465
2466	/* Check that there is enough space and limit headroom here */
2467	err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
2468	    0, ss_mv_cnt, ddpa->used, ddpa->cr);
2469	if (err != 0)
2470		goto out;
2471
2472	/*
2473	 * Compute the amounts of space that will be used by snapshots
2474	 * after the promotion (for both origin and clone).  For each,
2475	 * it is the amount of space that will be on all of their
2476	 * deadlists (that was not born before their new origin).
2477	 */
2478	if (dsl_dir_phys(hds->ds_dir)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
2479		uint64_t space;
2480
2481		/*
2482		 * Note, typically this will not be a clone of a clone,
2483		 * so dd_origin_txg will be < TXG_INITIAL, so
2484		 * these snaplist_space() -> dsl_deadlist_space_range()
2485		 * calls will be fast because they do not have to
2486		 * iterate over all bps.
2487		 */
2488		snap = list_head(&ddpa->origin_snaps);
2489		err = snaplist_space(&ddpa->shared_snaps,
2490		    snap->ds->ds_dir->dd_origin_txg, &ddpa->cloneusedsnap);
2491		if (err != 0)
2492			goto out;
2493
2494		err = snaplist_space(&ddpa->clone_snaps,
2495		    snap->ds->ds_dir->dd_origin_txg, &space);
2496		if (err != 0)
2497			goto out;
2498		ddpa->cloneusedsnap += space;
2499	}
2500	if (dsl_dir_phys(origin_ds->ds_dir)->dd_flags &
2501	    DD_FLAG_USED_BREAKDOWN) {
2502		err = snaplist_space(&ddpa->origin_snaps,
2503		    dsl_dataset_phys(origin_ds)->ds_creation_txg,
2504		    &ddpa->originusedsnap);
2505		if (err != 0)
2506			goto out;
2507	}
2508
2509out:
2510	promote_rele(ddpa, FTAG);
2511	return (err);
2512}
2513
2514static void
2515dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
2516{
2517	dsl_dataset_promote_arg_t *ddpa = arg;
2518	dsl_pool_t *dp = dmu_tx_pool(tx);
2519	dsl_dataset_t *hds;
2520	struct promotenode *snap;
2521	dsl_dataset_t *origin_ds;
2522	dsl_dataset_t *origin_head;
2523	dsl_dir_t *dd;
2524	dsl_dir_t *odd = NULL;
2525	uint64_t oldnext_obj;
2526	int64_t delta;
2527#if defined(__FreeBSD__) && defined(_KERNEL)
2528	char *oldname, *newname;
2529#endif
2530
2531	VERIFY0(promote_hold(ddpa, dp, FTAG));
2532	hds = ddpa->ddpa_clone;
2533
2534	ASSERT0(dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE);
2535
2536	snap = list_head(&ddpa->shared_snaps);
2537	origin_ds = snap->ds;
2538	dd = hds->ds_dir;
2539
2540	snap = list_head(&ddpa->origin_snaps);
2541	origin_head = snap->ds;
2542
2543	/*
2544	 * We need to explicitly open odd, since origin_ds's dd will be
2545	 * changing.
2546	 */
2547	VERIFY0(dsl_dir_hold_obj(dp, origin_ds->ds_dir->dd_object,
2548	    NULL, FTAG, &odd));
2549
2550	/* change origin's next snap */
2551	dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
2552	oldnext_obj = dsl_dataset_phys(origin_ds)->ds_next_snap_obj;
2553	snap = list_tail(&ddpa->clone_snaps);
2554	ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
2555	    origin_ds->ds_object);
2556	dsl_dataset_phys(origin_ds)->ds_next_snap_obj = snap->ds->ds_object;
2557
2558	/* change the origin's next clone */
2559	if (dsl_dataset_phys(origin_ds)->ds_next_clones_obj) {
2560		dsl_dataset_remove_from_next_clones(origin_ds,
2561		    snap->ds->ds_object, tx);
2562		VERIFY0(zap_add_int(dp->dp_meta_objset,
2563		    dsl_dataset_phys(origin_ds)->ds_next_clones_obj,
2564		    oldnext_obj, tx));
2565	}
2566
2567	/* change origin */
2568	dmu_buf_will_dirty(dd->dd_dbuf, tx);
2569	ASSERT3U(dsl_dir_phys(dd)->dd_origin_obj, ==, origin_ds->ds_object);
2570	dsl_dir_phys(dd)->dd_origin_obj = dsl_dir_phys(odd)->dd_origin_obj;
2571	dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg;
2572	dmu_buf_will_dirty(odd->dd_dbuf, tx);
2573	dsl_dir_phys(odd)->dd_origin_obj = origin_ds->ds_object;
2574	origin_head->ds_dir->dd_origin_txg =
2575	    dsl_dataset_phys(origin_ds)->ds_creation_txg;
2576
2577	/* change dd_clone entries */
2578	if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
2579		VERIFY0(zap_remove_int(dp->dp_meta_objset,
2580		    dsl_dir_phys(odd)->dd_clones, hds->ds_object, tx));
2581		VERIFY0(zap_add_int(dp->dp_meta_objset,
2582		    dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
2583		    hds->ds_object, tx));
2584
2585		VERIFY0(zap_remove_int(dp->dp_meta_objset,
2586		    dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
2587		    origin_head->ds_object, tx));
2588		if (dsl_dir_phys(dd)->dd_clones == 0) {
2589			dsl_dir_phys(dd)->dd_clones =
2590			    zap_create(dp->dp_meta_objset, DMU_OT_DSL_CLONES,
2591			    DMU_OT_NONE, 0, tx);
2592		}
2593		VERIFY0(zap_add_int(dp->dp_meta_objset,
2594		    dsl_dir_phys(dd)->dd_clones, origin_head->ds_object, tx));
2595	}
2596
2597#if defined(__FreeBSD__) && defined(_KERNEL)
2598	/* Take the spa_namespace_lock early so zvol renames don't deadlock. */
2599	mutex_enter(&spa_namespace_lock);
2600
2601	oldname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2602	newname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2603#endif
2604
2605	/* move snapshots to this dir */
2606	for (snap = list_head(&ddpa->shared_snaps); snap;
2607	    snap = list_next(&ddpa->shared_snaps, snap)) {
2608		dsl_dataset_t *ds = snap->ds;
2609
2610		/*
2611		 * Property callbacks are registered to a particular
2612		 * dsl_dir.  Since ours is changing, evict the objset
2613		 * so that they will be unregistered from the old dsl_dir.
2614		 */
2615		if (ds->ds_objset) {
2616			dmu_objset_evict(ds->ds_objset);
2617			ds->ds_objset = NULL;
2618		}
2619
2620		/* move snap name entry */
2621		VERIFY0(dsl_dataset_get_snapname(ds));
2622		VERIFY0(dsl_dataset_snap_remove(origin_head,
2623		    ds->ds_snapname, tx, B_TRUE));
2624		VERIFY0(zap_add(dp->dp_meta_objset,
2625		    dsl_dataset_phys(hds)->ds_snapnames_zapobj, ds->ds_snapname,
2626		    8, 1, &ds->ds_object, tx));
2627		dsl_fs_ss_count_adjust(hds->ds_dir, 1,
2628		    DD_FIELD_SNAPSHOT_COUNT, tx);
2629
2630		/* change containing dsl_dir */
2631		dmu_buf_will_dirty(ds->ds_dbuf, tx);
2632		ASSERT3U(dsl_dataset_phys(ds)->ds_dir_obj, ==, odd->dd_object);
2633		dsl_dataset_phys(ds)->ds_dir_obj = dd->dd_object;
2634		ASSERT3P(ds->ds_dir, ==, odd);
2635		dsl_dir_rele(ds->ds_dir, ds);
2636		VERIFY0(dsl_dir_hold_obj(dp, dd->dd_object,
2637		    NULL, ds, &ds->ds_dir));
2638
2639#if defined(__FreeBSD__) && defined(_KERNEL)
2640		dsl_dataset_name(ds, newname);
2641		zfsvfs_update_fromname(oldname, newname);
2642		zvol_rename_minors(oldname, newname);
2643#endif
2644
2645		/* move any clone references */
2646		if (dsl_dataset_phys(ds)->ds_next_clones_obj &&
2647		    spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
2648			zap_cursor_t zc;
2649			zap_attribute_t za;
2650
2651			for (zap_cursor_init(&zc, dp->dp_meta_objset,
2652			    dsl_dataset_phys(ds)->ds_next_clones_obj);
2653			    zap_cursor_retrieve(&zc, &za) == 0;
2654			    zap_cursor_advance(&zc)) {
2655				dsl_dataset_t *cnds;
2656				uint64_t o;
2657
2658				if (za.za_first_integer == oldnext_obj) {
2659					/*
2660					 * We've already moved the
2661					 * origin's reference.
2662					 */
2663					continue;
2664				}
2665
2666				VERIFY0(dsl_dataset_hold_obj(dp,
2667				    za.za_first_integer, FTAG, &cnds));
2668				o = dsl_dir_phys(cnds->ds_dir)->
2669				    dd_head_dataset_obj;
2670
2671				VERIFY0(zap_remove_int(dp->dp_meta_objset,
2672				    dsl_dir_phys(odd)->dd_clones, o, tx));
2673				VERIFY0(zap_add_int(dp->dp_meta_objset,
2674				    dsl_dir_phys(dd)->dd_clones, o, tx));
2675				dsl_dataset_rele(cnds, FTAG);
2676			}
2677			zap_cursor_fini(&zc);
2678		}
2679
2680		ASSERT(!dsl_prop_hascb(ds));
2681	}
2682
2683#if defined(__FreeBSD__) && defined(_KERNEL)
2684	mutex_exit(&spa_namespace_lock);
2685
2686	kmem_free(newname, MAXPATHLEN);
2687	kmem_free(oldname, MAXPATHLEN);
2688#endif
2689	/*
2690	 * Change space accounting.
2691	 * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either
2692	 * both be valid, or both be 0 (resulting in delta == 0).  This
2693	 * is true for each of {clone,origin} independently.
2694	 */
2695
2696	delta = ddpa->cloneusedsnap -
2697	    dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP];
2698	ASSERT3S(delta, >=, 0);
2699	ASSERT3U(ddpa->used, >=, delta);
2700	dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx);
2701	dsl_dir_diduse_space(dd, DD_USED_HEAD,
2702	    ddpa->used - delta, ddpa->comp, ddpa->uncomp, tx);
2703
2704	delta = ddpa->originusedsnap -
2705	    dsl_dir_phys(odd)->dd_used_breakdown[DD_USED_SNAP];
2706	ASSERT3S(delta, <=, 0);
2707	ASSERT3U(ddpa->used, >=, -delta);
2708	dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx);
2709	dsl_dir_diduse_space(odd, DD_USED_HEAD,
2710	    -ddpa->used - delta, -ddpa->comp, -ddpa->uncomp, tx);
2711
2712	dsl_dataset_phys(origin_ds)->ds_unique_bytes = ddpa->unique;
2713
2714	/* log history record */
2715	spa_history_log_internal_ds(hds, "promote", tx, "");
2716
2717	dsl_dir_rele(odd, FTAG);
2718	promote_rele(ddpa, FTAG);
2719}
2720
2721/*
2722 * Make a list of dsl_dataset_t's for the snapshots between first_obj
2723 * (exclusive) and last_obj (inclusive).  The list will be in reverse
2724 * order (last_obj will be the list_head()).  If first_obj == 0, do all
2725 * snapshots back to this dataset's origin.
2726 */
2727static int
2728snaplist_make(dsl_pool_t *dp,
2729    uint64_t first_obj, uint64_t last_obj, list_t *l, void *tag)
2730{
2731	uint64_t obj = last_obj;
2732
2733	list_create(l, sizeof (struct promotenode),
2734	    offsetof(struct promotenode, link));
2735
2736	while (obj != first_obj) {
2737		dsl_dataset_t *ds;
2738		struct promotenode *snap;
2739		int err;
2740
2741		err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
2742		ASSERT(err != ENOENT);
2743		if (err != 0)
2744			return (err);
2745
2746		if (first_obj == 0)
2747			first_obj = dsl_dir_phys(ds->ds_dir)->dd_origin_obj;
2748
2749		snap = kmem_alloc(sizeof (*snap), KM_SLEEP);
2750		snap->ds = ds;
2751		list_insert_tail(l, snap);
2752		obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
2753	}
2754
2755	return (0);
2756}
2757
2758static int
2759snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep)
2760{
2761	struct promotenode *snap;
2762
2763	*spacep = 0;
2764	for (snap = list_head(l); snap; snap = list_next(l, snap)) {
2765		uint64_t used, comp, uncomp;
2766		dsl_deadlist_space_range(&snap->ds->ds_deadlist,
2767		    mintxg, UINT64_MAX, &used, &comp, &uncomp);
2768		*spacep += used;
2769	}
2770	return (0);
2771}
2772
2773static void
2774snaplist_destroy(list_t *l, void *tag)
2775{
2776	struct promotenode *snap;
2777
2778	if (l == NULL || !list_link_active(&l->list_head))
2779		return;
2780
2781	while ((snap = list_tail(l)) != NULL) {
2782		list_remove(l, snap);
2783		dsl_dataset_rele(snap->ds, tag);
2784		kmem_free(snap, sizeof (*snap));
2785	}
2786	list_destroy(l);
2787}
2788
2789static int
2790promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, void *tag)
2791{
2792	int error;
2793	dsl_dir_t *dd;
2794	struct promotenode *snap;
2795
2796	error = dsl_dataset_hold(dp, ddpa->ddpa_clonename, tag,
2797	    &ddpa->ddpa_clone);
2798	if (error != 0)
2799		return (error);
2800	dd = ddpa->ddpa_clone->ds_dir;
2801
2802	if (ddpa->ddpa_clone->ds_is_snapshot ||
2803	    !dsl_dir_is_clone(dd)) {
2804		dsl_dataset_rele(ddpa->ddpa_clone, tag);
2805		return (SET_ERROR(EINVAL));
2806	}
2807
2808	error = snaplist_make(dp, 0, dsl_dir_phys(dd)->dd_origin_obj,
2809	    &ddpa->shared_snaps, tag);
2810	if (error != 0)
2811		goto out;
2812
2813	error = snaplist_make(dp, 0, ddpa->ddpa_clone->ds_object,
2814	    &ddpa->clone_snaps, tag);
2815	if (error != 0)
2816		goto out;
2817
2818	snap = list_head(&ddpa->shared_snaps);
2819	ASSERT3U(snap->ds->ds_object, ==, dsl_dir_phys(dd)->dd_origin_obj);
2820	error = snaplist_make(dp, dsl_dir_phys(dd)->dd_origin_obj,
2821	    dsl_dir_phys(snap->ds->ds_dir)->dd_head_dataset_obj,
2822	    &ddpa->origin_snaps, tag);
2823	if (error != 0)
2824		goto out;
2825
2826	if (dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj != 0) {
2827		error = dsl_dataset_hold_obj(dp,
2828		    dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj,
2829		    tag, &ddpa->origin_origin);
2830		if (error != 0)
2831			goto out;
2832	}
2833out:
2834	if (error != 0)
2835		promote_rele(ddpa, tag);
2836	return (error);
2837}
2838
2839static void
2840promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag)
2841{
2842	snaplist_destroy(&ddpa->shared_snaps, tag);
2843	snaplist_destroy(&ddpa->clone_snaps, tag);
2844	snaplist_destroy(&ddpa->origin_snaps, tag);
2845	if (ddpa->origin_origin != NULL)
2846		dsl_dataset_rele(ddpa->origin_origin, tag);
2847	dsl_dataset_rele(ddpa->ddpa_clone, tag);
2848}
2849
2850/*
2851 * Promote a clone.
2852 *
2853 * If it fails due to a conflicting snapshot name, "conflsnap" will be filled
2854 * in with the name.  (It must be at least MAXNAMELEN bytes long.)
2855 */
2856int
2857dsl_dataset_promote(const char *name, char *conflsnap)
2858{
2859	dsl_dataset_promote_arg_t ddpa = { 0 };
2860	uint64_t numsnaps;
2861	int error;
2862	objset_t *os;
2863
2864	/*
2865	 * We will modify space proportional to the number of
2866	 * snapshots.  Compute numsnaps.
2867	 */
2868	error = dmu_objset_hold(name, FTAG, &os);
2869	if (error != 0)
2870		return (error);
2871	error = zap_count(dmu_objset_pool(os)->dp_meta_objset,
2872	    dsl_dataset_phys(dmu_objset_ds(os))->ds_snapnames_zapobj,
2873	    &numsnaps);
2874	dmu_objset_rele(os, FTAG);
2875	if (error != 0)
2876		return (error);
2877
2878	ddpa.ddpa_clonename = name;
2879	ddpa.err_ds = conflsnap;
2880	ddpa.cr = CRED();
2881
2882	return (dsl_sync_task(name, dsl_dataset_promote_check,
2883	    dsl_dataset_promote_sync, &ddpa,
2884	    2 + numsnaps, ZFS_SPACE_CHECK_RESERVED));
2885}
2886
2887int
2888dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
2889    dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx)
2890{
2891	int64_t unused_refres_delta;
2892
2893	/* they should both be heads */
2894	if (clone->ds_is_snapshot ||
2895	    origin_head->ds_is_snapshot)
2896		return (SET_ERROR(EINVAL));
2897
2898	/* if we are not forcing, the branch point should be just before them */
2899	if (!force && clone->ds_prev != origin_head->ds_prev)
2900		return (SET_ERROR(EINVAL));
2901
2902	/* clone should be the clone (unless they are unrelated) */
2903	if (clone->ds_prev != NULL &&
2904	    clone->ds_prev != clone->ds_dir->dd_pool->dp_origin_snap &&
2905	    origin_head->ds_dir != clone->ds_prev->ds_dir)
2906		return (SET_ERROR(EINVAL));
2907
2908	/* the clone should be a child of the origin */
2909	if (clone->ds_dir->dd_parent != origin_head->ds_dir)
2910		return (SET_ERROR(EINVAL));
2911
2912	/* origin_head shouldn't be modified unless 'force' */
2913	if (!force &&
2914	    dsl_dataset_modified_since_snap(origin_head, origin_head->ds_prev))
2915		return (SET_ERROR(ETXTBSY));
2916
2917	/* origin_head should have no long holds (e.g. is not mounted) */
2918	if (dsl_dataset_handoff_check(origin_head, owner, tx))
2919		return (SET_ERROR(EBUSY));
2920
2921	/* check amount of any unconsumed refreservation */
2922	unused_refres_delta =
2923	    (int64_t)MIN(origin_head->ds_reserved,
2924	    dsl_dataset_phys(origin_head)->ds_unique_bytes) -
2925	    (int64_t)MIN(origin_head->ds_reserved,
2926	    dsl_dataset_phys(clone)->ds_unique_bytes);
2927
2928	if (unused_refres_delta > 0 &&
2929	    unused_refres_delta >
2930	    dsl_dir_space_available(origin_head->ds_dir, NULL, 0, TRUE))
2931		return (SET_ERROR(ENOSPC));
2932
2933	/* clone can't be over the head's refquota */
2934	if (origin_head->ds_quota != 0 &&
2935	    dsl_dataset_phys(clone)->ds_referenced_bytes >
2936	    origin_head->ds_quota)
2937		return (SET_ERROR(EDQUOT));
2938
2939	return (0);
2940}
2941
2942void
2943dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
2944    dsl_dataset_t *origin_head, dmu_tx_t *tx)
2945{
2946	dsl_pool_t *dp = dmu_tx_pool(tx);
2947	int64_t unused_refres_delta;
2948
2949	ASSERT(clone->ds_reserved == 0);
2950	ASSERT(origin_head->ds_quota == 0 ||
2951	    dsl_dataset_phys(clone)->ds_unique_bytes <= origin_head->ds_quota);
2952	ASSERT3P(clone->ds_prev, ==, origin_head->ds_prev);
2953
2954	/*
2955	 * Swap per-dataset feature flags.
2956	 */
2957	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
2958		if (!(spa_feature_table[f].fi_flags &
2959		    ZFEATURE_FLAG_PER_DATASET)) {
2960			ASSERT(!clone->ds_feature_inuse[f]);
2961			ASSERT(!origin_head->ds_feature_inuse[f]);
2962			continue;
2963		}
2964
2965		boolean_t clone_inuse = clone->ds_feature_inuse[f];
2966		boolean_t origin_head_inuse = origin_head->ds_feature_inuse[f];
2967
2968		if (clone_inuse) {
2969			dsl_dataset_deactivate_feature(clone->ds_object, f, tx);
2970			clone->ds_feature_inuse[f] = B_FALSE;
2971		}
2972		if (origin_head_inuse) {
2973			dsl_dataset_deactivate_feature(origin_head->ds_object,
2974			    f, tx);
2975			origin_head->ds_feature_inuse[f] = B_FALSE;
2976		}
2977		if (clone_inuse) {
2978			dsl_dataset_activate_feature(origin_head->ds_object,
2979			    f, tx);
2980			origin_head->ds_feature_inuse[f] = B_TRUE;
2981		}
2982		if (origin_head_inuse) {
2983			dsl_dataset_activate_feature(clone->ds_object, f, tx);
2984			clone->ds_feature_inuse[f] = B_TRUE;
2985		}
2986	}
2987
2988	dmu_buf_will_dirty(clone->ds_dbuf, tx);
2989	dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
2990
2991	if (clone->ds_objset != NULL) {
2992		dmu_objset_evict(clone->ds_objset);
2993		clone->ds_objset = NULL;
2994	}
2995
2996	if (origin_head->ds_objset != NULL) {
2997		dmu_objset_evict(origin_head->ds_objset);
2998		origin_head->ds_objset = NULL;
2999	}
3000
3001	unused_refres_delta =
3002	    (int64_t)MIN(origin_head->ds_reserved,
3003	    dsl_dataset_phys(origin_head)->ds_unique_bytes) -
3004	    (int64_t)MIN(origin_head->ds_reserved,
3005	    dsl_dataset_phys(clone)->ds_unique_bytes);
3006
3007	/*
3008	 * Reset origin's unique bytes, if it exists.
3009	 */
3010	if (clone->ds_prev) {
3011		dsl_dataset_t *origin = clone->ds_prev;
3012		uint64_t comp, uncomp;
3013
3014		dmu_buf_will_dirty(origin->ds_dbuf, tx);
3015		dsl_deadlist_space_range(&clone->ds_deadlist,
3016		    dsl_dataset_phys(origin)->ds_prev_snap_txg, UINT64_MAX,
3017		    &dsl_dataset_phys(origin)->ds_unique_bytes, &comp, &uncomp);
3018	}
3019
3020	/* swap blkptrs */
3021	{
3022		blkptr_t tmp;
3023		tmp = dsl_dataset_phys(origin_head)->ds_bp;
3024		dsl_dataset_phys(origin_head)->ds_bp =
3025		    dsl_dataset_phys(clone)->ds_bp;
3026		dsl_dataset_phys(clone)->ds_bp = tmp;
3027	}
3028
3029	/* set dd_*_bytes */
3030	{
3031		int64_t dused, dcomp, duncomp;
3032		uint64_t cdl_used, cdl_comp, cdl_uncomp;
3033		uint64_t odl_used, odl_comp, odl_uncomp;
3034
3035		ASSERT3U(dsl_dir_phys(clone->ds_dir)->
3036		    dd_used_breakdown[DD_USED_SNAP], ==, 0);
3037
3038		dsl_deadlist_space(&clone->ds_deadlist,
3039		    &cdl_used, &cdl_comp, &cdl_uncomp);
3040		dsl_deadlist_space(&origin_head->ds_deadlist,
3041		    &odl_used, &odl_comp, &odl_uncomp);
3042
3043		dused = dsl_dataset_phys(clone)->ds_referenced_bytes +
3044		    cdl_used -
3045		    (dsl_dataset_phys(origin_head)->ds_referenced_bytes +
3046		    odl_used);
3047		dcomp = dsl_dataset_phys(clone)->ds_compressed_bytes +
3048		    cdl_comp -
3049		    (dsl_dataset_phys(origin_head)->ds_compressed_bytes +
3050		    odl_comp);
3051		duncomp = dsl_dataset_phys(clone)->ds_uncompressed_bytes +
3052		    cdl_uncomp -
3053		    (dsl_dataset_phys(origin_head)->ds_uncompressed_bytes +
3054		    odl_uncomp);
3055
3056		dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_HEAD,
3057		    dused, dcomp, duncomp, tx);
3058		dsl_dir_diduse_space(clone->ds_dir, DD_USED_HEAD,
3059		    -dused, -dcomp, -duncomp, tx);
3060
3061		/*
3062		 * The difference in the space used by snapshots is the
3063		 * difference in snapshot space due to the head's
3064		 * deadlist (since that's the only thing that's
3065		 * changing that affects the snapused).
3066		 */
3067		dsl_deadlist_space_range(&clone->ds_deadlist,
3068		    origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
3069		    &cdl_used, &cdl_comp, &cdl_uncomp);
3070		dsl_deadlist_space_range(&origin_head->ds_deadlist,
3071		    origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
3072		    &odl_used, &odl_comp, &odl_uncomp);
3073		dsl_dir_transfer_space(origin_head->ds_dir, cdl_used - odl_used,
3074		    DD_USED_HEAD, DD_USED_SNAP, NULL);
3075	}
3076
3077	/* swap ds_*_bytes */
3078	SWITCH64(dsl_dataset_phys(origin_head)->ds_referenced_bytes,
3079	    dsl_dataset_phys(clone)->ds_referenced_bytes);
3080	SWITCH64(dsl_dataset_phys(origin_head)->ds_compressed_bytes,
3081	    dsl_dataset_phys(clone)->ds_compressed_bytes);
3082	SWITCH64(dsl_dataset_phys(origin_head)->ds_uncompressed_bytes,
3083	    dsl_dataset_phys(clone)->ds_uncompressed_bytes);
3084	SWITCH64(dsl_dataset_phys(origin_head)->ds_unique_bytes,
3085	    dsl_dataset_phys(clone)->ds_unique_bytes);
3086
3087	/* apply any parent delta for change in unconsumed refreservation */
3088	dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_REFRSRV,
3089	    unused_refres_delta, 0, 0, tx);
3090
3091	/*
3092	 * Swap deadlists.
3093	 */
3094	dsl_deadlist_close(&clone->ds_deadlist);
3095	dsl_deadlist_close(&origin_head->ds_deadlist);
3096	SWITCH64(dsl_dataset_phys(origin_head)->ds_deadlist_obj,
3097	    dsl_dataset_phys(clone)->ds_deadlist_obj);
3098	dsl_deadlist_open(&clone->ds_deadlist, dp->dp_meta_objset,
3099	    dsl_dataset_phys(clone)->ds_deadlist_obj);
3100	dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset,
3101	    dsl_dataset_phys(origin_head)->ds_deadlist_obj);
3102
3103	dsl_scan_ds_clone_swapped(origin_head, clone, tx);
3104
3105	spa_history_log_internal_ds(clone, "clone swap", tx,
3106	    "parent=%s", origin_head->ds_dir->dd_myname);
3107}
3108
3109/*
3110 * Given a pool name and a dataset object number in that pool,
3111 * return the name of that dataset.
3112 */
3113int
3114dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
3115{
3116	dsl_pool_t *dp;
3117	dsl_dataset_t *ds;
3118	int error;
3119
3120	error = dsl_pool_hold(pname, FTAG, &dp);
3121	if (error != 0)
3122		return (error);
3123
3124	error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
3125	if (error == 0) {
3126		dsl_dataset_name(ds, buf);
3127		dsl_dataset_rele(ds, FTAG);
3128	}
3129	dsl_pool_rele(dp, FTAG);
3130
3131	return (error);
3132}
3133
3134int
3135dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
3136    uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv)
3137{
3138	int error = 0;
3139
3140	ASSERT3S(asize, >, 0);
3141
3142	/*
3143	 * *ref_rsrv is the portion of asize that will come from any
3144	 * unconsumed refreservation space.
3145	 */
3146	*ref_rsrv = 0;
3147
3148	mutex_enter(&ds->ds_lock);
3149	/*
3150	 * Make a space adjustment for reserved bytes.
3151	 */
3152	if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) {
3153		ASSERT3U(*used, >=,
3154		    ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
3155		*used -=
3156		    (ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
3157		*ref_rsrv =
3158		    asize - MIN(asize, parent_delta(ds, asize + inflight));
3159	}
3160
3161	if (!check_quota || ds->ds_quota == 0) {
3162		mutex_exit(&ds->ds_lock);
3163		return (0);
3164	}
3165	/*
3166	 * If they are requesting more space, and our current estimate
3167	 * is over quota, they get to try again unless the actual
3168	 * on-disk is over quota and there are no pending changes (which
3169	 * may free up space for us).
3170	 */
3171	if (dsl_dataset_phys(ds)->ds_referenced_bytes + inflight >=
3172	    ds->ds_quota) {
3173		if (inflight > 0 ||
3174		    dsl_dataset_phys(ds)->ds_referenced_bytes < ds->ds_quota)
3175			error = SET_ERROR(ERESTART);
3176		else
3177			error = SET_ERROR(EDQUOT);
3178	}
3179	mutex_exit(&ds->ds_lock);
3180
3181	return (error);
3182}
3183
3184typedef struct dsl_dataset_set_qr_arg {
3185	const char *ddsqra_name;
3186	zprop_source_t ddsqra_source;
3187	uint64_t ddsqra_value;
3188} dsl_dataset_set_qr_arg_t;
3189
3190
3191/* ARGSUSED */
3192static int
3193dsl_dataset_set_refquota_check(void *arg, dmu_tx_t *tx)
3194{
3195	dsl_dataset_set_qr_arg_t *ddsqra = arg;
3196	dsl_pool_t *dp = dmu_tx_pool(tx);
3197	dsl_dataset_t *ds;
3198	int error;
3199	uint64_t newval;
3200
3201	if (spa_version(dp->dp_spa) < SPA_VERSION_REFQUOTA)
3202		return (SET_ERROR(ENOTSUP));
3203
3204	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
3205	if (error != 0)
3206		return (error);
3207
3208	if (ds->ds_is_snapshot) {
3209		dsl_dataset_rele(ds, FTAG);
3210		return (SET_ERROR(EINVAL));
3211	}
3212
3213	error = dsl_prop_predict(ds->ds_dir,
3214	    zfs_prop_to_name(ZFS_PROP_REFQUOTA),
3215	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
3216	if (error != 0) {
3217		dsl_dataset_rele(ds, FTAG);
3218		return (error);
3219	}
3220
3221	if (newval == 0) {
3222		dsl_dataset_rele(ds, FTAG);
3223		return (0);
3224	}
3225
3226	if (newval < dsl_dataset_phys(ds)->ds_referenced_bytes ||
3227	    newval < ds->ds_reserved) {
3228		dsl_dataset_rele(ds, FTAG);
3229		return (SET_ERROR(ENOSPC));
3230	}
3231
3232	dsl_dataset_rele(ds, FTAG);
3233	return (0);
3234}
3235
3236static void
3237dsl_dataset_set_refquota_sync(void *arg, dmu_tx_t *tx)
3238{
3239	dsl_dataset_set_qr_arg_t *ddsqra = arg;
3240	dsl_pool_t *dp = dmu_tx_pool(tx);
3241	dsl_dataset_t *ds;
3242	uint64_t newval;
3243
3244	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
3245
3246	dsl_prop_set_sync_impl(ds,
3247	    zfs_prop_to_name(ZFS_PROP_REFQUOTA),
3248	    ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
3249	    &ddsqra->ddsqra_value, tx);
3250
3251	VERIFY0(dsl_prop_get_int_ds(ds,
3252	    zfs_prop_to_name(ZFS_PROP_REFQUOTA), &newval));
3253
3254	if (ds->ds_quota != newval) {
3255		dmu_buf_will_dirty(ds->ds_dbuf, tx);
3256		ds->ds_quota = newval;
3257	}
3258	dsl_dataset_rele(ds, FTAG);
3259}
3260
3261int
3262dsl_dataset_set_refquota(const char *dsname, zprop_source_t source,
3263    uint64_t refquota)
3264{
3265	dsl_dataset_set_qr_arg_t ddsqra;
3266
3267	ddsqra.ddsqra_name = dsname;
3268	ddsqra.ddsqra_source = source;
3269	ddsqra.ddsqra_value = refquota;
3270
3271	return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check,
3272	    dsl_dataset_set_refquota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
3273}
3274
3275static int
3276dsl_dataset_set_refreservation_check(void *arg, dmu_tx_t *tx)
3277{
3278	dsl_dataset_set_qr_arg_t *ddsqra = arg;
3279	dsl_pool_t *dp = dmu_tx_pool(tx);
3280	dsl_dataset_t *ds;
3281	int error;
3282	uint64_t newval, unique;
3283
3284	if (spa_version(dp->dp_spa) < SPA_VERSION_REFRESERVATION)
3285		return (SET_ERROR(ENOTSUP));
3286
3287	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
3288	if (error != 0)
3289		return (error);
3290
3291	if (ds->ds_is_snapshot) {
3292		dsl_dataset_rele(ds, FTAG);
3293		return (SET_ERROR(EINVAL));
3294	}
3295
3296	error = dsl_prop_predict(ds->ds_dir,
3297	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
3298	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
3299	if (error != 0) {
3300		dsl_dataset_rele(ds, FTAG);
3301		return (error);
3302	}
3303
3304	/*
3305	 * If we are doing the preliminary check in open context, the
3306	 * space estimates may be inaccurate.
3307	 */
3308	if (!dmu_tx_is_syncing(tx)) {
3309		dsl_dataset_rele(ds, FTAG);
3310		return (0);
3311	}
3312
3313	mutex_enter(&ds->ds_lock);
3314	if (!DS_UNIQUE_IS_ACCURATE(ds))
3315		dsl_dataset_recalc_head_uniq(ds);
3316	unique = dsl_dataset_phys(ds)->ds_unique_bytes;
3317	mutex_exit(&ds->ds_lock);
3318
3319	if (MAX(unique, newval) > MAX(unique, ds->ds_reserved)) {
3320		uint64_t delta = MAX(unique, newval) -
3321		    MAX(unique, ds->ds_reserved);
3322
3323		if (delta >
3324		    dsl_dir_space_available(ds->ds_dir, NULL, 0, B_TRUE) ||
3325		    (ds->ds_quota > 0 && newval > ds->ds_quota)) {
3326			dsl_dataset_rele(ds, FTAG);
3327			return (SET_ERROR(ENOSPC));
3328		}
3329	}
3330
3331	dsl_dataset_rele(ds, FTAG);
3332	return (0);
3333}
3334
3335void
3336dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,
3337    zprop_source_t source, uint64_t value, dmu_tx_t *tx)
3338{
3339	uint64_t newval;
3340	uint64_t unique;
3341	int64_t delta;
3342
3343	dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
3344	    source, sizeof (value), 1, &value, tx);
3345
3346	VERIFY0(dsl_prop_get_int_ds(ds,
3347	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &newval));
3348
3349	dmu_buf_will_dirty(ds->ds_dbuf, tx);
3350	mutex_enter(&ds->ds_dir->dd_lock);
3351	mutex_enter(&ds->ds_lock);
3352	ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
3353	unique = dsl_dataset_phys(ds)->ds_unique_bytes;
3354	delta = MAX(0, (int64_t)(newval - unique)) -
3355	    MAX(0, (int64_t)(ds->ds_reserved - unique));
3356	ds->ds_reserved = newval;
3357	mutex_exit(&ds->ds_lock);
3358
3359	dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
3360	mutex_exit(&ds->ds_dir->dd_lock);
3361}
3362
3363static void
3364dsl_dataset_set_refreservation_sync(void *arg, dmu_tx_t *tx)
3365{
3366	dsl_dataset_set_qr_arg_t *ddsqra = arg;
3367	dsl_pool_t *dp = dmu_tx_pool(tx);
3368	dsl_dataset_t *ds;
3369
3370	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
3371	dsl_dataset_set_refreservation_sync_impl(ds,
3372	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, tx);
3373	dsl_dataset_rele(ds, FTAG);
3374}
3375
3376int
3377dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source,
3378    uint64_t refreservation)
3379{
3380	dsl_dataset_set_qr_arg_t ddsqra;
3381
3382	ddsqra.ddsqra_name = dsname;
3383	ddsqra.ddsqra_source = source;
3384	ddsqra.ddsqra_value = refreservation;
3385
3386	return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check,
3387	    dsl_dataset_set_refreservation_sync, &ddsqra,
3388	    0, ZFS_SPACE_CHECK_NONE));
3389}
3390
3391/*
3392 * Return (in *usedp) the amount of space written in new that is not
3393 * present in oldsnap.  New may be a snapshot or the head.  Old must be
3394 * a snapshot before new, in new's filesystem (or its origin).  If not then
3395 * fail and return EINVAL.
3396 *
3397 * The written space is calculated by considering two components:  First, we
3398 * ignore any freed space, and calculate the written as new's used space
3399 * minus old's used space.  Next, we add in the amount of space that was freed
3400 * between the two snapshots, thus reducing new's used space relative to old's.
3401 * Specifically, this is the space that was born before old->ds_creation_txg,
3402 * and freed before new (ie. on new's deadlist or a previous deadlist).
3403 *
3404 * space freed                         [---------------------]
3405 * snapshots                       ---O-------O--------O-------O------
3406 *                                         oldsnap            new
3407 */
3408int
3409dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
3410    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
3411{
3412	int err = 0;
3413	uint64_t snapobj;
3414	dsl_pool_t *dp = new->ds_dir->dd_pool;
3415
3416	ASSERT(dsl_pool_config_held(dp));
3417
3418	*usedp = 0;
3419	*usedp += dsl_dataset_phys(new)->ds_referenced_bytes;
3420	*usedp -= dsl_dataset_phys(oldsnap)->ds_referenced_bytes;
3421
3422	*compp = 0;
3423	*compp += dsl_dataset_phys(new)->ds_compressed_bytes;
3424	*compp -= dsl_dataset_phys(oldsnap)->ds_compressed_bytes;
3425
3426	*uncompp = 0;
3427	*uncompp += dsl_dataset_phys(new)->ds_uncompressed_bytes;
3428	*uncompp -= dsl_dataset_phys(oldsnap)->ds_uncompressed_bytes;
3429
3430	snapobj = new->ds_object;
3431	while (snapobj != oldsnap->ds_object) {
3432		dsl_dataset_t *snap;
3433		uint64_t used, comp, uncomp;
3434
3435		if (snapobj == new->ds_object) {
3436			snap = new;
3437		} else {
3438			err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
3439			if (err != 0)
3440				break;
3441		}
3442
3443		if (dsl_dataset_phys(snap)->ds_prev_snap_txg ==
3444		    dsl_dataset_phys(oldsnap)->ds_creation_txg) {
3445			/*
3446			 * The blocks in the deadlist can not be born after
3447			 * ds_prev_snap_txg, so get the whole deadlist space,
3448			 * which is more efficient (especially for old-format
3449			 * deadlists).  Unfortunately the deadlist code
3450			 * doesn't have enough information to make this
3451			 * optimization itself.
3452			 */
3453			dsl_deadlist_space(&snap->ds_deadlist,
3454			    &used, &comp, &uncomp);
3455		} else {
3456			dsl_deadlist_space_range(&snap->ds_deadlist,
3457			    0, dsl_dataset_phys(oldsnap)->ds_creation_txg,
3458			    &used, &comp, &uncomp);
3459		}
3460		*usedp += used;
3461		*compp += comp;
3462		*uncompp += uncomp;
3463
3464		/*
3465		 * If we get to the beginning of the chain of snapshots
3466		 * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap
3467		 * was not a snapshot of/before new.
3468		 */
3469		snapobj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
3470		if (snap != new)
3471			dsl_dataset_rele(snap, FTAG);
3472		if (snapobj == 0) {
3473			err = SET_ERROR(EINVAL);
3474			break;
3475		}
3476
3477	}
3478	return (err);
3479}
3480
3481/*
3482 * Return (in *usedp) the amount of space that will be reclaimed if firstsnap,
3483 * lastsnap, and all snapshots in between are deleted.
3484 *
3485 * blocks that would be freed            [---------------------------]
3486 * snapshots                       ---O-------O--------O-------O--------O
3487 *                                        firstsnap        lastsnap
3488 *
3489 * This is the set of blocks that were born after the snap before firstsnap,
3490 * (birth > firstsnap->prev_snap_txg) and died before the snap after the
3491 * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist).
3492 * We calculate this by iterating over the relevant deadlists (from the snap
3493 * after lastsnap, backward to the snap after firstsnap), summing up the
3494 * space on the deadlist that was born after the snap before firstsnap.
3495 */
3496int
3497dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
3498    dsl_dataset_t *lastsnap,
3499    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
3500{
3501	int err = 0;
3502	uint64_t snapobj;
3503	dsl_pool_t *dp = firstsnap->ds_dir->dd_pool;
3504
3505	ASSERT(firstsnap->ds_is_snapshot);
3506	ASSERT(lastsnap->ds_is_snapshot);
3507
3508	/*
3509	 * Check that the snapshots are in the same dsl_dir, and firstsnap
3510	 * is before lastsnap.
3511	 */
3512	if (firstsnap->ds_dir != lastsnap->ds_dir ||
3513	    dsl_dataset_phys(firstsnap)->ds_creation_txg >
3514	    dsl_dataset_phys(lastsnap)->ds_creation_txg)
3515		return (SET_ERROR(EINVAL));
3516
3517	*usedp = *compp = *uncompp = 0;
3518
3519	snapobj = dsl_dataset_phys(lastsnap)->ds_next_snap_obj;
3520	while (snapobj != firstsnap->ds_object) {
3521		dsl_dataset_t *ds;
3522		uint64_t used, comp, uncomp;
3523
3524		err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds);
3525		if (err != 0)
3526			break;
3527
3528		dsl_deadlist_space_range(&ds->ds_deadlist,
3529		    dsl_dataset_phys(firstsnap)->ds_prev_snap_txg, UINT64_MAX,
3530		    &used, &comp, &uncomp);
3531		*usedp += used;
3532		*compp += comp;
3533		*uncompp += uncomp;
3534
3535		snapobj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
3536		ASSERT3U(snapobj, !=, 0);
3537		dsl_dataset_rele(ds, FTAG);
3538	}
3539	return (err);
3540}
3541
3542/*
3543 * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline.
3544 * For example, they could both be snapshots of the same filesystem, and
3545 * 'earlier' is before 'later'.  Or 'earlier' could be the origin of
3546 * 'later's filesystem.  Or 'earlier' could be an older snapshot in the origin's
3547 * filesystem.  Or 'earlier' could be the origin's origin.
3548 *
3549 * If non-zero, earlier_txg is used instead of earlier's ds_creation_txg.
3550 */
3551boolean_t
3552dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier,
3553	uint64_t earlier_txg)
3554{
3555	dsl_pool_t *dp = later->ds_dir->dd_pool;
3556	int error;
3557	boolean_t ret;
3558
3559	ASSERT(dsl_pool_config_held(dp));
3560	ASSERT(earlier->ds_is_snapshot || earlier_txg != 0);
3561
3562	if (earlier_txg == 0)
3563		earlier_txg = dsl_dataset_phys(earlier)->ds_creation_txg;
3564
3565	if (later->ds_is_snapshot &&
3566	    earlier_txg >= dsl_dataset_phys(later)->ds_creation_txg)
3567		return (B_FALSE);
3568
3569	if (later->ds_dir == earlier->ds_dir)
3570		return (B_TRUE);
3571	if (!dsl_dir_is_clone(later->ds_dir))
3572		return (B_FALSE);
3573
3574	if (dsl_dir_phys(later->ds_dir)->dd_origin_obj == earlier->ds_object)
3575		return (B_TRUE);
3576	dsl_dataset_t *origin;
3577	error = dsl_dataset_hold_obj(dp,
3578	    dsl_dir_phys(later->ds_dir)->dd_origin_obj, FTAG, &origin);
3579	if (error != 0)
3580		return (B_FALSE);
3581	ret = dsl_dataset_is_before(origin, earlier, earlier_txg);
3582	dsl_dataset_rele(origin, FTAG);
3583	return (ret);
3584}
3585
3586void
3587dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx)
3588{
3589	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
3590	dmu_object_zapify(mos, ds->ds_object, DMU_OT_DSL_DATASET, tx);
3591}
3592
3593boolean_t
3594dsl_dataset_is_zapified(dsl_dataset_t *ds)
3595{
3596	dmu_object_info_t doi;
3597
3598	dmu_object_info_from_db(ds->ds_dbuf, &doi);
3599	return (doi.doi_type == DMU_OTN_ZAP_METADATA);
3600}
3601
3602boolean_t
3603dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds)
3604{
3605	return (dsl_dataset_is_zapified(ds) &&
3606	    zap_contains(ds->ds_dir->dd_pool->dp_meta_objset,
3607	    ds->ds_object, DS_FIELD_RESUME_TOGUID) == 0);
3608}
3609