dsl_dataset.c revision 297112
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Portions Copyright (c) 2011 Martin Matuska <mm@FreeBSD.org>
24 * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
25 * Copyright (c) 2014, Joyent, Inc. All rights reserved.
26 * Copyright (c) 2014 RackTop Systems.
27 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
28 * Copyright (c) 2014 Integros [integros.com]
29 */
30
31#include <sys/dmu_objset.h>
32#include <sys/dsl_dataset.h>
33#include <sys/dsl_dir.h>
34#include <sys/dsl_prop.h>
35#include <sys/dsl_synctask.h>
36#include <sys/dmu_traverse.h>
37#include <sys/dmu_impl.h>
38#include <sys/dmu_send.h>
39#include <sys/dmu_tx.h>
40#include <sys/arc.h>
41#include <sys/zio.h>
42#include <sys/zap.h>
43#include <sys/zfeature.h>
44#include <sys/unique.h>
45#include <sys/zfs_context.h>
46#include <sys/zfs_ioctl.h>
47#include <sys/spa.h>
48#include <sys/zfs_znode.h>
49#include <sys/zfs_onexit.h>
50#include <sys/zvol.h>
51#include <sys/dsl_scan.h>
52#include <sys/dsl_deadlist.h>
53#include <sys/dsl_destroy.h>
54#include <sys/dsl_userhold.h>
55#include <sys/dsl_bookmark.h>
56#include <sys/dmu_send.h>
57#include <sys/zio_checksum.h>
58#include <sys/zio_compress.h>
59#include <zfs_fletcher.h>
60
61SYSCTL_DECL(_vfs_zfs);
62
63/*
64 * The SPA supports block sizes up to 16MB.  However, very large blocks
65 * can have an impact on i/o latency (e.g. tying up a spinning disk for
66 * ~300ms), and also potentially on the memory allocator.  Therefore,
67 * we do not allow the recordsize to be set larger than zfs_max_recordsize
68 * (default 1MB).  Larger blocks can be created by changing this tunable,
69 * and pools with larger blocks can always be imported and used, regardless
70 * of this setting.
71 */
72int zfs_max_recordsize = 1 * 1024 * 1024;
73SYSCTL_INT(_vfs_zfs, OID_AUTO, max_recordsize, CTLFLAG_RWTUN,
74    &zfs_max_recordsize, 0,
75    "Maximum block size.  Expect dragons when tuning this.");
76
77#define	SWITCH64(x, y) \
78	{ \
79		uint64_t __tmp = (x); \
80		(x) = (y); \
81		(y) = __tmp; \
82	}
83
84#define	DS_REF_MAX	(1ULL << 62)
85
86extern inline dsl_dataset_phys_t *dsl_dataset_phys(dsl_dataset_t *ds);
87
88/*
89 * Figure out how much of this delta should be propogated to the dsl_dir
90 * layer.  If there's a refreservation, that space has already been
91 * partially accounted for in our ancestors.
92 */
93static int64_t
94parent_delta(dsl_dataset_t *ds, int64_t delta)
95{
96	dsl_dataset_phys_t *ds_phys;
97	uint64_t old_bytes, new_bytes;
98
99	if (ds->ds_reserved == 0)
100		return (delta);
101
102	ds_phys = dsl_dataset_phys(ds);
103	old_bytes = MAX(ds_phys->ds_unique_bytes, ds->ds_reserved);
104	new_bytes = MAX(ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
105
106	ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
107	return (new_bytes - old_bytes);
108}
109
110void
111dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
112{
113	int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
114	int compressed = BP_GET_PSIZE(bp);
115	int uncompressed = BP_GET_UCSIZE(bp);
116	int64_t delta;
117
118	dprintf_bp(bp, "ds=%p", ds);
119
120	ASSERT(dmu_tx_is_syncing(tx));
121	/* It could have been compressed away to nothing */
122	if (BP_IS_HOLE(bp))
123		return;
124	ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
125	ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp)));
126	if (ds == NULL) {
127		dsl_pool_mos_diduse_space(tx->tx_pool,
128		    used, compressed, uncompressed);
129		return;
130	}
131
132	dmu_buf_will_dirty(ds->ds_dbuf, tx);
133	mutex_enter(&ds->ds_lock);
134	delta = parent_delta(ds, used);
135	dsl_dataset_phys(ds)->ds_referenced_bytes += used;
136	dsl_dataset_phys(ds)->ds_compressed_bytes += compressed;
137	dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed;
138	dsl_dataset_phys(ds)->ds_unique_bytes += used;
139
140	if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) {
141		ds->ds_feature_activation_needed[SPA_FEATURE_LARGE_BLOCKS] =
142		    B_TRUE;
143	}
144
145	spa_feature_t f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp));
146	if (f != SPA_FEATURE_NONE)
147		ds->ds_feature_activation_needed[f] = B_TRUE;
148
149	mutex_exit(&ds->ds_lock);
150	dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
151	    compressed, uncompressed, tx);
152	dsl_dir_transfer_space(ds->ds_dir, used - delta,
153	    DD_USED_REFRSRV, DD_USED_HEAD, NULL);
154}
155
156int
157dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
158    boolean_t async)
159{
160	int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
161	int compressed = BP_GET_PSIZE(bp);
162	int uncompressed = BP_GET_UCSIZE(bp);
163
164	if (BP_IS_HOLE(bp))
165		return (0);
166
167	ASSERT(dmu_tx_is_syncing(tx));
168	ASSERT(bp->blk_birth <= tx->tx_txg);
169
170	if (ds == NULL) {
171		dsl_free(tx->tx_pool, tx->tx_txg, bp);
172		dsl_pool_mos_diduse_space(tx->tx_pool,
173		    -used, -compressed, -uncompressed);
174		return (used);
175	}
176	ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
177
178	ASSERT(!ds->ds_is_snapshot);
179	dmu_buf_will_dirty(ds->ds_dbuf, tx);
180
181	if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
182		int64_t delta;
183
184		dprintf_bp(bp, "freeing ds=%llu", ds->ds_object);
185		dsl_free(tx->tx_pool, tx->tx_txg, bp);
186
187		mutex_enter(&ds->ds_lock);
188		ASSERT(dsl_dataset_phys(ds)->ds_unique_bytes >= used ||
189		    !DS_UNIQUE_IS_ACCURATE(ds));
190		delta = parent_delta(ds, -used);
191		dsl_dataset_phys(ds)->ds_unique_bytes -= used;
192		mutex_exit(&ds->ds_lock);
193		dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
194		    delta, -compressed, -uncompressed, tx);
195		dsl_dir_transfer_space(ds->ds_dir, -used - delta,
196		    DD_USED_REFRSRV, DD_USED_HEAD, NULL);
197	} else {
198		dprintf_bp(bp, "putting on dead list: %s", "");
199		if (async) {
200			/*
201			 * We are here as part of zio's write done callback,
202			 * which means we're a zio interrupt thread.  We can't
203			 * call dsl_deadlist_insert() now because it may block
204			 * waiting for I/O.  Instead, put bp on the deferred
205			 * queue and let dsl_pool_sync() finish the job.
206			 */
207			bplist_append(&ds->ds_pending_deadlist, bp);
208		} else {
209			dsl_deadlist_insert(&ds->ds_deadlist, bp, tx);
210		}
211		ASSERT3U(ds->ds_prev->ds_object, ==,
212		    dsl_dataset_phys(ds)->ds_prev_snap_obj);
213		ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0);
214		/* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
215		if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
216		    ds->ds_object && bp->blk_birth >
217		    dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) {
218			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
219			mutex_enter(&ds->ds_prev->ds_lock);
220			dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used;
221			mutex_exit(&ds->ds_prev->ds_lock);
222		}
223		if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
224			dsl_dir_transfer_space(ds->ds_dir, used,
225			    DD_USED_HEAD, DD_USED_SNAP, tx);
226		}
227	}
228	mutex_enter(&ds->ds_lock);
229	ASSERT3U(dsl_dataset_phys(ds)->ds_referenced_bytes, >=, used);
230	dsl_dataset_phys(ds)->ds_referenced_bytes -= used;
231	ASSERT3U(dsl_dataset_phys(ds)->ds_compressed_bytes, >=, compressed);
232	dsl_dataset_phys(ds)->ds_compressed_bytes -= compressed;
233	ASSERT3U(dsl_dataset_phys(ds)->ds_uncompressed_bytes, >=, uncompressed);
234	dsl_dataset_phys(ds)->ds_uncompressed_bytes -= uncompressed;
235	mutex_exit(&ds->ds_lock);
236
237	return (used);
238}
239
240uint64_t
241dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
242{
243	uint64_t trysnap = 0;
244
245	if (ds == NULL)
246		return (0);
247	/*
248	 * The snapshot creation could fail, but that would cause an
249	 * incorrect FALSE return, which would only result in an
250	 * overestimation of the amount of space that an operation would
251	 * consume, which is OK.
252	 *
253	 * There's also a small window where we could miss a pending
254	 * snapshot, because we could set the sync task in the quiescing
255	 * phase.  So this should only be used as a guess.
256	 */
257	if (ds->ds_trysnap_txg >
258	    spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa))
259		trysnap = ds->ds_trysnap_txg;
260	return (MAX(dsl_dataset_phys(ds)->ds_prev_snap_txg, trysnap));
261}
262
263boolean_t
264dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp,
265    uint64_t blk_birth)
266{
267	if (blk_birth <= dsl_dataset_prev_snap_txg(ds) ||
268	    (bp != NULL && BP_IS_HOLE(bp)))
269		return (B_FALSE);
270
271	ddt_prefetch(dsl_dataset_get_spa(ds), bp);
272
273	return (B_TRUE);
274}
275
276static void
277dsl_dataset_evict(void *dbu)
278{
279	dsl_dataset_t *ds = dbu;
280
281	ASSERT(ds->ds_owner == NULL);
282
283	ds->ds_dbuf = NULL;
284
285	unique_remove(ds->ds_fsid_guid);
286
287	if (ds->ds_objset != NULL)
288		dmu_objset_evict(ds->ds_objset);
289
290	if (ds->ds_prev) {
291		dsl_dataset_rele(ds->ds_prev, ds);
292		ds->ds_prev = NULL;
293	}
294
295	bplist_destroy(&ds->ds_pending_deadlist);
296	if (ds->ds_deadlist.dl_os != NULL)
297		dsl_deadlist_close(&ds->ds_deadlist);
298	if (ds->ds_dir)
299		dsl_dir_async_rele(ds->ds_dir, ds);
300
301	ASSERT(!list_link_active(&ds->ds_synced_link));
302
303	list_destroy(&ds->ds_prop_cbs);
304	if (mutex_owned(&ds->ds_lock))
305		mutex_exit(&ds->ds_lock);
306	mutex_destroy(&ds->ds_lock);
307	if (mutex_owned(&ds->ds_opening_lock))
308		mutex_exit(&ds->ds_opening_lock);
309	mutex_destroy(&ds->ds_opening_lock);
310	mutex_destroy(&ds->ds_sendstream_lock);
311	refcount_destroy(&ds->ds_longholds);
312
313	kmem_free(ds, sizeof (dsl_dataset_t));
314}
315
316int
317dsl_dataset_get_snapname(dsl_dataset_t *ds)
318{
319	dsl_dataset_phys_t *headphys;
320	int err;
321	dmu_buf_t *headdbuf;
322	dsl_pool_t *dp = ds->ds_dir->dd_pool;
323	objset_t *mos = dp->dp_meta_objset;
324
325	if (ds->ds_snapname[0])
326		return (0);
327	if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0)
328		return (0);
329
330	err = dmu_bonus_hold(mos, dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj,
331	    FTAG, &headdbuf);
332	if (err != 0)
333		return (err);
334	headphys = headdbuf->db_data;
335	err = zap_value_search(dp->dp_meta_objset,
336	    headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname);
337	dmu_buf_rele(headdbuf, FTAG);
338	return (err);
339}
340
341int
342dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value)
343{
344	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
345	uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
346	matchtype_t mt;
347	int err;
348
349	if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
350		mt = MT_FIRST;
351	else
352		mt = MT_EXACT;
353
354	err = zap_lookup_norm(mos, snapobj, name, 8, 1,
355	    value, mt, NULL, 0, NULL);
356	if (err == ENOTSUP && mt == MT_FIRST)
357		err = zap_lookup(mos, snapobj, name, 8, 1, value);
358	return (err);
359}
360
361int
362dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx,
363    boolean_t adj_cnt)
364{
365	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
366	uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
367	matchtype_t mt;
368	int err;
369
370	dsl_dir_snap_cmtime_update(ds->ds_dir);
371
372	if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
373		mt = MT_FIRST;
374	else
375		mt = MT_EXACT;
376
377	err = zap_remove_norm(mos, snapobj, name, mt, tx);
378	if (err == ENOTSUP && mt == MT_FIRST)
379		err = zap_remove(mos, snapobj, name, tx);
380
381	if (err == 0 && adj_cnt)
382		dsl_fs_ss_count_adjust(ds->ds_dir, -1,
383		    DD_FIELD_SNAPSHOT_COUNT, tx);
384
385	return (err);
386}
387
388boolean_t
389dsl_dataset_try_add_ref(dsl_pool_t *dp, dsl_dataset_t *ds, void *tag)
390{
391	dmu_buf_t *dbuf = ds->ds_dbuf;
392	boolean_t result = B_FALSE;
393
394	if (dbuf != NULL && dmu_buf_try_add_ref(dbuf, dp->dp_meta_objset,
395	    ds->ds_object, DMU_BONUS_BLKID, tag)) {
396
397		if (ds == dmu_buf_get_user(dbuf))
398			result = B_TRUE;
399		else
400			dmu_buf_rele(dbuf, tag);
401	}
402
403	return (result);
404}
405
406int
407dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
408    dsl_dataset_t **dsp)
409{
410	objset_t *mos = dp->dp_meta_objset;
411	dmu_buf_t *dbuf;
412	dsl_dataset_t *ds;
413	int err;
414	dmu_object_info_t doi;
415
416	ASSERT(dsl_pool_config_held(dp));
417
418	err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
419	if (err != 0)
420		return (err);
421
422	/* Make sure dsobj has the correct object type. */
423	dmu_object_info_from_db(dbuf, &doi);
424	if (doi.doi_bonus_type != DMU_OT_DSL_DATASET) {
425		dmu_buf_rele(dbuf, tag);
426		return (SET_ERROR(EINVAL));
427	}
428
429	ds = dmu_buf_get_user(dbuf);
430	if (ds == NULL) {
431		dsl_dataset_t *winner = NULL;
432
433		ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
434		ds->ds_dbuf = dbuf;
435		ds->ds_object = dsobj;
436		ds->ds_is_snapshot = dsl_dataset_phys(ds)->ds_num_children != 0;
437
438		mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
439		mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
440		mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL);
441		refcount_create(&ds->ds_longholds);
442
443		bplist_create(&ds->ds_pending_deadlist);
444		dsl_deadlist_open(&ds->ds_deadlist,
445		    mos, dsl_dataset_phys(ds)->ds_deadlist_obj);
446
447		list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t),
448		    offsetof(dmu_sendarg_t, dsa_link));
449
450		list_create(&ds->ds_prop_cbs, sizeof (dsl_prop_cb_record_t),
451		    offsetof(dsl_prop_cb_record_t, cbr_ds_node));
452
453		if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
454			for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
455				if (!(spa_feature_table[f].fi_flags &
456				    ZFEATURE_FLAG_PER_DATASET))
457					continue;
458				err = zap_contains(mos, dsobj,
459				    spa_feature_table[f].fi_guid);
460				if (err == 0) {
461					ds->ds_feature_inuse[f] = B_TRUE;
462				} else {
463					ASSERT3U(err, ==, ENOENT);
464					err = 0;
465				}
466			}
467		}
468
469		err = dsl_dir_hold_obj(dp,
470		    dsl_dataset_phys(ds)->ds_dir_obj, NULL, ds, &ds->ds_dir);
471		if (err != 0) {
472			mutex_destroy(&ds->ds_lock);
473			mutex_destroy(&ds->ds_opening_lock);
474			mutex_destroy(&ds->ds_sendstream_lock);
475			refcount_destroy(&ds->ds_longholds);
476			bplist_destroy(&ds->ds_pending_deadlist);
477			dsl_deadlist_close(&ds->ds_deadlist);
478			kmem_free(ds, sizeof (dsl_dataset_t));
479			dmu_buf_rele(dbuf, tag);
480			return (err);
481		}
482
483		if (!ds->ds_is_snapshot) {
484			ds->ds_snapname[0] = '\0';
485			if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
486				err = dsl_dataset_hold_obj(dp,
487				    dsl_dataset_phys(ds)->ds_prev_snap_obj,
488				    ds, &ds->ds_prev);
489			}
490			if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
491				int zaperr = zap_lookup(mos, ds->ds_object,
492				    DS_FIELD_BOOKMARK_NAMES,
493				    sizeof (ds->ds_bookmarks), 1,
494				    &ds->ds_bookmarks);
495				if (zaperr != ENOENT)
496					VERIFY0(zaperr);
497			}
498		} else {
499			if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
500				err = dsl_dataset_get_snapname(ds);
501			if (err == 0 &&
502			    dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {
503				err = zap_count(
504				    ds->ds_dir->dd_pool->dp_meta_objset,
505				    dsl_dataset_phys(ds)->ds_userrefs_obj,
506				    &ds->ds_userrefs);
507			}
508		}
509
510		if (err == 0 && !ds->ds_is_snapshot) {
511			err = dsl_prop_get_int_ds(ds,
512			    zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
513			    &ds->ds_reserved);
514			if (err == 0) {
515				err = dsl_prop_get_int_ds(ds,
516				    zfs_prop_to_name(ZFS_PROP_REFQUOTA),
517				    &ds->ds_quota);
518			}
519		} else {
520			ds->ds_reserved = ds->ds_quota = 0;
521		}
522
523		dmu_buf_init_user(&ds->ds_dbu, dsl_dataset_evict, &ds->ds_dbuf);
524		if (err == 0)
525			winner = dmu_buf_set_user_ie(dbuf, &ds->ds_dbu);
526
527		if (err != 0 || winner != NULL) {
528			bplist_destroy(&ds->ds_pending_deadlist);
529			dsl_deadlist_close(&ds->ds_deadlist);
530			if (ds->ds_prev)
531				dsl_dataset_rele(ds->ds_prev, ds);
532			dsl_dir_rele(ds->ds_dir, ds);
533			mutex_destroy(&ds->ds_lock);
534			mutex_destroy(&ds->ds_opening_lock);
535			mutex_destroy(&ds->ds_sendstream_lock);
536			refcount_destroy(&ds->ds_longholds);
537			kmem_free(ds, sizeof (dsl_dataset_t));
538			if (err != 0) {
539				dmu_buf_rele(dbuf, tag);
540				return (err);
541			}
542			ds = winner;
543		} else {
544			ds->ds_fsid_guid =
545			    unique_insert(dsl_dataset_phys(ds)->ds_fsid_guid);
546		}
547	}
548	ASSERT3P(ds->ds_dbuf, ==, dbuf);
549	ASSERT3P(dsl_dataset_phys(ds), ==, dbuf->db_data);
550	ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0 ||
551	    spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN ||
552	    dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);
553	*dsp = ds;
554	return (0);
555}
556
557int
558dsl_dataset_hold(dsl_pool_t *dp, const char *name,
559    void *tag, dsl_dataset_t **dsp)
560{
561	dsl_dir_t *dd;
562	const char *snapname;
563	uint64_t obj;
564	int err = 0;
565	dsl_dataset_t *ds;
566
567	err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname);
568	if (err != 0)
569		return (err);
570
571	ASSERT(dsl_pool_config_held(dp));
572	obj = dsl_dir_phys(dd)->dd_head_dataset_obj;
573	if (obj != 0)
574		err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
575	else
576		err = SET_ERROR(ENOENT);
577
578	/* we may be looking for a snapshot */
579	if (err == 0 && snapname != NULL) {
580		dsl_dataset_t *snap_ds;
581
582		if (*snapname++ != '@') {
583			dsl_dataset_rele(ds, tag);
584			dsl_dir_rele(dd, FTAG);
585			return (SET_ERROR(ENOENT));
586		}
587
588		dprintf("looking for snapshot '%s'\n", snapname);
589		err = dsl_dataset_snap_lookup(ds, snapname, &obj);
590		if (err == 0)
591			err = dsl_dataset_hold_obj(dp, obj, tag, &snap_ds);
592		dsl_dataset_rele(ds, tag);
593
594		if (err == 0) {
595			mutex_enter(&snap_ds->ds_lock);
596			if (snap_ds->ds_snapname[0] == 0)
597				(void) strlcpy(snap_ds->ds_snapname, snapname,
598				    sizeof (snap_ds->ds_snapname));
599			mutex_exit(&snap_ds->ds_lock);
600			ds = snap_ds;
601		}
602	}
603	if (err == 0)
604		*dsp = ds;
605	dsl_dir_rele(dd, FTAG);
606	return (err);
607}
608
609int
610dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj,
611    void *tag, dsl_dataset_t **dsp)
612{
613	int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp);
614	if (err != 0)
615		return (err);
616	if (!dsl_dataset_tryown(*dsp, tag)) {
617		dsl_dataset_rele(*dsp, tag);
618		*dsp = NULL;
619		return (SET_ERROR(EBUSY));
620	}
621	return (0);
622}
623
624int
625dsl_dataset_own(dsl_pool_t *dp, const char *name,
626    void *tag, dsl_dataset_t **dsp)
627{
628	int err = dsl_dataset_hold(dp, name, tag, dsp);
629	if (err != 0)
630		return (err);
631	if (!dsl_dataset_tryown(*dsp, tag)) {
632		dsl_dataset_rele(*dsp, tag);
633		return (SET_ERROR(EBUSY));
634	}
635	return (0);
636}
637
638/*
639 * See the comment above dsl_pool_hold() for details.  In summary, a long
640 * hold is used to prevent destruction of a dataset while the pool hold
641 * is dropped, allowing other concurrent operations (e.g. spa_sync()).
642 *
643 * The dataset and pool must be held when this function is called.  After it
644 * is called, the pool hold may be released while the dataset is still held
645 * and accessed.
646 */
647void
648dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag)
649{
650	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
651	(void) refcount_add(&ds->ds_longholds, tag);
652}
653
654void
655dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag)
656{
657	(void) refcount_remove(&ds->ds_longholds, tag);
658}
659
660/* Return B_TRUE if there are any long holds on this dataset. */
661boolean_t
662dsl_dataset_long_held(dsl_dataset_t *ds)
663{
664	return (!refcount_is_zero(&ds->ds_longholds));
665}
666
667void
668dsl_dataset_name(dsl_dataset_t *ds, char *name)
669{
670	if (ds == NULL) {
671		(void) strcpy(name, "mos");
672	} else {
673		dsl_dir_name(ds->ds_dir, name);
674		VERIFY0(dsl_dataset_get_snapname(ds));
675		if (ds->ds_snapname[0]) {
676			(void) strcat(name, "@");
677			/*
678			 * We use a "recursive" mutex so that we
679			 * can call dprintf_ds() with ds_lock held.
680			 */
681			if (!MUTEX_HELD(&ds->ds_lock)) {
682				mutex_enter(&ds->ds_lock);
683				(void) strcat(name, ds->ds_snapname);
684				mutex_exit(&ds->ds_lock);
685			} else {
686				(void) strcat(name, ds->ds_snapname);
687			}
688		}
689	}
690}
691
692void
693dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
694{
695	dmu_buf_rele(ds->ds_dbuf, tag);
696}
697
698void
699dsl_dataset_disown(dsl_dataset_t *ds, void *tag)
700{
701	ASSERT3P(ds->ds_owner, ==, tag);
702	ASSERT(ds->ds_dbuf != NULL);
703
704	mutex_enter(&ds->ds_lock);
705	ds->ds_owner = NULL;
706	mutex_exit(&ds->ds_lock);
707	dsl_dataset_long_rele(ds, tag);
708	dsl_dataset_rele(ds, tag);
709}
710
711boolean_t
712dsl_dataset_tryown(dsl_dataset_t *ds, void *tag)
713{
714	boolean_t gotit = FALSE;
715
716	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
717	mutex_enter(&ds->ds_lock);
718	if (ds->ds_owner == NULL && !DS_IS_INCONSISTENT(ds)) {
719		ds->ds_owner = tag;
720		dsl_dataset_long_hold(ds, tag);
721		gotit = TRUE;
722	}
723	mutex_exit(&ds->ds_lock);
724	return (gotit);
725}
726
727boolean_t
728dsl_dataset_has_owner(dsl_dataset_t *ds)
729{
730	boolean_t rv;
731	mutex_enter(&ds->ds_lock);
732	rv = (ds->ds_owner != NULL);
733	mutex_exit(&ds->ds_lock);
734	return (rv);
735}
736
737static void
738dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx)
739{
740	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
741	objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
742	uint64_t zero = 0;
743
744	VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET);
745
746	spa_feature_incr(spa, f, tx);
747	dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
748
749	VERIFY0(zap_add(mos, dsobj, spa_feature_table[f].fi_guid,
750	    sizeof (zero), 1, &zero, tx));
751}
752
753void
754dsl_dataset_deactivate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx)
755{
756	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
757	objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
758
759	VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET);
760
761	VERIFY0(zap_remove(mos, dsobj, spa_feature_table[f].fi_guid, tx));
762	spa_feature_decr(spa, f, tx);
763}
764
765uint64_t
766dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
767    uint64_t flags, dmu_tx_t *tx)
768{
769	dsl_pool_t *dp = dd->dd_pool;
770	dmu_buf_t *dbuf;
771	dsl_dataset_phys_t *dsphys;
772	uint64_t dsobj;
773	objset_t *mos = dp->dp_meta_objset;
774
775	if (origin == NULL)
776		origin = dp->dp_origin_snap;
777
778	ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp);
779	ASSERT(origin == NULL || dsl_dataset_phys(origin)->ds_num_children > 0);
780	ASSERT(dmu_tx_is_syncing(tx));
781	ASSERT(dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
782
783	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
784	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
785	VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
786	dmu_buf_will_dirty(dbuf, tx);
787	dsphys = dbuf->db_data;
788	bzero(dsphys, sizeof (dsl_dataset_phys_t));
789	dsphys->ds_dir_obj = dd->dd_object;
790	dsphys->ds_flags = flags;
791	dsphys->ds_fsid_guid = unique_create();
792	do {
793		(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
794		    sizeof (dsphys->ds_guid));
795	} while (dsphys->ds_guid == 0);
796	dsphys->ds_snapnames_zapobj =
797	    zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP,
798	    DMU_OT_NONE, 0, tx);
799	dsphys->ds_creation_time = gethrestime_sec();
800	dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;
801
802	if (origin == NULL) {
803		dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx);
804	} else {
805		dsl_dataset_t *ohds; /* head of the origin snapshot */
806
807		dsphys->ds_prev_snap_obj = origin->ds_object;
808		dsphys->ds_prev_snap_txg =
809		    dsl_dataset_phys(origin)->ds_creation_txg;
810		dsphys->ds_referenced_bytes =
811		    dsl_dataset_phys(origin)->ds_referenced_bytes;
812		dsphys->ds_compressed_bytes =
813		    dsl_dataset_phys(origin)->ds_compressed_bytes;
814		dsphys->ds_uncompressed_bytes =
815		    dsl_dataset_phys(origin)->ds_uncompressed_bytes;
816		dsphys->ds_bp = dsl_dataset_phys(origin)->ds_bp;
817
818		/*
819		 * Inherit flags that describe the dataset's contents
820		 * (INCONSISTENT) or properties (Case Insensitive).
821		 */
822		dsphys->ds_flags |= dsl_dataset_phys(origin)->ds_flags &
823		    (DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET);
824
825		for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
826			if (origin->ds_feature_inuse[f])
827				dsl_dataset_activate_feature(dsobj, f, tx);
828		}
829
830		dmu_buf_will_dirty(origin->ds_dbuf, tx);
831		dsl_dataset_phys(origin)->ds_num_children++;
832
833		VERIFY0(dsl_dataset_hold_obj(dp,
834		    dsl_dir_phys(origin->ds_dir)->dd_head_dataset_obj,
835		    FTAG, &ohds));
836		dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,
837		    dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);
838		dsl_dataset_rele(ohds, FTAG);
839
840		if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
841			if (dsl_dataset_phys(origin)->ds_next_clones_obj == 0) {
842				dsl_dataset_phys(origin)->ds_next_clones_obj =
843				    zap_create(mos,
844				    DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
845			}
846			VERIFY0(zap_add_int(mos,
847			    dsl_dataset_phys(origin)->ds_next_clones_obj,
848			    dsobj, tx));
849		}
850
851		dmu_buf_will_dirty(dd->dd_dbuf, tx);
852		dsl_dir_phys(dd)->dd_origin_obj = origin->ds_object;
853		if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
854			if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {
855				dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
856				dsl_dir_phys(origin->ds_dir)->dd_clones =
857				    zap_create(mos,
858				    DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
859			}
860			VERIFY0(zap_add_int(mos,
861			    dsl_dir_phys(origin->ds_dir)->dd_clones,
862			    dsobj, tx));
863		}
864	}
865
866	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
867		dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
868
869	dmu_buf_rele(dbuf, FTAG);
870
871	dmu_buf_will_dirty(dd->dd_dbuf, tx);
872	dsl_dir_phys(dd)->dd_head_dataset_obj = dsobj;
873
874	return (dsobj);
875}
876
877static void
878dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx)
879{
880	objset_t *os;
881
882	VERIFY0(dmu_objset_from_ds(ds, &os));
883	bzero(&os->os_zil_header, sizeof (os->os_zil_header));
884	dsl_dataset_dirty(ds, tx);
885}
886
887uint64_t
888dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
889    dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx)
890{
891	dsl_pool_t *dp = pdd->dd_pool;
892	uint64_t dsobj, ddobj;
893	dsl_dir_t *dd;
894
895	ASSERT(dmu_tx_is_syncing(tx));
896	ASSERT(lastname[0] != '@');
897
898	ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
899	VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd));
900
901	dsobj = dsl_dataset_create_sync_dd(dd, origin,
902	    flags & ~DS_CREATE_FLAG_NODIRTY, tx);
903
904	dsl_deleg_set_create_perms(dd, tx, cr);
905
906	/*
907	 * Since we're creating a new node we know it's a leaf, so we can
908	 * initialize the counts if the limit feature is active.
909	 */
910	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
911		uint64_t cnt = 0;
912		objset_t *os = dd->dd_pool->dp_meta_objset;
913
914		dsl_dir_zapify(dd, tx);
915		VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
916		    sizeof (cnt), 1, &cnt, tx));
917		VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
918		    sizeof (cnt), 1, &cnt, tx));
919	}
920
921	dsl_dir_rele(dd, FTAG);
922
923	/*
924	 * If we are creating a clone, make sure we zero out any stale
925	 * data from the origin snapshots zil header.
926	 */
927	if (origin != NULL && !(flags & DS_CREATE_FLAG_NODIRTY)) {
928		dsl_dataset_t *ds;
929
930		VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
931		dsl_dataset_zero_zil(ds, tx);
932		dsl_dataset_rele(ds, FTAG);
933	}
934
935	return (dsobj);
936}
937
938#ifdef __FreeBSD__
939/* FreeBSD ioctl compat begin */
940struct destroyarg {
941	nvlist_t *nvl;
942	const char *snapname;
943};
944
945static int
946dsl_check_snap_cb(const char *name, void *arg)
947{
948	struct destroyarg *da = arg;
949	dsl_dataset_t *ds;
950	char *dsname;
951
952	dsname = kmem_asprintf("%s@%s", name, da->snapname);
953	fnvlist_add_boolean(da->nvl, dsname);
954	kmem_free(dsname, strlen(dsname) + 1);
955
956	return (0);
957}
958
959int
960dmu_get_recursive_snaps_nvl(char *fsname, const char *snapname,
961    nvlist_t *snaps)
962{
963	struct destroyarg *da;
964	int err;
965
966	da = kmem_zalloc(sizeof (struct destroyarg), KM_SLEEP);
967	da->nvl = snaps;
968	da->snapname = snapname;
969	err = dmu_objset_find(fsname, dsl_check_snap_cb, da,
970	    DS_FIND_CHILDREN);
971	kmem_free(da, sizeof (struct destroyarg));
972
973	return (err);
974}
975/* FreeBSD ioctl compat end */
976#endif /* __FreeBSD__ */
977
978/*
979 * The unique space in the head dataset can be calculated by subtracting
980 * the space used in the most recent snapshot, that is still being used
981 * in this file system, from the space currently in use.  To figure out
982 * the space in the most recent snapshot still in use, we need to take
983 * the total space used in the snapshot and subtract out the space that
984 * has been freed up since the snapshot was taken.
985 */
986void
987dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
988{
989	uint64_t mrs_used;
990	uint64_t dlused, dlcomp, dluncomp;
991
992	ASSERT(!ds->ds_is_snapshot);
993
994	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0)
995		mrs_used = dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes;
996	else
997		mrs_used = 0;
998
999	dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
1000
1001	ASSERT3U(dlused, <=, mrs_used);
1002	dsl_dataset_phys(ds)->ds_unique_bytes =
1003	    dsl_dataset_phys(ds)->ds_referenced_bytes - (mrs_used - dlused);
1004
1005	if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
1006	    SPA_VERSION_UNIQUE_ACCURATE)
1007		dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
1008}
1009
1010void
1011dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj,
1012    dmu_tx_t *tx)
1013{
1014	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1015	uint64_t count;
1016	int err;
1017
1018	ASSERT(dsl_dataset_phys(ds)->ds_num_children >= 2);
1019	err = zap_remove_int(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
1020	    obj, tx);
1021	/*
1022	 * The err should not be ENOENT, but a bug in a previous version
1023	 * of the code could cause upgrade_clones_cb() to not set
1024	 * ds_next_snap_obj when it should, leading to a missing entry.
1025	 * If we knew that the pool was created after
1026	 * SPA_VERSION_NEXT_CLONES, we could assert that it isn't
1027	 * ENOENT.  However, at least we can check that we don't have
1028	 * too many entries in the next_clones_obj even after failing to
1029	 * remove this one.
1030	 */
1031	if (err != ENOENT)
1032		VERIFY0(err);
1033	ASSERT0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
1034	    &count));
1035	ASSERT3U(count, <=, dsl_dataset_phys(ds)->ds_num_children - 2);
1036}
1037
1038
1039blkptr_t *
1040dsl_dataset_get_blkptr(dsl_dataset_t *ds)
1041{
1042	return (&dsl_dataset_phys(ds)->ds_bp);
1043}
1044
1045void
1046dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
1047{
1048	ASSERT(dmu_tx_is_syncing(tx));
1049	/* If it's the meta-objset, set dp_meta_rootbp */
1050	if (ds == NULL) {
1051		tx->tx_pool->dp_meta_rootbp = *bp;
1052	} else {
1053		dmu_buf_will_dirty(ds->ds_dbuf, tx);
1054		dsl_dataset_phys(ds)->ds_bp = *bp;
1055	}
1056}
1057
1058spa_t *
1059dsl_dataset_get_spa(dsl_dataset_t *ds)
1060{
1061	return (ds->ds_dir->dd_pool->dp_spa);
1062}
1063
1064void
1065dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
1066{
1067	dsl_pool_t *dp;
1068
1069	if (ds == NULL) /* this is the meta-objset */
1070		return;
1071
1072	ASSERT(ds->ds_objset != NULL);
1073
1074	if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0)
1075		panic("dirtying snapshot!");
1076
1077	dp = ds->ds_dir->dd_pool;
1078
1079	if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg)) {
1080		/* up the hold count until we can be written out */
1081		dmu_buf_add_ref(ds->ds_dbuf, ds);
1082	}
1083}
1084
1085boolean_t
1086dsl_dataset_is_dirty(dsl_dataset_t *ds)
1087{
1088	for (int t = 0; t < TXG_SIZE; t++) {
1089		if (txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets,
1090		    ds, t))
1091			return (B_TRUE);
1092	}
1093	return (B_FALSE);
1094}
1095
1096static int
1097dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
1098{
1099	uint64_t asize;
1100
1101	if (!dmu_tx_is_syncing(tx))
1102		return (0);
1103
1104	/*
1105	 * If there's an fs-only reservation, any blocks that might become
1106	 * owned by the snapshot dataset must be accommodated by space
1107	 * outside of the reservation.
1108	 */
1109	ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds));
1110	asize = MIN(dsl_dataset_phys(ds)->ds_unique_bytes, ds->ds_reserved);
1111	if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
1112		return (SET_ERROR(ENOSPC));
1113
1114	/*
1115	 * Propagate any reserved space for this snapshot to other
1116	 * snapshot checks in this sync group.
1117	 */
1118	if (asize > 0)
1119		dsl_dir_willuse_space(ds->ds_dir, asize, tx);
1120
1121	return (0);
1122}
1123
1124typedef struct dsl_dataset_snapshot_arg {
1125	nvlist_t *ddsa_snaps;
1126	nvlist_t *ddsa_props;
1127	nvlist_t *ddsa_errors;
1128	cred_t *ddsa_cr;
1129} dsl_dataset_snapshot_arg_t;
1130
1131int
1132dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname,
1133    dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr)
1134{
1135	int error;
1136	uint64_t value;
1137
1138	ds->ds_trysnap_txg = tx->tx_txg;
1139
1140	if (!dmu_tx_is_syncing(tx))
1141		return (0);
1142
1143	/*
1144	 * We don't allow multiple snapshots of the same txg.  If there
1145	 * is already one, try again.
1146	 */
1147	if (dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg)
1148		return (SET_ERROR(EAGAIN));
1149
1150	/*
1151	 * Check for conflicting snapshot name.
1152	 */
1153	error = dsl_dataset_snap_lookup(ds, snapname, &value);
1154	if (error == 0)
1155		return (SET_ERROR(EEXIST));
1156	if (error != ENOENT)
1157		return (error);
1158
1159	/*
1160	 * We don't allow taking snapshots of inconsistent datasets, such as
1161	 * those into which we are currently receiving.  However, if we are
1162	 * creating this snapshot as part of a receive, this check will be
1163	 * executed atomically with respect to the completion of the receive
1164	 * itself but prior to the clearing of DS_FLAG_INCONSISTENT; in this
1165	 * case we ignore this, knowing it will be fixed up for us shortly in
1166	 * dmu_recv_end_sync().
1167	 */
1168	if (!recv && DS_IS_INCONSISTENT(ds))
1169		return (SET_ERROR(EBUSY));
1170
1171	/*
1172	 * Skip the check for temporary snapshots or if we have already checked
1173	 * the counts in dsl_dataset_snapshot_check. This means we really only
1174	 * check the count here when we're receiving a stream.
1175	 */
1176	if (cnt != 0 && cr != NULL) {
1177		error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
1178		    ZFS_PROP_SNAPSHOT_LIMIT, NULL, cr);
1179		if (error != 0)
1180			return (error);
1181	}
1182
1183	error = dsl_dataset_snapshot_reserve_space(ds, tx);
1184	if (error != 0)
1185		return (error);
1186
1187	return (0);
1188}
1189
1190static int
1191dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx)
1192{
1193	dsl_dataset_snapshot_arg_t *ddsa = arg;
1194	dsl_pool_t *dp = dmu_tx_pool(tx);
1195	nvpair_t *pair;
1196	int rv = 0;
1197
1198	/*
1199	 * Pre-compute how many total new snapshots will be created for each
1200	 * level in the tree and below. This is needed for validating the
1201	 * snapshot limit when either taking a recursive snapshot or when
1202	 * taking multiple snapshots.
1203	 *
1204	 * The problem is that the counts are not actually adjusted when
1205	 * we are checking, only when we finally sync. For a single snapshot,
1206	 * this is easy, the count will increase by 1 at each node up the tree,
1207	 * but its more complicated for the recursive/multiple snapshot case.
1208	 *
1209	 * The dsl_fs_ss_limit_check function does recursively check the count
1210	 * at each level up the tree but since it is validating each snapshot
1211	 * independently we need to be sure that we are validating the complete
1212	 * count for the entire set of snapshots. We do this by rolling up the
1213	 * counts for each component of the name into an nvlist and then
1214	 * checking each of those cases with the aggregated count.
1215	 *
1216	 * This approach properly handles not only the recursive snapshot
1217	 * case (where we get all of those on the ddsa_snaps list) but also
1218	 * the sibling case (e.g. snapshot a/b and a/c so that we will also
1219	 * validate the limit on 'a' using a count of 2).
1220	 *
1221	 * We validate the snapshot names in the third loop and only report
1222	 * name errors once.
1223	 */
1224	if (dmu_tx_is_syncing(tx)) {
1225		nvlist_t *cnt_track = NULL;
1226		cnt_track = fnvlist_alloc();
1227
1228		/* Rollup aggregated counts into the cnt_track list */
1229		for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
1230		    pair != NULL;
1231		    pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
1232			char *pdelim;
1233			uint64_t val;
1234			char nm[MAXPATHLEN];
1235
1236			(void) strlcpy(nm, nvpair_name(pair), sizeof (nm));
1237			pdelim = strchr(nm, '@');
1238			if (pdelim == NULL)
1239				continue;
1240			*pdelim = '\0';
1241
1242			do {
1243				if (nvlist_lookup_uint64(cnt_track, nm,
1244				    &val) == 0) {
1245					/* update existing entry */
1246					fnvlist_add_uint64(cnt_track, nm,
1247					    val + 1);
1248				} else {
1249					/* add to list */
1250					fnvlist_add_uint64(cnt_track, nm, 1);
1251				}
1252
1253				pdelim = strrchr(nm, '/');
1254				if (pdelim != NULL)
1255					*pdelim = '\0';
1256			} while (pdelim != NULL);
1257		}
1258
1259		/* Check aggregated counts at each level */
1260		for (pair = nvlist_next_nvpair(cnt_track, NULL);
1261		    pair != NULL; pair = nvlist_next_nvpair(cnt_track, pair)) {
1262			int error = 0;
1263			char *name;
1264			uint64_t cnt = 0;
1265			dsl_dataset_t *ds;
1266
1267			name = nvpair_name(pair);
1268			cnt = fnvpair_value_uint64(pair);
1269			ASSERT(cnt > 0);
1270
1271			error = dsl_dataset_hold(dp, name, FTAG, &ds);
1272			if (error == 0) {
1273				error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
1274				    ZFS_PROP_SNAPSHOT_LIMIT, NULL,
1275				    ddsa->ddsa_cr);
1276				dsl_dataset_rele(ds, FTAG);
1277			}
1278
1279			if (error != 0) {
1280				if (ddsa->ddsa_errors != NULL)
1281					fnvlist_add_int32(ddsa->ddsa_errors,
1282					    name, error);
1283				rv = error;
1284				/* only report one error for this check */
1285				break;
1286			}
1287		}
1288		nvlist_free(cnt_track);
1289	}
1290
1291	for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
1292	    pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
1293		int error = 0;
1294		dsl_dataset_t *ds;
1295		char *name, *atp;
1296		char dsname[MAXNAMELEN];
1297
1298		name = nvpair_name(pair);
1299		if (strlen(name) >= MAXNAMELEN)
1300			error = SET_ERROR(ENAMETOOLONG);
1301		if (error == 0) {
1302			atp = strchr(name, '@');
1303			if (atp == NULL)
1304				error = SET_ERROR(EINVAL);
1305			if (error == 0)
1306				(void) strlcpy(dsname, name, atp - name + 1);
1307		}
1308		if (error == 0)
1309			error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
1310		if (error == 0) {
1311			/* passing 0/NULL skips dsl_fs_ss_limit_check */
1312			error = dsl_dataset_snapshot_check_impl(ds,
1313			    atp + 1, tx, B_FALSE, 0, NULL);
1314			dsl_dataset_rele(ds, FTAG);
1315		}
1316
1317		if (error != 0) {
1318			if (ddsa->ddsa_errors != NULL) {
1319				fnvlist_add_int32(ddsa->ddsa_errors,
1320				    name, error);
1321			}
1322			rv = error;
1323		}
1324	}
1325
1326	return (rv);
1327}
1328
1329void
1330dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
1331    dmu_tx_t *tx)
1332{
1333	static zil_header_t zero_zil;
1334
1335	dsl_pool_t *dp = ds->ds_dir->dd_pool;
1336	dmu_buf_t *dbuf;
1337	dsl_dataset_phys_t *dsphys;
1338	uint64_t dsobj, crtxg;
1339	objset_t *mos = dp->dp_meta_objset;
1340	objset_t *os;
1341
1342	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
1343
1344	/*
1345	 * If we are on an old pool, the zil must not be active, in which
1346	 * case it will be zeroed.  Usually zil_suspend() accomplishes this.
1347	 */
1348	ASSERT(spa_version(dmu_tx_pool(tx)->dp_spa) >= SPA_VERSION_FAST_SNAP ||
1349	    dmu_objset_from_ds(ds, &os) != 0 ||
1350	    bcmp(&os->os_phys->os_zil_header, &zero_zil,
1351	    sizeof (zero_zil)) == 0);
1352
1353	dsl_fs_ss_count_adjust(ds->ds_dir, 1, DD_FIELD_SNAPSHOT_COUNT, tx);
1354
1355	/*
1356	 * The origin's ds_creation_txg has to be < TXG_INITIAL
1357	 */
1358	if (strcmp(snapname, ORIGIN_DIR_NAME) == 0)
1359		crtxg = 1;
1360	else
1361		crtxg = tx->tx_txg;
1362
1363	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
1364	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
1365	VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
1366	dmu_buf_will_dirty(dbuf, tx);
1367	dsphys = dbuf->db_data;
1368	bzero(dsphys, sizeof (dsl_dataset_phys_t));
1369	dsphys->ds_dir_obj = ds->ds_dir->dd_object;
1370	dsphys->ds_fsid_guid = unique_create();
1371	do {
1372		(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
1373		    sizeof (dsphys->ds_guid));
1374	} while (dsphys->ds_guid == 0);
1375	dsphys->ds_prev_snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
1376	dsphys->ds_prev_snap_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
1377	dsphys->ds_next_snap_obj = ds->ds_object;
1378	dsphys->ds_num_children = 1;
1379	dsphys->ds_creation_time = gethrestime_sec();
1380	dsphys->ds_creation_txg = crtxg;
1381	dsphys->ds_deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj;
1382	dsphys->ds_referenced_bytes = dsl_dataset_phys(ds)->ds_referenced_bytes;
1383	dsphys->ds_compressed_bytes = dsl_dataset_phys(ds)->ds_compressed_bytes;
1384	dsphys->ds_uncompressed_bytes =
1385	    dsl_dataset_phys(ds)->ds_uncompressed_bytes;
1386	dsphys->ds_flags = dsl_dataset_phys(ds)->ds_flags;
1387	dsphys->ds_bp = dsl_dataset_phys(ds)->ds_bp;
1388	dmu_buf_rele(dbuf, FTAG);
1389
1390	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
1391		if (ds->ds_feature_inuse[f])
1392			dsl_dataset_activate_feature(dsobj, f, tx);
1393	}
1394
1395	ASSERT3U(ds->ds_prev != 0, ==,
1396	    dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
1397	if (ds->ds_prev) {
1398		uint64_t next_clones_obj =
1399		    dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj;
1400		ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
1401		    ds->ds_object ||
1402		    dsl_dataset_phys(ds->ds_prev)->ds_num_children > 1);
1403		if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
1404		    ds->ds_object) {
1405			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
1406			ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==,
1407			    dsl_dataset_phys(ds->ds_prev)->ds_creation_txg);
1408			dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj = dsobj;
1409		} else if (next_clones_obj != 0) {
1410			dsl_dataset_remove_from_next_clones(ds->ds_prev,
1411			    dsphys->ds_next_snap_obj, tx);
1412			VERIFY0(zap_add_int(mos,
1413			    next_clones_obj, dsobj, tx));
1414		}
1415	}
1416
1417	/*
1418	 * If we have a reference-reservation on this dataset, we will
1419	 * need to increase the amount of refreservation being charged
1420	 * since our unique space is going to zero.
1421	 */
1422	if (ds->ds_reserved) {
1423		int64_t delta;
1424		ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
1425		delta = MIN(dsl_dataset_phys(ds)->ds_unique_bytes,
1426		    ds->ds_reserved);
1427		dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,
1428		    delta, 0, 0, tx);
1429	}
1430
1431	dmu_buf_will_dirty(ds->ds_dbuf, tx);
1432	dsl_dataset_phys(ds)->ds_deadlist_obj =
1433	    dsl_deadlist_clone(&ds->ds_deadlist, UINT64_MAX,
1434	    dsl_dataset_phys(ds)->ds_prev_snap_obj, tx);
1435	dsl_deadlist_close(&ds->ds_deadlist);
1436	dsl_deadlist_open(&ds->ds_deadlist, mos,
1437	    dsl_dataset_phys(ds)->ds_deadlist_obj);
1438	dsl_deadlist_add_key(&ds->ds_deadlist,
1439	    dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
1440
1441	ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, <, tx->tx_txg);
1442	dsl_dataset_phys(ds)->ds_prev_snap_obj = dsobj;
1443	dsl_dataset_phys(ds)->ds_prev_snap_txg = crtxg;
1444	dsl_dataset_phys(ds)->ds_unique_bytes = 0;
1445	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
1446		dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
1447
1448	VERIFY0(zap_add(mos, dsl_dataset_phys(ds)->ds_snapnames_zapobj,
1449	    snapname, 8, 1, &dsobj, tx));
1450
1451	if (ds->ds_prev)
1452		dsl_dataset_rele(ds->ds_prev, ds);
1453	VERIFY0(dsl_dataset_hold_obj(dp,
1454	    dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev));
1455
1456	dsl_scan_ds_snapshotted(ds, tx);
1457
1458	dsl_dir_snap_cmtime_update(ds->ds_dir);
1459
1460	spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, "");
1461}
1462
1463static void
1464dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx)
1465{
1466	dsl_dataset_snapshot_arg_t *ddsa = arg;
1467	dsl_pool_t *dp = dmu_tx_pool(tx);
1468	nvpair_t *pair;
1469
1470	for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
1471	    pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
1472		dsl_dataset_t *ds;
1473		char *name, *atp;
1474		char dsname[MAXNAMELEN];
1475
1476		name = nvpair_name(pair);
1477		atp = strchr(name, '@');
1478		(void) strlcpy(dsname, name, atp - name + 1);
1479		VERIFY0(dsl_dataset_hold(dp, dsname, FTAG, &ds));
1480
1481		dsl_dataset_snapshot_sync_impl(ds, atp + 1, tx);
1482		if (ddsa->ddsa_props != NULL) {
1483			dsl_props_set_sync_impl(ds->ds_prev,
1484			    ZPROP_SRC_LOCAL, ddsa->ddsa_props, tx);
1485		}
1486		dsl_dataset_rele(ds, FTAG);
1487	}
1488}
1489
1490/*
1491 * The snapshots must all be in the same pool.
1492 * All-or-nothing: if there are any failures, nothing will be modified.
1493 */
1494int
1495dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors)
1496{
1497	dsl_dataset_snapshot_arg_t ddsa;
1498	nvpair_t *pair;
1499	boolean_t needsuspend;
1500	int error;
1501	spa_t *spa;
1502	char *firstname;
1503	nvlist_t *suspended = NULL;
1504
1505	pair = nvlist_next_nvpair(snaps, NULL);
1506	if (pair == NULL)
1507		return (0);
1508	firstname = nvpair_name(pair);
1509
1510	error = spa_open(firstname, &spa, FTAG);
1511	if (error != 0)
1512		return (error);
1513	needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
1514	spa_close(spa, FTAG);
1515
1516	if (needsuspend) {
1517		suspended = fnvlist_alloc();
1518		for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
1519		    pair = nvlist_next_nvpair(snaps, pair)) {
1520			char fsname[MAXNAMELEN];
1521			char *snapname = nvpair_name(pair);
1522			char *atp;
1523			void *cookie;
1524
1525			atp = strchr(snapname, '@');
1526			if (atp == NULL) {
1527				error = SET_ERROR(EINVAL);
1528				break;
1529			}
1530			(void) strlcpy(fsname, snapname, atp - snapname + 1);
1531
1532			error = zil_suspend(fsname, &cookie);
1533			if (error != 0)
1534				break;
1535			fnvlist_add_uint64(suspended, fsname,
1536			    (uintptr_t)cookie);
1537		}
1538	}
1539
1540	ddsa.ddsa_snaps = snaps;
1541	ddsa.ddsa_props = props;
1542	ddsa.ddsa_errors = errors;
1543	ddsa.ddsa_cr = CRED();
1544
1545	if (error == 0) {
1546		error = dsl_sync_task(firstname, dsl_dataset_snapshot_check,
1547		    dsl_dataset_snapshot_sync, &ddsa,
1548		    fnvlist_num_pairs(snaps) * 3, ZFS_SPACE_CHECK_NORMAL);
1549	}
1550
1551	if (suspended != NULL) {
1552		for (pair = nvlist_next_nvpair(suspended, NULL); pair != NULL;
1553		    pair = nvlist_next_nvpair(suspended, pair)) {
1554			zil_resume((void *)(uintptr_t)
1555			    fnvpair_value_uint64(pair));
1556		}
1557		fnvlist_free(suspended);
1558	}
1559
1560#ifdef __FreeBSD__
1561#ifdef _KERNEL
1562	if (error == 0) {
1563		for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
1564		    pair = nvlist_next_nvpair(snaps, pair)) {
1565			char *snapname = nvpair_name(pair);
1566			zvol_create_minors(snapname);
1567		}
1568	}
1569#endif
1570#endif
1571	return (error);
1572}
1573
1574typedef struct dsl_dataset_snapshot_tmp_arg {
1575	const char *ddsta_fsname;
1576	const char *ddsta_snapname;
1577	minor_t ddsta_cleanup_minor;
1578	const char *ddsta_htag;
1579} dsl_dataset_snapshot_tmp_arg_t;
1580
1581static int
1582dsl_dataset_snapshot_tmp_check(void *arg, dmu_tx_t *tx)
1583{
1584	dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
1585	dsl_pool_t *dp = dmu_tx_pool(tx);
1586	dsl_dataset_t *ds;
1587	int error;
1588
1589	error = dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds);
1590	if (error != 0)
1591		return (error);
1592
1593	/* NULL cred means no limit check for tmp snapshot */
1594	error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname,
1595	    tx, B_FALSE, 0, NULL);
1596	if (error != 0) {
1597		dsl_dataset_rele(ds, FTAG);
1598		return (error);
1599	}
1600
1601	if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) {
1602		dsl_dataset_rele(ds, FTAG);
1603		return (SET_ERROR(ENOTSUP));
1604	}
1605	error = dsl_dataset_user_hold_check_one(NULL, ddsta->ddsta_htag,
1606	    B_TRUE, tx);
1607	if (error != 0) {
1608		dsl_dataset_rele(ds, FTAG);
1609		return (error);
1610	}
1611
1612	dsl_dataset_rele(ds, FTAG);
1613	return (0);
1614}
1615
1616static void
1617dsl_dataset_snapshot_tmp_sync(void *arg, dmu_tx_t *tx)
1618{
1619	dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
1620	dsl_pool_t *dp = dmu_tx_pool(tx);
1621	dsl_dataset_t *ds;
1622
1623	VERIFY0(dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds));
1624
1625	dsl_dataset_snapshot_sync_impl(ds, ddsta->ddsta_snapname, tx);
1626	dsl_dataset_user_hold_sync_one(ds->ds_prev, ddsta->ddsta_htag,
1627	    ddsta->ddsta_cleanup_minor, gethrestime_sec(), tx);
1628	dsl_destroy_snapshot_sync_impl(ds->ds_prev, B_TRUE, tx);
1629
1630	dsl_dataset_rele(ds, FTAG);
1631}
1632
1633int
1634dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname,
1635    minor_t cleanup_minor, const char *htag)
1636{
1637	dsl_dataset_snapshot_tmp_arg_t ddsta;
1638	int error;
1639	spa_t *spa;
1640	boolean_t needsuspend;
1641	void *cookie;
1642
1643	ddsta.ddsta_fsname = fsname;
1644	ddsta.ddsta_snapname = snapname;
1645	ddsta.ddsta_cleanup_minor = cleanup_minor;
1646	ddsta.ddsta_htag = htag;
1647
1648	error = spa_open(fsname, &spa, FTAG);
1649	if (error != 0)
1650		return (error);
1651	needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
1652	spa_close(spa, FTAG);
1653
1654	if (needsuspend) {
1655		error = zil_suspend(fsname, &cookie);
1656		if (error != 0)
1657			return (error);
1658	}
1659
1660	error = dsl_sync_task(fsname, dsl_dataset_snapshot_tmp_check,
1661	    dsl_dataset_snapshot_tmp_sync, &ddsta, 3, ZFS_SPACE_CHECK_RESERVED);
1662
1663	if (needsuspend)
1664		zil_resume(cookie);
1665	return (error);
1666}
1667
1668
1669void
1670dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
1671{
1672	ASSERT(dmu_tx_is_syncing(tx));
1673	ASSERT(ds->ds_objset != NULL);
1674	ASSERT(dsl_dataset_phys(ds)->ds_next_snap_obj == 0);
1675
1676	/*
1677	 * in case we had to change ds_fsid_guid when we opened it,
1678	 * sync it out now.
1679	 */
1680	dmu_buf_will_dirty(ds->ds_dbuf, tx);
1681	dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid;
1682
1683	if (ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] != 0) {
1684		VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
1685		    ds->ds_object, DS_FIELD_RESUME_OBJECT, 8, 1,
1686		    &ds->ds_resume_object[tx->tx_txg & TXG_MASK], tx));
1687		VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
1688		    ds->ds_object, DS_FIELD_RESUME_OFFSET, 8, 1,
1689		    &ds->ds_resume_offset[tx->tx_txg & TXG_MASK], tx));
1690		VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
1691		    ds->ds_object, DS_FIELD_RESUME_BYTES, 8, 1,
1692		    &ds->ds_resume_bytes[tx->tx_txg & TXG_MASK], tx));
1693		ds->ds_resume_object[tx->tx_txg & TXG_MASK] = 0;
1694		ds->ds_resume_offset[tx->tx_txg & TXG_MASK] = 0;
1695		ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0;
1696	}
1697
1698	dmu_objset_sync(ds->ds_objset, zio, tx);
1699
1700	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
1701		if (ds->ds_feature_activation_needed[f]) {
1702			if (ds->ds_feature_inuse[f])
1703				continue;
1704			dsl_dataset_activate_feature(ds->ds_object, f, tx);
1705			ds->ds_feature_inuse[f] = B_TRUE;
1706		}
1707	}
1708}
1709
1710static void
1711get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv)
1712{
1713	uint64_t count = 0;
1714	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1715	zap_cursor_t zc;
1716	zap_attribute_t za;
1717	nvlist_t *propval = fnvlist_alloc();
1718	nvlist_t *val = fnvlist_alloc();
1719
1720	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
1721
1722	/*
1723	 * There may be missing entries in ds_next_clones_obj
1724	 * due to a bug in a previous version of the code.
1725	 * Only trust it if it has the right number of entries.
1726	 */
1727	if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
1728		VERIFY0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
1729		    &count));
1730	}
1731	if (count != dsl_dataset_phys(ds)->ds_num_children - 1)
1732		goto fail;
1733	for (zap_cursor_init(&zc, mos,
1734	    dsl_dataset_phys(ds)->ds_next_clones_obj);
1735	    zap_cursor_retrieve(&zc, &za) == 0;
1736	    zap_cursor_advance(&zc)) {
1737		dsl_dataset_t *clone;
1738		char buf[ZFS_MAXNAMELEN];
1739		VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
1740		    za.za_first_integer, FTAG, &clone));
1741		dsl_dir_name(clone->ds_dir, buf);
1742		fnvlist_add_boolean(val, buf);
1743		dsl_dataset_rele(clone, FTAG);
1744	}
1745	zap_cursor_fini(&zc);
1746	fnvlist_add_nvlist(propval, ZPROP_VALUE, val);
1747	fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES), propval);
1748fail:
1749	nvlist_free(val);
1750	nvlist_free(propval);
1751}
1752
1753static void
1754get_receive_resume_stats(dsl_dataset_t *ds, nvlist_t *nv)
1755{
1756	dsl_pool_t *dp = ds->ds_dir->dd_pool;
1757
1758	if (dsl_dataset_has_resume_receive_state(ds)) {
1759		char *str;
1760		void *packed;
1761		uint8_t *compressed;
1762		uint64_t val;
1763		nvlist_t *token_nv = fnvlist_alloc();
1764		size_t packed_size, compressed_size;
1765
1766		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
1767		    DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val) == 0) {
1768			fnvlist_add_uint64(token_nv, "fromguid", val);
1769		}
1770		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
1771		    DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val) == 0) {
1772			fnvlist_add_uint64(token_nv, "object", val);
1773		}
1774		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
1775		    DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val) == 0) {
1776			fnvlist_add_uint64(token_nv, "offset", val);
1777		}
1778		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
1779		    DS_FIELD_RESUME_BYTES, sizeof (val), 1, &val) == 0) {
1780			fnvlist_add_uint64(token_nv, "bytes", val);
1781		}
1782		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
1783		    DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val) == 0) {
1784			fnvlist_add_uint64(token_nv, "toguid", val);
1785		}
1786		char buf[256];
1787		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
1788		    DS_FIELD_RESUME_TONAME, 1, sizeof (buf), buf) == 0) {
1789			fnvlist_add_string(token_nv, "toname", buf);
1790		}
1791		if (zap_contains(dp->dp_meta_objset, ds->ds_object,
1792		    DS_FIELD_RESUME_EMBEDOK) == 0) {
1793			fnvlist_add_boolean(token_nv, "embedok");
1794		}
1795		packed = fnvlist_pack(token_nv, &packed_size);
1796		fnvlist_free(token_nv);
1797		compressed = kmem_alloc(packed_size, KM_SLEEP);
1798
1799		compressed_size = gzip_compress(packed, compressed,
1800		    packed_size, packed_size, 6);
1801
1802		zio_cksum_t cksum;
1803		fletcher_4_native(compressed, compressed_size, NULL, &cksum);
1804
1805		str = kmem_alloc(compressed_size * 2 + 1, KM_SLEEP);
1806		for (int i = 0; i < compressed_size; i++) {
1807			(void) sprintf(str + i * 2, "%02x", compressed[i]);
1808		}
1809		str[compressed_size * 2] = '\0';
1810		char *propval = kmem_asprintf("%u-%llx-%llx-%s",
1811		    ZFS_SEND_RESUME_TOKEN_VERSION,
1812		    (longlong_t)cksum.zc_word[0],
1813		    (longlong_t)packed_size, str);
1814		dsl_prop_nvlist_add_string(nv,
1815		    ZFS_PROP_RECEIVE_RESUME_TOKEN, propval);
1816		kmem_free(packed, packed_size);
1817		kmem_free(str, compressed_size * 2 + 1);
1818		kmem_free(compressed, packed_size);
1819		strfree(propval);
1820	}
1821}
1822
1823void
1824dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
1825{
1826	dsl_pool_t *dp = ds->ds_dir->dd_pool;
1827	uint64_t refd, avail, uobjs, aobjs, ratio;
1828
1829	ASSERT(dsl_pool_config_held(dp));
1830
1831	ratio = dsl_dataset_phys(ds)->ds_compressed_bytes == 0 ? 100 :
1832	    (dsl_dataset_phys(ds)->ds_uncompressed_bytes * 100 /
1833	    dsl_dataset_phys(ds)->ds_compressed_bytes);
1834
1835	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio);
1836	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALREFERENCED,
1837	    dsl_dataset_phys(ds)->ds_uncompressed_bytes);
1838
1839	if (ds->ds_is_snapshot) {
1840		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio);
1841		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
1842		    dsl_dataset_phys(ds)->ds_unique_bytes);
1843		get_clones_stat(ds, nv);
1844	} else {
1845		if (ds->ds_prev != NULL && ds->ds_prev != dp->dp_origin_snap) {
1846			char buf[MAXNAMELEN];
1847			dsl_dataset_name(ds->ds_prev, buf);
1848			dsl_prop_nvlist_add_string(nv, ZFS_PROP_PREV_SNAP, buf);
1849		}
1850
1851		dsl_dir_stats(ds->ds_dir, nv);
1852	}
1853
1854	dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs);
1855	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail);
1856	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd);
1857
1858	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
1859	    dsl_dataset_phys(ds)->ds_creation_time);
1860	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
1861	    dsl_dataset_phys(ds)->ds_creation_txg);
1862	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA,
1863	    ds->ds_quota);
1864	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION,
1865	    ds->ds_reserved);
1866	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
1867	    dsl_dataset_phys(ds)->ds_guid);
1868	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE,
1869	    dsl_dataset_phys(ds)->ds_unique_bytes);
1870	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID,
1871	    ds->ds_object);
1872	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS,
1873	    ds->ds_userrefs);
1874	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
1875	    DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
1876
1877	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
1878		uint64_t written, comp, uncomp;
1879		dsl_pool_t *dp = ds->ds_dir->dd_pool;
1880		dsl_dataset_t *prev;
1881
1882		int err = dsl_dataset_hold_obj(dp,
1883		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
1884		if (err == 0) {
1885			err = dsl_dataset_space_written(prev, ds, &written,
1886			    &comp, &uncomp);
1887			dsl_dataset_rele(prev, FTAG);
1888			if (err == 0) {
1889				dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN,
1890				    written);
1891			}
1892		}
1893	}
1894
1895	if (!dsl_dataset_is_snapshot(ds)) {
1896		/*
1897		 * A failed "newfs" (e.g. full) resumable receive leaves
1898		 * the stats set on this dataset.  Check here for the prop.
1899		 */
1900		get_receive_resume_stats(ds, nv);
1901
1902		/*
1903		 * A failed incremental resumable receive leaves the
1904		 * stats set on our child named "%recv".  Check the child
1905		 * for the prop.
1906		 */
1907		char recvname[ZFS_MAXNAMELEN];
1908		dsl_dataset_t *recv_ds;
1909		dsl_dataset_name(ds, recvname);
1910		(void) strcat(recvname, "/");
1911		(void) strcat(recvname, recv_clone_name);
1912		if (dsl_dataset_hold(dp, recvname, FTAG, &recv_ds) == 0) {
1913			get_receive_resume_stats(recv_ds, nv);
1914			dsl_dataset_rele(recv_ds, FTAG);
1915		}
1916	}
1917}
1918
1919void
1920dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
1921{
1922	dsl_pool_t *dp = ds->ds_dir->dd_pool;
1923	ASSERT(dsl_pool_config_held(dp));
1924
1925	stat->dds_creation_txg = dsl_dataset_phys(ds)->ds_creation_txg;
1926	stat->dds_inconsistent =
1927	    dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT;
1928	stat->dds_guid = dsl_dataset_phys(ds)->ds_guid;
1929	stat->dds_origin[0] = '\0';
1930	if (ds->ds_is_snapshot) {
1931		stat->dds_is_snapshot = B_TRUE;
1932		stat->dds_num_clones =
1933		    dsl_dataset_phys(ds)->ds_num_children - 1;
1934	} else {
1935		stat->dds_is_snapshot = B_FALSE;
1936		stat->dds_num_clones = 0;
1937
1938		if (dsl_dir_is_clone(ds->ds_dir)) {
1939			dsl_dataset_t *ods;
1940
1941			VERIFY0(dsl_dataset_hold_obj(dp,
1942			    dsl_dir_phys(ds->ds_dir)->dd_origin_obj,
1943			    FTAG, &ods));
1944			dsl_dataset_name(ods, stat->dds_origin);
1945			dsl_dataset_rele(ods, FTAG);
1946		}
1947	}
1948}
1949
1950uint64_t
1951dsl_dataset_fsid_guid(dsl_dataset_t *ds)
1952{
1953	return (ds->ds_fsid_guid);
1954}
1955
1956void
1957dsl_dataset_space(dsl_dataset_t *ds,
1958    uint64_t *refdbytesp, uint64_t *availbytesp,
1959    uint64_t *usedobjsp, uint64_t *availobjsp)
1960{
1961	*refdbytesp = dsl_dataset_phys(ds)->ds_referenced_bytes;
1962	*availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
1963	if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes)
1964		*availbytesp +=
1965		    ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes;
1966	if (ds->ds_quota != 0) {
1967		/*
1968		 * Adjust available bytes according to refquota
1969		 */
1970		if (*refdbytesp < ds->ds_quota)
1971			*availbytesp = MIN(*availbytesp,
1972			    ds->ds_quota - *refdbytesp);
1973		else
1974			*availbytesp = 0;
1975	}
1976	*usedobjsp = BP_GET_FILL(&dsl_dataset_phys(ds)->ds_bp);
1977	*availobjsp = DN_MAX_OBJECT - *usedobjsp;
1978}
1979
1980boolean_t
1981dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap)
1982{
1983	dsl_pool_t *dp = ds->ds_dir->dd_pool;
1984
1985	ASSERT(dsl_pool_config_held(dp));
1986	if (snap == NULL)
1987		return (B_FALSE);
1988	if (dsl_dataset_phys(ds)->ds_bp.blk_birth >
1989	    dsl_dataset_phys(snap)->ds_creation_txg) {
1990		objset_t *os, *os_snap;
1991		/*
1992		 * It may be that only the ZIL differs, because it was
1993		 * reset in the head.  Don't count that as being
1994		 * modified.
1995		 */
1996		if (dmu_objset_from_ds(ds, &os) != 0)
1997			return (B_TRUE);
1998		if (dmu_objset_from_ds(snap, &os_snap) != 0)
1999			return (B_TRUE);
2000		return (bcmp(&os->os_phys->os_meta_dnode,
2001		    &os_snap->os_phys->os_meta_dnode,
2002		    sizeof (os->os_phys->os_meta_dnode)) != 0);
2003	}
2004	return (B_FALSE);
2005}
2006
2007typedef struct dsl_dataset_rename_snapshot_arg {
2008	const char *ddrsa_fsname;
2009	const char *ddrsa_oldsnapname;
2010	const char *ddrsa_newsnapname;
2011	boolean_t ddrsa_recursive;
2012	dmu_tx_t *ddrsa_tx;
2013} dsl_dataset_rename_snapshot_arg_t;
2014
2015/* ARGSUSED */
2016static int
2017dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp,
2018    dsl_dataset_t *hds, void *arg)
2019{
2020	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
2021	int error;
2022	uint64_t val;
2023
2024	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
2025	if (error != 0) {
2026		/* ignore nonexistent snapshots */
2027		return (error == ENOENT ? 0 : error);
2028	}
2029
2030	/* new name should not exist */
2031	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_newsnapname, &val);
2032	if (error == 0)
2033		error = SET_ERROR(EEXIST);
2034	else if (error == ENOENT)
2035		error = 0;
2036
2037	/* dataset name + 1 for the "@" + the new snapshot name must fit */
2038	if (dsl_dir_namelen(hds->ds_dir) + 1 +
2039	    strlen(ddrsa->ddrsa_newsnapname) >= MAXNAMELEN)
2040		error = SET_ERROR(ENAMETOOLONG);
2041
2042	return (error);
2043}
2044
2045static int
2046dsl_dataset_rename_snapshot_check(void *arg, dmu_tx_t *tx)
2047{
2048	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
2049	dsl_pool_t *dp = dmu_tx_pool(tx);
2050	dsl_dataset_t *hds;
2051	int error;
2052
2053	error = dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds);
2054	if (error != 0)
2055		return (error);
2056
2057	if (ddrsa->ddrsa_recursive) {
2058		error = dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
2059		    dsl_dataset_rename_snapshot_check_impl, ddrsa,
2060		    DS_FIND_CHILDREN);
2061	} else {
2062		error = dsl_dataset_rename_snapshot_check_impl(dp, hds, ddrsa);
2063	}
2064	dsl_dataset_rele(hds, FTAG);
2065	return (error);
2066}
2067
2068static int
2069dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp,
2070    dsl_dataset_t *hds, void *arg)
2071{
2072#ifdef __FreeBSD__
2073#ifdef _KERNEL
2074	char *oldname, *newname;
2075#endif
2076#endif
2077	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
2078	dsl_dataset_t *ds;
2079	uint64_t val;
2080	dmu_tx_t *tx = ddrsa->ddrsa_tx;
2081	int error;
2082
2083	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
2084	ASSERT(error == 0 || error == ENOENT);
2085	if (error == ENOENT) {
2086		/* ignore nonexistent snapshots */
2087		return (0);
2088	}
2089
2090	VERIFY0(dsl_dataset_hold_obj(dp, val, FTAG, &ds));
2091
2092	/* log before we change the name */
2093	spa_history_log_internal_ds(ds, "rename", tx,
2094	    "-> @%s", ddrsa->ddrsa_newsnapname);
2095
2096	VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx,
2097	    B_FALSE));
2098	mutex_enter(&ds->ds_lock);
2099	(void) strcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname);
2100	mutex_exit(&ds->ds_lock);
2101	VERIFY0(zap_add(dp->dp_meta_objset,
2102	    dsl_dataset_phys(hds)->ds_snapnames_zapobj,
2103	    ds->ds_snapname, 8, 1, &ds->ds_object, tx));
2104
2105#ifdef __FreeBSD__
2106#ifdef _KERNEL
2107	oldname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2108	newname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2109	snprintf(oldname, MAXPATHLEN, "%s@%s", ddrsa->ddrsa_fsname,
2110	    ddrsa->ddrsa_oldsnapname);
2111	snprintf(newname, MAXPATHLEN, "%s@%s", ddrsa->ddrsa_fsname,
2112	    ddrsa->ddrsa_newsnapname);
2113	zfsvfs_update_fromname(oldname, newname);
2114	zvol_rename_minors(oldname, newname);
2115	kmem_free(newname, MAXPATHLEN);
2116	kmem_free(oldname, MAXPATHLEN);
2117#endif
2118#endif
2119	dsl_dataset_rele(ds, FTAG);
2120
2121	return (0);
2122}
2123
2124static void
2125dsl_dataset_rename_snapshot_sync(void *arg, dmu_tx_t *tx)
2126{
2127	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
2128	dsl_pool_t *dp = dmu_tx_pool(tx);
2129	dsl_dataset_t *hds;
2130
2131	VERIFY0(dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds));
2132	ddrsa->ddrsa_tx = tx;
2133	if (ddrsa->ddrsa_recursive) {
2134		VERIFY0(dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
2135		    dsl_dataset_rename_snapshot_sync_impl, ddrsa,
2136		    DS_FIND_CHILDREN));
2137	} else {
2138		VERIFY0(dsl_dataset_rename_snapshot_sync_impl(dp, hds, ddrsa));
2139	}
2140	dsl_dataset_rele(hds, FTAG);
2141}
2142
2143int
2144dsl_dataset_rename_snapshot(const char *fsname,
2145    const char *oldsnapname, const char *newsnapname, boolean_t recursive)
2146{
2147	dsl_dataset_rename_snapshot_arg_t ddrsa;
2148
2149	ddrsa.ddrsa_fsname = fsname;
2150	ddrsa.ddrsa_oldsnapname = oldsnapname;
2151	ddrsa.ddrsa_newsnapname = newsnapname;
2152	ddrsa.ddrsa_recursive = recursive;
2153
2154	return (dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check,
2155	    dsl_dataset_rename_snapshot_sync, &ddrsa,
2156	    1, ZFS_SPACE_CHECK_RESERVED));
2157}
2158
2159/*
2160 * If we're doing an ownership handoff, we need to make sure that there is
2161 * only one long hold on the dataset.  We're not allowed to change anything here
2162 * so we don't permanently release the long hold or regular hold here.  We want
2163 * to do this only when syncing to avoid the dataset unexpectedly going away
2164 * when we release the long hold.
2165 */
2166static int
2167dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx)
2168{
2169	boolean_t held;
2170
2171	if (!dmu_tx_is_syncing(tx))
2172		return (0);
2173
2174	if (owner != NULL) {
2175		VERIFY3P(ds->ds_owner, ==, owner);
2176		dsl_dataset_long_rele(ds, owner);
2177	}
2178
2179	held = dsl_dataset_long_held(ds);
2180
2181	if (owner != NULL)
2182		dsl_dataset_long_hold(ds, owner);
2183
2184	if (held)
2185		return (SET_ERROR(EBUSY));
2186
2187	return (0);
2188}
2189
2190typedef struct dsl_dataset_rollback_arg {
2191	const char *ddra_fsname;
2192	void *ddra_owner;
2193	nvlist_t *ddra_result;
2194} dsl_dataset_rollback_arg_t;
2195
2196static int
2197dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx)
2198{
2199	dsl_dataset_rollback_arg_t *ddra = arg;
2200	dsl_pool_t *dp = dmu_tx_pool(tx);
2201	dsl_dataset_t *ds;
2202	int64_t unused_refres_delta;
2203	int error;
2204
2205	error = dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds);
2206	if (error != 0)
2207		return (error);
2208
2209	/* must not be a snapshot */
2210	if (ds->ds_is_snapshot) {
2211		dsl_dataset_rele(ds, FTAG);
2212		return (SET_ERROR(EINVAL));
2213	}
2214
2215	/* must have a most recent snapshot */
2216	if (dsl_dataset_phys(ds)->ds_prev_snap_txg < TXG_INITIAL) {
2217		dsl_dataset_rele(ds, FTAG);
2218		return (SET_ERROR(EINVAL));
2219	}
2220
2221	/* must not have any bookmarks after the most recent snapshot */
2222	nvlist_t *proprequest = fnvlist_alloc();
2223	fnvlist_add_boolean(proprequest, zfs_prop_to_name(ZFS_PROP_CREATETXG));
2224	nvlist_t *bookmarks = fnvlist_alloc();
2225	error = dsl_get_bookmarks_impl(ds, proprequest, bookmarks);
2226	fnvlist_free(proprequest);
2227	if (error != 0)
2228		return (error);
2229	for (nvpair_t *pair = nvlist_next_nvpair(bookmarks, NULL);
2230	    pair != NULL; pair = nvlist_next_nvpair(bookmarks, pair)) {
2231		nvlist_t *valuenv =
2232		    fnvlist_lookup_nvlist(fnvpair_value_nvlist(pair),
2233		    zfs_prop_to_name(ZFS_PROP_CREATETXG));
2234		uint64_t createtxg = fnvlist_lookup_uint64(valuenv, "value");
2235		if (createtxg > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
2236			fnvlist_free(bookmarks);
2237			dsl_dataset_rele(ds, FTAG);
2238			return (SET_ERROR(EEXIST));
2239		}
2240	}
2241	fnvlist_free(bookmarks);
2242
2243	error = dsl_dataset_handoff_check(ds, ddra->ddra_owner, tx);
2244	if (error != 0) {
2245		dsl_dataset_rele(ds, FTAG);
2246		return (error);
2247	}
2248
2249	/*
2250	 * Check if the snap we are rolling back to uses more than
2251	 * the refquota.
2252	 */
2253	if (ds->ds_quota != 0 &&
2254	    dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes > ds->ds_quota) {
2255		dsl_dataset_rele(ds, FTAG);
2256		return (SET_ERROR(EDQUOT));
2257	}
2258
2259	/*
2260	 * When we do the clone swap, we will temporarily use more space
2261	 * due to the refreservation (the head will no longer have any
2262	 * unique space, so the entire amount of the refreservation will need
2263	 * to be free).  We will immediately destroy the clone, freeing
2264	 * this space, but the freeing happens over many txg's.
2265	 */
2266	unused_refres_delta = (int64_t)MIN(ds->ds_reserved,
2267	    dsl_dataset_phys(ds)->ds_unique_bytes);
2268
2269	if (unused_refres_delta > 0 &&
2270	    unused_refres_delta >
2271	    dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) {
2272		dsl_dataset_rele(ds, FTAG);
2273		return (SET_ERROR(ENOSPC));
2274	}
2275
2276	dsl_dataset_rele(ds, FTAG);
2277	return (0);
2278}
2279
2280static void
2281dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx)
2282{
2283	dsl_dataset_rollback_arg_t *ddra = arg;
2284	dsl_pool_t *dp = dmu_tx_pool(tx);
2285	dsl_dataset_t *ds, *clone;
2286	uint64_t cloneobj;
2287	char namebuf[ZFS_MAXNAMELEN];
2288
2289	VERIFY0(dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds));
2290
2291	dsl_dataset_name(ds->ds_prev, namebuf);
2292	fnvlist_add_string(ddra->ddra_result, "target", namebuf);
2293
2294	cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback",
2295	    ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, tx);
2296
2297	VERIFY0(dsl_dataset_hold_obj(dp, cloneobj, FTAG, &clone));
2298
2299	dsl_dataset_clone_swap_sync_impl(clone, ds, tx);
2300	dsl_dataset_zero_zil(ds, tx);
2301
2302	dsl_destroy_head_sync_impl(clone, tx);
2303
2304	dsl_dataset_rele(clone, FTAG);
2305	dsl_dataset_rele(ds, FTAG);
2306}
2307
2308/*
2309 * Rolls back the given filesystem or volume to the most recent snapshot.
2310 * The name of the most recent snapshot will be returned under key "target"
2311 * in the result nvlist.
2312 *
2313 * If owner != NULL:
2314 * - The existing dataset MUST be owned by the specified owner at entry
2315 * - Upon return, dataset will still be held by the same owner, whether we
2316 *   succeed or not.
2317 *
2318 * This mode is required any time the existing filesystem is mounted.  See
2319 * notes above zfs_suspend_fs() for further details.
2320 */
2321int
2322dsl_dataset_rollback(const char *fsname, void *owner, nvlist_t *result)
2323{
2324	dsl_dataset_rollback_arg_t ddra;
2325
2326	ddra.ddra_fsname = fsname;
2327	ddra.ddra_owner = owner;
2328	ddra.ddra_result = result;
2329
2330	return (dsl_sync_task(fsname, dsl_dataset_rollback_check,
2331	    dsl_dataset_rollback_sync, &ddra,
2332	    1, ZFS_SPACE_CHECK_RESERVED));
2333}
2334
2335struct promotenode {
2336	list_node_t link;
2337	dsl_dataset_t *ds;
2338};
2339
2340typedef struct dsl_dataset_promote_arg {
2341	const char *ddpa_clonename;
2342	dsl_dataset_t *ddpa_clone;
2343	list_t shared_snaps, origin_snaps, clone_snaps;
2344	dsl_dataset_t *origin_origin; /* origin of the origin */
2345	uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
2346	char *err_ds;
2347	cred_t *cr;
2348} dsl_dataset_promote_arg_t;
2349
2350static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
2351static int promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp,
2352    void *tag);
2353static void promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag);
2354
2355static int
2356dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
2357{
2358	dsl_dataset_promote_arg_t *ddpa = arg;
2359	dsl_pool_t *dp = dmu_tx_pool(tx);
2360	dsl_dataset_t *hds;
2361	struct promotenode *snap;
2362	dsl_dataset_t *origin_ds;
2363	int err;
2364	uint64_t unused;
2365	uint64_t ss_mv_cnt;
2366	size_t max_snap_len;
2367
2368	err = promote_hold(ddpa, dp, FTAG);
2369	if (err != 0)
2370		return (err);
2371
2372	hds = ddpa->ddpa_clone;
2373	max_snap_len = MAXNAMELEN - strlen(ddpa->ddpa_clonename) - 1;
2374
2375	if (dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE) {
2376		promote_rele(ddpa, FTAG);
2377		return (SET_ERROR(EXDEV));
2378	}
2379
2380	/*
2381	 * Compute and check the amount of space to transfer.  Since this is
2382	 * so expensive, don't do the preliminary check.
2383	 */
2384	if (!dmu_tx_is_syncing(tx)) {
2385		promote_rele(ddpa, FTAG);
2386		return (0);
2387	}
2388
2389	snap = list_head(&ddpa->shared_snaps);
2390	origin_ds = snap->ds;
2391
2392	/* compute origin's new unique space */
2393	snap = list_tail(&ddpa->clone_snaps);
2394	ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
2395	    origin_ds->ds_object);
2396	dsl_deadlist_space_range(&snap->ds->ds_deadlist,
2397	    dsl_dataset_phys(origin_ds)->ds_prev_snap_txg, UINT64_MAX,
2398	    &ddpa->unique, &unused, &unused);
2399
2400	/*
2401	 * Walk the snapshots that we are moving
2402	 *
2403	 * Compute space to transfer.  Consider the incremental changes
2404	 * to used by each snapshot:
2405	 * (my used) = (prev's used) + (blocks born) - (blocks killed)
2406	 * So each snapshot gave birth to:
2407	 * (blocks born) = (my used) - (prev's used) + (blocks killed)
2408	 * So a sequence would look like:
2409	 * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0)
2410	 * Which simplifies to:
2411	 * uN + kN + kN-1 + ... + k1 + k0
2412	 * Note however, if we stop before we reach the ORIGIN we get:
2413	 * uN + kN + kN-1 + ... + kM - uM-1
2414	 */
2415	ss_mv_cnt = 0;
2416	ddpa->used = dsl_dataset_phys(origin_ds)->ds_referenced_bytes;
2417	ddpa->comp = dsl_dataset_phys(origin_ds)->ds_compressed_bytes;
2418	ddpa->uncomp = dsl_dataset_phys(origin_ds)->ds_uncompressed_bytes;
2419	for (snap = list_head(&ddpa->shared_snaps); snap;
2420	    snap = list_next(&ddpa->shared_snaps, snap)) {
2421		uint64_t val, dlused, dlcomp, dluncomp;
2422		dsl_dataset_t *ds = snap->ds;
2423
2424		ss_mv_cnt++;
2425
2426		/*
2427		 * If there are long holds, we won't be able to evict
2428		 * the objset.
2429		 */
2430		if (dsl_dataset_long_held(ds)) {
2431			err = SET_ERROR(EBUSY);
2432			goto out;
2433		}
2434
2435		/* Check that the snapshot name does not conflict */
2436		VERIFY0(dsl_dataset_get_snapname(ds));
2437		if (strlen(ds->ds_snapname) >= max_snap_len) {
2438			err = SET_ERROR(ENAMETOOLONG);
2439			goto out;
2440		}
2441		err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
2442		if (err == 0) {
2443			(void) strcpy(ddpa->err_ds, snap->ds->ds_snapname);
2444			err = SET_ERROR(EEXIST);
2445			goto out;
2446		}
2447		if (err != ENOENT)
2448			goto out;
2449
2450		/* The very first snapshot does not have a deadlist */
2451		if (dsl_dataset_phys(ds)->ds_prev_snap_obj == 0)
2452			continue;
2453
2454		dsl_deadlist_space(&ds->ds_deadlist,
2455		    &dlused, &dlcomp, &dluncomp);
2456		ddpa->used += dlused;
2457		ddpa->comp += dlcomp;
2458		ddpa->uncomp += dluncomp;
2459	}
2460
2461	/*
2462	 * If we are a clone of a clone then we never reached ORIGIN,
2463	 * so we need to subtract out the clone origin's used space.
2464	 */
2465	if (ddpa->origin_origin) {
2466		ddpa->used -=
2467		    dsl_dataset_phys(ddpa->origin_origin)->ds_referenced_bytes;
2468		ddpa->comp -=
2469		    dsl_dataset_phys(ddpa->origin_origin)->ds_compressed_bytes;
2470		ddpa->uncomp -=
2471		    dsl_dataset_phys(ddpa->origin_origin)->
2472		    ds_uncompressed_bytes;
2473	}
2474
2475	/* Check that there is enough space and limit headroom here */
2476	err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
2477	    0, ss_mv_cnt, ddpa->used, ddpa->cr);
2478	if (err != 0)
2479		goto out;
2480
2481	/*
2482	 * Compute the amounts of space that will be used by snapshots
2483	 * after the promotion (for both origin and clone).  For each,
2484	 * it is the amount of space that will be on all of their
2485	 * deadlists (that was not born before their new origin).
2486	 */
2487	if (dsl_dir_phys(hds->ds_dir)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
2488		uint64_t space;
2489
2490		/*
2491		 * Note, typically this will not be a clone of a clone,
2492		 * so dd_origin_txg will be < TXG_INITIAL, so
2493		 * these snaplist_space() -> dsl_deadlist_space_range()
2494		 * calls will be fast because they do not have to
2495		 * iterate over all bps.
2496		 */
2497		snap = list_head(&ddpa->origin_snaps);
2498		err = snaplist_space(&ddpa->shared_snaps,
2499		    snap->ds->ds_dir->dd_origin_txg, &ddpa->cloneusedsnap);
2500		if (err != 0)
2501			goto out;
2502
2503		err = snaplist_space(&ddpa->clone_snaps,
2504		    snap->ds->ds_dir->dd_origin_txg, &space);
2505		if (err != 0)
2506			goto out;
2507		ddpa->cloneusedsnap += space;
2508	}
2509	if (dsl_dir_phys(origin_ds->ds_dir)->dd_flags &
2510	    DD_FLAG_USED_BREAKDOWN) {
2511		err = snaplist_space(&ddpa->origin_snaps,
2512		    dsl_dataset_phys(origin_ds)->ds_creation_txg,
2513		    &ddpa->originusedsnap);
2514		if (err != 0)
2515			goto out;
2516	}
2517
2518out:
2519	promote_rele(ddpa, FTAG);
2520	return (err);
2521}
2522
2523static void
2524dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
2525{
2526	dsl_dataset_promote_arg_t *ddpa = arg;
2527	dsl_pool_t *dp = dmu_tx_pool(tx);
2528	dsl_dataset_t *hds;
2529	struct promotenode *snap;
2530	dsl_dataset_t *origin_ds;
2531	dsl_dataset_t *origin_head;
2532	dsl_dir_t *dd;
2533	dsl_dir_t *odd = NULL;
2534	uint64_t oldnext_obj;
2535	int64_t delta;
2536#if defined(__FreeBSD__) && defined(_KERNEL)
2537	char *oldname, *newname;
2538#endif
2539
2540	VERIFY0(promote_hold(ddpa, dp, FTAG));
2541	hds = ddpa->ddpa_clone;
2542
2543	ASSERT0(dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE);
2544
2545	snap = list_head(&ddpa->shared_snaps);
2546	origin_ds = snap->ds;
2547	dd = hds->ds_dir;
2548
2549	snap = list_head(&ddpa->origin_snaps);
2550	origin_head = snap->ds;
2551
2552	/*
2553	 * We need to explicitly open odd, since origin_ds's dd will be
2554	 * changing.
2555	 */
2556	VERIFY0(dsl_dir_hold_obj(dp, origin_ds->ds_dir->dd_object,
2557	    NULL, FTAG, &odd));
2558
2559	/* change origin's next snap */
2560	dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
2561	oldnext_obj = dsl_dataset_phys(origin_ds)->ds_next_snap_obj;
2562	snap = list_tail(&ddpa->clone_snaps);
2563	ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
2564	    origin_ds->ds_object);
2565	dsl_dataset_phys(origin_ds)->ds_next_snap_obj = snap->ds->ds_object;
2566
2567	/* change the origin's next clone */
2568	if (dsl_dataset_phys(origin_ds)->ds_next_clones_obj) {
2569		dsl_dataset_remove_from_next_clones(origin_ds,
2570		    snap->ds->ds_object, tx);
2571		VERIFY0(zap_add_int(dp->dp_meta_objset,
2572		    dsl_dataset_phys(origin_ds)->ds_next_clones_obj,
2573		    oldnext_obj, tx));
2574	}
2575
2576	/* change origin */
2577	dmu_buf_will_dirty(dd->dd_dbuf, tx);
2578	ASSERT3U(dsl_dir_phys(dd)->dd_origin_obj, ==, origin_ds->ds_object);
2579	dsl_dir_phys(dd)->dd_origin_obj = dsl_dir_phys(odd)->dd_origin_obj;
2580	dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg;
2581	dmu_buf_will_dirty(odd->dd_dbuf, tx);
2582	dsl_dir_phys(odd)->dd_origin_obj = origin_ds->ds_object;
2583	origin_head->ds_dir->dd_origin_txg =
2584	    dsl_dataset_phys(origin_ds)->ds_creation_txg;
2585
2586	/* change dd_clone entries */
2587	if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
2588		VERIFY0(zap_remove_int(dp->dp_meta_objset,
2589		    dsl_dir_phys(odd)->dd_clones, hds->ds_object, tx));
2590		VERIFY0(zap_add_int(dp->dp_meta_objset,
2591		    dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
2592		    hds->ds_object, tx));
2593
2594		VERIFY0(zap_remove_int(dp->dp_meta_objset,
2595		    dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
2596		    origin_head->ds_object, tx));
2597		if (dsl_dir_phys(dd)->dd_clones == 0) {
2598			dsl_dir_phys(dd)->dd_clones =
2599			    zap_create(dp->dp_meta_objset, DMU_OT_DSL_CLONES,
2600			    DMU_OT_NONE, 0, tx);
2601		}
2602		VERIFY0(zap_add_int(dp->dp_meta_objset,
2603		    dsl_dir_phys(dd)->dd_clones, origin_head->ds_object, tx));
2604	}
2605
2606#if defined(__FreeBSD__) && defined(_KERNEL)
2607	/* Take the spa_namespace_lock early so zvol renames don't deadlock. */
2608	mutex_enter(&spa_namespace_lock);
2609
2610	oldname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2611	newname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2612#endif
2613
2614	/* move snapshots to this dir */
2615	for (snap = list_head(&ddpa->shared_snaps); snap;
2616	    snap = list_next(&ddpa->shared_snaps, snap)) {
2617		dsl_dataset_t *ds = snap->ds;
2618
2619		/*
2620		 * Property callbacks are registered to a particular
2621		 * dsl_dir.  Since ours is changing, evict the objset
2622		 * so that they will be unregistered from the old dsl_dir.
2623		 */
2624		if (ds->ds_objset) {
2625			dmu_objset_evict(ds->ds_objset);
2626			ds->ds_objset = NULL;
2627		}
2628
2629		/* move snap name entry */
2630		VERIFY0(dsl_dataset_get_snapname(ds));
2631		VERIFY0(dsl_dataset_snap_remove(origin_head,
2632		    ds->ds_snapname, tx, B_TRUE));
2633		VERIFY0(zap_add(dp->dp_meta_objset,
2634		    dsl_dataset_phys(hds)->ds_snapnames_zapobj, ds->ds_snapname,
2635		    8, 1, &ds->ds_object, tx));
2636		dsl_fs_ss_count_adjust(hds->ds_dir, 1,
2637		    DD_FIELD_SNAPSHOT_COUNT, tx);
2638
2639		/* change containing dsl_dir */
2640		dmu_buf_will_dirty(ds->ds_dbuf, tx);
2641		ASSERT3U(dsl_dataset_phys(ds)->ds_dir_obj, ==, odd->dd_object);
2642		dsl_dataset_phys(ds)->ds_dir_obj = dd->dd_object;
2643		ASSERT3P(ds->ds_dir, ==, odd);
2644		dsl_dir_rele(ds->ds_dir, ds);
2645		VERIFY0(dsl_dir_hold_obj(dp, dd->dd_object,
2646		    NULL, ds, &ds->ds_dir));
2647
2648#if defined(__FreeBSD__) && defined(_KERNEL)
2649		dsl_dataset_name(ds, newname);
2650		zfsvfs_update_fromname(oldname, newname);
2651		zvol_rename_minors(oldname, newname);
2652#endif
2653
2654		/* move any clone references */
2655		if (dsl_dataset_phys(ds)->ds_next_clones_obj &&
2656		    spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
2657			zap_cursor_t zc;
2658			zap_attribute_t za;
2659
2660			for (zap_cursor_init(&zc, dp->dp_meta_objset,
2661			    dsl_dataset_phys(ds)->ds_next_clones_obj);
2662			    zap_cursor_retrieve(&zc, &za) == 0;
2663			    zap_cursor_advance(&zc)) {
2664				dsl_dataset_t *cnds;
2665				uint64_t o;
2666
2667				if (za.za_first_integer == oldnext_obj) {
2668					/*
2669					 * We've already moved the
2670					 * origin's reference.
2671					 */
2672					continue;
2673				}
2674
2675				VERIFY0(dsl_dataset_hold_obj(dp,
2676				    za.za_first_integer, FTAG, &cnds));
2677				o = dsl_dir_phys(cnds->ds_dir)->
2678				    dd_head_dataset_obj;
2679
2680				VERIFY0(zap_remove_int(dp->dp_meta_objset,
2681				    dsl_dir_phys(odd)->dd_clones, o, tx));
2682				VERIFY0(zap_add_int(dp->dp_meta_objset,
2683				    dsl_dir_phys(dd)->dd_clones, o, tx));
2684				dsl_dataset_rele(cnds, FTAG);
2685			}
2686			zap_cursor_fini(&zc);
2687		}
2688
2689		ASSERT(!dsl_prop_hascb(ds));
2690	}
2691
2692#if defined(__FreeBSD__) && defined(_KERNEL)
2693	mutex_exit(&spa_namespace_lock);
2694
2695	kmem_free(newname, MAXPATHLEN);
2696	kmem_free(oldname, MAXPATHLEN);
2697#endif
2698	/*
2699	 * Change space accounting.
2700	 * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either
2701	 * both be valid, or both be 0 (resulting in delta == 0).  This
2702	 * is true for each of {clone,origin} independently.
2703	 */
2704
2705	delta = ddpa->cloneusedsnap -
2706	    dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP];
2707	ASSERT3S(delta, >=, 0);
2708	ASSERT3U(ddpa->used, >=, delta);
2709	dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx);
2710	dsl_dir_diduse_space(dd, DD_USED_HEAD,
2711	    ddpa->used - delta, ddpa->comp, ddpa->uncomp, tx);
2712
2713	delta = ddpa->originusedsnap -
2714	    dsl_dir_phys(odd)->dd_used_breakdown[DD_USED_SNAP];
2715	ASSERT3S(delta, <=, 0);
2716	ASSERT3U(ddpa->used, >=, -delta);
2717	dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx);
2718	dsl_dir_diduse_space(odd, DD_USED_HEAD,
2719	    -ddpa->used - delta, -ddpa->comp, -ddpa->uncomp, tx);
2720
2721	dsl_dataset_phys(origin_ds)->ds_unique_bytes = ddpa->unique;
2722
2723	/* log history record */
2724	spa_history_log_internal_ds(hds, "promote", tx, "");
2725
2726	dsl_dir_rele(odd, FTAG);
2727	promote_rele(ddpa, FTAG);
2728}
2729
2730/*
2731 * Make a list of dsl_dataset_t's for the snapshots between first_obj
2732 * (exclusive) and last_obj (inclusive).  The list will be in reverse
2733 * order (last_obj will be the list_head()).  If first_obj == 0, do all
2734 * snapshots back to this dataset's origin.
2735 */
2736static int
2737snaplist_make(dsl_pool_t *dp,
2738    uint64_t first_obj, uint64_t last_obj, list_t *l, void *tag)
2739{
2740	uint64_t obj = last_obj;
2741
2742	list_create(l, sizeof (struct promotenode),
2743	    offsetof(struct promotenode, link));
2744
2745	while (obj != first_obj) {
2746		dsl_dataset_t *ds;
2747		struct promotenode *snap;
2748		int err;
2749
2750		err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
2751		ASSERT(err != ENOENT);
2752		if (err != 0)
2753			return (err);
2754
2755		if (first_obj == 0)
2756			first_obj = dsl_dir_phys(ds->ds_dir)->dd_origin_obj;
2757
2758		snap = kmem_alloc(sizeof (*snap), KM_SLEEP);
2759		snap->ds = ds;
2760		list_insert_tail(l, snap);
2761		obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
2762	}
2763
2764	return (0);
2765}
2766
2767static int
2768snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep)
2769{
2770	struct promotenode *snap;
2771
2772	*spacep = 0;
2773	for (snap = list_head(l); snap; snap = list_next(l, snap)) {
2774		uint64_t used, comp, uncomp;
2775		dsl_deadlist_space_range(&snap->ds->ds_deadlist,
2776		    mintxg, UINT64_MAX, &used, &comp, &uncomp);
2777		*spacep += used;
2778	}
2779	return (0);
2780}
2781
2782static void
2783snaplist_destroy(list_t *l, void *tag)
2784{
2785	struct promotenode *snap;
2786
2787	if (l == NULL || !list_link_active(&l->list_head))
2788		return;
2789
2790	while ((snap = list_tail(l)) != NULL) {
2791		list_remove(l, snap);
2792		dsl_dataset_rele(snap->ds, tag);
2793		kmem_free(snap, sizeof (*snap));
2794	}
2795	list_destroy(l);
2796}
2797
2798static int
2799promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, void *tag)
2800{
2801	int error;
2802	dsl_dir_t *dd;
2803	struct promotenode *snap;
2804
2805	error = dsl_dataset_hold(dp, ddpa->ddpa_clonename, tag,
2806	    &ddpa->ddpa_clone);
2807	if (error != 0)
2808		return (error);
2809	dd = ddpa->ddpa_clone->ds_dir;
2810
2811	if (ddpa->ddpa_clone->ds_is_snapshot ||
2812	    !dsl_dir_is_clone(dd)) {
2813		dsl_dataset_rele(ddpa->ddpa_clone, tag);
2814		return (SET_ERROR(EINVAL));
2815	}
2816
2817	error = snaplist_make(dp, 0, dsl_dir_phys(dd)->dd_origin_obj,
2818	    &ddpa->shared_snaps, tag);
2819	if (error != 0)
2820		goto out;
2821
2822	error = snaplist_make(dp, 0, ddpa->ddpa_clone->ds_object,
2823	    &ddpa->clone_snaps, tag);
2824	if (error != 0)
2825		goto out;
2826
2827	snap = list_head(&ddpa->shared_snaps);
2828	ASSERT3U(snap->ds->ds_object, ==, dsl_dir_phys(dd)->dd_origin_obj);
2829	error = snaplist_make(dp, dsl_dir_phys(dd)->dd_origin_obj,
2830	    dsl_dir_phys(snap->ds->ds_dir)->dd_head_dataset_obj,
2831	    &ddpa->origin_snaps, tag);
2832	if (error != 0)
2833		goto out;
2834
2835	if (dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj != 0) {
2836		error = dsl_dataset_hold_obj(dp,
2837		    dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj,
2838		    tag, &ddpa->origin_origin);
2839		if (error != 0)
2840			goto out;
2841	}
2842out:
2843	if (error != 0)
2844		promote_rele(ddpa, tag);
2845	return (error);
2846}
2847
2848static void
2849promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag)
2850{
2851	snaplist_destroy(&ddpa->shared_snaps, tag);
2852	snaplist_destroy(&ddpa->clone_snaps, tag);
2853	snaplist_destroy(&ddpa->origin_snaps, tag);
2854	if (ddpa->origin_origin != NULL)
2855		dsl_dataset_rele(ddpa->origin_origin, tag);
2856	dsl_dataset_rele(ddpa->ddpa_clone, tag);
2857}
2858
2859/*
2860 * Promote a clone.
2861 *
2862 * If it fails due to a conflicting snapshot name, "conflsnap" will be filled
2863 * in with the name.  (It must be at least MAXNAMELEN bytes long.)
2864 */
2865int
2866dsl_dataset_promote(const char *name, char *conflsnap)
2867{
2868	dsl_dataset_promote_arg_t ddpa = { 0 };
2869	uint64_t numsnaps;
2870	int error;
2871	objset_t *os;
2872
2873	/*
2874	 * We will modify space proportional to the number of
2875	 * snapshots.  Compute numsnaps.
2876	 */
2877	error = dmu_objset_hold(name, FTAG, &os);
2878	if (error != 0)
2879		return (error);
2880	error = zap_count(dmu_objset_pool(os)->dp_meta_objset,
2881	    dsl_dataset_phys(dmu_objset_ds(os))->ds_snapnames_zapobj,
2882	    &numsnaps);
2883	dmu_objset_rele(os, FTAG);
2884	if (error != 0)
2885		return (error);
2886
2887	ddpa.ddpa_clonename = name;
2888	ddpa.err_ds = conflsnap;
2889	ddpa.cr = CRED();
2890
2891	return (dsl_sync_task(name, dsl_dataset_promote_check,
2892	    dsl_dataset_promote_sync, &ddpa,
2893	    2 + numsnaps, ZFS_SPACE_CHECK_RESERVED));
2894}
2895
2896int
2897dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
2898    dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx)
2899{
2900	int64_t unused_refres_delta;
2901
2902	/* they should both be heads */
2903	if (clone->ds_is_snapshot ||
2904	    origin_head->ds_is_snapshot)
2905		return (SET_ERROR(EINVAL));
2906
2907	/* if we are not forcing, the branch point should be just before them */
2908	if (!force && clone->ds_prev != origin_head->ds_prev)
2909		return (SET_ERROR(EINVAL));
2910
2911	/* clone should be the clone (unless they are unrelated) */
2912	if (clone->ds_prev != NULL &&
2913	    clone->ds_prev != clone->ds_dir->dd_pool->dp_origin_snap &&
2914	    origin_head->ds_dir != clone->ds_prev->ds_dir)
2915		return (SET_ERROR(EINVAL));
2916
2917	/* the clone should be a child of the origin */
2918	if (clone->ds_dir->dd_parent != origin_head->ds_dir)
2919		return (SET_ERROR(EINVAL));
2920
2921	/* origin_head shouldn't be modified unless 'force' */
2922	if (!force &&
2923	    dsl_dataset_modified_since_snap(origin_head, origin_head->ds_prev))
2924		return (SET_ERROR(ETXTBSY));
2925
2926	/* origin_head should have no long holds (e.g. is not mounted) */
2927	if (dsl_dataset_handoff_check(origin_head, owner, tx))
2928		return (SET_ERROR(EBUSY));
2929
2930	/* check amount of any unconsumed refreservation */
2931	unused_refres_delta =
2932	    (int64_t)MIN(origin_head->ds_reserved,
2933	    dsl_dataset_phys(origin_head)->ds_unique_bytes) -
2934	    (int64_t)MIN(origin_head->ds_reserved,
2935	    dsl_dataset_phys(clone)->ds_unique_bytes);
2936
2937	if (unused_refres_delta > 0 &&
2938	    unused_refres_delta >
2939	    dsl_dir_space_available(origin_head->ds_dir, NULL, 0, TRUE))
2940		return (SET_ERROR(ENOSPC));
2941
2942	/* clone can't be over the head's refquota */
2943	if (origin_head->ds_quota != 0 &&
2944	    dsl_dataset_phys(clone)->ds_referenced_bytes >
2945	    origin_head->ds_quota)
2946		return (SET_ERROR(EDQUOT));
2947
2948	return (0);
2949}
2950
2951void
2952dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
2953    dsl_dataset_t *origin_head, dmu_tx_t *tx)
2954{
2955	dsl_pool_t *dp = dmu_tx_pool(tx);
2956	int64_t unused_refres_delta;
2957
2958	ASSERT(clone->ds_reserved == 0);
2959	ASSERT(origin_head->ds_quota == 0 ||
2960	    dsl_dataset_phys(clone)->ds_unique_bytes <= origin_head->ds_quota);
2961	ASSERT3P(clone->ds_prev, ==, origin_head->ds_prev);
2962
2963	/*
2964	 * Swap per-dataset feature flags.
2965	 */
2966	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
2967		if (!(spa_feature_table[f].fi_flags &
2968		    ZFEATURE_FLAG_PER_DATASET)) {
2969			ASSERT(!clone->ds_feature_inuse[f]);
2970			ASSERT(!origin_head->ds_feature_inuse[f]);
2971			continue;
2972		}
2973
2974		boolean_t clone_inuse = clone->ds_feature_inuse[f];
2975		boolean_t origin_head_inuse = origin_head->ds_feature_inuse[f];
2976
2977		if (clone_inuse) {
2978			dsl_dataset_deactivate_feature(clone->ds_object, f, tx);
2979			clone->ds_feature_inuse[f] = B_FALSE;
2980		}
2981		if (origin_head_inuse) {
2982			dsl_dataset_deactivate_feature(origin_head->ds_object,
2983			    f, tx);
2984			origin_head->ds_feature_inuse[f] = B_FALSE;
2985		}
2986		if (clone_inuse) {
2987			dsl_dataset_activate_feature(origin_head->ds_object,
2988			    f, tx);
2989			origin_head->ds_feature_inuse[f] = B_TRUE;
2990		}
2991		if (origin_head_inuse) {
2992			dsl_dataset_activate_feature(clone->ds_object, f, tx);
2993			clone->ds_feature_inuse[f] = B_TRUE;
2994		}
2995	}
2996
2997	dmu_buf_will_dirty(clone->ds_dbuf, tx);
2998	dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
2999
3000	if (clone->ds_objset != NULL) {
3001		dmu_objset_evict(clone->ds_objset);
3002		clone->ds_objset = NULL;
3003	}
3004
3005	if (origin_head->ds_objset != NULL) {
3006		dmu_objset_evict(origin_head->ds_objset);
3007		origin_head->ds_objset = NULL;
3008	}
3009
3010	unused_refres_delta =
3011	    (int64_t)MIN(origin_head->ds_reserved,
3012	    dsl_dataset_phys(origin_head)->ds_unique_bytes) -
3013	    (int64_t)MIN(origin_head->ds_reserved,
3014	    dsl_dataset_phys(clone)->ds_unique_bytes);
3015
3016	/*
3017	 * Reset origin's unique bytes, if it exists.
3018	 */
3019	if (clone->ds_prev) {
3020		dsl_dataset_t *origin = clone->ds_prev;
3021		uint64_t comp, uncomp;
3022
3023		dmu_buf_will_dirty(origin->ds_dbuf, tx);
3024		dsl_deadlist_space_range(&clone->ds_deadlist,
3025		    dsl_dataset_phys(origin)->ds_prev_snap_txg, UINT64_MAX,
3026		    &dsl_dataset_phys(origin)->ds_unique_bytes, &comp, &uncomp);
3027	}
3028
3029	/* swap blkptrs */
3030	{
3031		blkptr_t tmp;
3032		tmp = dsl_dataset_phys(origin_head)->ds_bp;
3033		dsl_dataset_phys(origin_head)->ds_bp =
3034		    dsl_dataset_phys(clone)->ds_bp;
3035		dsl_dataset_phys(clone)->ds_bp = tmp;
3036	}
3037
3038	/* set dd_*_bytes */
3039	{
3040		int64_t dused, dcomp, duncomp;
3041		uint64_t cdl_used, cdl_comp, cdl_uncomp;
3042		uint64_t odl_used, odl_comp, odl_uncomp;
3043
3044		ASSERT3U(dsl_dir_phys(clone->ds_dir)->
3045		    dd_used_breakdown[DD_USED_SNAP], ==, 0);
3046
3047		dsl_deadlist_space(&clone->ds_deadlist,
3048		    &cdl_used, &cdl_comp, &cdl_uncomp);
3049		dsl_deadlist_space(&origin_head->ds_deadlist,
3050		    &odl_used, &odl_comp, &odl_uncomp);
3051
3052		dused = dsl_dataset_phys(clone)->ds_referenced_bytes +
3053		    cdl_used -
3054		    (dsl_dataset_phys(origin_head)->ds_referenced_bytes +
3055		    odl_used);
3056		dcomp = dsl_dataset_phys(clone)->ds_compressed_bytes +
3057		    cdl_comp -
3058		    (dsl_dataset_phys(origin_head)->ds_compressed_bytes +
3059		    odl_comp);
3060		duncomp = dsl_dataset_phys(clone)->ds_uncompressed_bytes +
3061		    cdl_uncomp -
3062		    (dsl_dataset_phys(origin_head)->ds_uncompressed_bytes +
3063		    odl_uncomp);
3064
3065		dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_HEAD,
3066		    dused, dcomp, duncomp, tx);
3067		dsl_dir_diduse_space(clone->ds_dir, DD_USED_HEAD,
3068		    -dused, -dcomp, -duncomp, tx);
3069
3070		/*
3071		 * The difference in the space used by snapshots is the
3072		 * difference in snapshot space due to the head's
3073		 * deadlist (since that's the only thing that's
3074		 * changing that affects the snapused).
3075		 */
3076		dsl_deadlist_space_range(&clone->ds_deadlist,
3077		    origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
3078		    &cdl_used, &cdl_comp, &cdl_uncomp);
3079		dsl_deadlist_space_range(&origin_head->ds_deadlist,
3080		    origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
3081		    &odl_used, &odl_comp, &odl_uncomp);
3082		dsl_dir_transfer_space(origin_head->ds_dir, cdl_used - odl_used,
3083		    DD_USED_HEAD, DD_USED_SNAP, NULL);
3084	}
3085
3086	/* swap ds_*_bytes */
3087	SWITCH64(dsl_dataset_phys(origin_head)->ds_referenced_bytes,
3088	    dsl_dataset_phys(clone)->ds_referenced_bytes);
3089	SWITCH64(dsl_dataset_phys(origin_head)->ds_compressed_bytes,
3090	    dsl_dataset_phys(clone)->ds_compressed_bytes);
3091	SWITCH64(dsl_dataset_phys(origin_head)->ds_uncompressed_bytes,
3092	    dsl_dataset_phys(clone)->ds_uncompressed_bytes);
3093	SWITCH64(dsl_dataset_phys(origin_head)->ds_unique_bytes,
3094	    dsl_dataset_phys(clone)->ds_unique_bytes);
3095
3096	/* apply any parent delta for change in unconsumed refreservation */
3097	dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_REFRSRV,
3098	    unused_refres_delta, 0, 0, tx);
3099
3100	/*
3101	 * Swap deadlists.
3102	 */
3103	dsl_deadlist_close(&clone->ds_deadlist);
3104	dsl_deadlist_close(&origin_head->ds_deadlist);
3105	SWITCH64(dsl_dataset_phys(origin_head)->ds_deadlist_obj,
3106	    dsl_dataset_phys(clone)->ds_deadlist_obj);
3107	dsl_deadlist_open(&clone->ds_deadlist, dp->dp_meta_objset,
3108	    dsl_dataset_phys(clone)->ds_deadlist_obj);
3109	dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset,
3110	    dsl_dataset_phys(origin_head)->ds_deadlist_obj);
3111
3112	dsl_scan_ds_clone_swapped(origin_head, clone, tx);
3113
3114	spa_history_log_internal_ds(clone, "clone swap", tx,
3115	    "parent=%s", origin_head->ds_dir->dd_myname);
3116}
3117
3118/*
3119 * Given a pool name and a dataset object number in that pool,
3120 * return the name of that dataset.
3121 */
3122int
3123dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
3124{
3125	dsl_pool_t *dp;
3126	dsl_dataset_t *ds;
3127	int error;
3128
3129	error = dsl_pool_hold(pname, FTAG, &dp);
3130	if (error != 0)
3131		return (error);
3132
3133	error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
3134	if (error == 0) {
3135		dsl_dataset_name(ds, buf);
3136		dsl_dataset_rele(ds, FTAG);
3137	}
3138	dsl_pool_rele(dp, FTAG);
3139
3140	return (error);
3141}
3142
3143int
3144dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
3145    uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv)
3146{
3147	int error = 0;
3148
3149	ASSERT3S(asize, >, 0);
3150
3151	/*
3152	 * *ref_rsrv is the portion of asize that will come from any
3153	 * unconsumed refreservation space.
3154	 */
3155	*ref_rsrv = 0;
3156
3157	mutex_enter(&ds->ds_lock);
3158	/*
3159	 * Make a space adjustment for reserved bytes.
3160	 */
3161	if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) {
3162		ASSERT3U(*used, >=,
3163		    ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
3164		*used -=
3165		    (ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
3166		*ref_rsrv =
3167		    asize - MIN(asize, parent_delta(ds, asize + inflight));
3168	}
3169
3170	if (!check_quota || ds->ds_quota == 0) {
3171		mutex_exit(&ds->ds_lock);
3172		return (0);
3173	}
3174	/*
3175	 * If they are requesting more space, and our current estimate
3176	 * is over quota, they get to try again unless the actual
3177	 * on-disk is over quota and there are no pending changes (which
3178	 * may free up space for us).
3179	 */
3180	if (dsl_dataset_phys(ds)->ds_referenced_bytes + inflight >=
3181	    ds->ds_quota) {
3182		if (inflight > 0 ||
3183		    dsl_dataset_phys(ds)->ds_referenced_bytes < ds->ds_quota)
3184			error = SET_ERROR(ERESTART);
3185		else
3186			error = SET_ERROR(EDQUOT);
3187	}
3188	mutex_exit(&ds->ds_lock);
3189
3190	return (error);
3191}
3192
3193typedef struct dsl_dataset_set_qr_arg {
3194	const char *ddsqra_name;
3195	zprop_source_t ddsqra_source;
3196	uint64_t ddsqra_value;
3197} dsl_dataset_set_qr_arg_t;
3198
3199
3200/* ARGSUSED */
3201static int
3202dsl_dataset_set_refquota_check(void *arg, dmu_tx_t *tx)
3203{
3204	dsl_dataset_set_qr_arg_t *ddsqra = arg;
3205	dsl_pool_t *dp = dmu_tx_pool(tx);
3206	dsl_dataset_t *ds;
3207	int error;
3208	uint64_t newval;
3209
3210	if (spa_version(dp->dp_spa) < SPA_VERSION_REFQUOTA)
3211		return (SET_ERROR(ENOTSUP));
3212
3213	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
3214	if (error != 0)
3215		return (error);
3216
3217	if (ds->ds_is_snapshot) {
3218		dsl_dataset_rele(ds, FTAG);
3219		return (SET_ERROR(EINVAL));
3220	}
3221
3222	error = dsl_prop_predict(ds->ds_dir,
3223	    zfs_prop_to_name(ZFS_PROP_REFQUOTA),
3224	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
3225	if (error != 0) {
3226		dsl_dataset_rele(ds, FTAG);
3227		return (error);
3228	}
3229
3230	if (newval == 0) {
3231		dsl_dataset_rele(ds, FTAG);
3232		return (0);
3233	}
3234
3235	if (newval < dsl_dataset_phys(ds)->ds_referenced_bytes ||
3236	    newval < ds->ds_reserved) {
3237		dsl_dataset_rele(ds, FTAG);
3238		return (SET_ERROR(ENOSPC));
3239	}
3240
3241	dsl_dataset_rele(ds, FTAG);
3242	return (0);
3243}
3244
3245static void
3246dsl_dataset_set_refquota_sync(void *arg, dmu_tx_t *tx)
3247{
3248	dsl_dataset_set_qr_arg_t *ddsqra = arg;
3249	dsl_pool_t *dp = dmu_tx_pool(tx);
3250	dsl_dataset_t *ds;
3251	uint64_t newval;
3252
3253	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
3254
3255	dsl_prop_set_sync_impl(ds,
3256	    zfs_prop_to_name(ZFS_PROP_REFQUOTA),
3257	    ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
3258	    &ddsqra->ddsqra_value, tx);
3259
3260	VERIFY0(dsl_prop_get_int_ds(ds,
3261	    zfs_prop_to_name(ZFS_PROP_REFQUOTA), &newval));
3262
3263	if (ds->ds_quota != newval) {
3264		dmu_buf_will_dirty(ds->ds_dbuf, tx);
3265		ds->ds_quota = newval;
3266	}
3267	dsl_dataset_rele(ds, FTAG);
3268}
3269
3270int
3271dsl_dataset_set_refquota(const char *dsname, zprop_source_t source,
3272    uint64_t refquota)
3273{
3274	dsl_dataset_set_qr_arg_t ddsqra;
3275
3276	ddsqra.ddsqra_name = dsname;
3277	ddsqra.ddsqra_source = source;
3278	ddsqra.ddsqra_value = refquota;
3279
3280	return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check,
3281	    dsl_dataset_set_refquota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
3282}
3283
3284static int
3285dsl_dataset_set_refreservation_check(void *arg, dmu_tx_t *tx)
3286{
3287	dsl_dataset_set_qr_arg_t *ddsqra = arg;
3288	dsl_pool_t *dp = dmu_tx_pool(tx);
3289	dsl_dataset_t *ds;
3290	int error;
3291	uint64_t newval, unique;
3292
3293	if (spa_version(dp->dp_spa) < SPA_VERSION_REFRESERVATION)
3294		return (SET_ERROR(ENOTSUP));
3295
3296	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
3297	if (error != 0)
3298		return (error);
3299
3300	if (ds->ds_is_snapshot) {
3301		dsl_dataset_rele(ds, FTAG);
3302		return (SET_ERROR(EINVAL));
3303	}
3304
3305	error = dsl_prop_predict(ds->ds_dir,
3306	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
3307	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
3308	if (error != 0) {
3309		dsl_dataset_rele(ds, FTAG);
3310		return (error);
3311	}
3312
3313	/*
3314	 * If we are doing the preliminary check in open context, the
3315	 * space estimates may be inaccurate.
3316	 */
3317	if (!dmu_tx_is_syncing(tx)) {
3318		dsl_dataset_rele(ds, FTAG);
3319		return (0);
3320	}
3321
3322	mutex_enter(&ds->ds_lock);
3323	if (!DS_UNIQUE_IS_ACCURATE(ds))
3324		dsl_dataset_recalc_head_uniq(ds);
3325	unique = dsl_dataset_phys(ds)->ds_unique_bytes;
3326	mutex_exit(&ds->ds_lock);
3327
3328	if (MAX(unique, newval) > MAX(unique, ds->ds_reserved)) {
3329		uint64_t delta = MAX(unique, newval) -
3330		    MAX(unique, ds->ds_reserved);
3331
3332		if (delta >
3333		    dsl_dir_space_available(ds->ds_dir, NULL, 0, B_TRUE) ||
3334		    (ds->ds_quota > 0 && newval > ds->ds_quota)) {
3335			dsl_dataset_rele(ds, FTAG);
3336			return (SET_ERROR(ENOSPC));
3337		}
3338	}
3339
3340	dsl_dataset_rele(ds, FTAG);
3341	return (0);
3342}
3343
3344void
3345dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,
3346    zprop_source_t source, uint64_t value, dmu_tx_t *tx)
3347{
3348	uint64_t newval;
3349	uint64_t unique;
3350	int64_t delta;
3351
3352	dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
3353	    source, sizeof (value), 1, &value, tx);
3354
3355	VERIFY0(dsl_prop_get_int_ds(ds,
3356	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &newval));
3357
3358	dmu_buf_will_dirty(ds->ds_dbuf, tx);
3359	mutex_enter(&ds->ds_dir->dd_lock);
3360	mutex_enter(&ds->ds_lock);
3361	ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
3362	unique = dsl_dataset_phys(ds)->ds_unique_bytes;
3363	delta = MAX(0, (int64_t)(newval - unique)) -
3364	    MAX(0, (int64_t)(ds->ds_reserved - unique));
3365	ds->ds_reserved = newval;
3366	mutex_exit(&ds->ds_lock);
3367
3368	dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
3369	mutex_exit(&ds->ds_dir->dd_lock);
3370}
3371
3372static void
3373dsl_dataset_set_refreservation_sync(void *arg, dmu_tx_t *tx)
3374{
3375	dsl_dataset_set_qr_arg_t *ddsqra = arg;
3376	dsl_pool_t *dp = dmu_tx_pool(tx);
3377	dsl_dataset_t *ds;
3378
3379	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
3380	dsl_dataset_set_refreservation_sync_impl(ds,
3381	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, tx);
3382	dsl_dataset_rele(ds, FTAG);
3383}
3384
3385int
3386dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source,
3387    uint64_t refreservation)
3388{
3389	dsl_dataset_set_qr_arg_t ddsqra;
3390
3391	ddsqra.ddsqra_name = dsname;
3392	ddsqra.ddsqra_source = source;
3393	ddsqra.ddsqra_value = refreservation;
3394
3395	return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check,
3396	    dsl_dataset_set_refreservation_sync, &ddsqra,
3397	    0, ZFS_SPACE_CHECK_NONE));
3398}
3399
3400/*
3401 * Return (in *usedp) the amount of space written in new that is not
3402 * present in oldsnap.  New may be a snapshot or the head.  Old must be
3403 * a snapshot before new, in new's filesystem (or its origin).  If not then
3404 * fail and return EINVAL.
3405 *
3406 * The written space is calculated by considering two components:  First, we
3407 * ignore any freed space, and calculate the written as new's used space
3408 * minus old's used space.  Next, we add in the amount of space that was freed
3409 * between the two snapshots, thus reducing new's used space relative to old's.
3410 * Specifically, this is the space that was born before old->ds_creation_txg,
3411 * and freed before new (ie. on new's deadlist or a previous deadlist).
3412 *
3413 * space freed                         [---------------------]
3414 * snapshots                       ---O-------O--------O-------O------
3415 *                                         oldsnap            new
3416 */
3417int
3418dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
3419    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
3420{
3421	int err = 0;
3422	uint64_t snapobj;
3423	dsl_pool_t *dp = new->ds_dir->dd_pool;
3424
3425	ASSERT(dsl_pool_config_held(dp));
3426
3427	*usedp = 0;
3428	*usedp += dsl_dataset_phys(new)->ds_referenced_bytes;
3429	*usedp -= dsl_dataset_phys(oldsnap)->ds_referenced_bytes;
3430
3431	*compp = 0;
3432	*compp += dsl_dataset_phys(new)->ds_compressed_bytes;
3433	*compp -= dsl_dataset_phys(oldsnap)->ds_compressed_bytes;
3434
3435	*uncompp = 0;
3436	*uncompp += dsl_dataset_phys(new)->ds_uncompressed_bytes;
3437	*uncompp -= dsl_dataset_phys(oldsnap)->ds_uncompressed_bytes;
3438
3439	snapobj = new->ds_object;
3440	while (snapobj != oldsnap->ds_object) {
3441		dsl_dataset_t *snap;
3442		uint64_t used, comp, uncomp;
3443
3444		if (snapobj == new->ds_object) {
3445			snap = new;
3446		} else {
3447			err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
3448			if (err != 0)
3449				break;
3450		}
3451
3452		if (dsl_dataset_phys(snap)->ds_prev_snap_txg ==
3453		    dsl_dataset_phys(oldsnap)->ds_creation_txg) {
3454			/*
3455			 * The blocks in the deadlist can not be born after
3456			 * ds_prev_snap_txg, so get the whole deadlist space,
3457			 * which is more efficient (especially for old-format
3458			 * deadlists).  Unfortunately the deadlist code
3459			 * doesn't have enough information to make this
3460			 * optimization itself.
3461			 */
3462			dsl_deadlist_space(&snap->ds_deadlist,
3463			    &used, &comp, &uncomp);
3464		} else {
3465			dsl_deadlist_space_range(&snap->ds_deadlist,
3466			    0, dsl_dataset_phys(oldsnap)->ds_creation_txg,
3467			    &used, &comp, &uncomp);
3468		}
3469		*usedp += used;
3470		*compp += comp;
3471		*uncompp += uncomp;
3472
3473		/*
3474		 * If we get to the beginning of the chain of snapshots
3475		 * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap
3476		 * was not a snapshot of/before new.
3477		 */
3478		snapobj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
3479		if (snap != new)
3480			dsl_dataset_rele(snap, FTAG);
3481		if (snapobj == 0) {
3482			err = SET_ERROR(EINVAL);
3483			break;
3484		}
3485
3486	}
3487	return (err);
3488}
3489
3490/*
3491 * Return (in *usedp) the amount of space that will be reclaimed if firstsnap,
3492 * lastsnap, and all snapshots in between are deleted.
3493 *
3494 * blocks that would be freed            [---------------------------]
3495 * snapshots                       ---O-------O--------O-------O--------O
3496 *                                        firstsnap        lastsnap
3497 *
3498 * This is the set of blocks that were born after the snap before firstsnap,
3499 * (birth > firstsnap->prev_snap_txg) and died before the snap after the
3500 * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist).
3501 * We calculate this by iterating over the relevant deadlists (from the snap
3502 * after lastsnap, backward to the snap after firstsnap), summing up the
3503 * space on the deadlist that was born after the snap before firstsnap.
3504 */
3505int
3506dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
3507    dsl_dataset_t *lastsnap,
3508    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
3509{
3510	int err = 0;
3511	uint64_t snapobj;
3512	dsl_pool_t *dp = firstsnap->ds_dir->dd_pool;
3513
3514	ASSERT(firstsnap->ds_is_snapshot);
3515	ASSERT(lastsnap->ds_is_snapshot);
3516
3517	/*
3518	 * Check that the snapshots are in the same dsl_dir, and firstsnap
3519	 * is before lastsnap.
3520	 */
3521	if (firstsnap->ds_dir != lastsnap->ds_dir ||
3522	    dsl_dataset_phys(firstsnap)->ds_creation_txg >
3523	    dsl_dataset_phys(lastsnap)->ds_creation_txg)
3524		return (SET_ERROR(EINVAL));
3525
3526	*usedp = *compp = *uncompp = 0;
3527
3528	snapobj = dsl_dataset_phys(lastsnap)->ds_next_snap_obj;
3529	while (snapobj != firstsnap->ds_object) {
3530		dsl_dataset_t *ds;
3531		uint64_t used, comp, uncomp;
3532
3533		err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds);
3534		if (err != 0)
3535			break;
3536
3537		dsl_deadlist_space_range(&ds->ds_deadlist,
3538		    dsl_dataset_phys(firstsnap)->ds_prev_snap_txg, UINT64_MAX,
3539		    &used, &comp, &uncomp);
3540		*usedp += used;
3541		*compp += comp;
3542		*uncompp += uncomp;
3543
3544		snapobj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
3545		ASSERT3U(snapobj, !=, 0);
3546		dsl_dataset_rele(ds, FTAG);
3547	}
3548	return (err);
3549}
3550
3551/*
3552 * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline.
3553 * For example, they could both be snapshots of the same filesystem, and
3554 * 'earlier' is before 'later'.  Or 'earlier' could be the origin of
3555 * 'later's filesystem.  Or 'earlier' could be an older snapshot in the origin's
3556 * filesystem.  Or 'earlier' could be the origin's origin.
3557 *
3558 * If non-zero, earlier_txg is used instead of earlier's ds_creation_txg.
3559 */
3560boolean_t
3561dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier,
3562    uint64_t earlier_txg)
3563{
3564	dsl_pool_t *dp = later->ds_dir->dd_pool;
3565	int error;
3566	boolean_t ret;
3567
3568	ASSERT(dsl_pool_config_held(dp));
3569	ASSERT(earlier->ds_is_snapshot || earlier_txg != 0);
3570
3571	if (earlier_txg == 0)
3572		earlier_txg = dsl_dataset_phys(earlier)->ds_creation_txg;
3573
3574	if (later->ds_is_snapshot &&
3575	    earlier_txg >= dsl_dataset_phys(later)->ds_creation_txg)
3576		return (B_FALSE);
3577
3578	if (later->ds_dir == earlier->ds_dir)
3579		return (B_TRUE);
3580	if (!dsl_dir_is_clone(later->ds_dir))
3581		return (B_FALSE);
3582
3583	if (dsl_dir_phys(later->ds_dir)->dd_origin_obj == earlier->ds_object)
3584		return (B_TRUE);
3585	dsl_dataset_t *origin;
3586	error = dsl_dataset_hold_obj(dp,
3587	    dsl_dir_phys(later->ds_dir)->dd_origin_obj, FTAG, &origin);
3588	if (error != 0)
3589		return (B_FALSE);
3590	ret = dsl_dataset_is_before(origin, earlier, earlier_txg);
3591	dsl_dataset_rele(origin, FTAG);
3592	return (ret);
3593}
3594
3595void
3596dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx)
3597{
3598	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
3599	dmu_object_zapify(mos, ds->ds_object, DMU_OT_DSL_DATASET, tx);
3600}
3601
3602boolean_t
3603dsl_dataset_is_zapified(dsl_dataset_t *ds)
3604{
3605	dmu_object_info_t doi;
3606
3607	dmu_object_info_from_db(ds->ds_dbuf, &doi);
3608	return (doi.doi_type == DMU_OTN_ZAP_METADATA);
3609}
3610
3611boolean_t
3612dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds)
3613{
3614	return (dsl_dataset_is_zapified(ds) &&
3615	    zap_contains(ds->ds_dir->dd_pool->dp_meta_objset,
3616	    ds->ds_object, DS_FIELD_RESUME_TOGUID) == 0);
3617}
3618