dmu_objset.c revision 288542
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
25 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
27 */
28
29/* Portions Copyright 2010 Robert Milkowski */
30
31#include <sys/cred.h>
32#include <sys/zfs_context.h>
33#include <sys/dmu_objset.h>
34#include <sys/dsl_dir.h>
35#include <sys/dsl_dataset.h>
36#include <sys/dsl_prop.h>
37#include <sys/dsl_pool.h>
38#include <sys/dsl_synctask.h>
39#include <sys/dsl_deleg.h>
40#include <sys/dnode.h>
41#include <sys/dbuf.h>
42#include <sys/zvol.h>
43#include <sys/dmu_tx.h>
44#include <sys/zap.h>
45#include <sys/zil.h>
46#include <sys/dmu_impl.h>
47#include <sys/zfs_ioctl.h>
48#include <sys/sa.h>
49#include <sys/zfs_onexit.h>
50#include <sys/dsl_destroy.h>
51
52/*
53 * Needed to close a window in dnode_move() that allows the objset to be freed
54 * before it can be safely accessed.
55 */
56krwlock_t os_lock;
57
58void
59dmu_objset_init(void)
60{
61	rw_init(&os_lock, NULL, RW_DEFAULT, NULL);
62}
63
64void
65dmu_objset_fini(void)
66{
67	rw_destroy(&os_lock);
68}
69
70spa_t *
71dmu_objset_spa(objset_t *os)
72{
73	return (os->os_spa);
74}
75
76zilog_t *
77dmu_objset_zil(objset_t *os)
78{
79	return (os->os_zil);
80}
81
82dsl_pool_t *
83dmu_objset_pool(objset_t *os)
84{
85	dsl_dataset_t *ds;
86
87	if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir)
88		return (ds->ds_dir->dd_pool);
89	else
90		return (spa_get_dsl(os->os_spa));
91}
92
93dsl_dataset_t *
94dmu_objset_ds(objset_t *os)
95{
96	return (os->os_dsl_dataset);
97}
98
99dmu_objset_type_t
100dmu_objset_type(objset_t *os)
101{
102	return (os->os_phys->os_type);
103}
104
105void
106dmu_objset_name(objset_t *os, char *buf)
107{
108	dsl_dataset_name(os->os_dsl_dataset, buf);
109}
110
111uint64_t
112dmu_objset_id(objset_t *os)
113{
114	dsl_dataset_t *ds = os->os_dsl_dataset;
115
116	return (ds ? ds->ds_object : 0);
117}
118
119zfs_sync_type_t
120dmu_objset_syncprop(objset_t *os)
121{
122	return (os->os_sync);
123}
124
125zfs_logbias_op_t
126dmu_objset_logbias(objset_t *os)
127{
128	return (os->os_logbias);
129}
130
131static void
132checksum_changed_cb(void *arg, uint64_t newval)
133{
134	objset_t *os = arg;
135
136	/*
137	 * Inheritance should have been done by now.
138	 */
139	ASSERT(newval != ZIO_CHECKSUM_INHERIT);
140
141	os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE);
142}
143
144static void
145compression_changed_cb(void *arg, uint64_t newval)
146{
147	objset_t *os = arg;
148
149	/*
150	 * Inheritance and range checking should have been done by now.
151	 */
152	ASSERT(newval != ZIO_COMPRESS_INHERIT);
153
154	os->os_compress = zio_compress_select(os->os_spa, newval,
155	    ZIO_COMPRESS_ON);
156}
157
158static void
159copies_changed_cb(void *arg, uint64_t newval)
160{
161	objset_t *os = arg;
162
163	/*
164	 * Inheritance and range checking should have been done by now.
165	 */
166	ASSERT(newval > 0);
167	ASSERT(newval <= spa_max_replication(os->os_spa));
168
169	os->os_copies = newval;
170}
171
172static void
173dedup_changed_cb(void *arg, uint64_t newval)
174{
175	objset_t *os = arg;
176	spa_t *spa = os->os_spa;
177	enum zio_checksum checksum;
178
179	/*
180	 * Inheritance should have been done by now.
181	 */
182	ASSERT(newval != ZIO_CHECKSUM_INHERIT);
183
184	checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF);
185
186	os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK;
187	os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY);
188}
189
190static void
191primary_cache_changed_cb(void *arg, uint64_t newval)
192{
193	objset_t *os = arg;
194
195	/*
196	 * Inheritance and range checking should have been done by now.
197	 */
198	ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
199	    newval == ZFS_CACHE_METADATA);
200
201	os->os_primary_cache = newval;
202}
203
204static void
205secondary_cache_changed_cb(void *arg, uint64_t newval)
206{
207	objset_t *os = arg;
208
209	/*
210	 * Inheritance and range checking should have been done by now.
211	 */
212	ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
213	    newval == ZFS_CACHE_METADATA);
214
215	os->os_secondary_cache = newval;
216}
217
218static void
219sync_changed_cb(void *arg, uint64_t newval)
220{
221	objset_t *os = arg;
222
223	/*
224	 * Inheritance and range checking should have been done by now.
225	 */
226	ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS ||
227	    newval == ZFS_SYNC_DISABLED);
228
229	os->os_sync = newval;
230	if (os->os_zil)
231		zil_set_sync(os->os_zil, newval);
232}
233
234static void
235redundant_metadata_changed_cb(void *arg, uint64_t newval)
236{
237	objset_t *os = arg;
238
239	/*
240	 * Inheritance and range checking should have been done by now.
241	 */
242	ASSERT(newval == ZFS_REDUNDANT_METADATA_ALL ||
243	    newval == ZFS_REDUNDANT_METADATA_MOST);
244
245	os->os_redundant_metadata = newval;
246}
247
248static void
249logbias_changed_cb(void *arg, uint64_t newval)
250{
251	objset_t *os = arg;
252
253	ASSERT(newval == ZFS_LOGBIAS_LATENCY ||
254	    newval == ZFS_LOGBIAS_THROUGHPUT);
255	os->os_logbias = newval;
256	if (os->os_zil)
257		zil_set_logbias(os->os_zil, newval);
258}
259
260static void
261recordsize_changed_cb(void *arg, uint64_t newval)
262{
263	objset_t *os = arg;
264
265	os->os_recordsize = newval;
266}
267
268void
269dmu_objset_byteswap(void *buf, size_t size)
270{
271	objset_phys_t *osp = buf;
272
273	ASSERT(size == OBJSET_OLD_PHYS_SIZE || size == sizeof (objset_phys_t));
274	dnode_byteswap(&osp->os_meta_dnode);
275	byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t));
276	osp->os_type = BSWAP_64(osp->os_type);
277	osp->os_flags = BSWAP_64(osp->os_flags);
278	if (size == sizeof (objset_phys_t)) {
279		dnode_byteswap(&osp->os_userused_dnode);
280		dnode_byteswap(&osp->os_groupused_dnode);
281	}
282}
283
284int
285dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
286    objset_t **osp)
287{
288	objset_t *os;
289	int i, err;
290
291	ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock));
292
293	os = kmem_zalloc(sizeof (objset_t), KM_SLEEP);
294	os->os_dsl_dataset = ds;
295	os->os_spa = spa;
296	os->os_rootbp = bp;
297	if (!BP_IS_HOLE(os->os_rootbp)) {
298		arc_flags_t aflags = ARC_FLAG_WAIT;
299		zbookmark_phys_t zb;
300		SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
301		    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
302
303		if (DMU_OS_IS_L2CACHEABLE(os))
304			aflags |= ARC_FLAG_L2CACHE;
305		if (DMU_OS_IS_L2COMPRESSIBLE(os))
306			aflags |= ARC_FLAG_L2COMPRESS;
307
308		dprintf_bp(os->os_rootbp, "reading %s", "");
309		err = arc_read(NULL, spa, os->os_rootbp,
310		    arc_getbuf_func, &os->os_phys_buf,
311		    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb);
312		if (err != 0) {
313			kmem_free(os, sizeof (objset_t));
314			/* convert checksum errors into IO errors */
315			if (err == ECKSUM)
316				err = SET_ERROR(EIO);
317			return (err);
318		}
319
320		/* Increase the blocksize if we are permitted. */
321		if (spa_version(spa) >= SPA_VERSION_USERSPACE &&
322		    arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) {
323			arc_buf_t *buf = arc_buf_alloc(spa,
324			    sizeof (objset_phys_t), &os->os_phys_buf,
325			    ARC_BUFC_METADATA);
326			bzero(buf->b_data, sizeof (objset_phys_t));
327			bcopy(os->os_phys_buf->b_data, buf->b_data,
328			    arc_buf_size(os->os_phys_buf));
329			(void) arc_buf_remove_ref(os->os_phys_buf,
330			    &os->os_phys_buf);
331			os->os_phys_buf = buf;
332		}
333
334		os->os_phys = os->os_phys_buf->b_data;
335		os->os_flags = os->os_phys->os_flags;
336	} else {
337		int size = spa_version(spa) >= SPA_VERSION_USERSPACE ?
338		    sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE;
339		os->os_phys_buf = arc_buf_alloc(spa, size,
340		    &os->os_phys_buf, ARC_BUFC_METADATA);
341		os->os_phys = os->os_phys_buf->b_data;
342		bzero(os->os_phys, size);
343	}
344
345	/*
346	 * Note: the changed_cb will be called once before the register
347	 * func returns, thus changing the checksum/compression from the
348	 * default (fletcher2/off).  Snapshots don't need to know about
349	 * checksum/compression/copies.
350	 */
351	if (ds != NULL) {
352		err = dsl_prop_register(ds,
353		    zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
354		    primary_cache_changed_cb, os);
355		if (err == 0) {
356			err = dsl_prop_register(ds,
357			    zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
358			    secondary_cache_changed_cb, os);
359		}
360		if (!dsl_dataset_is_snapshot(ds)) {
361			if (err == 0) {
362				err = dsl_prop_register(ds,
363				    zfs_prop_to_name(ZFS_PROP_CHECKSUM),
364				    checksum_changed_cb, os);
365			}
366			if (err == 0) {
367				err = dsl_prop_register(ds,
368				    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
369				    compression_changed_cb, os);
370			}
371			if (err == 0) {
372				err = dsl_prop_register(ds,
373				    zfs_prop_to_name(ZFS_PROP_COPIES),
374				    copies_changed_cb, os);
375			}
376			if (err == 0) {
377				err = dsl_prop_register(ds,
378				    zfs_prop_to_name(ZFS_PROP_DEDUP),
379				    dedup_changed_cb, os);
380			}
381			if (err == 0) {
382				err = dsl_prop_register(ds,
383				    zfs_prop_to_name(ZFS_PROP_LOGBIAS),
384				    logbias_changed_cb, os);
385			}
386			if (err == 0) {
387				err = dsl_prop_register(ds,
388				    zfs_prop_to_name(ZFS_PROP_SYNC),
389				    sync_changed_cb, os);
390			}
391			if (err == 0) {
392				err = dsl_prop_register(ds,
393				    zfs_prop_to_name(
394				    ZFS_PROP_REDUNDANT_METADATA),
395				    redundant_metadata_changed_cb, os);
396			}
397			if (err == 0) {
398				err = dsl_prop_register(ds,
399				    zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
400				    recordsize_changed_cb, os);
401			}
402		}
403		if (err != 0) {
404			VERIFY(arc_buf_remove_ref(os->os_phys_buf,
405			    &os->os_phys_buf));
406			kmem_free(os, sizeof (objset_t));
407			return (err);
408		}
409	} else {
410		/* It's the meta-objset. */
411		os->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
412		os->os_compress = ZIO_COMPRESS_ON;
413		os->os_copies = spa_max_replication(spa);
414		os->os_dedup_checksum = ZIO_CHECKSUM_OFF;
415		os->os_dedup_verify = B_FALSE;
416		os->os_logbias = ZFS_LOGBIAS_LATENCY;
417		os->os_sync = ZFS_SYNC_STANDARD;
418		os->os_primary_cache = ZFS_CACHE_ALL;
419		os->os_secondary_cache = ZFS_CACHE_ALL;
420	}
421
422	if (ds == NULL || !dsl_dataset_is_snapshot(ds))
423		os->os_zil_header = os->os_phys->os_zil_header;
424	os->os_zil = zil_alloc(os, &os->os_zil_header);
425
426	for (i = 0; i < TXG_SIZE; i++) {
427		list_create(&os->os_dirty_dnodes[i], sizeof (dnode_t),
428		    offsetof(dnode_t, dn_dirty_link[i]));
429		list_create(&os->os_free_dnodes[i], sizeof (dnode_t),
430		    offsetof(dnode_t, dn_dirty_link[i]));
431	}
432	list_create(&os->os_dnodes, sizeof (dnode_t),
433	    offsetof(dnode_t, dn_link));
434	list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),
435	    offsetof(dmu_buf_impl_t, db_link));
436
437	mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL);
438	mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
439	mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
440
441	DMU_META_DNODE(os) = dnode_special_open(os,
442	    &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT,
443	    &os->os_meta_dnode);
444	if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) {
445		DMU_USERUSED_DNODE(os) = dnode_special_open(os,
446		    &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT,
447		    &os->os_userused_dnode);
448		DMU_GROUPUSED_DNODE(os) = dnode_special_open(os,
449		    &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT,
450		    &os->os_groupused_dnode);
451	}
452
453	*osp = os;
454	return (0);
455}
456
457int
458dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp)
459{
460	int err = 0;
461
462	mutex_enter(&ds->ds_opening_lock);
463	if (ds->ds_objset == NULL) {
464		objset_t *os;
465		err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
466		    ds, dsl_dataset_get_blkptr(ds), &os);
467
468		if (err == 0) {
469			mutex_enter(&ds->ds_lock);
470			ASSERT(ds->ds_objset == NULL);
471			ds->ds_objset = os;
472			mutex_exit(&ds->ds_lock);
473		}
474	}
475	*osp = ds->ds_objset;
476	mutex_exit(&ds->ds_opening_lock);
477	return (err);
478}
479
480/*
481 * Holds the pool while the objset is held.  Therefore only one objset
482 * can be held at a time.
483 */
484int
485dmu_objset_hold(const char *name, void *tag, objset_t **osp)
486{
487	dsl_pool_t *dp;
488	dsl_dataset_t *ds;
489	int err;
490
491	err = dsl_pool_hold(name, tag, &dp);
492	if (err != 0)
493		return (err);
494	err = dsl_dataset_hold(dp, name, tag, &ds);
495	if (err != 0) {
496		dsl_pool_rele(dp, tag);
497		return (err);
498	}
499
500	err = dmu_objset_from_ds(ds, osp);
501	if (err != 0) {
502		dsl_dataset_rele(ds, tag);
503		dsl_pool_rele(dp, tag);
504	}
505
506	return (err);
507}
508
509/*
510 * dsl_pool must not be held when this is called.
511 * Upon successful return, there will be a longhold on the dataset,
512 * and the dsl_pool will not be held.
513 */
514int
515dmu_objset_own(const char *name, dmu_objset_type_t type,
516    boolean_t readonly, void *tag, objset_t **osp)
517{
518	dsl_pool_t *dp;
519	dsl_dataset_t *ds;
520	int err;
521
522	err = dsl_pool_hold(name, FTAG, &dp);
523	if (err != 0)
524		return (err);
525	err = dsl_dataset_own(dp, name, tag, &ds);
526	if (err != 0) {
527		dsl_pool_rele(dp, FTAG);
528		return (err);
529	}
530
531	err = dmu_objset_from_ds(ds, osp);
532	dsl_pool_rele(dp, FTAG);
533	if (err != 0) {
534		dsl_dataset_disown(ds, tag);
535	} else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
536		dsl_dataset_disown(ds, tag);
537		return (SET_ERROR(EINVAL));
538	} else if (!readonly && dsl_dataset_is_snapshot(ds)) {
539		dsl_dataset_disown(ds, tag);
540		return (SET_ERROR(EROFS));
541	}
542	return (err);
543}
544
545void
546dmu_objset_rele(objset_t *os, void *tag)
547{
548	dsl_pool_t *dp = dmu_objset_pool(os);
549	dsl_dataset_rele(os->os_dsl_dataset, tag);
550	dsl_pool_rele(dp, tag);
551}
552
553/*
554 * When we are called, os MUST refer to an objset associated with a dataset
555 * that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner
556 * == tag.  We will then release and reacquire ownership of the dataset while
557 * holding the pool config_rwlock to avoid intervening namespace or ownership
558 * changes may occur.
559 *
560 * This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to
561 * release the hold on its dataset and acquire a new one on the dataset of the
562 * same name so that it can be partially torn down and reconstructed.
563 */
564void
565dmu_objset_refresh_ownership(objset_t *os, void *tag)
566{
567	dsl_pool_t *dp;
568	dsl_dataset_t *ds, *newds;
569	char name[MAXNAMELEN];
570
571	ds = os->os_dsl_dataset;
572	VERIFY3P(ds, !=, NULL);
573	VERIFY3P(ds->ds_owner, ==, tag);
574	VERIFY(dsl_dataset_long_held(ds));
575
576	dsl_dataset_name(ds, name);
577	dp = dmu_objset_pool(os);
578	dsl_pool_config_enter(dp, FTAG);
579	dmu_objset_disown(os, tag);
580	VERIFY0(dsl_dataset_own(dp, name, tag, &newds));
581	VERIFY3P(newds, ==, os->os_dsl_dataset);
582	dsl_pool_config_exit(dp, FTAG);
583}
584
585void
586dmu_objset_disown(objset_t *os, void *tag)
587{
588	dsl_dataset_disown(os->os_dsl_dataset, tag);
589}
590
591void
592dmu_objset_evict_dbufs(objset_t *os)
593{
594	dnode_t *dn;
595
596	mutex_enter(&os->os_lock);
597
598	/* process the mdn last, since the other dnodes have holds on it */
599	list_remove(&os->os_dnodes, DMU_META_DNODE(os));
600	list_insert_tail(&os->os_dnodes, DMU_META_DNODE(os));
601
602	/*
603	 * Find the first dnode with holds.  We have to do this dance
604	 * because dnode_add_ref() only works if you already have a
605	 * hold.  If there are no holds then it has no dbufs so OK to
606	 * skip.
607	 */
608	for (dn = list_head(&os->os_dnodes);
609	    dn && !dnode_add_ref(dn, FTAG);
610	    dn = list_next(&os->os_dnodes, dn))
611		continue;
612
613	while (dn) {
614		dnode_t *next_dn = dn;
615
616		do {
617			next_dn = list_next(&os->os_dnodes, next_dn);
618		} while (next_dn && !dnode_add_ref(next_dn, FTAG));
619
620		mutex_exit(&os->os_lock);
621		dnode_evict_dbufs(dn);
622		dnode_rele(dn, FTAG);
623		mutex_enter(&os->os_lock);
624		dn = next_dn;
625	}
626	mutex_exit(&os->os_lock);
627}
628
629void
630dmu_objset_evict(objset_t *os)
631{
632	dsl_dataset_t *ds = os->os_dsl_dataset;
633
634	for (int t = 0; t < TXG_SIZE; t++)
635		ASSERT(!dmu_objset_is_dirty(os, t));
636
637	if (ds) {
638		if (!dsl_dataset_is_snapshot(ds)) {
639			VERIFY0(dsl_prop_unregister(ds,
640			    zfs_prop_to_name(ZFS_PROP_CHECKSUM),
641			    checksum_changed_cb, os));
642			VERIFY0(dsl_prop_unregister(ds,
643			    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
644			    compression_changed_cb, os));
645			VERIFY0(dsl_prop_unregister(ds,
646			    zfs_prop_to_name(ZFS_PROP_COPIES),
647			    copies_changed_cb, os));
648			VERIFY0(dsl_prop_unregister(ds,
649			    zfs_prop_to_name(ZFS_PROP_DEDUP),
650			    dedup_changed_cb, os));
651			VERIFY0(dsl_prop_unregister(ds,
652			    zfs_prop_to_name(ZFS_PROP_LOGBIAS),
653			    logbias_changed_cb, os));
654			VERIFY0(dsl_prop_unregister(ds,
655			    zfs_prop_to_name(ZFS_PROP_SYNC),
656			    sync_changed_cb, os));
657			VERIFY0(dsl_prop_unregister(ds,
658			    zfs_prop_to_name(ZFS_PROP_REDUNDANT_METADATA),
659			    redundant_metadata_changed_cb, os));
660			VERIFY0(dsl_prop_unregister(ds,
661			    zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
662			    recordsize_changed_cb, os));
663		}
664		VERIFY0(dsl_prop_unregister(ds,
665		    zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
666		    primary_cache_changed_cb, os));
667		VERIFY0(dsl_prop_unregister(ds,
668		    zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
669		    secondary_cache_changed_cb, os));
670	}
671
672	if (os->os_sa)
673		sa_tear_down(os);
674
675	dmu_objset_evict_dbufs(os);
676
677	dnode_special_close(&os->os_meta_dnode);
678	if (DMU_USERUSED_DNODE(os)) {
679		dnode_special_close(&os->os_userused_dnode);
680		dnode_special_close(&os->os_groupused_dnode);
681	}
682	zil_free(os->os_zil);
683
684	ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
685
686	VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf));
687
688	/*
689	 * This is a barrier to prevent the objset from going away in
690	 * dnode_move() until we can safely ensure that the objset is still in
691	 * use. We consider the objset valid before the barrier and invalid
692	 * after the barrier.
693	 */
694	rw_enter(&os_lock, RW_READER);
695	rw_exit(&os_lock);
696
697	mutex_destroy(&os->os_lock);
698	mutex_destroy(&os->os_obj_lock);
699	mutex_destroy(&os->os_user_ptr_lock);
700	kmem_free(os, sizeof (objset_t));
701}
702
703timestruc_t
704dmu_objset_snap_cmtime(objset_t *os)
705{
706	return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir));
707}
708
709/* called from dsl for meta-objset */
710objset_t *
711dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
712    dmu_objset_type_t type, dmu_tx_t *tx)
713{
714	objset_t *os;
715	dnode_t *mdn;
716
717	ASSERT(dmu_tx_is_syncing(tx));
718
719	if (ds != NULL)
720		VERIFY0(dmu_objset_from_ds(ds, &os));
721	else
722		VERIFY0(dmu_objset_open_impl(spa, NULL, bp, &os));
723
724	mdn = DMU_META_DNODE(os);
725
726	dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
727	    DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx);
728
729	/*
730	 * We don't want to have to increase the meta-dnode's nlevels
731	 * later, because then we could do it in quescing context while
732	 * we are also accessing it in open context.
733	 *
734	 * This precaution is not necessary for the MOS (ds == NULL),
735	 * because the MOS is only updated in syncing context.
736	 * This is most fortunate: the MOS is the only objset that
737	 * needs to be synced multiple times as spa_sync() iterates
738	 * to convergence, so minimizing its dn_nlevels matters.
739	 */
740	if (ds != NULL) {
741		int levels = 1;
742
743		/*
744		 * Determine the number of levels necessary for the meta-dnode
745		 * to contain DN_MAX_OBJECT dnodes.
746		 */
747		while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift +
748		    (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) <
749		    DN_MAX_OBJECT * sizeof (dnode_phys_t))
750			levels++;
751
752		mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] =
753		    mdn->dn_nlevels = levels;
754	}
755
756	ASSERT(type != DMU_OST_NONE);
757	ASSERT(type != DMU_OST_ANY);
758	ASSERT(type < DMU_OST_NUMTYPES);
759	os->os_phys->os_type = type;
760	if (dmu_objset_userused_enabled(os)) {
761		os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
762		os->os_flags = os->os_phys->os_flags;
763	}
764
765	dsl_dataset_dirty(ds, tx);
766
767	return (os);
768}
769
770typedef struct dmu_objset_create_arg {
771	const char *doca_name;
772	cred_t *doca_cred;
773	void (*doca_userfunc)(objset_t *os, void *arg,
774	    cred_t *cr, dmu_tx_t *tx);
775	void *doca_userarg;
776	dmu_objset_type_t doca_type;
777	uint64_t doca_flags;
778} dmu_objset_create_arg_t;
779
780/*ARGSUSED*/
781static int
782dmu_objset_create_check(void *arg, dmu_tx_t *tx)
783{
784	dmu_objset_create_arg_t *doca = arg;
785	dsl_pool_t *dp = dmu_tx_pool(tx);
786	dsl_dir_t *pdd;
787	const char *tail;
788	int error;
789
790	if (strchr(doca->doca_name, '@') != NULL)
791		return (SET_ERROR(EINVAL));
792
793	error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail);
794	if (error != 0)
795		return (error);
796	if (tail == NULL) {
797		dsl_dir_rele(pdd, FTAG);
798		return (SET_ERROR(EEXIST));
799	}
800	error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
801	    doca->doca_cred);
802	dsl_dir_rele(pdd, FTAG);
803
804	return (error);
805}
806
807static void
808dmu_objset_create_sync(void *arg, dmu_tx_t *tx)
809{
810	dmu_objset_create_arg_t *doca = arg;
811	dsl_pool_t *dp = dmu_tx_pool(tx);
812	dsl_dir_t *pdd;
813	const char *tail;
814	dsl_dataset_t *ds;
815	uint64_t obj;
816	blkptr_t *bp;
817	objset_t *os;
818
819	VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail));
820
821	obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags,
822	    doca->doca_cred, tx);
823
824	VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
825	bp = dsl_dataset_get_blkptr(ds);
826	os = dmu_objset_create_impl(pdd->dd_pool->dp_spa,
827	    ds, bp, doca->doca_type, tx);
828
829	if (doca->doca_userfunc != NULL) {
830		doca->doca_userfunc(os, doca->doca_userarg,
831		    doca->doca_cred, tx);
832	}
833
834	spa_history_log_internal_ds(ds, "create", tx, "");
835	dsl_dataset_rele(ds, FTAG);
836	dsl_dir_rele(pdd, FTAG);
837}
838
839int
840dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
841    void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg)
842{
843	dmu_objset_create_arg_t doca;
844
845	doca.doca_name = name;
846	doca.doca_cred = CRED();
847	doca.doca_flags = flags;
848	doca.doca_userfunc = func;
849	doca.doca_userarg = arg;
850	doca.doca_type = type;
851
852	return (dsl_sync_task(name,
853	    dmu_objset_create_check, dmu_objset_create_sync, &doca,
854	    5, ZFS_SPACE_CHECK_NORMAL));
855}
856
857typedef struct dmu_objset_clone_arg {
858	const char *doca_clone;
859	const char *doca_origin;
860	cred_t *doca_cred;
861} dmu_objset_clone_arg_t;
862
863/*ARGSUSED*/
864static int
865dmu_objset_clone_check(void *arg, dmu_tx_t *tx)
866{
867	dmu_objset_clone_arg_t *doca = arg;
868	dsl_dir_t *pdd;
869	const char *tail;
870	int error;
871	dsl_dataset_t *origin;
872	dsl_pool_t *dp = dmu_tx_pool(tx);
873
874	if (strchr(doca->doca_clone, '@') != NULL)
875		return (SET_ERROR(EINVAL));
876
877	error = dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail);
878	if (error != 0)
879		return (error);
880	if (tail == NULL) {
881		dsl_dir_rele(pdd, FTAG);
882		return (SET_ERROR(EEXIST));
883	}
884
885	error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
886	    doca->doca_cred);
887	if (error != 0) {
888		dsl_dir_rele(pdd, FTAG);
889		return (SET_ERROR(EDQUOT));
890	}
891	dsl_dir_rele(pdd, FTAG);
892
893	error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin);
894	if (error != 0)
895		return (error);
896
897	/* You can only clone snapshots, not the head datasets. */
898	if (!dsl_dataset_is_snapshot(origin)) {
899		dsl_dataset_rele(origin, FTAG);
900		return (SET_ERROR(EINVAL));
901	}
902	dsl_dataset_rele(origin, FTAG);
903
904	return (0);
905}
906
907static void
908dmu_objset_clone_sync(void *arg, dmu_tx_t *tx)
909{
910	dmu_objset_clone_arg_t *doca = arg;
911	dsl_pool_t *dp = dmu_tx_pool(tx);
912	dsl_dir_t *pdd;
913	const char *tail;
914	dsl_dataset_t *origin, *ds;
915	uint64_t obj;
916	char namebuf[MAXNAMELEN];
917
918	VERIFY0(dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail));
919	VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin));
920
921	obj = dsl_dataset_create_sync(pdd, tail, origin, 0,
922	    doca->doca_cred, tx);
923
924	VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
925	dsl_dataset_name(origin, namebuf);
926	spa_history_log_internal_ds(ds, "clone", tx,
927	    "origin=%s (%llu)", namebuf, origin->ds_object);
928	dsl_dataset_rele(ds, FTAG);
929	dsl_dataset_rele(origin, FTAG);
930	dsl_dir_rele(pdd, FTAG);
931}
932
933int
934dmu_objset_clone(const char *clone, const char *origin)
935{
936	dmu_objset_clone_arg_t doca;
937
938	doca.doca_clone = clone;
939	doca.doca_origin = origin;
940	doca.doca_cred = CRED();
941
942	return (dsl_sync_task(clone,
943	    dmu_objset_clone_check, dmu_objset_clone_sync, &doca,
944	    5, ZFS_SPACE_CHECK_NORMAL));
945}
946
947int
948dmu_objset_snapshot_one(const char *fsname, const char *snapname)
949{
950	int err;
951	char *longsnap = kmem_asprintf("%s@%s", fsname, snapname);
952	nvlist_t *snaps = fnvlist_alloc();
953
954	fnvlist_add_boolean(snaps, longsnap);
955	strfree(longsnap);
956	err = dsl_dataset_snapshot(snaps, NULL, NULL);
957	fnvlist_free(snaps);
958	return (err);
959}
960
961static void
962dmu_objset_sync_dnodes(list_t *list, list_t *newlist, dmu_tx_t *tx)
963{
964	dnode_t *dn;
965
966	while (dn = list_head(list)) {
967		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
968		ASSERT(dn->dn_dbuf->db_data_pending);
969		/*
970		 * Initialize dn_zio outside dnode_sync() because the
971		 * meta-dnode needs to set it ouside dnode_sync().
972		 */
973		dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio;
974		ASSERT(dn->dn_zio);
975
976		ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS);
977		list_remove(list, dn);
978
979		if (newlist) {
980			(void) dnode_add_ref(dn, newlist);
981			list_insert_tail(newlist, dn);
982		}
983
984		dnode_sync(dn, tx);
985	}
986}
987
988/* ARGSUSED */
989static void
990dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
991{
992	blkptr_t *bp = zio->io_bp;
993	objset_t *os = arg;
994	dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
995
996	ASSERT(!BP_IS_EMBEDDED(bp));
997	ASSERT3P(bp, ==, os->os_rootbp);
998	ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
999	ASSERT0(BP_GET_LEVEL(bp));
1000
1001	/*
1002	 * Update rootbp fill count: it should be the number of objects
1003	 * allocated in the object set (not counting the "special"
1004	 * objects that are stored in the objset_phys_t -- the meta
1005	 * dnode and user/group accounting objects).
1006	 */
1007	bp->blk_fill = 0;
1008	for (int i = 0; i < dnp->dn_nblkptr; i++)
1009		bp->blk_fill += BP_GET_FILL(&dnp->dn_blkptr[i]);
1010}
1011
1012/* ARGSUSED */
1013static void
1014dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg)
1015{
1016	blkptr_t *bp = zio->io_bp;
1017	blkptr_t *bp_orig = &zio->io_bp_orig;
1018	objset_t *os = arg;
1019
1020	if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
1021		ASSERT(BP_EQUAL(bp, bp_orig));
1022	} else {
1023		dsl_dataset_t *ds = os->os_dsl_dataset;
1024		dmu_tx_t *tx = os->os_synctx;
1025
1026		(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
1027		dsl_dataset_block_born(ds, bp, tx);
1028	}
1029}
1030
1031/* called from dsl */
1032void
1033dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
1034{
1035	int txgoff;
1036	zbookmark_phys_t zb;
1037	zio_prop_t zp;
1038	zio_t *zio;
1039	list_t *list;
1040	list_t *newlist = NULL;
1041	dbuf_dirty_record_t *dr;
1042
1043	dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);
1044
1045	ASSERT(dmu_tx_is_syncing(tx));
1046	/* XXX the write_done callback should really give us the tx... */
1047	os->os_synctx = tx;
1048
1049	if (os->os_dsl_dataset == NULL) {
1050		/*
1051		 * This is the MOS.  If we have upgraded,
1052		 * spa_max_replication() could change, so reset
1053		 * os_copies here.
1054		 */
1055		os->os_copies = spa_max_replication(os->os_spa);
1056	}
1057
1058	/*
1059	 * Create the root block IO
1060	 */
1061	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
1062	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
1063	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
1064	arc_release(os->os_phys_buf, &os->os_phys_buf);
1065
1066	dmu_write_policy(os, NULL, 0, 0, &zp);
1067
1068	zio = arc_write(pio, os->os_spa, tx->tx_txg,
1069	    os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os),
1070	    DMU_OS_IS_L2COMPRESSIBLE(os), &zp, dmu_objset_write_ready,
1071	    NULL, dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE,
1072	    ZIO_FLAG_MUSTSUCCEED, &zb);
1073
1074	/*
1075	 * Sync special dnodes - the parent IO for the sync is the root block
1076	 */
1077	DMU_META_DNODE(os)->dn_zio = zio;
1078	dnode_sync(DMU_META_DNODE(os), tx);
1079
1080	os->os_phys->os_flags = os->os_flags;
1081
1082	if (DMU_USERUSED_DNODE(os) &&
1083	    DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) {
1084		DMU_USERUSED_DNODE(os)->dn_zio = zio;
1085		dnode_sync(DMU_USERUSED_DNODE(os), tx);
1086		DMU_GROUPUSED_DNODE(os)->dn_zio = zio;
1087		dnode_sync(DMU_GROUPUSED_DNODE(os), tx);
1088	}
1089
1090	txgoff = tx->tx_txg & TXG_MASK;
1091
1092	if (dmu_objset_userused_enabled(os)) {
1093		newlist = &os->os_synced_dnodes;
1094		/*
1095		 * We must create the list here because it uses the
1096		 * dn_dirty_link[] of this txg.
1097		 */
1098		list_create(newlist, sizeof (dnode_t),
1099		    offsetof(dnode_t, dn_dirty_link[txgoff]));
1100	}
1101
1102	dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx);
1103	dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx);
1104
1105	list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
1106	while (dr = list_head(list)) {
1107		ASSERT0(dr->dr_dbuf->db_level);
1108		list_remove(list, dr);
1109		if (dr->dr_zio)
1110			zio_nowait(dr->dr_zio);
1111	}
1112	/*
1113	 * Free intent log blocks up to this tx.
1114	 */
1115	zil_sync(os->os_zil, tx);
1116	os->os_phys->os_zil_header = os->os_zil_header;
1117	zio_nowait(zio);
1118}
1119
1120boolean_t
1121dmu_objset_is_dirty(objset_t *os, uint64_t txg)
1122{
1123	return (!list_is_empty(&os->os_dirty_dnodes[txg & TXG_MASK]) ||
1124	    !list_is_empty(&os->os_free_dnodes[txg & TXG_MASK]));
1125}
1126
1127static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES];
1128
1129void
1130dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb)
1131{
1132	used_cbs[ost] = cb;
1133}
1134
1135boolean_t
1136dmu_objset_userused_enabled(objset_t *os)
1137{
1138	return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE &&
1139	    used_cbs[os->os_phys->os_type] != NULL &&
1140	    DMU_USERUSED_DNODE(os) != NULL);
1141}
1142
1143static void
1144do_userquota_update(objset_t *os, uint64_t used, uint64_t flags,
1145    uint64_t user, uint64_t group, boolean_t subtract, dmu_tx_t *tx)
1146{
1147	if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) {
1148		int64_t delta = DNODE_SIZE + used;
1149		if (subtract)
1150			delta = -delta;
1151		VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT,
1152		    user, delta, tx));
1153		VERIFY3U(0, ==, zap_increment_int(os, DMU_GROUPUSED_OBJECT,
1154		    group, delta, tx));
1155	}
1156}
1157
1158void
1159dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
1160{
1161	dnode_t *dn;
1162	list_t *list = &os->os_synced_dnodes;
1163
1164	ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os));
1165
1166	while (dn = list_head(list)) {
1167		int flags;
1168		ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object));
1169		ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE ||
1170		    dn->dn_phys->dn_flags &
1171		    DNODE_FLAG_USERUSED_ACCOUNTED);
1172
1173		/* Allocate the user/groupused objects if necessary. */
1174		if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
1175			VERIFY(0 == zap_create_claim(os,
1176			    DMU_USERUSED_OBJECT,
1177			    DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
1178			VERIFY(0 == zap_create_claim(os,
1179			    DMU_GROUPUSED_OBJECT,
1180			    DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
1181		}
1182
1183		/*
1184		 * We intentionally modify the zap object even if the
1185		 * net delta is zero.  Otherwise
1186		 * the block of the zap obj could be shared between
1187		 * datasets but need to be different between them after
1188		 * a bprewrite.
1189		 */
1190
1191		flags = dn->dn_id_flags;
1192		ASSERT(flags);
1193		if (flags & DN_ID_OLD_EXIST)  {
1194			do_userquota_update(os, dn->dn_oldused, dn->dn_oldflags,
1195			    dn->dn_olduid, dn->dn_oldgid, B_TRUE, tx);
1196		}
1197		if (flags & DN_ID_NEW_EXIST) {
1198			do_userquota_update(os, DN_USED_BYTES(dn->dn_phys),
1199			    dn->dn_phys->dn_flags,  dn->dn_newuid,
1200			    dn->dn_newgid, B_FALSE, tx);
1201		}
1202
1203		mutex_enter(&dn->dn_mtx);
1204		dn->dn_oldused = 0;
1205		dn->dn_oldflags = 0;
1206		if (dn->dn_id_flags & DN_ID_NEW_EXIST) {
1207			dn->dn_olduid = dn->dn_newuid;
1208			dn->dn_oldgid = dn->dn_newgid;
1209			dn->dn_id_flags |= DN_ID_OLD_EXIST;
1210			if (dn->dn_bonuslen == 0)
1211				dn->dn_id_flags |= DN_ID_CHKED_SPILL;
1212			else
1213				dn->dn_id_flags |= DN_ID_CHKED_BONUS;
1214		}
1215		dn->dn_id_flags &= ~(DN_ID_NEW_EXIST);
1216		mutex_exit(&dn->dn_mtx);
1217
1218		list_remove(list, dn);
1219		dnode_rele(dn, list);
1220	}
1221}
1222
1223/*
1224 * Returns a pointer to data to find uid/gid from
1225 *
1226 * If a dirty record for transaction group that is syncing can't
1227 * be found then NULL is returned.  In the NULL case it is assumed
1228 * the uid/gid aren't changing.
1229 */
1230static void *
1231dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx)
1232{
1233	dbuf_dirty_record_t *dr, **drp;
1234	void *data;
1235
1236	if (db->db_dirtycnt == 0)
1237		return (db->db.db_data);  /* Nothing is changing */
1238
1239	for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
1240		if (dr->dr_txg == tx->tx_txg)
1241			break;
1242
1243	if (dr == NULL) {
1244		data = NULL;
1245	} else {
1246		dnode_t *dn;
1247
1248		DB_DNODE_ENTER(dr->dr_dbuf);
1249		dn = DB_DNODE(dr->dr_dbuf);
1250
1251		if (dn->dn_bonuslen == 0 &&
1252		    dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID)
1253			data = dr->dt.dl.dr_data->b_data;
1254		else
1255			data = dr->dt.dl.dr_data;
1256
1257		DB_DNODE_EXIT(dr->dr_dbuf);
1258	}
1259
1260	return (data);
1261}
1262
1263void
1264dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
1265{
1266	objset_t *os = dn->dn_objset;
1267	void *data = NULL;
1268	dmu_buf_impl_t *db = NULL;
1269	uint64_t *user = NULL;
1270	uint64_t *group = NULL;
1271	int flags = dn->dn_id_flags;
1272	int error;
1273	boolean_t have_spill = B_FALSE;
1274
1275	if (!dmu_objset_userused_enabled(dn->dn_objset))
1276		return;
1277
1278	if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST|
1279	    DN_ID_CHKED_SPILL)))
1280		return;
1281
1282	if (before && dn->dn_bonuslen != 0)
1283		data = DN_BONUS(dn->dn_phys);
1284	else if (!before && dn->dn_bonuslen != 0) {
1285		if (dn->dn_bonus) {
1286			db = dn->dn_bonus;
1287			mutex_enter(&db->db_mtx);
1288			data = dmu_objset_userquota_find_data(db, tx);
1289		} else {
1290			data = DN_BONUS(dn->dn_phys);
1291		}
1292	} else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) {
1293			int rf = 0;
1294
1295			if (RW_WRITE_HELD(&dn->dn_struct_rwlock))
1296				rf |= DB_RF_HAVESTRUCT;
1297			error = dmu_spill_hold_by_dnode(dn,
1298			    rf | DB_RF_MUST_SUCCEED,
1299			    FTAG, (dmu_buf_t **)&db);
1300			ASSERT(error == 0);
1301			mutex_enter(&db->db_mtx);
1302			data = (before) ? db->db.db_data :
1303			    dmu_objset_userquota_find_data(db, tx);
1304			have_spill = B_TRUE;
1305	} else {
1306		mutex_enter(&dn->dn_mtx);
1307		dn->dn_id_flags |= DN_ID_CHKED_BONUS;
1308		mutex_exit(&dn->dn_mtx);
1309		return;
1310	}
1311
1312	if (before) {
1313		ASSERT(data);
1314		user = &dn->dn_olduid;
1315		group = &dn->dn_oldgid;
1316	} else if (data) {
1317		user = &dn->dn_newuid;
1318		group = &dn->dn_newgid;
1319	}
1320
1321	/*
1322	 * Must always call the callback in case the object
1323	 * type has changed and that type isn't an object type to track
1324	 */
1325	error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data,
1326	    user, group);
1327
1328	/*
1329	 * Preserve existing uid/gid when the callback can't determine
1330	 * what the new uid/gid are and the callback returned EEXIST.
1331	 * The EEXIST error tells us to just use the existing uid/gid.
1332	 * If we don't know what the old values are then just assign
1333	 * them to 0, since that is a new file  being created.
1334	 */
1335	if (!before && data == NULL && error == EEXIST) {
1336		if (flags & DN_ID_OLD_EXIST) {
1337			dn->dn_newuid = dn->dn_olduid;
1338			dn->dn_newgid = dn->dn_oldgid;
1339		} else {
1340			dn->dn_newuid = 0;
1341			dn->dn_newgid = 0;
1342		}
1343		error = 0;
1344	}
1345
1346	if (db)
1347		mutex_exit(&db->db_mtx);
1348
1349	mutex_enter(&dn->dn_mtx);
1350	if (error == 0 && before)
1351		dn->dn_id_flags |= DN_ID_OLD_EXIST;
1352	if (error == 0 && !before)
1353		dn->dn_id_flags |= DN_ID_NEW_EXIST;
1354
1355	if (have_spill) {
1356		dn->dn_id_flags |= DN_ID_CHKED_SPILL;
1357	} else {
1358		dn->dn_id_flags |= DN_ID_CHKED_BONUS;
1359	}
1360	mutex_exit(&dn->dn_mtx);
1361	if (have_spill)
1362		dmu_buf_rele((dmu_buf_t *)db, FTAG);
1363}
1364
1365boolean_t
1366dmu_objset_userspace_present(objset_t *os)
1367{
1368	return (os->os_phys->os_flags &
1369	    OBJSET_FLAG_USERACCOUNTING_COMPLETE);
1370}
1371
1372int
1373dmu_objset_userspace_upgrade(objset_t *os)
1374{
1375	uint64_t obj;
1376	int err = 0;
1377
1378	if (dmu_objset_userspace_present(os))
1379		return (0);
1380	if (!dmu_objset_userused_enabled(os))
1381		return (SET_ERROR(ENOTSUP));
1382	if (dmu_objset_is_snapshot(os))
1383		return (SET_ERROR(EINVAL));
1384
1385	/*
1386	 * We simply need to mark every object dirty, so that it will be
1387	 * synced out and now accounted.  If this is called
1388	 * concurrently, or if we already did some work before crashing,
1389	 * that's fine, since we track each object's accounted state
1390	 * independently.
1391	 */
1392
1393	for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) {
1394		dmu_tx_t *tx;
1395		dmu_buf_t *db;
1396		int objerr;
1397
1398		if (issig(JUSTLOOKING) && issig(FORREAL))
1399			return (SET_ERROR(EINTR));
1400
1401		objerr = dmu_bonus_hold(os, obj, FTAG, &db);
1402		if (objerr != 0)
1403			continue;
1404		tx = dmu_tx_create(os);
1405		dmu_tx_hold_bonus(tx, obj);
1406		objerr = dmu_tx_assign(tx, TXG_WAIT);
1407		if (objerr != 0) {
1408			dmu_tx_abort(tx);
1409			continue;
1410		}
1411		dmu_buf_will_dirty(db, tx);
1412		dmu_buf_rele(db, FTAG);
1413		dmu_tx_commit(tx);
1414	}
1415
1416	os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
1417	txg_wait_synced(dmu_objset_pool(os), 0);
1418	return (0);
1419}
1420
1421void
1422dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
1423    uint64_t *usedobjsp, uint64_t *availobjsp)
1424{
1425	dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp,
1426	    usedobjsp, availobjsp);
1427}
1428
1429uint64_t
1430dmu_objset_fsid_guid(objset_t *os)
1431{
1432	return (dsl_dataset_fsid_guid(os->os_dsl_dataset));
1433}
1434
1435void
1436dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat)
1437{
1438	stat->dds_type = os->os_phys->os_type;
1439	if (os->os_dsl_dataset)
1440		dsl_dataset_fast_stat(os->os_dsl_dataset, stat);
1441}
1442
1443void
1444dmu_objset_stats(objset_t *os, nvlist_t *nv)
1445{
1446	ASSERT(os->os_dsl_dataset ||
1447	    os->os_phys->os_type == DMU_OST_META);
1448
1449	if (os->os_dsl_dataset != NULL)
1450		dsl_dataset_stats(os->os_dsl_dataset, nv);
1451
1452	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE,
1453	    os->os_phys->os_type);
1454	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING,
1455	    dmu_objset_userspace_present(os));
1456}
1457
1458int
1459dmu_objset_is_snapshot(objset_t *os)
1460{
1461	if (os->os_dsl_dataset != NULL)
1462		return (dsl_dataset_is_snapshot(os->os_dsl_dataset));
1463	else
1464		return (B_FALSE);
1465}
1466
1467int
1468dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen,
1469    boolean_t *conflict)
1470{
1471	dsl_dataset_t *ds = os->os_dsl_dataset;
1472	uint64_t ignored;
1473
1474	if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
1475		return (SET_ERROR(ENOENT));
1476
1477	return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset,
1478	    dsl_dataset_phys(ds)->ds_snapnames_zapobj, name, 8, 1, &ignored,
1479	    MT_FIRST, real, maxlen, conflict));
1480}
1481
1482int
1483dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
1484    uint64_t *idp, uint64_t *offp, boolean_t *case_conflict)
1485{
1486	dsl_dataset_t *ds = os->os_dsl_dataset;
1487	zap_cursor_t cursor;
1488	zap_attribute_t attr;
1489
1490	ASSERT(dsl_pool_config_held(dmu_objset_pool(os)));
1491
1492	if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
1493		return (SET_ERROR(ENOENT));
1494
1495	zap_cursor_init_serialized(&cursor,
1496	    ds->ds_dir->dd_pool->dp_meta_objset,
1497	    dsl_dataset_phys(ds)->ds_snapnames_zapobj, *offp);
1498
1499	if (zap_cursor_retrieve(&cursor, &attr) != 0) {
1500		zap_cursor_fini(&cursor);
1501		return (SET_ERROR(ENOENT));
1502	}
1503
1504	if (strlen(attr.za_name) + 1 > namelen) {
1505		zap_cursor_fini(&cursor);
1506		return (SET_ERROR(ENAMETOOLONG));
1507	}
1508
1509	(void) strcpy(name, attr.za_name);
1510	if (idp)
1511		*idp = attr.za_first_integer;
1512	if (case_conflict)
1513		*case_conflict = attr.za_normalization_conflict;
1514	zap_cursor_advance(&cursor);
1515	*offp = zap_cursor_serialize(&cursor);
1516	zap_cursor_fini(&cursor);
1517
1518	return (0);
1519}
1520
1521int
1522dmu_dir_list_next(objset_t *os, int namelen, char *name,
1523    uint64_t *idp, uint64_t *offp)
1524{
1525	dsl_dir_t *dd = os->os_dsl_dataset->ds_dir;
1526	zap_cursor_t cursor;
1527	zap_attribute_t attr;
1528
1529	/* there is no next dir on a snapshot! */
1530	if (os->os_dsl_dataset->ds_object !=
1531	    dsl_dir_phys(dd)->dd_head_dataset_obj)
1532		return (SET_ERROR(ENOENT));
1533
1534	zap_cursor_init_serialized(&cursor,
1535	    dd->dd_pool->dp_meta_objset,
1536	    dsl_dir_phys(dd)->dd_child_dir_zapobj, *offp);
1537
1538	if (zap_cursor_retrieve(&cursor, &attr) != 0) {
1539		zap_cursor_fini(&cursor);
1540		return (SET_ERROR(ENOENT));
1541	}
1542
1543	if (strlen(attr.za_name) + 1 > namelen) {
1544		zap_cursor_fini(&cursor);
1545		return (SET_ERROR(ENAMETOOLONG));
1546	}
1547
1548	(void) strcpy(name, attr.za_name);
1549	if (idp)
1550		*idp = attr.za_first_integer;
1551	zap_cursor_advance(&cursor);
1552	*offp = zap_cursor_serialize(&cursor);
1553	zap_cursor_fini(&cursor);
1554
1555	return (0);
1556}
1557
1558/*
1559 * Find objsets under and including ddobj, call func(ds) on each.
1560 */
1561int
1562dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
1563    int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags)
1564{
1565	dsl_dir_t *dd;
1566	dsl_dataset_t *ds;
1567	zap_cursor_t zc;
1568	zap_attribute_t *attr;
1569	uint64_t thisobj;
1570	int err;
1571
1572	ASSERT(dsl_pool_config_held(dp));
1573
1574	err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd);
1575	if (err != 0)
1576		return (err);
1577
1578	/* Don't visit hidden ($MOS & $ORIGIN) objsets. */
1579	if (dd->dd_myname[0] == '$') {
1580		dsl_dir_rele(dd, FTAG);
1581		return (0);
1582	}
1583
1584	thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
1585	attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
1586
1587	/*
1588	 * Iterate over all children.
1589	 */
1590	if (flags & DS_FIND_CHILDREN) {
1591		for (zap_cursor_init(&zc, dp->dp_meta_objset,
1592		    dsl_dir_phys(dd)->dd_child_dir_zapobj);
1593		    zap_cursor_retrieve(&zc, attr) == 0;
1594		    (void) zap_cursor_advance(&zc)) {
1595			ASSERT3U(attr->za_integer_length, ==,
1596			    sizeof (uint64_t));
1597			ASSERT3U(attr->za_num_integers, ==, 1);
1598
1599			err = dmu_objset_find_dp(dp, attr->za_first_integer,
1600			    func, arg, flags);
1601			if (err != 0)
1602				break;
1603		}
1604		zap_cursor_fini(&zc);
1605
1606		if (err != 0) {
1607			dsl_dir_rele(dd, FTAG);
1608			kmem_free(attr, sizeof (zap_attribute_t));
1609			return (err);
1610		}
1611	}
1612
1613	/*
1614	 * Iterate over all snapshots.
1615	 */
1616	if (flags & DS_FIND_SNAPSHOTS) {
1617		dsl_dataset_t *ds;
1618		err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
1619
1620		if (err == 0) {
1621			uint64_t snapobj;
1622
1623			snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
1624			dsl_dataset_rele(ds, FTAG);
1625
1626			for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
1627			    zap_cursor_retrieve(&zc, attr) == 0;
1628			    (void) zap_cursor_advance(&zc)) {
1629				ASSERT3U(attr->za_integer_length, ==,
1630				    sizeof (uint64_t));
1631				ASSERT3U(attr->za_num_integers, ==, 1);
1632
1633				err = dsl_dataset_hold_obj(dp,
1634				    attr->za_first_integer, FTAG, &ds);
1635				if (err != 0)
1636					break;
1637				err = func(dp, ds, arg);
1638				dsl_dataset_rele(ds, FTAG);
1639				if (err != 0)
1640					break;
1641			}
1642			zap_cursor_fini(&zc);
1643		}
1644	}
1645
1646	dsl_dir_rele(dd, FTAG);
1647	kmem_free(attr, sizeof (zap_attribute_t));
1648
1649	if (err != 0)
1650		return (err);
1651
1652	/*
1653	 * Apply to self.
1654	 */
1655	err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
1656	if (err != 0)
1657		return (err);
1658	err = func(dp, ds, arg);
1659	dsl_dataset_rele(ds, FTAG);
1660	return (err);
1661}
1662
1663/*
1664 * Find all objsets under name, and for each, call 'func(child_name, arg)'.
1665 * The dp_config_rwlock must not be held when this is called, and it
1666 * will not be held when the callback is called.
1667 * Therefore this function should only be used when the pool is not changing
1668 * (e.g. in syncing context), or the callback can deal with the possible races.
1669 */
1670static int
1671dmu_objset_find_impl(spa_t *spa, const char *name,
1672    int func(const char *, void *), void *arg, int flags)
1673{
1674	dsl_dir_t *dd;
1675	dsl_pool_t *dp = spa_get_dsl(spa);
1676	dsl_dataset_t *ds;
1677	zap_cursor_t zc;
1678	zap_attribute_t *attr;
1679	char *child;
1680	uint64_t thisobj;
1681	int err;
1682
1683	dsl_pool_config_enter(dp, FTAG);
1684
1685	err = dsl_dir_hold(dp, name, FTAG, &dd, NULL);
1686	if (err != 0) {
1687		dsl_pool_config_exit(dp, FTAG);
1688		return (err);
1689	}
1690
1691	/* Don't visit hidden ($MOS & $ORIGIN) objsets. */
1692	if (dd->dd_myname[0] == '$') {
1693		dsl_dir_rele(dd, FTAG);
1694		dsl_pool_config_exit(dp, FTAG);
1695		return (0);
1696	}
1697
1698	thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
1699	attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
1700
1701	/*
1702	 * Iterate over all children.
1703	 */
1704	if (flags & DS_FIND_CHILDREN) {
1705		for (zap_cursor_init(&zc, dp->dp_meta_objset,
1706		    dsl_dir_phys(dd)->dd_child_dir_zapobj);
1707		    zap_cursor_retrieve(&zc, attr) == 0;
1708		    (void) zap_cursor_advance(&zc)) {
1709			ASSERT3U(attr->za_integer_length, ==,
1710			    sizeof (uint64_t));
1711			ASSERT3U(attr->za_num_integers, ==, 1);
1712
1713			child = kmem_asprintf("%s/%s", name, attr->za_name);
1714			dsl_pool_config_exit(dp, FTAG);
1715			err = dmu_objset_find_impl(spa, child,
1716			    func, arg, flags);
1717			dsl_pool_config_enter(dp, FTAG);
1718			strfree(child);
1719			if (err != 0)
1720				break;
1721		}
1722		zap_cursor_fini(&zc);
1723
1724		if (err != 0) {
1725			dsl_dir_rele(dd, FTAG);
1726			dsl_pool_config_exit(dp, FTAG);
1727			kmem_free(attr, sizeof (zap_attribute_t));
1728			return (err);
1729		}
1730	}
1731
1732	/*
1733	 * Iterate over all snapshots.
1734	 */
1735	if (flags & DS_FIND_SNAPSHOTS) {
1736		err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
1737
1738		if (err == 0) {
1739			uint64_t snapobj;
1740
1741			snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
1742			dsl_dataset_rele(ds, FTAG);
1743
1744			for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
1745			    zap_cursor_retrieve(&zc, attr) == 0;
1746			    (void) zap_cursor_advance(&zc)) {
1747				ASSERT3U(attr->za_integer_length, ==,
1748				    sizeof (uint64_t));
1749				ASSERT3U(attr->za_num_integers, ==, 1);
1750
1751				child = kmem_asprintf("%s@%s",
1752				    name, attr->za_name);
1753				dsl_pool_config_exit(dp, FTAG);
1754				err = func(child, arg);
1755				dsl_pool_config_enter(dp, FTAG);
1756				strfree(child);
1757				if (err != 0)
1758					break;
1759			}
1760			zap_cursor_fini(&zc);
1761		}
1762	}
1763
1764	dsl_dir_rele(dd, FTAG);
1765	kmem_free(attr, sizeof (zap_attribute_t));
1766	dsl_pool_config_exit(dp, FTAG);
1767
1768	if (err != 0)
1769		return (err);
1770
1771	/* Apply to self. */
1772	return (func(name, arg));
1773}
1774
1775/*
1776 * See comment above dmu_objset_find_impl().
1777 */
1778int
1779dmu_objset_find(char *name, int func(const char *, void *), void *arg,
1780    int flags)
1781{
1782	spa_t *spa;
1783	int error;
1784
1785	error = spa_open(name, &spa, FTAG);
1786	if (error != 0)
1787		return (error);
1788	error = dmu_objset_find_impl(spa, name, func, arg, flags);
1789	spa_close(spa, FTAG);
1790	return (error);
1791}
1792
1793void
1794dmu_objset_set_user(objset_t *os, void *user_ptr)
1795{
1796	ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
1797	os->os_user_ptr = user_ptr;
1798}
1799
1800void *
1801dmu_objset_get_user(objset_t *os)
1802{
1803	ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
1804	return (os->os_user_ptr);
1805}
1806
1807/*
1808 * Determine name of filesystem, given name of snapshot.
1809 * buf must be at least MAXNAMELEN bytes
1810 */
1811int
1812dmu_fsname(const char *snapname, char *buf)
1813{
1814	char *atp = strchr(snapname, '@');
1815	if (atp == NULL)
1816		return (SET_ERROR(EINVAL));
1817	if (atp - snapname >= MAXNAMELEN)
1818		return (SET_ERROR(ENAMETOOLONG));
1819	(void) strlcpy(buf, snapname, atp - snapname + 1);
1820	return (0);
1821}
1822