1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd/*
22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23269416Sdelphij * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
24251646Sdelphij * Copyright (c) 2013 Steven Hartland. All rights reserved.
25168404Spjd */
26168404Spjd
27168404Spjd#include <sys/dsl_pool.h>
28168404Spjd#include <sys/dsl_dataset.h>
29219089Spjd#include <sys/dsl_prop.h>
30168404Spjd#include <sys/dsl_dir.h>
31168404Spjd#include <sys/dsl_synctask.h>
32219089Spjd#include <sys/dsl_scan.h>
33219089Spjd#include <sys/dnode.h>
34168404Spjd#include <sys/dmu_tx.h>
35168404Spjd#include <sys/dmu_objset.h>
36168404Spjd#include <sys/arc.h>
37168404Spjd#include <sys/zap.h>
38168404Spjd#include <sys/zio.h>
39168404Spjd#include <sys/zfs_context.h>
40168404Spjd#include <sys/fs/zfs.h>
41185029Spjd#include <sys/zfs_znode.h>
42185029Spjd#include <sys/spa_impl.h>
43219089Spjd#include <sys/dsl_deadlist.h>
44236884Smm#include <sys/bptree.h>
45236884Smm#include <sys/zfeature.h>
46239620Smm#include <sys/zil_impl.h>
47248571Smm#include <sys/dsl_userhold.h>
48168404Spjd
49271435Ssmh#ifdef __FreeBSD__
50271435Ssmh#include <sys/sysctl.h>
51271435Ssmh#include <sys/types.h>
52271435Ssmh#endif
53271435Ssmh
54260763Savg/*
55260763Savg * ZFS Write Throttle
56260763Savg * ------------------
57260763Savg *
58260763Savg * ZFS must limit the rate of incoming writes to the rate at which it is able
59260763Savg * to sync data modifications to the backend storage. Throttling by too much
60260763Savg * creates an artificial limit; throttling by too little can only be sustained
61260763Savg * for short periods and would lead to highly lumpy performance. On a per-pool
62260763Savg * basis, ZFS tracks the amount of modified (dirty) data. As operations change
63260763Savg * data, the amount of dirty data increases; as ZFS syncs out data, the amount
64260763Savg * of dirty data decreases. When the amount of dirty data exceeds a
65260763Savg * predetermined threshold further modifications are blocked until the amount
66260763Savg * of dirty data decreases (as data is synced out).
67260763Savg *
68260763Savg * The limit on dirty data is tunable, and should be adjusted according to
69260763Savg * both the IO capacity and available memory of the system. The larger the
70260763Savg * window, the more ZFS is able to aggregate and amortize metadata (and data)
71260763Savg * changes. However, memory is a limited resource, and allowing for more dirty
72260763Savg * data comes at the cost of keeping other useful data in memory (for example
73260763Savg * ZFS data cached by the ARC).
74260763Savg *
75260763Savg * Implementation
76260763Savg *
77260763Savg * As buffers are modified dsl_pool_willuse_space() increments both the per-
78260763Savg * txg (dp_dirty_pertxg[]) and poolwide (dp_dirty_total) accounting of
79260763Savg * dirty space used; dsl_pool_dirty_space() decrements those values as data
80260763Savg * is synced out from dsl_pool_sync(). While only the poolwide value is
81260763Savg * relevant, the per-txg value is useful for debugging. The tunable
82260763Savg * zfs_dirty_data_max determines the dirty space limit. Once that value is
83260763Savg * exceeded, new writes are halted until space frees up.
84260763Savg *
85260763Savg * The zfs_dirty_data_sync tunable dictates the threshold at which we
86260763Savg * ensure that there is a txg syncing (see the comment in txg.c for a full
87260763Savg * description of transaction group stages).
88260763Savg *
89260763Savg * The IO scheduler uses both the dirty space limit and current amount of
90260763Savg * dirty data as inputs. Those values affect the number of concurrent IOs ZFS
91260763Savg * issues. See the comment in vdev_queue.c for details of the IO scheduler.
92260763Savg *
93260763Savg * The delay is also calculated based on the amount of dirty data.  See the
94260763Savg * comment above dmu_tx_delay() for details.
95260763Savg */
96185029Spjd
97260763Savg/*
98260763Savg * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory,
99260763Savg * capped at zfs_dirty_data_max_max.  It can also be overridden in /etc/system.
100260763Savg */
101260763Savguint64_t zfs_dirty_data_max;
102260763Savguint64_t zfs_dirty_data_max_max = 4ULL * 1024 * 1024 * 1024;
103260763Savgint zfs_dirty_data_max_percent = 10;
104185029Spjd
105260763Savg/*
106260763Savg * If there is at least this much dirty data, push out a txg.
107260763Savg */
108260763Savguint64_t zfs_dirty_data_sync = 64 * 1024 * 1024;
109185029Spjd
110260763Savg/*
111260763Savg * Once there is this amount of dirty data, the dmu_tx_delay() will kick in
112260763Savg * and delay each transaction.
113260763Savg * This value should be >= zfs_vdev_async_write_active_max_dirty_percent.
114260763Savg */
115260763Savgint zfs_delay_min_dirty_percent = 60;
116185029Spjd
117260763Savg/*
118260763Savg * This controls how quickly the delay approaches infinity.
119272456Sdelphij * Larger values cause it to delay more for a given amount of dirty data.
120272456Sdelphij * Therefore larger values will cause there to be less dirty data for a
121260763Savg * given throughput.
122260763Savg *
123260763Savg * For the smoothest delay, this value should be about 1 billion divided
124260763Savg * by the maximum number of operations per second.  This will smoothly
125260763Savg * handle between 10x and 1/10th this number.
126260763Savg *
127260763Savg * Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the
128260763Savg * multiply in dmu_tx_delay().
129260763Savg */
130260763Savguint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000;
131260763Savg
132260763Savg
133271435Ssmh#ifdef __FreeBSD__
134260763Savg
135271435Ssmhextern int zfs_vdev_async_write_active_max_dirty_percent;
136271435Ssmh
137219089SpjdSYSCTL_DECL(_vfs_zfs);
138219089Spjd
139271435SsmhTUNABLE_QUAD("vfs.zfs.dirty_data_max", &zfs_dirty_data_max);
140271435SsmhSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_max, CTLFLAG_RWTUN,
141271435Ssmh    &zfs_dirty_data_max, 0,
142271435Ssmh    "The maximum amount of dirty data in bytes after which new writes are "
143271435Ssmh    "halted until space becomes available");
144271435Ssmh
145271435SsmhTUNABLE_QUAD("vfs.zfs.dirty_data_max_max", &zfs_dirty_data_max_max);
146271435SsmhSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_max_max, CTLFLAG_RDTUN,
147271435Ssmh    &zfs_dirty_data_max_max, 0,
148271435Ssmh    "The absolute cap on dirty_data_max when auto calculating");
149271435Ssmh
150271435SsmhTUNABLE_INT("vfs.zfs.dirty_data_max_percent", &zfs_dirty_data_max_percent);
151271435SsmhSYSCTL_INT(_vfs_zfs, OID_AUTO, dirty_data_max_percent, CTLFLAG_RDTUN,
152271435Ssmh    &zfs_dirty_data_max_percent, 0,
153271435Ssmh    "The percent of physical memory used to auto calculate dirty_data_max");
154271435Ssmh
155271435SsmhTUNABLE_QUAD("vfs.zfs.dirty_data_sync", &zfs_dirty_data_sync);
156271435SsmhSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_sync, CTLFLAG_RWTUN,
157271435Ssmh    &zfs_dirty_data_sync, 0,
158271435Ssmh    "Force a txg if the number of dirty buffer bytes exceed this value");
159271435Ssmh
160271435Ssmhstatic int sysctl_zfs_delay_min_dirty_percent(SYSCTL_HANDLER_ARGS);
161271435Ssmh/* No zfs_delay_min_dirty_percent tunable due to limit requirements */
162271435SsmhSYSCTL_PROC(_vfs_zfs, OID_AUTO, delay_min_dirty_percent,
163271435Ssmh    CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(int),
164271435Ssmh    sysctl_zfs_delay_min_dirty_percent, "I",
165271435Ssmh    "The limit of outstanding dirty data before transations are delayed");
166271435Ssmh
167271435Ssmhstatic int sysctl_zfs_delay_scale(SYSCTL_HANDLER_ARGS);
168271435Ssmh/* No zfs_delay_scale tunable due to limit requirements */
169271435SsmhSYSCTL_PROC(_vfs_zfs, OID_AUTO, delay_scale,
170271435Ssmh    CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
171271435Ssmh    sysctl_zfs_delay_scale, "QU",
172271435Ssmh    "Controls how quickly the delay approaches infinity");
173271435Ssmh
174271435Ssmhstatic int
175271435Ssmhsysctl_zfs_delay_min_dirty_percent(SYSCTL_HANDLER_ARGS)
176271435Ssmh{
177271435Ssmh	int val, err;
178271435Ssmh
179271435Ssmh	val = zfs_delay_min_dirty_percent;
180271435Ssmh	err = sysctl_handle_int(oidp, &val, 0, req);
181271435Ssmh	if (err != 0 || req->newptr == NULL)
182271435Ssmh		return (err);
183271435Ssmh
184271435Ssmh	if (val < zfs_vdev_async_write_active_max_dirty_percent)
185271435Ssmh		return (EINVAL);
186271435Ssmh
187271435Ssmh	zfs_delay_min_dirty_percent = val;
188271435Ssmh
189271435Ssmh	return (0);
190271435Ssmh}
191271435Ssmh
192271435Ssmhstatic int
193271435Ssmhsysctl_zfs_delay_scale(SYSCTL_HANDLER_ARGS)
194271435Ssmh{
195271435Ssmh	uint64_t val;
196271435Ssmh	int err;
197271435Ssmh
198271435Ssmh	val = zfs_delay_scale;
199271435Ssmh	err = sysctl_handle_64(oidp, &val, 0, req);
200271435Ssmh	if (err != 0 || req->newptr == NULL)
201271435Ssmh		return (err);
202271435Ssmh
203271435Ssmh	if (val > UINT64_MAX / zfs_dirty_data_max)
204271435Ssmh		return (EINVAL);
205271435Ssmh
206271435Ssmh	zfs_delay_scale = val;
207271435Ssmh
208271435Ssmh	return (0);
209271435Ssmh}
210260763Savg#endif
211219089Spjd
212255437Sdelphijhrtime_t zfs_throttle_delay = MSEC2NSEC(10);
213255437Sdelphijhrtime_t zfs_throttle_resolution = MSEC2NSEC(10);
214255437Sdelphij
215219089Spjdint
216185029Spjddsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
217168404Spjd{
218168404Spjd	uint64_t obj;
219168404Spjd	int err;
220168404Spjd
221168404Spjd	err = zap_lookup(dp->dp_meta_objset,
222168404Spjd	    dp->dp_root_dir->dd_phys->dd_child_dir_zapobj,
223185029Spjd	    name, sizeof (obj), 1, &obj);
224168404Spjd	if (err)
225168404Spjd		return (err);
226168404Spjd
227248571Smm	return (dsl_dir_hold_obj(dp, obj, name, dp, ddp));
228168404Spjd}
229168404Spjd
230168404Spjdstatic dsl_pool_t *
231168404Spjddsl_pool_open_impl(spa_t *spa, uint64_t txg)
232168404Spjd{
233168404Spjd	dsl_pool_t *dp;
234168404Spjd	blkptr_t *bp = spa_get_rootblkptr(spa);
235168404Spjd
236168404Spjd	dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP);
237168404Spjd	dp->dp_spa = spa;
238168404Spjd	dp->dp_meta_rootbp = *bp;
239248571Smm	rrw_init(&dp->dp_config_rwlock, B_TRUE);
240168404Spjd	txg_init(dp, txg);
241168404Spjd
242168404Spjd	txg_list_create(&dp->dp_dirty_datasets,
243168404Spjd	    offsetof(dsl_dataset_t, ds_dirty_link));
244239620Smm	txg_list_create(&dp->dp_dirty_zilogs,
245239620Smm	    offsetof(zilog_t, zl_dirty_link));
246168404Spjd	txg_list_create(&dp->dp_dirty_dirs,
247168404Spjd	    offsetof(dsl_dir_t, dd_dirty_link));
248168404Spjd	txg_list_create(&dp->dp_sync_tasks,
249248571Smm	    offsetof(dsl_sync_task_t, dst_node));
250168404Spjd
251185029Spjd	mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
252260763Savg	cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL);
253185029Spjd
254196307Spjd	dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri,
255196307Spjd	    1, 4, 0);
256196307Spjd
257168404Spjd	return (dp);
258168404Spjd}
259168404Spjd
260168404Spjdint
261236884Smmdsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
262168404Spjd{
263168404Spjd	int err;
264168404Spjd	dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
265236884Smm
266236884Smm	err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp,
267236884Smm	    &dp->dp_meta_objset);
268236884Smm	if (err != 0)
269236884Smm		dsl_pool_close(dp);
270236884Smm	else
271236884Smm		*dpp = dp;
272236884Smm
273236884Smm	return (err);
274236884Smm}
275236884Smm
276236884Smmint
277236884Smmdsl_pool_open(dsl_pool_t *dp)
278236884Smm{
279236884Smm	int err;
280185029Spjd	dsl_dir_t *dd;
281185029Spjd	dsl_dataset_t *ds;
282219089Spjd	uint64_t obj;
283168404Spjd
284248571Smm	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
285168404Spjd	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
286168404Spjd	    DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
287168404Spjd	    &dp->dp_root_dir_obj);
288168404Spjd	if (err)
289168404Spjd		goto out;
290168404Spjd
291248571Smm	err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
292168404Spjd	    NULL, dp, &dp->dp_root_dir);
293168404Spjd	if (err)
294168404Spjd		goto out;
295168404Spjd
296185029Spjd	err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir);
297168404Spjd	if (err)
298168404Spjd		goto out;
299168404Spjd
300236884Smm	if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) {
301185029Spjd		err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd);
302185029Spjd		if (err)
303185029Spjd			goto out;
304185029Spjd		err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj,
305185029Spjd		    FTAG, &ds);
306209962Smm		if (err == 0) {
307209962Smm			err = dsl_dataset_hold_obj(dp,
308219089Spjd			    ds->ds_phys->ds_prev_snap_obj, dp,
309219089Spjd			    &dp->dp_origin_snap);
310209962Smm			dsl_dataset_rele(ds, FTAG);
311209962Smm		}
312248571Smm		dsl_dir_rele(dd, dp);
313185029Spjd		if (err)
314185029Spjd			goto out;
315185029Spjd	}
316185029Spjd
317236884Smm	if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
318219089Spjd		err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME,
319219089Spjd		    &dp->dp_free_dir);
320185029Spjd		if (err)
321185029Spjd			goto out;
322219089Spjd
323185029Spjd		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
324219089Spjd		    DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj);
325185029Spjd		if (err)
326185029Spjd			goto out;
327248571Smm		VERIFY0(bpobj_open(&dp->dp_free_bpobj,
328219089Spjd		    dp->dp_meta_objset, obj));
329185029Spjd	}
330185029Spjd
331268650Sdelphij	/*
332268650Sdelphij	 * Note: errors ignored, because the leak dir will not exist if we
333268650Sdelphij	 * have not encountered a leak yet.
334268650Sdelphij	 */
335268650Sdelphij	(void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME,
336268650Sdelphij	    &dp->dp_leak_dir);
337268650Sdelphij
338263390Sdelphij	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
339236884Smm		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
340236884Smm		    DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
341236884Smm		    &dp->dp_bptree_obj);
342236884Smm		if (err != 0)
343236884Smm			goto out;
344236884Smm	}
345236884Smm
346263390Sdelphij	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMPTY_BPOBJ)) {
347239774Smm		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
348239774Smm		    DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
349239774Smm		    &dp->dp_empty_bpobj);
350239774Smm		if (err != 0)
351239774Smm			goto out;
352239774Smm	}
353239774Smm
354219089Spjd	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
355219089Spjd	    DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
356219089Spjd	    &dp->dp_tmp_userrefs_obj);
357219089Spjd	if (err == ENOENT)
358219089Spjd		err = 0;
359219089Spjd	if (err)
360219089Spjd		goto out;
361219089Spjd
362236884Smm	err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg);
363219089Spjd
364168404Spjdout:
365248571Smm	rrw_exit(&dp->dp_config_rwlock, FTAG);
366168404Spjd	return (err);
367168404Spjd}
368168404Spjd
369168404Spjdvoid
370168404Spjddsl_pool_close(dsl_pool_t *dp)
371168404Spjd{
372185029Spjd	/*
373260763Savg	 * Drop our references from dsl_pool_open().
374260763Savg	 *
375185029Spjd	 * Since we held the origin_snap from "syncing" context (which
376185029Spjd	 * includes pool-opening context), it actually only got a "ref"
377185029Spjd	 * and not a hold, so just drop that here.
378185029Spjd	 */
379185029Spjd	if (dp->dp_origin_snap)
380248571Smm		dsl_dataset_rele(dp->dp_origin_snap, dp);
381168404Spjd	if (dp->dp_mos_dir)
382248571Smm		dsl_dir_rele(dp->dp_mos_dir, dp);
383219089Spjd	if (dp->dp_free_dir)
384248571Smm		dsl_dir_rele(dp->dp_free_dir, dp);
385268650Sdelphij	if (dp->dp_leak_dir)
386268650Sdelphij		dsl_dir_rele(dp->dp_leak_dir, dp);
387168404Spjd	if (dp->dp_root_dir)
388248571Smm		dsl_dir_rele(dp->dp_root_dir, dp);
389168404Spjd
390219089Spjd	bpobj_close(&dp->dp_free_bpobj);
391219089Spjd
392168404Spjd	/* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
393168404Spjd	if (dp->dp_meta_objset)
394219089Spjd		dmu_objset_evict(dp->dp_meta_objset);
395168404Spjd
396168404Spjd	txg_list_destroy(&dp->dp_dirty_datasets);
397239620Smm	txg_list_destroy(&dp->dp_dirty_zilogs);
398219089Spjd	txg_list_destroy(&dp->dp_sync_tasks);
399168404Spjd	txg_list_destroy(&dp->dp_dirty_dirs);
400168404Spjd
401185029Spjd	arc_flush(dp->dp_spa);
402168404Spjd	txg_fini(dp);
403219089Spjd	dsl_scan_fini(dp);
404248571Smm	rrw_destroy(&dp->dp_config_rwlock);
405185029Spjd	mutex_destroy(&dp->dp_lock);
406196307Spjd	taskq_destroy(dp->dp_vnrele_taskq);
407208047Smm	if (dp->dp_blkstats)
408208047Smm		kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
409168404Spjd	kmem_free(dp, sizeof (dsl_pool_t));
410168404Spjd}
411168404Spjd
412168404Spjddsl_pool_t *
413185029Spjddsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
414168404Spjd{
415168404Spjd	int err;
416168404Spjd	dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
417168404Spjd	dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
418219089Spjd	objset_t *os;
419185029Spjd	dsl_dataset_t *ds;
420219089Spjd	uint64_t obj;
421185029Spjd
422248571Smm	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
423248571Smm
424185029Spjd	/* create and open the MOS (meta-objset) */
425219089Spjd	dp->dp_meta_objset = dmu_objset_create_impl(spa,
426219089Spjd	    NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx);
427168404Spjd
428168404Spjd	/* create the pool directory */
429168404Spjd	err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
430168404Spjd	    DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx);
431240415Smm	ASSERT0(err);
432168404Spjd
433219089Spjd	/* Initialize scan structures */
434248571Smm	VERIFY0(dsl_scan_init(dp, txg));
435219089Spjd
436168404Spjd	/* create and open the root dir */
437185029Spjd	dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx);
438248571Smm	VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
439168404Spjd	    NULL, dp, &dp->dp_root_dir));
440168404Spjd
441168404Spjd	/* create and open the meta-objset dir */
442185029Spjd	(void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx);
443248571Smm	VERIFY0(dsl_pool_open_special_dir(dp,
444185029Spjd	    MOS_DIR_NAME, &dp->dp_mos_dir));
445168404Spjd
446219089Spjd	if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
447219089Spjd		/* create and open the free dir */
448219089Spjd		(void) dsl_dir_create_sync(dp, dp->dp_root_dir,
449219089Spjd		    FREE_DIR_NAME, tx);
450248571Smm		VERIFY0(dsl_pool_open_special_dir(dp,
451219089Spjd		    FREE_DIR_NAME, &dp->dp_free_dir));
452219089Spjd
453219089Spjd		/* create and open the free_bplist */
454219089Spjd		obj = bpobj_alloc(dp->dp_meta_objset, SPA_MAXBLOCKSIZE, tx);
455219089Spjd		VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
456219089Spjd		    DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0);
457248571Smm		VERIFY0(bpobj_open(&dp->dp_free_bpobj,
458219089Spjd		    dp->dp_meta_objset, obj));
459219089Spjd	}
460219089Spjd
461185029Spjd	if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB)
462185029Spjd		dsl_pool_create_origin(dp, tx);
463185029Spjd
464185029Spjd	/* create the root dataset */
465219089Spjd	obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);
466185029Spjd
467185029Spjd	/* create the root objset */
468248571Smm	VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds));
469219089Spjd	os = dmu_objset_create_impl(dp->dp_spa, ds,
470185029Spjd	    dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx);
471185029Spjd#ifdef _KERNEL
472219089Spjd	zfs_create_fs(os, kcred, zplprops, tx);
473185029Spjd#endif
474185029Spjd	dsl_dataset_rele(ds, FTAG);
475185029Spjd
476168404Spjd	dmu_tx_commit(tx);
477168404Spjd
478248571Smm	rrw_exit(&dp->dp_config_rwlock, FTAG);
479248571Smm
480168404Spjd	return (dp);
481168404Spjd}
482168404Spjd
483239620Smm/*
484239620Smm * Account for the meta-objset space in its placeholder dsl_dir.
485239620Smm */
486239620Smmvoid
487239620Smmdsl_pool_mos_diduse_space(dsl_pool_t *dp,
488239620Smm    int64_t used, int64_t comp, int64_t uncomp)
489239620Smm{
490239620Smm	ASSERT3U(comp, ==, uncomp); /* it's all metadata */
491239620Smm	mutex_enter(&dp->dp_lock);
492239620Smm	dp->dp_mos_used_delta += used;
493239620Smm	dp->dp_mos_compressed_delta += comp;
494239620Smm	dp->dp_mos_uncompressed_delta += uncomp;
495239620Smm	mutex_exit(&dp->dp_lock);
496239620Smm}
497239620Smm
498219089Spjdstatic int
499219089Spjddeadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
500219089Spjd{
501219089Spjd	dsl_deadlist_t *dl = arg;
502219089Spjd	dsl_deadlist_insert(dl, bp, tx);
503219089Spjd	return (0);
504219089Spjd}
505219089Spjd
506260763Savgstatic void
507260763Savgdsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx)
508260763Savg{
509260763Savg	zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
510260763Savg	dmu_objset_sync(dp->dp_meta_objset, zio, tx);
511260763Savg	VERIFY0(zio_wait(zio));
512260763Savg	dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
513260763Savg	spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
514260763Savg}
515260763Savg
516260763Savgstatic void
517260763Savgdsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta)
518260763Savg{
519260763Savg	ASSERT(MUTEX_HELD(&dp->dp_lock));
520260763Savg
521260763Savg	if (delta < 0)
522260763Savg		ASSERT3U(-delta, <=, dp->dp_dirty_total);
523260763Savg
524260763Savg	dp->dp_dirty_total += delta;
525260763Savg
526260763Savg	/*
527260763Savg	 * Note: we signal even when increasing dp_dirty_total.
528260763Savg	 * This ensures forward progress -- each thread wakes the next waiter.
529260763Savg	 */
530260763Savg	if (dp->dp_dirty_total <= zfs_dirty_data_max)
531260763Savg		cv_signal(&dp->dp_spaceavail_cv);
532260763Savg}
533260763Savg
534168404Spjdvoid
535168404Spjddsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
536168404Spjd{
537168404Spjd	zio_t *zio;
538168404Spjd	dmu_tx_t *tx;
539168404Spjd	dsl_dir_t *dd;
540168404Spjd	dsl_dataset_t *ds;
541219089Spjd	objset_t *mos = dp->dp_meta_objset;
542239620Smm	list_t synced_datasets;
543168404Spjd
544239620Smm	list_create(&synced_datasets, sizeof (dsl_dataset_t),
545239620Smm	    offsetof(dsl_dataset_t, ds_synced_link));
546239620Smm
547260763Savg	tx = dmu_tx_create_assigned(dp, txg);
548260763Savg
549219089Spjd	/*
550260763Savg	 * Write out all dirty blocks of dirty datasets.
551219089Spjd	 */
552168404Spjd	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
553260763Savg	while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
554209962Smm		/*
555209962Smm		 * We must not sync any non-MOS datasets twice, because
556209962Smm		 * we may have taken a snapshot of them.  However, we
557209962Smm		 * may sync newly-created datasets on pass 2.
558209962Smm		 */
559209962Smm		ASSERT(!list_link_active(&ds->ds_synced_link));
560239620Smm		list_insert_tail(&synced_datasets, ds);
561168404Spjd		dsl_dataset_sync(ds, zio, tx);
562168404Spjd	}
563260763Savg	VERIFY0(zio_wait(zio));
564185029Spjd
565260763Savg	/*
566260763Savg	 * We have written all of the accounted dirty data, so our
567260763Savg	 * dp_space_towrite should now be zero.  However, some seldom-used
568260763Savg	 * code paths do not adhere to this (e.g. dbuf_undirty(), also
569260763Savg	 * rounding error in dbuf_write_physdone).
570260763Savg	 * Shore up the accounting of any dirtied space now.
571260763Savg	 */
572260763Savg	dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg);
573168404Spjd
574239620Smm	/*
575239620Smm	 * After the data blocks have been written (ensured by the zio_wait()
576239620Smm	 * above), update the user/group space accounting.
577239620Smm	 */
578260763Savg	for (ds = list_head(&synced_datasets); ds != NULL;
579260763Savg	    ds = list_next(&synced_datasets, ds)) {
580219089Spjd		dmu_objset_do_userquota_updates(ds->ds_objset, tx);
581260763Savg	}
582209962Smm
583209962Smm	/*
584209962Smm	 * Sync the datasets again to push out the changes due to
585219089Spjd	 * userspace updates.  This must be done before we process the
586239620Smm	 * sync tasks, so that any snapshots will have the correct
587239620Smm	 * user accounting information (and we won't get confused
588239620Smm	 * about which blocks are part of the snapshot).
589209962Smm	 */
590209962Smm	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
591260763Savg	while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
592209962Smm		ASSERT(list_link_active(&ds->ds_synced_link));
593209962Smm		dmu_buf_rele(ds->ds_dbuf, ds);
594209962Smm		dsl_dataset_sync(ds, zio, tx);
595209962Smm	}
596260763Savg	VERIFY0(zio_wait(zio));
597209962Smm
598219089Spjd	/*
599239620Smm	 * Now that the datasets have been completely synced, we can
600239620Smm	 * clean up our in-memory structures accumulated while syncing:
601239620Smm	 *
602239620Smm	 *  - move dead blocks from the pending deadlist to the on-disk deadlist
603239620Smm	 *  - release hold from dsl_dataset_dirty()
604219089Spjd	 */
605260763Savg	while ((ds = list_remove_head(&synced_datasets)) != NULL) {
606239620Smm		objset_t *os = ds->ds_objset;
607219089Spjd		bplist_iterate(&ds->ds_pending_deadlist,
608219089Spjd		    deadlist_enqueue_cb, &ds->ds_deadlist, tx);
609239620Smm		ASSERT(!dmu_objset_is_dirty(os, txg));
610239620Smm		dmu_buf_rele(ds->ds_dbuf, ds);
611219089Spjd	}
612260763Savg	while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) {
613168404Spjd		dsl_dir_sync(dd, tx);
614260763Savg	}
615168404Spjd
616239620Smm	/*
617239620Smm	 * The MOS's space is accounted for in the pool/$MOS
618239620Smm	 * (dp_mos_dir).  We can't modify the mos while we're syncing
619239620Smm	 * it, so we remember the deltas and apply them here.
620239620Smm	 */
621239620Smm	if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 ||
622239620Smm	    dp->dp_mos_uncompressed_delta != 0) {
623239620Smm		dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD,
624239620Smm		    dp->dp_mos_used_delta,
625239620Smm		    dp->dp_mos_compressed_delta,
626239620Smm		    dp->dp_mos_uncompressed_delta, tx);
627239620Smm		dp->dp_mos_used_delta = 0;
628239620Smm		dp->dp_mos_compressed_delta = 0;
629239620Smm		dp->dp_mos_uncompressed_delta = 0;
630239620Smm	}
631239620Smm
632219089Spjd	if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
633219089Spjd	    list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) {
634260763Savg		dsl_pool_sync_mos(dp, tx);
635168404Spjd	}
636168404Spjd
637239620Smm	/*
638239620Smm	 * If we modify a dataset in the same txg that we want to destroy it,
639239620Smm	 * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it.
640239620Smm	 * dsl_dir_destroy_check() will fail if there are unexpected holds.
641239620Smm	 * Therefore, we want to sync the MOS (thus syncing the dd_dbuf
642239620Smm	 * and clearing the hold on it) before we process the sync_tasks.
643239620Smm	 * The MOS data dirtied by the sync_tasks will be synced on the next
644239620Smm	 * pass.
645239620Smm	 */
646239620Smm	if (!txg_list_empty(&dp->dp_sync_tasks, txg)) {
647248571Smm		dsl_sync_task_t *dst;
648239620Smm		/*
649239620Smm		 * No more sync tasks should have been added while we
650239620Smm		 * were syncing.
651239620Smm		 */
652260763Savg		ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1);
653260763Savg		while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg)) != NULL)
654248571Smm			dsl_sync_task_sync(dst, tx);
655239620Smm	}
656239620Smm
657168404Spjd	dmu_tx_commit(tx);
658185029Spjd
659260763Savg	DTRACE_PROBE2(dsl_pool_sync__done, dsl_pool_t *dp, dp, uint64_t, txg);
660168404Spjd}
661168404Spjd
662168404Spjdvoid
663219089Spjddsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
664168404Spjd{
665239620Smm	zilog_t *zilog;
666168404Spjd
667239620Smm	while (zilog = txg_list_remove(&dp->dp_dirty_zilogs, txg)) {
668260763Savg		dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
669239620Smm		zil_clean(zilog, txg);
670239620Smm		ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg));
671239620Smm		dmu_buf_rele(ds->ds_dbuf, zilog);
672168404Spjd	}
673219089Spjd	ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));
674168404Spjd}
675168404Spjd
676168404Spjd/*
677168404Spjd * TRUE if the current thread is the tx_sync_thread or if we
678168404Spjd * are being called from SPA context during pool initialization.
679168404Spjd */
680168404Spjdint
681168404Spjddsl_pool_sync_context(dsl_pool_t *dp)
682168404Spjd{
683168404Spjd	return (curthread == dp->dp_tx.tx_sync_thread ||
684236884Smm	    spa_is_initializing(dp->dp_spa));
685168404Spjd}
686168404Spjd
687168404Spjduint64_t
688168404Spjddsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree)
689168404Spjd{
690168404Spjd	uint64_t space, resv;
691168404Spjd
692168404Spjd	/*
693168404Spjd	 * If we're trying to assess whether it's OK to do a free,
694168404Spjd	 * cut the reservation in half to allow forward progress
695168404Spjd	 * (e.g. make it possible to rm(1) files from a full pool).
696168404Spjd	 */
697168404Spjd	space = spa_get_dspace(dp->dp_spa);
698269006Sdelphij	resv = spa_get_slop_space(dp->dp_spa);
699168404Spjd	if (netfree)
700168404Spjd		resv >>= 1;
701168404Spjd
702168404Spjd	return (space - resv);
703168404Spjd}
704185029Spjd
705260763Savgboolean_t
706260763Savgdsl_pool_need_dirty_delay(dsl_pool_t *dp)
707185029Spjd{
708260763Savg	uint64_t delay_min_bytes =
709260763Savg	    zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
710260763Savg	boolean_t rv;
711185029Spjd
712260763Savg	mutex_enter(&dp->dp_lock);
713260763Savg	if (dp->dp_dirty_total > zfs_dirty_data_sync)
714260763Savg		txg_kick(dp);
715260763Savg	rv = (dp->dp_dirty_total > delay_min_bytes);
716260763Savg	mutex_exit(&dp->dp_lock);
717260763Savg	return (rv);
718185029Spjd}
719185029Spjd
720185029Spjdvoid
721260763Savgdsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
722185029Spjd{
723260763Savg	if (space > 0) {
724260763Savg		mutex_enter(&dp->dp_lock);
725260763Savg		dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space;
726260763Savg		dsl_pool_dirty_delta(dp, space);
727260763Savg		mutex_exit(&dp->dp_lock);
728260763Savg	}
729185029Spjd}
730185029Spjd
731185029Spjdvoid
732260763Savgdsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg)
733185029Spjd{
734260763Savg	ASSERT3S(space, >=, 0);
735260763Savg	if (space == 0)
736185029Spjd		return;
737260763Savg	mutex_enter(&dp->dp_lock);
738260763Savg	if (dp->dp_dirty_pertxg[txg & TXG_MASK] < space) {
739260763Savg		/* XXX writing something we didn't dirty? */
740260763Savg		space = dp->dp_dirty_pertxg[txg & TXG_MASK];
741185029Spjd	}
742260763Savg	ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space);
743260763Savg	dp->dp_dirty_pertxg[txg & TXG_MASK] -= space;
744260763Savg	ASSERT3U(dp->dp_dirty_total, >=, space);
745260763Savg	dsl_pool_dirty_delta(dp, -space);
746260763Savg	mutex_exit(&dp->dp_lock);
747185029Spjd}
748185029Spjd
749185029Spjd/* ARGSUSED */
750185029Spjdstatic int
751248571Smmupgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
752185029Spjd{
753185029Spjd	dmu_tx_t *tx = arg;
754185029Spjd	dsl_dataset_t *ds, *prev = NULL;
755185029Spjd	int err;
756185029Spjd
757248571Smm	err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
758185029Spjd	if (err)
759185029Spjd		return (err);
760185029Spjd
761185029Spjd	while (ds->ds_phys->ds_prev_snap_obj != 0) {
762185029Spjd		err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
763185029Spjd		    FTAG, &prev);
764185029Spjd		if (err) {
765185029Spjd			dsl_dataset_rele(ds, FTAG);
766185029Spjd			return (err);
767185029Spjd		}
768185029Spjd
769185029Spjd		if (prev->ds_phys->ds_next_snap_obj != ds->ds_object)
770185029Spjd			break;
771185029Spjd		dsl_dataset_rele(ds, FTAG);
772185029Spjd		ds = prev;
773185029Spjd		prev = NULL;
774185029Spjd	}
775185029Spjd
776185029Spjd	if (prev == NULL) {
777185029Spjd		prev = dp->dp_origin_snap;
778185029Spjd
779185029Spjd		/*
780185029Spjd		 * The $ORIGIN can't have any data, or the accounting
781185029Spjd		 * will be wrong.
782185029Spjd		 */
783248571Smm		ASSERT0(prev->ds_phys->ds_bp.blk_birth);
784185029Spjd
785185029Spjd		/* The origin doesn't get attached to itself */
786185029Spjd		if (ds->ds_object == prev->ds_object) {
787185029Spjd			dsl_dataset_rele(ds, FTAG);
788185029Spjd			return (0);
789185029Spjd		}
790185029Spjd
791185029Spjd		dmu_buf_will_dirty(ds->ds_dbuf, tx);
792185029Spjd		ds->ds_phys->ds_prev_snap_obj = prev->ds_object;
793185029Spjd		ds->ds_phys->ds_prev_snap_txg = prev->ds_phys->ds_creation_txg;
794185029Spjd
795185029Spjd		dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
796185029Spjd		ds->ds_dir->dd_phys->dd_origin_obj = prev->ds_object;
797185029Spjd
798185029Spjd		dmu_buf_will_dirty(prev->ds_dbuf, tx);
799185029Spjd		prev->ds_phys->ds_num_children++;
800185029Spjd
801185029Spjd		if (ds->ds_phys->ds_next_snap_obj == 0) {
802185029Spjd			ASSERT(ds->ds_prev == NULL);
803248571Smm			VERIFY0(dsl_dataset_hold_obj(dp,
804185029Spjd			    ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
805185029Spjd		}
806185029Spjd	}
807185029Spjd
808248571Smm	ASSERT3U(ds->ds_dir->dd_phys->dd_origin_obj, ==, prev->ds_object);
809248571Smm	ASSERT3U(ds->ds_phys->ds_prev_snap_obj, ==, prev->ds_object);
810185029Spjd
811185029Spjd	if (prev->ds_phys->ds_next_clones_obj == 0) {
812209962Smm		dmu_buf_will_dirty(prev->ds_dbuf, tx);
813185029Spjd		prev->ds_phys->ds_next_clones_obj =
814185029Spjd		    zap_create(dp->dp_meta_objset,
815185029Spjd		    DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
816185029Spjd	}
817248571Smm	VERIFY0(zap_add_int(dp->dp_meta_objset,
818185029Spjd	    prev->ds_phys->ds_next_clones_obj, ds->ds_object, tx));
819185029Spjd
820185029Spjd	dsl_dataset_rele(ds, FTAG);
821185029Spjd	if (prev != dp->dp_origin_snap)
822185029Spjd		dsl_dataset_rele(prev, FTAG);
823185029Spjd	return (0);
824185029Spjd}
825185029Spjd
826185029Spjdvoid
827185029Spjddsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx)
828185029Spjd{
829185029Spjd	ASSERT(dmu_tx_is_syncing(tx));
830185029Spjd	ASSERT(dp->dp_origin_snap != NULL);
831185029Spjd
832248571Smm	VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb,
833209962Smm	    tx, DS_FIND_CHILDREN));
834185029Spjd}
835185029Spjd
836219089Spjd/* ARGSUSED */
837219089Spjdstatic int
838248571Smmupgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
839219089Spjd{
840219089Spjd	dmu_tx_t *tx = arg;
841219089Spjd	objset_t *mos = dp->dp_meta_objset;
842219089Spjd
843248571Smm	if (ds->ds_dir->dd_phys->dd_origin_obj != 0) {
844219089Spjd		dsl_dataset_t *origin;
845219089Spjd
846248571Smm		VERIFY0(dsl_dataset_hold_obj(dp,
847219089Spjd		    ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin));
848219089Spjd
849219089Spjd		if (origin->ds_dir->dd_phys->dd_clones == 0) {
850219089Spjd			dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
851219089Spjd			origin->ds_dir->dd_phys->dd_clones = zap_create(mos,
852219089Spjd			    DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
853219089Spjd		}
854219089Spjd
855248571Smm		VERIFY0(zap_add_int(dp->dp_meta_objset,
856248571Smm		    origin->ds_dir->dd_phys->dd_clones, ds->ds_object, tx));
857219089Spjd
858219089Spjd		dsl_dataset_rele(origin, FTAG);
859219089Spjd	}
860219089Spjd	return (0);
861219089Spjd}
862219089Spjd
863185029Spjdvoid
864219089Spjddsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx)
865219089Spjd{
866219089Spjd	ASSERT(dmu_tx_is_syncing(tx));
867219089Spjd	uint64_t obj;
868219089Spjd
869219089Spjd	(void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx);
870248571Smm	VERIFY0(dsl_pool_open_special_dir(dp,
871219089Spjd	    FREE_DIR_NAME, &dp->dp_free_dir));
872219089Spjd
873219089Spjd	/*
874219089Spjd	 * We can't use bpobj_alloc(), because spa_version() still
875219089Spjd	 * returns the old version, and we need a new-version bpobj with
876219089Spjd	 * subobj support.  So call dmu_object_alloc() directly.
877219089Spjd	 */
878219089Spjd	obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ,
879219089Spjd	    SPA_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
880248571Smm	VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
881219089Spjd	    DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
882248571Smm	VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj));
883219089Spjd
884248571Smm	VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
885219089Spjd	    upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN));
886219089Spjd}
887219089Spjd
888219089Spjdvoid
889185029Spjddsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)
890185029Spjd{
891185029Spjd	uint64_t dsobj;
892185029Spjd	dsl_dataset_t *ds;
893185029Spjd
894185029Spjd	ASSERT(dmu_tx_is_syncing(tx));
895185029Spjd	ASSERT(dp->dp_origin_snap == NULL);
896248571Smm	ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER));
897185029Spjd
898185029Spjd	/* create the origin dir, ds, & snap-ds */
899185029Spjd	dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME,
900185029Spjd	    NULL, 0, kcred, tx);
901248571Smm	VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
902248571Smm	dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx);
903248571Smm	VERIFY0(dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
904185029Spjd	    dp, &dp->dp_origin_snap));
905185029Spjd	dsl_dataset_rele(ds, FTAG);
906185029Spjd}
907196307Spjd
908196307Spjdtaskq_t *
909196307Spjddsl_pool_vnrele_taskq(dsl_pool_t *dp)
910196307Spjd{
911196307Spjd	return (dp->dp_vnrele_taskq);
912196307Spjd}
913219089Spjd
914219089Spjd/*
915219089Spjd * Walk through the pool-wide zap object of temporary snapshot user holds
916219089Spjd * and release them.
917219089Spjd */
918219089Spjdvoid
919219089Spjddsl_pool_clean_tmp_userrefs(dsl_pool_t *dp)
920219089Spjd{
921219089Spjd	zap_attribute_t za;
922219089Spjd	zap_cursor_t zc;
923219089Spjd	objset_t *mos = dp->dp_meta_objset;
924219089Spjd	uint64_t zapobj = dp->dp_tmp_userrefs_obj;
925251646Sdelphij	nvlist_t *holds;
926219089Spjd
927219089Spjd	if (zapobj == 0)
928219089Spjd		return;
929219089Spjd	ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
930219089Spjd
931251646Sdelphij	holds = fnvlist_alloc();
932251646Sdelphij
933219089Spjd	for (zap_cursor_init(&zc, mos, zapobj);
934219089Spjd	    zap_cursor_retrieve(&zc, &za) == 0;
935219089Spjd	    zap_cursor_advance(&zc)) {
936219089Spjd		char *htag;
937251646Sdelphij		nvlist_t *tags;
938219089Spjd
939219089Spjd		htag = strchr(za.za_name, '-');
940219089Spjd		*htag = '\0';
941219089Spjd		++htag;
942251646Sdelphij		if (nvlist_lookup_nvlist(holds, za.za_name, &tags) != 0) {
943251646Sdelphij			tags = fnvlist_alloc();
944251646Sdelphij			fnvlist_add_boolean(tags, htag);
945251646Sdelphij			fnvlist_add_nvlist(holds, za.za_name, tags);
946251646Sdelphij			fnvlist_free(tags);
947251646Sdelphij		} else {
948251646Sdelphij			fnvlist_add_boolean(tags, htag);
949251646Sdelphij		}
950219089Spjd	}
951251646Sdelphij	dsl_dataset_user_release_tmp(dp, holds);
952251646Sdelphij	fnvlist_free(holds);
953219089Spjd	zap_cursor_fini(&zc);
954219089Spjd}
955219089Spjd
956219089Spjd/*
957219089Spjd * Create the pool-wide zap object for storing temporary snapshot holds.
958219089Spjd */
959219089Spjdvoid
960219089Spjddsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx)
961219089Spjd{
962219089Spjd	objset_t *mos = dp->dp_meta_objset;
963219089Spjd
964219089Spjd	ASSERT(dp->dp_tmp_userrefs_obj == 0);
965219089Spjd	ASSERT(dmu_tx_is_syncing(tx));
966219089Spjd
967236884Smm	dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS,
968236884Smm	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx);
969219089Spjd}
970219089Spjd
971219089Spjdstatic int
972219089Spjddsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj,
973248571Smm    const char *tag, uint64_t now, dmu_tx_t *tx, boolean_t holding)
974219089Spjd{
975219089Spjd	objset_t *mos = dp->dp_meta_objset;
976219089Spjd	uint64_t zapobj = dp->dp_tmp_userrefs_obj;
977219089Spjd	char *name;
978219089Spjd	int error;
979219089Spjd
980219089Spjd	ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
981219089Spjd	ASSERT(dmu_tx_is_syncing(tx));
982219089Spjd
983219089Spjd	/*
984219089Spjd	 * If the pool was created prior to SPA_VERSION_USERREFS, the
985219089Spjd	 * zap object for temporary holds might not exist yet.
986219089Spjd	 */
987219089Spjd	if (zapobj == 0) {
988219089Spjd		if (holding) {
989219089Spjd			dsl_pool_user_hold_create_obj(dp, tx);
990219089Spjd			zapobj = dp->dp_tmp_userrefs_obj;
991219089Spjd		} else {
992249195Smm			return (SET_ERROR(ENOENT));
993219089Spjd		}
994219089Spjd	}
995219089Spjd
996219089Spjd	name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag);
997219089Spjd	if (holding)
998248571Smm		error = zap_add(mos, zapobj, name, 8, 1, &now, tx);
999219089Spjd	else
1000219089Spjd		error = zap_remove(mos, zapobj, name, tx);
1001219089Spjd	strfree(name);
1002219089Spjd
1003219089Spjd	return (error);
1004219089Spjd}
1005219089Spjd
1006219089Spjd/*
1007219089Spjd * Add a temporary hold for the given dataset object and tag.
1008219089Spjd */
1009219089Spjdint
1010219089Spjddsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
1011248571Smm    uint64_t now, dmu_tx_t *tx)
1012219089Spjd{
1013219089Spjd	return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE));
1014219089Spjd}
1015219089Spjd
1016219089Spjd/*
1017219089Spjd * Release a temporary hold for the given dataset object and tag.
1018219089Spjd */
1019219089Spjdint
1020219089Spjddsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
1021219089Spjd    dmu_tx_t *tx)
1022219089Spjd{
1023248571Smm	return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, 0,
1024219089Spjd	    tx, B_FALSE));
1025219089Spjd}
1026248571Smm
1027248571Smm/*
1028248571Smm * DSL Pool Configuration Lock
1029248571Smm *
1030248571Smm * The dp_config_rwlock protects against changes to DSL state (e.g. dataset
1031248571Smm * creation / destruction / rename / property setting).  It must be held for
1032248571Smm * read to hold a dataset or dsl_dir.  I.e. you must call
1033248571Smm * dsl_pool_config_enter() or dsl_pool_hold() before calling
1034248571Smm * dsl_{dataset,dir}_hold{_obj}.  In most circumstances, the dp_config_rwlock
1035248571Smm * must be held continuously until all datasets and dsl_dirs are released.
1036248571Smm *
1037248571Smm * The only exception to this rule is that if a "long hold" is placed on
1038248571Smm * a dataset, then the dp_config_rwlock may be dropped while the dataset
1039248571Smm * is still held.  The long hold will prevent the dataset from being
1040248571Smm * destroyed -- the destroy will fail with EBUSY.  A long hold can be
1041248571Smm * obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset
1042248571Smm * (by calling dsl_{dataset,objset}_{try}own{_obj}).
1043248571Smm *
1044248571Smm * Legitimate long-holders (including owners) should be long-running, cancelable
1045248571Smm * tasks that should cause "zfs destroy" to fail.  This includes DMU
1046248571Smm * consumers (i.e. a ZPL filesystem being mounted or ZVOL being open),
1047248571Smm * "zfs send", and "zfs diff".  There are several other long-holders whose
1048248571Smm * uses are suboptimal (e.g. "zfs promote", and zil_suspend()).
1049248571Smm *
1050248571Smm * The usual formula for long-holding would be:
1051248571Smm * dsl_pool_hold()
1052248571Smm * dsl_dataset_hold()
1053248571Smm * ... perform checks ...
1054248571Smm * dsl_dataset_long_hold()
1055248571Smm * dsl_pool_rele()
1056248571Smm * ... perform long-running task ...
1057248571Smm * dsl_dataset_long_rele()
1058248571Smm * dsl_dataset_rele()
1059248571Smm *
1060248571Smm * Note that when the long hold is released, the dataset is still held but
1061248571Smm * the pool is not held.  The dataset may change arbitrarily during this time
1062248571Smm * (e.g. it could be destroyed).  Therefore you shouldn't do anything to the
1063248571Smm * dataset except release it.
1064248571Smm *
1065248571Smm * User-initiated operations (e.g. ioctls, zfs_ioc_*()) are either read-only
1066248571Smm * or modifying operations.
1067248571Smm *
1068248571Smm * Modifying operations should generally use dsl_sync_task().  The synctask
1069248571Smm * infrastructure enforces proper locking strategy with respect to the
1070248571Smm * dp_config_rwlock.  See the comment above dsl_sync_task() for details.
1071248571Smm *
1072248571Smm * Read-only operations will manually hold the pool, then the dataset, obtain
1073248571Smm * information from the dataset, then release the pool and dataset.
1074248571Smm * dmu_objset_{hold,rele}() are convenience routines that also do the pool
1075248571Smm * hold/rele.
1076248571Smm */
1077248571Smm
1078248571Smmint
1079248571Smmdsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp)
1080248571Smm{
1081248571Smm	spa_t *spa;
1082248571Smm	int error;
1083248571Smm
1084248571Smm	error = spa_open(name, &spa, tag);
1085248571Smm	if (error == 0) {
1086248571Smm		*dp = spa_get_dsl(spa);
1087248571Smm		dsl_pool_config_enter(*dp, tag);
1088248571Smm	}
1089248571Smm	return (error);
1090248571Smm}
1091248571Smm
1092248571Smmvoid
1093248571Smmdsl_pool_rele(dsl_pool_t *dp, void *tag)
1094248571Smm{
1095248571Smm	dsl_pool_config_exit(dp, tag);
1096248571Smm	spa_close(dp->dp_spa, tag);
1097248571Smm}
1098248571Smm
1099248571Smmvoid
1100248571Smmdsl_pool_config_enter(dsl_pool_t *dp, void *tag)
1101248571Smm{
1102248571Smm	/*
1103248571Smm	 * We use a "reentrant" reader-writer lock, but not reentrantly.
1104248571Smm	 *
1105248571Smm	 * The rrwlock can (with the track_all flag) track all reading threads,
1106248571Smm	 * which is very useful for debugging which code path failed to release
1107248571Smm	 * the lock, and for verifying that the *current* thread does hold
1108248571Smm	 * the lock.
1109248571Smm	 *
1110248571Smm	 * (Unlike a rwlock, which knows that N threads hold it for
1111248571Smm	 * read, but not *which* threads, so rw_held(RW_READER) returns TRUE
1112248571Smm	 * if any thread holds it for read, even if this thread doesn't).
1113248571Smm	 */
1114248571Smm	ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));
1115248571Smm	rrw_enter(&dp->dp_config_rwlock, RW_READER, tag);
1116248571Smm}
1117248571Smm
1118248571Smmvoid
1119248571Smmdsl_pool_config_exit(dsl_pool_t *dp, void *tag)
1120248571Smm{
1121248571Smm	rrw_exit(&dp->dp_config_rwlock, tag);
1122248571Smm}
1123248571Smm
1124248571Smmboolean_t
1125248571Smmdsl_pool_config_held(dsl_pool_t *dp)
1126248571Smm{
1127248571Smm	return (RRW_LOCK_HELD(&dp->dp_config_rwlock));
1128248571Smm}
1129