1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23269416Sdelphij * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 24251646Sdelphij * Copyright (c) 2013 Steven Hartland. All rights reserved. 25168404Spjd */ 26168404Spjd 27168404Spjd#include <sys/dsl_pool.h> 28168404Spjd#include <sys/dsl_dataset.h> 29219089Spjd#include <sys/dsl_prop.h> 30168404Spjd#include <sys/dsl_dir.h> 31168404Spjd#include <sys/dsl_synctask.h> 32219089Spjd#include <sys/dsl_scan.h> 33219089Spjd#include <sys/dnode.h> 34168404Spjd#include <sys/dmu_tx.h> 35168404Spjd#include <sys/dmu_objset.h> 36168404Spjd#include <sys/arc.h> 37168404Spjd#include <sys/zap.h> 38168404Spjd#include <sys/zio.h> 39168404Spjd#include <sys/zfs_context.h> 40168404Spjd#include <sys/fs/zfs.h> 41185029Spjd#include <sys/zfs_znode.h> 42185029Spjd#include <sys/spa_impl.h> 43219089Spjd#include <sys/dsl_deadlist.h> 44236884Smm#include <sys/bptree.h> 45236884Smm#include <sys/zfeature.h> 46239620Smm#include <sys/zil_impl.h> 47248571Smm#include <sys/dsl_userhold.h> 48168404Spjd 49271435Ssmh#ifdef __FreeBSD__ 50271435Ssmh#include <sys/sysctl.h> 51271435Ssmh#include <sys/types.h> 52271435Ssmh#endif 53271435Ssmh 54260763Savg/* 55260763Savg * ZFS Write Throttle 56260763Savg * ------------------ 57260763Savg * 58260763Savg * ZFS must limit the rate of incoming writes to the rate at which it is able 59260763Savg * to sync data modifications to the backend storage. Throttling by too much 60260763Savg * creates an artificial limit; throttling by too little can only be sustained 61260763Savg * for short periods and would lead to highly lumpy performance. On a per-pool 62260763Savg * basis, ZFS tracks the amount of modified (dirty) data. As operations change 63260763Savg * data, the amount of dirty data increases; as ZFS syncs out data, the amount 64260763Savg * of dirty data decreases. When the amount of dirty data exceeds a 65260763Savg * predetermined threshold further modifications are blocked until the amount 66260763Savg * of dirty data decreases (as data is synced out). 67260763Savg * 68260763Savg * The limit on dirty data is tunable, and should be adjusted according to 69260763Savg * both the IO capacity and available memory of the system. The larger the 70260763Savg * window, the more ZFS is able to aggregate and amortize metadata (and data) 71260763Savg * changes. However, memory is a limited resource, and allowing for more dirty 72260763Savg * data comes at the cost of keeping other useful data in memory (for example 73260763Savg * ZFS data cached by the ARC). 74260763Savg * 75260763Savg * Implementation 76260763Savg * 77260763Savg * As buffers are modified dsl_pool_willuse_space() increments both the per- 78260763Savg * txg (dp_dirty_pertxg[]) and poolwide (dp_dirty_total) accounting of 79260763Savg * dirty space used; dsl_pool_dirty_space() decrements those values as data 80260763Savg * is synced out from dsl_pool_sync(). While only the poolwide value is 81260763Savg * relevant, the per-txg value is useful for debugging. The tunable 82260763Savg * zfs_dirty_data_max determines the dirty space limit. Once that value is 83260763Savg * exceeded, new writes are halted until space frees up. 84260763Savg * 85260763Savg * The zfs_dirty_data_sync tunable dictates the threshold at which we 86260763Savg * ensure that there is a txg syncing (see the comment in txg.c for a full 87260763Savg * description of transaction group stages). 88260763Savg * 89260763Savg * The IO scheduler uses both the dirty space limit and current amount of 90260763Savg * dirty data as inputs. Those values affect the number of concurrent IOs ZFS 91260763Savg * issues. See the comment in vdev_queue.c for details of the IO scheduler. 92260763Savg * 93260763Savg * The delay is also calculated based on the amount of dirty data. See the 94260763Savg * comment above dmu_tx_delay() for details. 95260763Savg */ 96185029Spjd 97260763Savg/* 98260763Savg * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory, 99260763Savg * capped at zfs_dirty_data_max_max. It can also be overridden in /etc/system. 100260763Savg */ 101260763Savguint64_t zfs_dirty_data_max; 102260763Savguint64_t zfs_dirty_data_max_max = 4ULL * 1024 * 1024 * 1024; 103260763Savgint zfs_dirty_data_max_percent = 10; 104185029Spjd 105260763Savg/* 106260763Savg * If there is at least this much dirty data, push out a txg. 107260763Savg */ 108260763Savguint64_t zfs_dirty_data_sync = 64 * 1024 * 1024; 109185029Spjd 110260763Savg/* 111260763Savg * Once there is this amount of dirty data, the dmu_tx_delay() will kick in 112260763Savg * and delay each transaction. 113260763Savg * This value should be >= zfs_vdev_async_write_active_max_dirty_percent. 114260763Savg */ 115260763Savgint zfs_delay_min_dirty_percent = 60; 116185029Spjd 117260763Savg/* 118260763Savg * This controls how quickly the delay approaches infinity. 119272456Sdelphij * Larger values cause it to delay more for a given amount of dirty data. 120272456Sdelphij * Therefore larger values will cause there to be less dirty data for a 121260763Savg * given throughput. 122260763Savg * 123260763Savg * For the smoothest delay, this value should be about 1 billion divided 124260763Savg * by the maximum number of operations per second. This will smoothly 125260763Savg * handle between 10x and 1/10th this number. 126260763Savg * 127260763Savg * Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the 128260763Savg * multiply in dmu_tx_delay(). 129260763Savg */ 130260763Savguint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000; 131260763Savg 132260763Savg 133271435Ssmh#ifdef __FreeBSD__ 134260763Savg 135271435Ssmhextern int zfs_vdev_async_write_active_max_dirty_percent; 136271435Ssmh 137219089SpjdSYSCTL_DECL(_vfs_zfs); 138219089Spjd 139271435SsmhTUNABLE_QUAD("vfs.zfs.dirty_data_max", &zfs_dirty_data_max); 140271435SsmhSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_max, CTLFLAG_RWTUN, 141271435Ssmh &zfs_dirty_data_max, 0, 142271435Ssmh "The maximum amount of dirty data in bytes after which new writes are " 143271435Ssmh "halted until space becomes available"); 144271435Ssmh 145271435SsmhTUNABLE_QUAD("vfs.zfs.dirty_data_max_max", &zfs_dirty_data_max_max); 146271435SsmhSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_max_max, CTLFLAG_RDTUN, 147271435Ssmh &zfs_dirty_data_max_max, 0, 148271435Ssmh "The absolute cap on dirty_data_max when auto calculating"); 149271435Ssmh 150271435SsmhTUNABLE_INT("vfs.zfs.dirty_data_max_percent", &zfs_dirty_data_max_percent); 151271435SsmhSYSCTL_INT(_vfs_zfs, OID_AUTO, dirty_data_max_percent, CTLFLAG_RDTUN, 152271435Ssmh &zfs_dirty_data_max_percent, 0, 153271435Ssmh "The percent of physical memory used to auto calculate dirty_data_max"); 154271435Ssmh 155271435SsmhTUNABLE_QUAD("vfs.zfs.dirty_data_sync", &zfs_dirty_data_sync); 156271435SsmhSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_sync, CTLFLAG_RWTUN, 157271435Ssmh &zfs_dirty_data_sync, 0, 158271435Ssmh "Force a txg if the number of dirty buffer bytes exceed this value"); 159271435Ssmh 160271435Ssmhstatic int sysctl_zfs_delay_min_dirty_percent(SYSCTL_HANDLER_ARGS); 161271435Ssmh/* No zfs_delay_min_dirty_percent tunable due to limit requirements */ 162271435SsmhSYSCTL_PROC(_vfs_zfs, OID_AUTO, delay_min_dirty_percent, 163271435Ssmh CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(int), 164271435Ssmh sysctl_zfs_delay_min_dirty_percent, "I", 165271435Ssmh "The limit of outstanding dirty data before transations are delayed"); 166271435Ssmh 167271435Ssmhstatic int sysctl_zfs_delay_scale(SYSCTL_HANDLER_ARGS); 168271435Ssmh/* No zfs_delay_scale tunable due to limit requirements */ 169271435SsmhSYSCTL_PROC(_vfs_zfs, OID_AUTO, delay_scale, 170271435Ssmh CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), 171271435Ssmh sysctl_zfs_delay_scale, "QU", 172271435Ssmh "Controls how quickly the delay approaches infinity"); 173271435Ssmh 174271435Ssmhstatic int 175271435Ssmhsysctl_zfs_delay_min_dirty_percent(SYSCTL_HANDLER_ARGS) 176271435Ssmh{ 177271435Ssmh int val, err; 178271435Ssmh 179271435Ssmh val = zfs_delay_min_dirty_percent; 180271435Ssmh err = sysctl_handle_int(oidp, &val, 0, req); 181271435Ssmh if (err != 0 || req->newptr == NULL) 182271435Ssmh return (err); 183271435Ssmh 184271435Ssmh if (val < zfs_vdev_async_write_active_max_dirty_percent) 185271435Ssmh return (EINVAL); 186271435Ssmh 187271435Ssmh zfs_delay_min_dirty_percent = val; 188271435Ssmh 189271435Ssmh return (0); 190271435Ssmh} 191271435Ssmh 192271435Ssmhstatic int 193271435Ssmhsysctl_zfs_delay_scale(SYSCTL_HANDLER_ARGS) 194271435Ssmh{ 195271435Ssmh uint64_t val; 196271435Ssmh int err; 197271435Ssmh 198271435Ssmh val = zfs_delay_scale; 199271435Ssmh err = sysctl_handle_64(oidp, &val, 0, req); 200271435Ssmh if (err != 0 || req->newptr == NULL) 201271435Ssmh return (err); 202271435Ssmh 203271435Ssmh if (val > UINT64_MAX / zfs_dirty_data_max) 204271435Ssmh return (EINVAL); 205271435Ssmh 206271435Ssmh zfs_delay_scale = val; 207271435Ssmh 208271435Ssmh return (0); 209271435Ssmh} 210260763Savg#endif 211219089Spjd 212255437Sdelphijhrtime_t zfs_throttle_delay = MSEC2NSEC(10); 213255437Sdelphijhrtime_t zfs_throttle_resolution = MSEC2NSEC(10); 214255437Sdelphij 215219089Spjdint 216185029Spjddsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) 217168404Spjd{ 218168404Spjd uint64_t obj; 219168404Spjd int err; 220168404Spjd 221168404Spjd err = zap_lookup(dp->dp_meta_objset, 222168404Spjd dp->dp_root_dir->dd_phys->dd_child_dir_zapobj, 223185029Spjd name, sizeof (obj), 1, &obj); 224168404Spjd if (err) 225168404Spjd return (err); 226168404Spjd 227248571Smm return (dsl_dir_hold_obj(dp, obj, name, dp, ddp)); 228168404Spjd} 229168404Spjd 230168404Spjdstatic dsl_pool_t * 231168404Spjddsl_pool_open_impl(spa_t *spa, uint64_t txg) 232168404Spjd{ 233168404Spjd dsl_pool_t *dp; 234168404Spjd blkptr_t *bp = spa_get_rootblkptr(spa); 235168404Spjd 236168404Spjd dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP); 237168404Spjd dp->dp_spa = spa; 238168404Spjd dp->dp_meta_rootbp = *bp; 239248571Smm rrw_init(&dp->dp_config_rwlock, B_TRUE); 240168404Spjd txg_init(dp, txg); 241168404Spjd 242168404Spjd txg_list_create(&dp->dp_dirty_datasets, 243168404Spjd offsetof(dsl_dataset_t, ds_dirty_link)); 244239620Smm txg_list_create(&dp->dp_dirty_zilogs, 245239620Smm offsetof(zilog_t, zl_dirty_link)); 246168404Spjd txg_list_create(&dp->dp_dirty_dirs, 247168404Spjd offsetof(dsl_dir_t, dd_dirty_link)); 248168404Spjd txg_list_create(&dp->dp_sync_tasks, 249248571Smm offsetof(dsl_sync_task_t, dst_node)); 250168404Spjd 251185029Spjd mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); 252260763Savg cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL); 253185029Spjd 254196307Spjd dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri, 255196307Spjd 1, 4, 0); 256196307Spjd 257168404Spjd return (dp); 258168404Spjd} 259168404Spjd 260168404Spjdint 261236884Smmdsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) 262168404Spjd{ 263168404Spjd int err; 264168404Spjd dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); 265236884Smm 266236884Smm err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, 267236884Smm &dp->dp_meta_objset); 268236884Smm if (err != 0) 269236884Smm dsl_pool_close(dp); 270236884Smm else 271236884Smm *dpp = dp; 272236884Smm 273236884Smm return (err); 274236884Smm} 275236884Smm 276236884Smmint 277236884Smmdsl_pool_open(dsl_pool_t *dp) 278236884Smm{ 279236884Smm int err; 280185029Spjd dsl_dir_t *dd; 281185029Spjd dsl_dataset_t *ds; 282219089Spjd uint64_t obj; 283168404Spjd 284248571Smm rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 285168404Spjd err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 286168404Spjd DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, 287168404Spjd &dp->dp_root_dir_obj); 288168404Spjd if (err) 289168404Spjd goto out; 290168404Spjd 291248571Smm err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, 292168404Spjd NULL, dp, &dp->dp_root_dir); 293168404Spjd if (err) 294168404Spjd goto out; 295168404Spjd 296185029Spjd err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir); 297168404Spjd if (err) 298168404Spjd goto out; 299168404Spjd 300236884Smm if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) { 301185029Spjd err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd); 302185029Spjd if (err) 303185029Spjd goto out; 304185029Spjd err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj, 305185029Spjd FTAG, &ds); 306209962Smm if (err == 0) { 307209962Smm err = dsl_dataset_hold_obj(dp, 308219089Spjd ds->ds_phys->ds_prev_snap_obj, dp, 309219089Spjd &dp->dp_origin_snap); 310209962Smm dsl_dataset_rele(ds, FTAG); 311209962Smm } 312248571Smm dsl_dir_rele(dd, dp); 313185029Spjd if (err) 314185029Spjd goto out; 315185029Spjd } 316185029Spjd 317236884Smm if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { 318219089Spjd err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME, 319219089Spjd &dp->dp_free_dir); 320185029Spjd if (err) 321185029Spjd goto out; 322219089Spjd 323185029Spjd err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 324219089Spjd DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj); 325185029Spjd if (err) 326185029Spjd goto out; 327248571Smm VERIFY0(bpobj_open(&dp->dp_free_bpobj, 328219089Spjd dp->dp_meta_objset, obj)); 329185029Spjd } 330185029Spjd 331268650Sdelphij /* 332268650Sdelphij * Note: errors ignored, because the leak dir will not exist if we 333268650Sdelphij * have not encountered a leak yet. 334268650Sdelphij */ 335268650Sdelphij (void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME, 336268650Sdelphij &dp->dp_leak_dir); 337268650Sdelphij 338263390Sdelphij if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) { 339236884Smm err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 340236884Smm DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, 341236884Smm &dp->dp_bptree_obj); 342236884Smm if (err != 0) 343236884Smm goto out; 344236884Smm } 345236884Smm 346263390Sdelphij if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMPTY_BPOBJ)) { 347239774Smm err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 348239774Smm DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1, 349239774Smm &dp->dp_empty_bpobj); 350239774Smm if (err != 0) 351239774Smm goto out; 352239774Smm } 353239774Smm 354219089Spjd err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 355219089Spjd DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1, 356219089Spjd &dp->dp_tmp_userrefs_obj); 357219089Spjd if (err == ENOENT) 358219089Spjd err = 0; 359219089Spjd if (err) 360219089Spjd goto out; 361219089Spjd 362236884Smm err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg); 363219089Spjd 364168404Spjdout: 365248571Smm rrw_exit(&dp->dp_config_rwlock, FTAG); 366168404Spjd return (err); 367168404Spjd} 368168404Spjd 369168404Spjdvoid 370168404Spjddsl_pool_close(dsl_pool_t *dp) 371168404Spjd{ 372185029Spjd /* 373260763Savg * Drop our references from dsl_pool_open(). 374260763Savg * 375185029Spjd * Since we held the origin_snap from "syncing" context (which 376185029Spjd * includes pool-opening context), it actually only got a "ref" 377185029Spjd * and not a hold, so just drop that here. 378185029Spjd */ 379185029Spjd if (dp->dp_origin_snap) 380248571Smm dsl_dataset_rele(dp->dp_origin_snap, dp); 381168404Spjd if (dp->dp_mos_dir) 382248571Smm dsl_dir_rele(dp->dp_mos_dir, dp); 383219089Spjd if (dp->dp_free_dir) 384248571Smm dsl_dir_rele(dp->dp_free_dir, dp); 385268650Sdelphij if (dp->dp_leak_dir) 386268650Sdelphij dsl_dir_rele(dp->dp_leak_dir, dp); 387168404Spjd if (dp->dp_root_dir) 388248571Smm dsl_dir_rele(dp->dp_root_dir, dp); 389168404Spjd 390219089Spjd bpobj_close(&dp->dp_free_bpobj); 391219089Spjd 392168404Spjd /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */ 393168404Spjd if (dp->dp_meta_objset) 394219089Spjd dmu_objset_evict(dp->dp_meta_objset); 395168404Spjd 396168404Spjd txg_list_destroy(&dp->dp_dirty_datasets); 397239620Smm txg_list_destroy(&dp->dp_dirty_zilogs); 398219089Spjd txg_list_destroy(&dp->dp_sync_tasks); 399168404Spjd txg_list_destroy(&dp->dp_dirty_dirs); 400168404Spjd 401185029Spjd arc_flush(dp->dp_spa); 402168404Spjd txg_fini(dp); 403219089Spjd dsl_scan_fini(dp); 404248571Smm rrw_destroy(&dp->dp_config_rwlock); 405185029Spjd mutex_destroy(&dp->dp_lock); 406196307Spjd taskq_destroy(dp->dp_vnrele_taskq); 407208047Smm if (dp->dp_blkstats) 408208047Smm kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); 409168404Spjd kmem_free(dp, sizeof (dsl_pool_t)); 410168404Spjd} 411168404Spjd 412168404Spjddsl_pool_t * 413185029Spjddsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) 414168404Spjd{ 415168404Spjd int err; 416168404Spjd dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); 417168404Spjd dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); 418219089Spjd objset_t *os; 419185029Spjd dsl_dataset_t *ds; 420219089Spjd uint64_t obj; 421185029Spjd 422248571Smm rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 423248571Smm 424185029Spjd /* create and open the MOS (meta-objset) */ 425219089Spjd dp->dp_meta_objset = dmu_objset_create_impl(spa, 426219089Spjd NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx); 427168404Spjd 428168404Spjd /* create the pool directory */ 429168404Spjd err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 430168404Spjd DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx); 431240415Smm ASSERT0(err); 432168404Spjd 433219089Spjd /* Initialize scan structures */ 434248571Smm VERIFY0(dsl_scan_init(dp, txg)); 435219089Spjd 436168404Spjd /* create and open the root dir */ 437185029Spjd dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx); 438248571Smm VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, 439168404Spjd NULL, dp, &dp->dp_root_dir)); 440168404Spjd 441168404Spjd /* create and open the meta-objset dir */ 442185029Spjd (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx); 443248571Smm VERIFY0(dsl_pool_open_special_dir(dp, 444185029Spjd MOS_DIR_NAME, &dp->dp_mos_dir)); 445168404Spjd 446219089Spjd if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { 447219089Spjd /* create and open the free dir */ 448219089Spjd (void) dsl_dir_create_sync(dp, dp->dp_root_dir, 449219089Spjd FREE_DIR_NAME, tx); 450248571Smm VERIFY0(dsl_pool_open_special_dir(dp, 451219089Spjd FREE_DIR_NAME, &dp->dp_free_dir)); 452219089Spjd 453219089Spjd /* create and open the free_bplist */ 454219089Spjd obj = bpobj_alloc(dp->dp_meta_objset, SPA_MAXBLOCKSIZE, tx); 455219089Spjd VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 456219089Spjd DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0); 457248571Smm VERIFY0(bpobj_open(&dp->dp_free_bpobj, 458219089Spjd dp->dp_meta_objset, obj)); 459219089Spjd } 460219089Spjd 461185029Spjd if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) 462185029Spjd dsl_pool_create_origin(dp, tx); 463185029Spjd 464185029Spjd /* create the root dataset */ 465219089Spjd obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx); 466185029Spjd 467185029Spjd /* create the root objset */ 468248571Smm VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds)); 469219089Spjd os = dmu_objset_create_impl(dp->dp_spa, ds, 470185029Spjd dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx); 471185029Spjd#ifdef _KERNEL 472219089Spjd zfs_create_fs(os, kcred, zplprops, tx); 473185029Spjd#endif 474185029Spjd dsl_dataset_rele(ds, FTAG); 475185029Spjd 476168404Spjd dmu_tx_commit(tx); 477168404Spjd 478248571Smm rrw_exit(&dp->dp_config_rwlock, FTAG); 479248571Smm 480168404Spjd return (dp); 481168404Spjd} 482168404Spjd 483239620Smm/* 484239620Smm * Account for the meta-objset space in its placeholder dsl_dir. 485239620Smm */ 486239620Smmvoid 487239620Smmdsl_pool_mos_diduse_space(dsl_pool_t *dp, 488239620Smm int64_t used, int64_t comp, int64_t uncomp) 489239620Smm{ 490239620Smm ASSERT3U(comp, ==, uncomp); /* it's all metadata */ 491239620Smm mutex_enter(&dp->dp_lock); 492239620Smm dp->dp_mos_used_delta += used; 493239620Smm dp->dp_mos_compressed_delta += comp; 494239620Smm dp->dp_mos_uncompressed_delta += uncomp; 495239620Smm mutex_exit(&dp->dp_lock); 496239620Smm} 497239620Smm 498219089Spjdstatic int 499219089Spjddeadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 500219089Spjd{ 501219089Spjd dsl_deadlist_t *dl = arg; 502219089Spjd dsl_deadlist_insert(dl, bp, tx); 503219089Spjd return (0); 504219089Spjd} 505219089Spjd 506260763Savgstatic void 507260763Savgdsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx) 508260763Savg{ 509260763Savg zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 510260763Savg dmu_objset_sync(dp->dp_meta_objset, zio, tx); 511260763Savg VERIFY0(zio_wait(zio)); 512260763Savg dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); 513260763Savg spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); 514260763Savg} 515260763Savg 516260763Savgstatic void 517260763Savgdsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta) 518260763Savg{ 519260763Savg ASSERT(MUTEX_HELD(&dp->dp_lock)); 520260763Savg 521260763Savg if (delta < 0) 522260763Savg ASSERT3U(-delta, <=, dp->dp_dirty_total); 523260763Savg 524260763Savg dp->dp_dirty_total += delta; 525260763Savg 526260763Savg /* 527260763Savg * Note: we signal even when increasing dp_dirty_total. 528260763Savg * This ensures forward progress -- each thread wakes the next waiter. 529260763Savg */ 530260763Savg if (dp->dp_dirty_total <= zfs_dirty_data_max) 531260763Savg cv_signal(&dp->dp_spaceavail_cv); 532260763Savg} 533260763Savg 534168404Spjdvoid 535168404Spjddsl_pool_sync(dsl_pool_t *dp, uint64_t txg) 536168404Spjd{ 537168404Spjd zio_t *zio; 538168404Spjd dmu_tx_t *tx; 539168404Spjd dsl_dir_t *dd; 540168404Spjd dsl_dataset_t *ds; 541219089Spjd objset_t *mos = dp->dp_meta_objset; 542239620Smm list_t synced_datasets; 543168404Spjd 544239620Smm list_create(&synced_datasets, sizeof (dsl_dataset_t), 545239620Smm offsetof(dsl_dataset_t, ds_synced_link)); 546239620Smm 547260763Savg tx = dmu_tx_create_assigned(dp, txg); 548260763Savg 549219089Spjd /* 550260763Savg * Write out all dirty blocks of dirty datasets. 551219089Spjd */ 552168404Spjd zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 553260763Savg while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { 554209962Smm /* 555209962Smm * We must not sync any non-MOS datasets twice, because 556209962Smm * we may have taken a snapshot of them. However, we 557209962Smm * may sync newly-created datasets on pass 2. 558209962Smm */ 559209962Smm ASSERT(!list_link_active(&ds->ds_synced_link)); 560239620Smm list_insert_tail(&synced_datasets, ds); 561168404Spjd dsl_dataset_sync(ds, zio, tx); 562168404Spjd } 563260763Savg VERIFY0(zio_wait(zio)); 564185029Spjd 565260763Savg /* 566260763Savg * We have written all of the accounted dirty data, so our 567260763Savg * dp_space_towrite should now be zero. However, some seldom-used 568260763Savg * code paths do not adhere to this (e.g. dbuf_undirty(), also 569260763Savg * rounding error in dbuf_write_physdone). 570260763Savg * Shore up the accounting of any dirtied space now. 571260763Savg */ 572260763Savg dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg); 573168404Spjd 574239620Smm /* 575239620Smm * After the data blocks have been written (ensured by the zio_wait() 576239620Smm * above), update the user/group space accounting. 577239620Smm */ 578260763Savg for (ds = list_head(&synced_datasets); ds != NULL; 579260763Savg ds = list_next(&synced_datasets, ds)) { 580219089Spjd dmu_objset_do_userquota_updates(ds->ds_objset, tx); 581260763Savg } 582209962Smm 583209962Smm /* 584209962Smm * Sync the datasets again to push out the changes due to 585219089Spjd * userspace updates. This must be done before we process the 586239620Smm * sync tasks, so that any snapshots will have the correct 587239620Smm * user accounting information (and we won't get confused 588239620Smm * about which blocks are part of the snapshot). 589209962Smm */ 590209962Smm zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 591260763Savg while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { 592209962Smm ASSERT(list_link_active(&ds->ds_synced_link)); 593209962Smm dmu_buf_rele(ds->ds_dbuf, ds); 594209962Smm dsl_dataset_sync(ds, zio, tx); 595209962Smm } 596260763Savg VERIFY0(zio_wait(zio)); 597209962Smm 598219089Spjd /* 599239620Smm * Now that the datasets have been completely synced, we can 600239620Smm * clean up our in-memory structures accumulated while syncing: 601239620Smm * 602239620Smm * - move dead blocks from the pending deadlist to the on-disk deadlist 603239620Smm * - release hold from dsl_dataset_dirty() 604219089Spjd */ 605260763Savg while ((ds = list_remove_head(&synced_datasets)) != NULL) { 606239620Smm objset_t *os = ds->ds_objset; 607219089Spjd bplist_iterate(&ds->ds_pending_deadlist, 608219089Spjd deadlist_enqueue_cb, &ds->ds_deadlist, tx); 609239620Smm ASSERT(!dmu_objset_is_dirty(os, txg)); 610239620Smm dmu_buf_rele(ds->ds_dbuf, ds); 611219089Spjd } 612260763Savg while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) { 613168404Spjd dsl_dir_sync(dd, tx); 614260763Savg } 615168404Spjd 616239620Smm /* 617239620Smm * The MOS's space is accounted for in the pool/$MOS 618239620Smm * (dp_mos_dir). We can't modify the mos while we're syncing 619239620Smm * it, so we remember the deltas and apply them here. 620239620Smm */ 621239620Smm if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 || 622239620Smm dp->dp_mos_uncompressed_delta != 0) { 623239620Smm dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD, 624239620Smm dp->dp_mos_used_delta, 625239620Smm dp->dp_mos_compressed_delta, 626239620Smm dp->dp_mos_uncompressed_delta, tx); 627239620Smm dp->dp_mos_used_delta = 0; 628239620Smm dp->dp_mos_compressed_delta = 0; 629239620Smm dp->dp_mos_uncompressed_delta = 0; 630239620Smm } 631239620Smm 632219089Spjd if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL || 633219089Spjd list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) { 634260763Savg dsl_pool_sync_mos(dp, tx); 635168404Spjd } 636168404Spjd 637239620Smm /* 638239620Smm * If we modify a dataset in the same txg that we want to destroy it, 639239620Smm * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it. 640239620Smm * dsl_dir_destroy_check() will fail if there are unexpected holds. 641239620Smm * Therefore, we want to sync the MOS (thus syncing the dd_dbuf 642239620Smm * and clearing the hold on it) before we process the sync_tasks. 643239620Smm * The MOS data dirtied by the sync_tasks will be synced on the next 644239620Smm * pass. 645239620Smm */ 646239620Smm if (!txg_list_empty(&dp->dp_sync_tasks, txg)) { 647248571Smm dsl_sync_task_t *dst; 648239620Smm /* 649239620Smm * No more sync tasks should have been added while we 650239620Smm * were syncing. 651239620Smm */ 652260763Savg ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1); 653260763Savg while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg)) != NULL) 654248571Smm dsl_sync_task_sync(dst, tx); 655239620Smm } 656239620Smm 657168404Spjd dmu_tx_commit(tx); 658185029Spjd 659260763Savg DTRACE_PROBE2(dsl_pool_sync__done, dsl_pool_t *dp, dp, uint64_t, txg); 660168404Spjd} 661168404Spjd 662168404Spjdvoid 663219089Spjddsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg) 664168404Spjd{ 665239620Smm zilog_t *zilog; 666168404Spjd 667239620Smm while (zilog = txg_list_remove(&dp->dp_dirty_zilogs, txg)) { 668260763Savg dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); 669239620Smm zil_clean(zilog, txg); 670239620Smm ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg)); 671239620Smm dmu_buf_rele(ds->ds_dbuf, zilog); 672168404Spjd } 673219089Spjd ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg)); 674168404Spjd} 675168404Spjd 676168404Spjd/* 677168404Spjd * TRUE if the current thread is the tx_sync_thread or if we 678168404Spjd * are being called from SPA context during pool initialization. 679168404Spjd */ 680168404Spjdint 681168404Spjddsl_pool_sync_context(dsl_pool_t *dp) 682168404Spjd{ 683168404Spjd return (curthread == dp->dp_tx.tx_sync_thread || 684236884Smm spa_is_initializing(dp->dp_spa)); 685168404Spjd} 686168404Spjd 687168404Spjduint64_t 688168404Spjddsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree) 689168404Spjd{ 690168404Spjd uint64_t space, resv; 691168404Spjd 692168404Spjd /* 693168404Spjd * If we're trying to assess whether it's OK to do a free, 694168404Spjd * cut the reservation in half to allow forward progress 695168404Spjd * (e.g. make it possible to rm(1) files from a full pool). 696168404Spjd */ 697168404Spjd space = spa_get_dspace(dp->dp_spa); 698269006Sdelphij resv = spa_get_slop_space(dp->dp_spa); 699168404Spjd if (netfree) 700168404Spjd resv >>= 1; 701168404Spjd 702168404Spjd return (space - resv); 703168404Spjd} 704185029Spjd 705260763Savgboolean_t 706260763Savgdsl_pool_need_dirty_delay(dsl_pool_t *dp) 707185029Spjd{ 708260763Savg uint64_t delay_min_bytes = 709260763Savg zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; 710260763Savg boolean_t rv; 711185029Spjd 712260763Savg mutex_enter(&dp->dp_lock); 713260763Savg if (dp->dp_dirty_total > zfs_dirty_data_sync) 714260763Savg txg_kick(dp); 715260763Savg rv = (dp->dp_dirty_total > delay_min_bytes); 716260763Savg mutex_exit(&dp->dp_lock); 717260763Savg return (rv); 718185029Spjd} 719185029Spjd 720185029Spjdvoid 721260763Savgdsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) 722185029Spjd{ 723260763Savg if (space > 0) { 724260763Savg mutex_enter(&dp->dp_lock); 725260763Savg dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space; 726260763Savg dsl_pool_dirty_delta(dp, space); 727260763Savg mutex_exit(&dp->dp_lock); 728260763Savg } 729185029Spjd} 730185029Spjd 731185029Spjdvoid 732260763Savgdsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg) 733185029Spjd{ 734260763Savg ASSERT3S(space, >=, 0); 735260763Savg if (space == 0) 736185029Spjd return; 737260763Savg mutex_enter(&dp->dp_lock); 738260763Savg if (dp->dp_dirty_pertxg[txg & TXG_MASK] < space) { 739260763Savg /* XXX writing something we didn't dirty? */ 740260763Savg space = dp->dp_dirty_pertxg[txg & TXG_MASK]; 741185029Spjd } 742260763Savg ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space); 743260763Savg dp->dp_dirty_pertxg[txg & TXG_MASK] -= space; 744260763Savg ASSERT3U(dp->dp_dirty_total, >=, space); 745260763Savg dsl_pool_dirty_delta(dp, -space); 746260763Savg mutex_exit(&dp->dp_lock); 747185029Spjd} 748185029Spjd 749185029Spjd/* ARGSUSED */ 750185029Spjdstatic int 751248571Smmupgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) 752185029Spjd{ 753185029Spjd dmu_tx_t *tx = arg; 754185029Spjd dsl_dataset_t *ds, *prev = NULL; 755185029Spjd int err; 756185029Spjd 757248571Smm err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); 758185029Spjd if (err) 759185029Spjd return (err); 760185029Spjd 761185029Spjd while (ds->ds_phys->ds_prev_snap_obj != 0) { 762185029Spjd err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, 763185029Spjd FTAG, &prev); 764185029Spjd if (err) { 765185029Spjd dsl_dataset_rele(ds, FTAG); 766185029Spjd return (err); 767185029Spjd } 768185029Spjd 769185029Spjd if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) 770185029Spjd break; 771185029Spjd dsl_dataset_rele(ds, FTAG); 772185029Spjd ds = prev; 773185029Spjd prev = NULL; 774185029Spjd } 775185029Spjd 776185029Spjd if (prev == NULL) { 777185029Spjd prev = dp->dp_origin_snap; 778185029Spjd 779185029Spjd /* 780185029Spjd * The $ORIGIN can't have any data, or the accounting 781185029Spjd * will be wrong. 782185029Spjd */ 783248571Smm ASSERT0(prev->ds_phys->ds_bp.blk_birth); 784185029Spjd 785185029Spjd /* The origin doesn't get attached to itself */ 786185029Spjd if (ds->ds_object == prev->ds_object) { 787185029Spjd dsl_dataset_rele(ds, FTAG); 788185029Spjd return (0); 789185029Spjd } 790185029Spjd 791185029Spjd dmu_buf_will_dirty(ds->ds_dbuf, tx); 792185029Spjd ds->ds_phys->ds_prev_snap_obj = prev->ds_object; 793185029Spjd ds->ds_phys->ds_prev_snap_txg = prev->ds_phys->ds_creation_txg; 794185029Spjd 795185029Spjd dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); 796185029Spjd ds->ds_dir->dd_phys->dd_origin_obj = prev->ds_object; 797185029Spjd 798185029Spjd dmu_buf_will_dirty(prev->ds_dbuf, tx); 799185029Spjd prev->ds_phys->ds_num_children++; 800185029Spjd 801185029Spjd if (ds->ds_phys->ds_next_snap_obj == 0) { 802185029Spjd ASSERT(ds->ds_prev == NULL); 803248571Smm VERIFY0(dsl_dataset_hold_obj(dp, 804185029Spjd ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); 805185029Spjd } 806185029Spjd } 807185029Spjd 808248571Smm ASSERT3U(ds->ds_dir->dd_phys->dd_origin_obj, ==, prev->ds_object); 809248571Smm ASSERT3U(ds->ds_phys->ds_prev_snap_obj, ==, prev->ds_object); 810185029Spjd 811185029Spjd if (prev->ds_phys->ds_next_clones_obj == 0) { 812209962Smm dmu_buf_will_dirty(prev->ds_dbuf, tx); 813185029Spjd prev->ds_phys->ds_next_clones_obj = 814185029Spjd zap_create(dp->dp_meta_objset, 815185029Spjd DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); 816185029Spjd } 817248571Smm VERIFY0(zap_add_int(dp->dp_meta_objset, 818185029Spjd prev->ds_phys->ds_next_clones_obj, ds->ds_object, tx)); 819185029Spjd 820185029Spjd dsl_dataset_rele(ds, FTAG); 821185029Spjd if (prev != dp->dp_origin_snap) 822185029Spjd dsl_dataset_rele(prev, FTAG); 823185029Spjd return (0); 824185029Spjd} 825185029Spjd 826185029Spjdvoid 827185029Spjddsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx) 828185029Spjd{ 829185029Spjd ASSERT(dmu_tx_is_syncing(tx)); 830185029Spjd ASSERT(dp->dp_origin_snap != NULL); 831185029Spjd 832248571Smm VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb, 833209962Smm tx, DS_FIND_CHILDREN)); 834185029Spjd} 835185029Spjd 836219089Spjd/* ARGSUSED */ 837219089Spjdstatic int 838248571Smmupgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) 839219089Spjd{ 840219089Spjd dmu_tx_t *tx = arg; 841219089Spjd objset_t *mos = dp->dp_meta_objset; 842219089Spjd 843248571Smm if (ds->ds_dir->dd_phys->dd_origin_obj != 0) { 844219089Spjd dsl_dataset_t *origin; 845219089Spjd 846248571Smm VERIFY0(dsl_dataset_hold_obj(dp, 847219089Spjd ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin)); 848219089Spjd 849219089Spjd if (origin->ds_dir->dd_phys->dd_clones == 0) { 850219089Spjd dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); 851219089Spjd origin->ds_dir->dd_phys->dd_clones = zap_create(mos, 852219089Spjd DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); 853219089Spjd } 854219089Spjd 855248571Smm VERIFY0(zap_add_int(dp->dp_meta_objset, 856248571Smm origin->ds_dir->dd_phys->dd_clones, ds->ds_object, tx)); 857219089Spjd 858219089Spjd dsl_dataset_rele(origin, FTAG); 859219089Spjd } 860219089Spjd return (0); 861219089Spjd} 862219089Spjd 863185029Spjdvoid 864219089Spjddsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx) 865219089Spjd{ 866219089Spjd ASSERT(dmu_tx_is_syncing(tx)); 867219089Spjd uint64_t obj; 868219089Spjd 869219089Spjd (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx); 870248571Smm VERIFY0(dsl_pool_open_special_dir(dp, 871219089Spjd FREE_DIR_NAME, &dp->dp_free_dir)); 872219089Spjd 873219089Spjd /* 874219089Spjd * We can't use bpobj_alloc(), because spa_version() still 875219089Spjd * returns the old version, and we need a new-version bpobj with 876219089Spjd * subobj support. So call dmu_object_alloc() directly. 877219089Spjd */ 878219089Spjd obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ, 879219089Spjd SPA_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx); 880248571Smm VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 881219089Spjd DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx)); 882248571Smm VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj)); 883219089Spjd 884248571Smm VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 885219089Spjd upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN)); 886219089Spjd} 887219089Spjd 888219089Spjdvoid 889185029Spjddsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) 890185029Spjd{ 891185029Spjd uint64_t dsobj; 892185029Spjd dsl_dataset_t *ds; 893185029Spjd 894185029Spjd ASSERT(dmu_tx_is_syncing(tx)); 895185029Spjd ASSERT(dp->dp_origin_snap == NULL); 896248571Smm ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER)); 897185029Spjd 898185029Spjd /* create the origin dir, ds, & snap-ds */ 899185029Spjd dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME, 900185029Spjd NULL, 0, kcred, tx); 901248571Smm VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 902248571Smm dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx); 903248571Smm VERIFY0(dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, 904185029Spjd dp, &dp->dp_origin_snap)); 905185029Spjd dsl_dataset_rele(ds, FTAG); 906185029Spjd} 907196307Spjd 908196307Spjdtaskq_t * 909196307Spjddsl_pool_vnrele_taskq(dsl_pool_t *dp) 910196307Spjd{ 911196307Spjd return (dp->dp_vnrele_taskq); 912196307Spjd} 913219089Spjd 914219089Spjd/* 915219089Spjd * Walk through the pool-wide zap object of temporary snapshot user holds 916219089Spjd * and release them. 917219089Spjd */ 918219089Spjdvoid 919219089Spjddsl_pool_clean_tmp_userrefs(dsl_pool_t *dp) 920219089Spjd{ 921219089Spjd zap_attribute_t za; 922219089Spjd zap_cursor_t zc; 923219089Spjd objset_t *mos = dp->dp_meta_objset; 924219089Spjd uint64_t zapobj = dp->dp_tmp_userrefs_obj; 925251646Sdelphij nvlist_t *holds; 926219089Spjd 927219089Spjd if (zapobj == 0) 928219089Spjd return; 929219089Spjd ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); 930219089Spjd 931251646Sdelphij holds = fnvlist_alloc(); 932251646Sdelphij 933219089Spjd for (zap_cursor_init(&zc, mos, zapobj); 934219089Spjd zap_cursor_retrieve(&zc, &za) == 0; 935219089Spjd zap_cursor_advance(&zc)) { 936219089Spjd char *htag; 937251646Sdelphij nvlist_t *tags; 938219089Spjd 939219089Spjd htag = strchr(za.za_name, '-'); 940219089Spjd *htag = '\0'; 941219089Spjd ++htag; 942251646Sdelphij if (nvlist_lookup_nvlist(holds, za.za_name, &tags) != 0) { 943251646Sdelphij tags = fnvlist_alloc(); 944251646Sdelphij fnvlist_add_boolean(tags, htag); 945251646Sdelphij fnvlist_add_nvlist(holds, za.za_name, tags); 946251646Sdelphij fnvlist_free(tags); 947251646Sdelphij } else { 948251646Sdelphij fnvlist_add_boolean(tags, htag); 949251646Sdelphij } 950219089Spjd } 951251646Sdelphij dsl_dataset_user_release_tmp(dp, holds); 952251646Sdelphij fnvlist_free(holds); 953219089Spjd zap_cursor_fini(&zc); 954219089Spjd} 955219089Spjd 956219089Spjd/* 957219089Spjd * Create the pool-wide zap object for storing temporary snapshot holds. 958219089Spjd */ 959219089Spjdvoid 960219089Spjddsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx) 961219089Spjd{ 962219089Spjd objset_t *mos = dp->dp_meta_objset; 963219089Spjd 964219089Spjd ASSERT(dp->dp_tmp_userrefs_obj == 0); 965219089Spjd ASSERT(dmu_tx_is_syncing(tx)); 966219089Spjd 967236884Smm dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS, 968236884Smm DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx); 969219089Spjd} 970219089Spjd 971219089Spjdstatic int 972219089Spjddsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj, 973248571Smm const char *tag, uint64_t now, dmu_tx_t *tx, boolean_t holding) 974219089Spjd{ 975219089Spjd objset_t *mos = dp->dp_meta_objset; 976219089Spjd uint64_t zapobj = dp->dp_tmp_userrefs_obj; 977219089Spjd char *name; 978219089Spjd int error; 979219089Spjd 980219089Spjd ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); 981219089Spjd ASSERT(dmu_tx_is_syncing(tx)); 982219089Spjd 983219089Spjd /* 984219089Spjd * If the pool was created prior to SPA_VERSION_USERREFS, the 985219089Spjd * zap object for temporary holds might not exist yet. 986219089Spjd */ 987219089Spjd if (zapobj == 0) { 988219089Spjd if (holding) { 989219089Spjd dsl_pool_user_hold_create_obj(dp, tx); 990219089Spjd zapobj = dp->dp_tmp_userrefs_obj; 991219089Spjd } else { 992249195Smm return (SET_ERROR(ENOENT)); 993219089Spjd } 994219089Spjd } 995219089Spjd 996219089Spjd name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag); 997219089Spjd if (holding) 998248571Smm error = zap_add(mos, zapobj, name, 8, 1, &now, tx); 999219089Spjd else 1000219089Spjd error = zap_remove(mos, zapobj, name, tx); 1001219089Spjd strfree(name); 1002219089Spjd 1003219089Spjd return (error); 1004219089Spjd} 1005219089Spjd 1006219089Spjd/* 1007219089Spjd * Add a temporary hold for the given dataset object and tag. 1008219089Spjd */ 1009219089Spjdint 1010219089Spjddsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag, 1011248571Smm uint64_t now, dmu_tx_t *tx) 1012219089Spjd{ 1013219089Spjd return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE)); 1014219089Spjd} 1015219089Spjd 1016219089Spjd/* 1017219089Spjd * Release a temporary hold for the given dataset object and tag. 1018219089Spjd */ 1019219089Spjdint 1020219089Spjddsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, 1021219089Spjd dmu_tx_t *tx) 1022219089Spjd{ 1023248571Smm return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, 0, 1024219089Spjd tx, B_FALSE)); 1025219089Spjd} 1026248571Smm 1027248571Smm/* 1028248571Smm * DSL Pool Configuration Lock 1029248571Smm * 1030248571Smm * The dp_config_rwlock protects against changes to DSL state (e.g. dataset 1031248571Smm * creation / destruction / rename / property setting). It must be held for 1032248571Smm * read to hold a dataset or dsl_dir. I.e. you must call 1033248571Smm * dsl_pool_config_enter() or dsl_pool_hold() before calling 1034248571Smm * dsl_{dataset,dir}_hold{_obj}. In most circumstances, the dp_config_rwlock 1035248571Smm * must be held continuously until all datasets and dsl_dirs are released. 1036248571Smm * 1037248571Smm * The only exception to this rule is that if a "long hold" is placed on 1038248571Smm * a dataset, then the dp_config_rwlock may be dropped while the dataset 1039248571Smm * is still held. The long hold will prevent the dataset from being 1040248571Smm * destroyed -- the destroy will fail with EBUSY. A long hold can be 1041248571Smm * obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset 1042248571Smm * (by calling dsl_{dataset,objset}_{try}own{_obj}). 1043248571Smm * 1044248571Smm * Legitimate long-holders (including owners) should be long-running, cancelable 1045248571Smm * tasks that should cause "zfs destroy" to fail. This includes DMU 1046248571Smm * consumers (i.e. a ZPL filesystem being mounted or ZVOL being open), 1047248571Smm * "zfs send", and "zfs diff". There are several other long-holders whose 1048248571Smm * uses are suboptimal (e.g. "zfs promote", and zil_suspend()). 1049248571Smm * 1050248571Smm * The usual formula for long-holding would be: 1051248571Smm * dsl_pool_hold() 1052248571Smm * dsl_dataset_hold() 1053248571Smm * ... perform checks ... 1054248571Smm * dsl_dataset_long_hold() 1055248571Smm * dsl_pool_rele() 1056248571Smm * ... perform long-running task ... 1057248571Smm * dsl_dataset_long_rele() 1058248571Smm * dsl_dataset_rele() 1059248571Smm * 1060248571Smm * Note that when the long hold is released, the dataset is still held but 1061248571Smm * the pool is not held. The dataset may change arbitrarily during this time 1062248571Smm * (e.g. it could be destroyed). Therefore you shouldn't do anything to the 1063248571Smm * dataset except release it. 1064248571Smm * 1065248571Smm * User-initiated operations (e.g. ioctls, zfs_ioc_*()) are either read-only 1066248571Smm * or modifying operations. 1067248571Smm * 1068248571Smm * Modifying operations should generally use dsl_sync_task(). The synctask 1069248571Smm * infrastructure enforces proper locking strategy with respect to the 1070248571Smm * dp_config_rwlock. See the comment above dsl_sync_task() for details. 1071248571Smm * 1072248571Smm * Read-only operations will manually hold the pool, then the dataset, obtain 1073248571Smm * information from the dataset, then release the pool and dataset. 1074248571Smm * dmu_objset_{hold,rele}() are convenience routines that also do the pool 1075248571Smm * hold/rele. 1076248571Smm */ 1077248571Smm 1078248571Smmint 1079248571Smmdsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp) 1080248571Smm{ 1081248571Smm spa_t *spa; 1082248571Smm int error; 1083248571Smm 1084248571Smm error = spa_open(name, &spa, tag); 1085248571Smm if (error == 0) { 1086248571Smm *dp = spa_get_dsl(spa); 1087248571Smm dsl_pool_config_enter(*dp, tag); 1088248571Smm } 1089248571Smm return (error); 1090248571Smm} 1091248571Smm 1092248571Smmvoid 1093248571Smmdsl_pool_rele(dsl_pool_t *dp, void *tag) 1094248571Smm{ 1095248571Smm dsl_pool_config_exit(dp, tag); 1096248571Smm spa_close(dp->dp_spa, tag); 1097248571Smm} 1098248571Smm 1099248571Smmvoid 1100248571Smmdsl_pool_config_enter(dsl_pool_t *dp, void *tag) 1101248571Smm{ 1102248571Smm /* 1103248571Smm * We use a "reentrant" reader-writer lock, but not reentrantly. 1104248571Smm * 1105248571Smm * The rrwlock can (with the track_all flag) track all reading threads, 1106248571Smm * which is very useful for debugging which code path failed to release 1107248571Smm * the lock, and for verifying that the *current* thread does hold 1108248571Smm * the lock. 1109248571Smm * 1110248571Smm * (Unlike a rwlock, which knows that N threads hold it for 1111248571Smm * read, but not *which* threads, so rw_held(RW_READER) returns TRUE 1112248571Smm * if any thread holds it for read, even if this thread doesn't). 1113248571Smm */ 1114248571Smm ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER)); 1115248571Smm rrw_enter(&dp->dp_config_rwlock, RW_READER, tag); 1116248571Smm} 1117248571Smm 1118248571Smmvoid 1119248571Smmdsl_pool_config_exit(dsl_pool_t *dp, void *tag) 1120248571Smm{ 1121248571Smm rrw_exit(&dp->dp_config_rwlock, tag); 1122248571Smm} 1123248571Smm 1124248571Smmboolean_t 1125248571Smmdsl_pool_config_held(dsl_pool_t *dp) 1126248571Smm{ 1127248571Smm return (RRW_LOCK_HELD(&dp->dp_config_rwlock)); 1128248571Smm} 1129