dsl_pool.c revision 239774
1139749Simp/* 226159Sse * CDDL HEADER START 326159Sse * 426159Sse * The contents of this file are subject to the terms of the 526159Sse * Common Development and Distribution License (the "License"). 626159Sse * You may not use this file except in compliance with the License. 726159Sse * 826159Sse * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 926159Sse * or http://www.opensolaris.org/os/licensing. 1026159Sse * See the License for the specific language governing permissions 1126159Sse * and limitations under the License. 1226159Sse * 1326159Sse * When distributing Covered Code, include this CDDL HEADER in each 1426159Sse * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 1526159Sse * If applicable, add the following below this CDDL HEADER, with the 1626159Sse * fields enclosed by brackets "[]" replaced with your own identifying 1726159Sse * information: Portions Copyright [yyyy] [name of copyright owner] 1826159Sse * 1926159Sse * CDDL HEADER END 2026159Sse */ 2126159Sse/* 2226159Sse * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 2326159Sse * Copyright (c) 2012 by Delphix. All rights reserved. 2426159Sse */ 2526159Sse 2650477Speter#include <sys/dsl_pool.h> 2726159Sse#include <sys/dsl_dataset.h> 2826159Sse#include <sys/dsl_prop.h> 296100Sse#include <sys/dsl_dir.h> 3039231Sgibbs#include <sys/dsl_synctask.h> 31165217Sjhb#include <sys/dsl_scan.h> 3239231Sgibbs#include <sys/dnode.h> 3339231Sgibbs#include <sys/dmu_tx.h> 3439231Sgibbs#include <sys/dmu_objset.h> 3526159Sse#include <sys/arc.h> 36165217Sjhb#include <sys/zap.h> 37165217Sjhb#include <sys/zio.h> 38165217Sjhb#include <sys/zfs_context.h> 396100Sse#include <sys/fs/zfs.h> 40163805Simp#include <sys/zfs_znode.h> 416100Sse#include <sys/spa_impl.h> 42120063Sscottl#include <sys/dsl_deadlist.h> 43120063Sscottl#include <sys/bptree.h> 44120063Sscottl#include <sys/zfeature.h> 45214122Sjkim#include <sys/zil_impl.h> 46214122Sjkim 47214122Sjkimint zfs_no_write_throttle = 0; 48120063Sscottlint zfs_write_limit_shift = 3; /* 1/8th of physical memory */ 49220195Sjhbint zfs_txg_synctime_ms = 1000; /* target millisecs to sync a txg */ 50220195Sjhb 51220195Sjhbuint64_t zfs_write_limit_min = 32 << 20; /* min write limit is 32MB */ 52220195Sjhbuint64_t zfs_write_limit_max = 0; /* max data payload per txg */ 53220195Sjhbuint64_t zfs_write_limit_inflated = 0; 54220195Sjhbuint64_t zfs_write_limit_override = 0; 55220195Sjhb 56220195Sjhbkmutex_t zfs_write_limit_lock; 57163163Sjmg 58163163Sjmgstatic pgcnt_t old_physmem = 0; 59163163Sjmg 60163163SjmgSYSCTL_DECL(_vfs_zfs); 61163163SjmgTUNABLE_INT("vfs.zfs.no_write_throttle", &zfs_no_write_throttle); 62163163SjmgSYSCTL_INT(_vfs_zfs, OID_AUTO, no_write_throttle, CTLFLAG_RDTUN, 63163163Sjmg &zfs_no_write_throttle, 0, ""); 64163163SjmgTUNABLE_INT("vfs.zfs.write_limit_shift", &zfs_write_limit_shift); 65163163SjmgSYSCTL_INT(_vfs_zfs, OID_AUTO, write_limit_shift, CTLFLAG_RDTUN, 66163163Sjmg &zfs_write_limit_shift, 0, "2^N of physical memory"); 67163163SjmgSYSCTL_DECL(_vfs_zfs_txg); 68163163SjmgTUNABLE_INT("vfs.zfs.txg.synctime_ms", &zfs_txg_synctime_ms); 69163163SjmgSYSCTL_INT(_vfs_zfs_txg, OID_AUTO, synctime_ms, CTLFLAG_RDTUN, 70163163Sjmg &zfs_txg_synctime_ms, 0, "Target milliseconds to sync a txg"); 71167909Sjhb 72163163SjmgTUNABLE_QUAD("vfs.zfs.write_limit_min", &zfs_write_limit_min); 73163163SjmgSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, write_limit_min, CTLFLAG_RDTUN, 74163163Sjmg &zfs_write_limit_min, 0, "Minimum write limit"); 75163163SjmgTUNABLE_QUAD("vfs.zfs.write_limit_max", &zfs_write_limit_max); 76163163SjmgSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, write_limit_max, CTLFLAG_RDTUN, 77163163Sjmg &zfs_write_limit_max, 0, "Maximum data payload per txg"); 78163163SjmgTUNABLE_QUAD("vfs.zfs.write_limit_inflated", &zfs_write_limit_inflated); 79120063SscottlSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, write_limit_inflated, CTLFLAG_RDTUN, 80120063Sscottl &zfs_write_limit_inflated, 0, ""); 81120063SscottlTUNABLE_QUAD("vfs.zfs.write_limit_override", &zfs_write_limit_override); 82164264SjhbSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, write_limit_override, CTLFLAG_RDTUN, 83120063Sscottl &zfs_write_limit_override, 0, ""); 84164264Sjhb 85164264Sjhbint 86164264Sjhbdsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) 87169221Sjhb{ 88120063Sscottl uint64_t obj; 89120063Sscottl int err; 90164282Sjhb 91169221Sjhb err = zap_lookup(dp->dp_meta_objset, 92169221Sjhb dp->dp_root_dir->dd_phys->dd_child_dir_zapobj, 93169221Sjhb name, sizeof (obj), 1, &obj); 94169221Sjhb if (err) 95169221Sjhb return (err); 96169221Sjhb 97169221Sjhb return (dsl_dir_open_obj(dp, obj, name, dp, ddp)); 98169221Sjhb} 99169221Sjhb 100169221Sjhbstatic dsl_pool_t * 101169221Sjhbdsl_pool_open_impl(spa_t *spa, uint64_t txg) 102164264Sjhb{ 103164264Sjhb dsl_pool_t *dp; 104169221Sjhb blkptr_t *bp = spa_get_rootblkptr(spa); 105164282Sjhb 106164264Sjhb dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP); 107164264Sjhb dp->dp_spa = spa; 108164264Sjhb dp->dp_meta_rootbp = *bp; 109164264Sjhb rw_init(&dp->dp_config_rwlock, NULL, RW_DEFAULT, NULL); 110169221Sjhb dp->dp_write_limit = zfs_write_limit_min; 111169221Sjhb txg_init(dp, txg); 112169221Sjhb 113169221Sjhb txg_list_create(&dp->dp_dirty_datasets, 114164264Sjhb offsetof(dsl_dataset_t, ds_dirty_link)); 115164264Sjhb txg_list_create(&dp->dp_dirty_zilogs, 116164264Sjhb offsetof(zilog_t, zl_dirty_link)); 117164264Sjhb txg_list_create(&dp->dp_dirty_dirs, 118180753Sluoqi offsetof(dsl_dir_t, dd_dirty_link)); 119180753Sluoqi txg_list_create(&dp->dp_sync_tasks, 120219737Sjhb offsetof(dsl_sync_task_group_t, dstg_node)); 121180753Sluoqi 122180753Sluoqi mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); 123180753Sluoqi 124180753Sluoqi dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri, 125180753Sluoqi 1, 4, 0); 12626159Sse 12726159Sse return (dp); 12845720Speter} 1296100Sse 130220195Sjhbint 131128019Simpdsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) 132119266Simp{ 133119266Simp int err; 134119266Simp dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); 135119266Simp 1366100Sse err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, 137119266Simp &dp->dp_meta_objset); 138119266Simp if (err != 0) 1397233Sse dsl_pool_close(dp); 140119266Simp else 141119266Simp *dpp = dp; 142119266Simp 143119266Simp return (err); 1446100Sse} 145119266Simp 146119266Simpint 147119266Simpdsl_pool_open(dsl_pool_t *dp) 148119266Simp{ 1496100Sse int err; 150119266Simp dsl_dir_t *dd; 151119266Simp dsl_dataset_t *ds; 152119266Simp uint64_t obj; 1536100Sse 154119266Simp rw_enter(&dp->dp_config_rwlock, RW_WRITER); 155119266Simp err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1567233Sse DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, 157172394Smarius &dp->dp_root_dir_obj); 158119266Simp if (err) 159119266Simp goto out; 160119266Simp 1617233Sse err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, 162193256Sjhb NULL, dp, &dp->dp_root_dir); 163193256Sjhb if (err) 164193256Sjhb goto out; 165193256Sjhb 166180753Sluoqi err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir); 16726159Sse if (err) 1686100Sse goto out; 16926159Sse 1706100Sse if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) { 171165217Sjhb err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd); 172165217Sjhb if (err) 173165217Sjhb goto out; 174165217Sjhb err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj, 1757233Sse FTAG, &ds); 17626159Sse if (err == 0) { 17726159Sse err = dsl_dataset_hold_obj(dp, 17826159Sse ds->ds_phys->ds_prev_snap_obj, dp, 179119266Simp &dp->dp_origin_snap); 180119266Simp dsl_dataset_rele(ds, FTAG); 181119266Simp } 182119266Simp dsl_dir_close(dd, dp); 183119266Simp if (err) 184119266Simp goto out; 185119266Simp } 18626159Sse 1876100Sse if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { 18826159Sse err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME, 18926159Sse &dp->dp_free_dir); 19026159Sse if (err) 191119266Simp goto out; 192119266Simp 193119266Simp err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 194119266Simp DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj); 195119266Simp if (err) 196119266Simp goto out; 197119266Simp VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj, 198119266Simp dp->dp_meta_objset, obj)); 199119266Simp } 200119266Simp 201119266Simp if (spa_feature_is_active(dp->dp_spa, 202119266Simp &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { 20326159Sse err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 20426159Sse DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, 205119266Simp &dp->dp_bptree_obj); 20639231Sgibbs if (err != 0) 20761047Speter goto out; 20861047Speter } 20961047Speter 21061047Speter if (spa_feature_is_active(dp->dp_spa, 21161047Speter &spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ])) { 21261047Speter err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 21361047Speter DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1, 21461047Speter &dp->dp_empty_bpobj); 21561047Speter if (err != 0) 21639231Sgibbs goto out; 21745720Speter } 21845720Speter 21945720Speter err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 22045720Speter DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1, 22145720Speter &dp->dp_tmp_userrefs_obj); 22269953Smsmith if (err == ENOENT) 22369953Smsmith err = 0; 22469953Smsmith if (err) 22569953Smsmith goto out; 22669953Smsmith 22769953Smsmith err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg); 22869953Smsmith 22969953Smsmithout: 23069953Smsmith rw_exit(&dp->dp_config_rwlock); 23169953Smsmith return (err); 23269953Smsmith} 233172394Smarius 23469953Smsmithvoid 23569953Smsmithdsl_pool_close(dsl_pool_t *dp) 23669953Smsmith{ 237107300Simp /* drop our references from dsl_pool_open() */ 238149972Simp 239149972Simp /* 240149972Simp * Since we held the origin_snap from "syncing" context (which 241149972Simp * includes pool-opening context), it actually only got a "ref" 242178161Sphk * and not a hold, so just drop that here. 24345720Speter */ 24445720Speter if (dp->dp_origin_snap) 24545720Speter dsl_dataset_drop_ref(dp->dp_origin_snap, dp); 24645720Speter if (dp->dp_mos_dir) 24745720Speter dsl_dir_close(dp->dp_mos_dir, dp); 248165217Sjhb if (dp->dp_free_dir) 24988375Stmm dsl_dir_close(dp->dp_free_dir, dp); 25045720Speter if (dp->dp_root_dir) 251119266Simp dsl_dir_close(dp->dp_root_dir, dp); 252119266Simp 253119266Simp bpobj_close(&dp->dp_free_bpobj); 254119266Simp 255119266Simp /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */ 256119266Simp if (dp->dp_meta_objset) 257119266Simp dmu_objset_evict(dp->dp_meta_objset); 258119266Simp 259119266Simp txg_list_destroy(&dp->dp_dirty_datasets); 260119266Simp txg_list_destroy(&dp->dp_dirty_zilogs); 261119266Simp txg_list_destroy(&dp->dp_sync_tasks); 262172394Smarius txg_list_destroy(&dp->dp_dirty_dirs); 263119266Simp 264119266Simp arc_flush(dp->dp_spa); 265119266Simp txg_fini(dp); 266119266Simp dsl_scan_fini(dp); 267149972Simp rw_destroy(&dp->dp_config_rwlock); 268149972Simp mutex_destroy(&dp->dp_lock); 269149972Simp taskq_destroy(dp->dp_vnrele_taskq); 270149972Simp if (dp->dp_blkstats) 271149972Simp kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); 27245720Speter kmem_free(dp, sizeof (dsl_pool_t)); 27366416Speter} 27466416Speter 27569953Smsmithdsl_pool_t * 27669953Smsmithdsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) 27769953Smsmith{ 278119266Simp int err; 27945720Speter dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); 28045720Speter dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); 28145720Speter objset_t *os; 28245720Speter dsl_dataset_t *ds; 28345720Speter uint64_t obj; 28445720Speter 285119266Simp /* create and open the MOS (meta-objset) */ 28645720Speter dp->dp_meta_objset = dmu_objset_create_impl(spa, 28745720Speter NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx); 28845720Speter 28945720Speter /* create the pool directory */ 29047339Sgallatin err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 29147339Sgallatin DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx); 29247339Sgallatin ASSERT3U(err, ==, 0); 29347339Sgallatin 29447339Sgallatin /* Initialize scan structures */ 29547339Sgallatin VERIFY3U(0, ==, dsl_scan_init(dp, txg)); 296172394Smarius 29765176Sdfr /* create and open the root dir */ 29847339Sgallatin dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx); 29947339Sgallatin VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj, 300165217Sjhb NULL, dp, &dp->dp_root_dir)); 301102144Smux 30247339Sgallatin /* create and open the meta-objset dir */ 303172394Smarius (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx); 304119266Simp VERIFY(0 == dsl_pool_open_special_dir(dp, 30547339Sgallatin MOS_DIR_NAME, &dp->dp_mos_dir)); 30666416Speter 30766416Speter if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { 30869953Smsmith /* create and open the free dir */ 30998017Simp (void) dsl_dir_create_sync(dp, dp->dp_root_dir, 31098017Simp FREE_DIR_NAME, tx); 31198017Simp VERIFY(0 == dsl_pool_open_special_dir(dp, 31290554Smsmith FREE_DIR_NAME, &dp->dp_free_dir)); 313165217Sjhb 314165217Sjhb /* create and open the free_bplist */ 31590554Smsmith obj = bpobj_alloc(dp->dp_meta_objset, SPA_MAXBLOCKSIZE, tx); 31690554Smsmith VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 31769953Smsmith DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0); 31869953Smsmith VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj, 31969953Smsmith dp->dp_meta_objset, obj)); 32069953Smsmith } 32169953Smsmith 322113544Smdodd if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) 32373185Speter dsl_pool_create_origin(dp, tx); 32473185Speter 325113544Smdodd /* create the root dataset */ 32673185Speter obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx); 32745720Speter 328113544Smdodd /* create the root objset */ 32973185Speter VERIFY(0 == dsl_dataset_hold_obj(dp, obj, FTAG, &ds)); 33073185Speter os = dmu_objset_create_impl(dp->dp_spa, ds, 331113544Smdodd dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx); 33273185Speter#ifdef _KERNEL 33373185Speter zfs_create_fs(os, kcred, zplprops, tx); 334113544Smdodd#endif 33573185Speter dsl_dataset_rele(ds, FTAG); 33673185Speter 337113544Smdodd dmu_tx_commit(tx); 33873185Speter 33973185Speter return (dp); 340113544Smdodd} 34173185Speter 34273185Speter/* 343113544Smdodd * Account for the meta-objset space in its placeholder dsl_dir. 34473185Speter */ 34573185Spetervoid 346163163Sjmgdsl_pool_mos_diduse_space(dsl_pool_t *dp, 347163163Sjmg int64_t used, int64_t comp, int64_t uncomp) 348163163Sjmg{ 349163163Sjmg ASSERT3U(comp, ==, uncomp); /* it's all metadata */ 350163163Sjmg mutex_enter(&dp->dp_lock); 351163163Sjmg dp->dp_mos_used_delta += used; 352163163Sjmg dp->dp_mos_compressed_delta += comp; 353163163Sjmg dp->dp_mos_uncompressed_delta += uncomp; 354163163Sjmg mutex_exit(&dp->dp_lock); 355163163Sjmg} 356163163Sjmg 357163163Sjmgstatic int 35869953Smsmithdeadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 359145651Smarcel{ 360145651Smarcel dsl_deadlist_t *dl = arg; 361145651Smarcel dsl_pool_t *dp = dmu_objset_pool(dl->dl_os); 362145651Smarcel rw_enter(&dp->dp_config_rwlock, RW_READER); 363145651Smarcel dsl_deadlist_insert(dl, bp, tx); 364145651Smarcel rw_exit(&dp->dp_config_rwlock); 365145651Smarcel return (0); 366145651Smarcel} 367145651Smarcel 368145651Smarcelvoid 369145651Smarceldsl_pool_sync(dsl_pool_t *dp, uint64_t txg) 370145651Smarcel{ 371145651Smarcel zio_t *zio; 372145651Smarcel dmu_tx_t *tx; 373145651Smarcel dsl_dir_t *dd; 374145651Smarcel dsl_dataset_t *ds; 375145651Smarcel objset_t *mos = dp->dp_meta_objset; 376145651Smarcel hrtime_t start, write_time; 37769953Smsmith uint64_t data_written; 37869953Smsmith int err; 37969953Smsmith list_t synced_datasets; 38069953Smsmith 38169953Smsmith list_create(&synced_datasets, sizeof (dsl_dataset_t), 38269953Smsmith offsetof(dsl_dataset_t, ds_synced_link)); 383160964Syar 38469953Smsmith /* 38569953Smsmith * We need to copy dp_space_towrite() before doing 386160964Syar * dsl_sync_task_group_sync(), because 38769953Smsmith * dsl_dataset_snapshot_reserve_space() will increase 38869953Smsmith * dp_space_towrite but not actually write anything. 38969953Smsmith */ 39069953Smsmith data_written = dp->dp_space_towrite[txg & TXG_MASK]; 391165217Sjhb 392165217Sjhb tx = dmu_tx_create_assigned(dp, txg); 393165217Sjhb 394165217Sjhb dp->dp_read_overhead = 0; 395165217Sjhb start = gethrtime(); 39626159Sse 39773185Speter zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 39873185Speter while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) { 39973185Speter /* 40073185Speter * We must not sync any non-MOS datasets twice, because 40173185Speter * we may have taken a snapshot of them. However, we 40258287Speter * may sync newly-created datasets on pass 2. 40373185Speter */ 40473185Speter ASSERT(!list_link_active(&ds->ds_synced_link)); 40573185Speter list_insert_tail(&synced_datasets, ds); 40673185Speter dsl_dataset_sync(ds, zio, tx); 40773185Speter } 40873185Speter DTRACE_PROBE(pool_sync__1setup); 409153560Sjhb err = zio_wait(zio); 410219865Sjhb 411219865Sjhb write_time = gethrtime() - start; 412219865Sjhb ASSERT(err == 0); 413219865Sjhb DTRACE_PROBE(pool_sync__2rootzio); 414219865Sjhb 415219865Sjhb /* 416153560Sjhb * After the data blocks have been written (ensured by the zio_wait() 417153560Sjhb * above), update the user/group space accounting. 418219865Sjhb */ 419153560Sjhb for (ds = list_head(&synced_datasets); ds; 420153560Sjhb ds = list_next(&synced_datasets, ds)) 421164264Sjhb dmu_objset_do_userquota_updates(ds->ds_objset, tx); 422164264Sjhb 423164264Sjhb /* 424164264Sjhb * Sync the datasets again to push out the changes due to 425164264Sjhb * userspace updates. This must be done before we process the 426164264Sjhb * sync tasks, so that any snapshots will have the correct 427164264Sjhb * user accounting information (and we won't get confused 428166176Sjhb * about which blocks are part of the snapshot). 429166176Sjhb */ 430166176Sjhb zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 431166176Sjhb while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) { 432166176Sjhb ASSERT(list_link_active(&ds->ds_synced_link)); 433166176Sjhb dmu_buf_rele(ds->ds_dbuf, ds); 434169221Sjhb dsl_dataset_sync(ds, zio, tx); 435166176Sjhb } 436169221Sjhb err = zio_wait(zio); 437166176Sjhb 438166176Sjhb /* 439166176Sjhb * Now that the datasets have been completely synced, we can 440164264Sjhb * clean up our in-memory structures accumulated while syncing: 441164264Sjhb * 442164264Sjhb * - move dead blocks from the pending deadlist to the on-disk deadlist 443164264Sjhb * - clean up zil records 444164264Sjhb * - release hold from dsl_dataset_dirty() 445164264Sjhb */ 446164264Sjhb while (ds = list_remove_head(&synced_datasets)) { 447164264Sjhb objset_t *os = ds->ds_objset; 448164264Sjhb bplist_iterate(&ds->ds_pending_deadlist, 449164264Sjhb deadlist_enqueue_cb, &ds->ds_deadlist, tx); 450164264Sjhb ASSERT(!dmu_objset_is_dirty(os, txg)); 451166176Sjhb dmu_buf_rele(ds->ds_dbuf, ds); 452166176Sjhb } 453166176Sjhb 454166176Sjhb start = gethrtime(); 455166176Sjhb while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) 456166176Sjhb dsl_dir_sync(dd, tx); 457119266Simp write_time += gethrtime() - start; 458172394Smarius 459119266Simp /* 460223885Skib * The MOS's space is accounted for in the pool/$MOS 461164264Sjhb * (dp_mos_dir). We can't modify the mos while we're syncing 462169221Sjhb * it, so we remember the deltas and apply them here. 463164264Sjhb */ 464169221Sjhb if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 || 465165228Sjhb dp->dp_mos_uncompressed_delta != 0) { 466253273Smarius dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD, 467164264Sjhb dp->dp_mos_used_delta, 468180753Sluoqi dp->dp_mos_compressed_delta, 469180753Sluoqi dp->dp_mos_uncompressed_delta, tx); 470203528Smav dp->dp_mos_used_delta = 0; 471233379Sjhb dp->dp_mos_compressed_delta = 0; 472233379Sjhb dp->dp_mos_uncompressed_delta = 0; 473203528Smav } 474203528Smav 47569953Smsmith start = gethrtime(); 47661047Speter if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL || 47769953Smsmith list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) { 47869953Smsmith zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 47969953Smsmith dmu_objset_sync(mos, zio, tx); 48069953Smsmith err = zio_wait(zio); 48158287Speter ASSERT(err == 0); 48269953Smsmith dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); 48369953Smsmith spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); 48469953Smsmith } 48588184Smdodd write_time += gethrtime() - start; 48612453Sbde DTRACE_PROBE2(pool_sync__4io, hrtime_t, write_time, 48788184Smdodd hrtime_t, dp->dp_read_overhead); 488119266Simp write_time -= dp->dp_read_overhead; 48988184Smdodd 490220195Sjhb /* 491220195Sjhb * If we modify a dataset in the same txg that we want to destroy it, 492220195Sjhb * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it. 49339231Sgibbs * dsl_dir_destroy_check() will fail if there are unexpected holds. 494 * Therefore, we want to sync the MOS (thus syncing the dd_dbuf 495 * and clearing the hold on it) before we process the sync_tasks. 496 * The MOS data dirtied by the sync_tasks will be synced on the next 497 * pass. 498 */ 499 DTRACE_PROBE(pool_sync__3task); 500 if (!txg_list_empty(&dp->dp_sync_tasks, txg)) { 501 dsl_sync_task_group_t *dstg; 502 /* 503 * No more sync tasks should have been added while we 504 * were syncing. 505 */ 506 ASSERT(spa_sync_pass(dp->dp_spa) == 1); 507 while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) 508 dsl_sync_task_group_sync(dstg, tx); 509 } 510 511 dmu_tx_commit(tx); 512 513 dp->dp_space_towrite[txg & TXG_MASK] = 0; 514 ASSERT(dp->dp_tempreserved[txg & TXG_MASK] == 0); 515 516 /* 517 * If the write limit max has not been explicitly set, set it 518 * to a fraction of available physical memory (default 1/8th). 519 * Note that we must inflate the limit because the spa 520 * inflates write sizes to account for data replication. 521 * Check this each sync phase to catch changing memory size. 522 */ 523 if (physmem != old_physmem && zfs_write_limit_shift) { 524 mutex_enter(&zfs_write_limit_lock); 525 old_physmem = physmem; 526 zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift; 527 zfs_write_limit_inflated = MAX(zfs_write_limit_min, 528 spa_get_asize(dp->dp_spa, zfs_write_limit_max)); 529 mutex_exit(&zfs_write_limit_lock); 530 } 531 532 /* 533 * Attempt to keep the sync time consistent by adjusting the 534 * amount of write traffic allowed into each transaction group. 535 * Weight the throughput calculation towards the current value: 536 * thru = 3/4 old_thru + 1/4 new_thru 537 * 538 * Note: write_time is in nanosecs, so write_time/MICROSEC 539 * yields millisecs 540 */ 541 ASSERT(zfs_write_limit_min > 0); 542 if (data_written > zfs_write_limit_min / 8 && write_time > MICROSEC) { 543 uint64_t throughput = data_written / (write_time / MICROSEC); 544 545 if (dp->dp_throughput) 546 dp->dp_throughput = throughput / 4 + 547 3 * dp->dp_throughput / 4; 548 else 549 dp->dp_throughput = throughput; 550 dp->dp_write_limit = MIN(zfs_write_limit_inflated, 551 MAX(zfs_write_limit_min, 552 dp->dp_throughput * zfs_txg_synctime_ms)); 553 } 554} 555 556void 557dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg) 558{ 559 zilog_t *zilog; 560 dsl_dataset_t *ds; 561 562 while (zilog = txg_list_remove(&dp->dp_dirty_zilogs, txg)) { 563 ds = dmu_objset_ds(zilog->zl_os); 564 zil_clean(zilog, txg); 565 ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg)); 566 dmu_buf_rele(ds->ds_dbuf, zilog); 567 } 568 ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg)); 569} 570 571/* 572 * TRUE if the current thread is the tx_sync_thread or if we 573 * are being called from SPA context during pool initialization. 574 */ 575int 576dsl_pool_sync_context(dsl_pool_t *dp) 577{ 578 return (curthread == dp->dp_tx.tx_sync_thread || 579 spa_is_initializing(dp->dp_spa)); 580} 581 582uint64_t 583dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree) 584{ 585 uint64_t space, resv; 586 587 /* 588 * Reserve about 1.6% (1/64), or at least 32MB, for allocation 589 * efficiency. 590 * XXX The intent log is not accounted for, so it must fit 591 * within this slop. 592 * 593 * If we're trying to assess whether it's OK to do a free, 594 * cut the reservation in half to allow forward progress 595 * (e.g. make it possible to rm(1) files from a full pool). 596 */ 597 space = spa_get_dspace(dp->dp_spa); 598 resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1); 599 if (netfree) 600 resv >>= 1; 601 602 return (space - resv); 603} 604 605int 606dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx) 607{ 608 uint64_t reserved = 0; 609 uint64_t write_limit = (zfs_write_limit_override ? 610 zfs_write_limit_override : dp->dp_write_limit); 611 612 if (zfs_no_write_throttle) { 613 atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], 614 space); 615 return (0); 616 } 617 618 /* 619 * Check to see if we have exceeded the maximum allowed IO for 620 * this transaction group. We can do this without locks since 621 * a little slop here is ok. Note that we do the reserved check 622 * with only half the requested reserve: this is because the 623 * reserve requests are worst-case, and we really don't want to 624 * throttle based off of worst-case estimates. 625 */ 626 if (write_limit > 0) { 627 reserved = dp->dp_space_towrite[tx->tx_txg & TXG_MASK] 628 + dp->dp_tempreserved[tx->tx_txg & TXG_MASK] / 2; 629 630 if (reserved && reserved > write_limit) 631 return (ERESTART); 632 } 633 634 atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], space); 635 636 /* 637 * If this transaction group is over 7/8ths capacity, delay 638 * the caller 1 clock tick. This will slow down the "fill" 639 * rate until the sync process can catch up with us. 640 */ 641 if (reserved && reserved > (write_limit - (write_limit >> 3))) 642 txg_delay(dp, tx->tx_txg, 1); 643 644 return (0); 645} 646 647void 648dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) 649{ 650 ASSERT(dp->dp_tempreserved[tx->tx_txg & TXG_MASK] >= space); 651 atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], -space); 652} 653 654void 655dsl_pool_memory_pressure(dsl_pool_t *dp) 656{ 657 uint64_t space_inuse = 0; 658 int i; 659 660 if (dp->dp_write_limit == zfs_write_limit_min) 661 return; 662 663 for (i = 0; i < TXG_SIZE; i++) { 664 space_inuse += dp->dp_space_towrite[i]; 665 space_inuse += dp->dp_tempreserved[i]; 666 } 667 dp->dp_write_limit = MAX(zfs_write_limit_min, 668 MIN(dp->dp_write_limit, space_inuse / 4)); 669} 670 671void 672dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) 673{ 674 if (space > 0) { 675 mutex_enter(&dp->dp_lock); 676 dp->dp_space_towrite[tx->tx_txg & TXG_MASK] += space; 677 mutex_exit(&dp->dp_lock); 678 } 679} 680 681/* ARGSUSED */ 682static int 683upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) 684{ 685 dmu_tx_t *tx = arg; 686 dsl_dataset_t *ds, *prev = NULL; 687 int err; 688 dsl_pool_t *dp = spa_get_dsl(spa); 689 690 err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); 691 if (err) 692 return (err); 693 694 while (ds->ds_phys->ds_prev_snap_obj != 0) { 695 err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, 696 FTAG, &prev); 697 if (err) { 698 dsl_dataset_rele(ds, FTAG); 699 return (err); 700 } 701 702 if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) 703 break; 704 dsl_dataset_rele(ds, FTAG); 705 ds = prev; 706 prev = NULL; 707 } 708 709 if (prev == NULL) { 710 prev = dp->dp_origin_snap; 711 712 /* 713 * The $ORIGIN can't have any data, or the accounting 714 * will be wrong. 715 */ 716 ASSERT(prev->ds_phys->ds_bp.blk_birth == 0); 717 718 /* The origin doesn't get attached to itself */ 719 if (ds->ds_object == prev->ds_object) { 720 dsl_dataset_rele(ds, FTAG); 721 return (0); 722 } 723 724 dmu_buf_will_dirty(ds->ds_dbuf, tx); 725 ds->ds_phys->ds_prev_snap_obj = prev->ds_object; 726 ds->ds_phys->ds_prev_snap_txg = prev->ds_phys->ds_creation_txg; 727 728 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); 729 ds->ds_dir->dd_phys->dd_origin_obj = prev->ds_object; 730 731 dmu_buf_will_dirty(prev->ds_dbuf, tx); 732 prev->ds_phys->ds_num_children++; 733 734 if (ds->ds_phys->ds_next_snap_obj == 0) { 735 ASSERT(ds->ds_prev == NULL); 736 VERIFY(0 == dsl_dataset_hold_obj(dp, 737 ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); 738 } 739 } 740 741 ASSERT(ds->ds_dir->dd_phys->dd_origin_obj == prev->ds_object); 742 ASSERT(ds->ds_phys->ds_prev_snap_obj == prev->ds_object); 743 744 if (prev->ds_phys->ds_next_clones_obj == 0) { 745 dmu_buf_will_dirty(prev->ds_dbuf, tx); 746 prev->ds_phys->ds_next_clones_obj = 747 zap_create(dp->dp_meta_objset, 748 DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); 749 } 750 VERIFY(0 == zap_add_int(dp->dp_meta_objset, 751 prev->ds_phys->ds_next_clones_obj, ds->ds_object, tx)); 752 753 dsl_dataset_rele(ds, FTAG); 754 if (prev != dp->dp_origin_snap) 755 dsl_dataset_rele(prev, FTAG); 756 return (0); 757} 758 759void 760dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx) 761{ 762 ASSERT(dmu_tx_is_syncing(tx)); 763 ASSERT(dp->dp_origin_snap != NULL); 764 765 VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL, upgrade_clones_cb, 766 tx, DS_FIND_CHILDREN)); 767} 768 769/* ARGSUSED */ 770static int 771upgrade_dir_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) 772{ 773 dmu_tx_t *tx = arg; 774 dsl_dataset_t *ds; 775 dsl_pool_t *dp = spa_get_dsl(spa); 776 objset_t *mos = dp->dp_meta_objset; 777 778 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 779 780 if (ds->ds_dir->dd_phys->dd_origin_obj) { 781 dsl_dataset_t *origin; 782 783 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, 784 ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin)); 785 786 if (origin->ds_dir->dd_phys->dd_clones == 0) { 787 dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); 788 origin->ds_dir->dd_phys->dd_clones = zap_create(mos, 789 DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); 790 } 791 792 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, 793 origin->ds_dir->dd_phys->dd_clones, dsobj, tx)); 794 795 dsl_dataset_rele(origin, FTAG); 796 } 797 798 dsl_dataset_rele(ds, FTAG); 799 return (0); 800} 801 802void 803dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx) 804{ 805 ASSERT(dmu_tx_is_syncing(tx)); 806 uint64_t obj; 807 808 (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx); 809 VERIFY(0 == dsl_pool_open_special_dir(dp, 810 FREE_DIR_NAME, &dp->dp_free_dir)); 811 812 /* 813 * We can't use bpobj_alloc(), because spa_version() still 814 * returns the old version, and we need a new-version bpobj with 815 * subobj support. So call dmu_object_alloc() directly. 816 */ 817 obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ, 818 SPA_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx); 819 VERIFY3U(0, ==, zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 820 DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx)); 821 VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj, 822 dp->dp_meta_objset, obj)); 823 824 VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL, 825 upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN)); 826} 827 828void 829dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) 830{ 831 uint64_t dsobj; 832 dsl_dataset_t *ds; 833 834 ASSERT(dmu_tx_is_syncing(tx)); 835 ASSERT(dp->dp_origin_snap == NULL); 836 837 /* create the origin dir, ds, & snap-ds */ 838 rw_enter(&dp->dp_config_rwlock, RW_WRITER); 839 dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME, 840 NULL, 0, kcred, tx); 841 VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 842 dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, tx); 843 VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, 844 dp, &dp->dp_origin_snap)); 845 dsl_dataset_rele(ds, FTAG); 846 rw_exit(&dp->dp_config_rwlock); 847} 848 849taskq_t * 850dsl_pool_vnrele_taskq(dsl_pool_t *dp) 851{ 852 return (dp->dp_vnrele_taskq); 853} 854 855/* 856 * Walk through the pool-wide zap object of temporary snapshot user holds 857 * and release them. 858 */ 859void 860dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp) 861{ 862 zap_attribute_t za; 863 zap_cursor_t zc; 864 objset_t *mos = dp->dp_meta_objset; 865 uint64_t zapobj = dp->dp_tmp_userrefs_obj; 866 867 if (zapobj == 0) 868 return; 869 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); 870 871 for (zap_cursor_init(&zc, mos, zapobj); 872 zap_cursor_retrieve(&zc, &za) == 0; 873 zap_cursor_advance(&zc)) { 874 char *htag; 875 uint64_t dsobj; 876 877 htag = strchr(za.za_name, '-'); 878 *htag = '\0'; 879 ++htag; 880 dsobj = strtonum(za.za_name, NULL); 881 (void) dsl_dataset_user_release_tmp(dp, dsobj, htag, B_FALSE); 882 } 883 zap_cursor_fini(&zc); 884} 885 886/* 887 * Create the pool-wide zap object for storing temporary snapshot holds. 888 */ 889void 890dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx) 891{ 892 objset_t *mos = dp->dp_meta_objset; 893 894 ASSERT(dp->dp_tmp_userrefs_obj == 0); 895 ASSERT(dmu_tx_is_syncing(tx)); 896 897 dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS, 898 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx); 899} 900 901static int 902dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj, 903 const char *tag, uint64_t *now, dmu_tx_t *tx, boolean_t holding) 904{ 905 objset_t *mos = dp->dp_meta_objset; 906 uint64_t zapobj = dp->dp_tmp_userrefs_obj; 907 char *name; 908 int error; 909 910 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); 911 ASSERT(dmu_tx_is_syncing(tx)); 912 913 /* 914 * If the pool was created prior to SPA_VERSION_USERREFS, the 915 * zap object for temporary holds might not exist yet. 916 */ 917 if (zapobj == 0) { 918 if (holding) { 919 dsl_pool_user_hold_create_obj(dp, tx); 920 zapobj = dp->dp_tmp_userrefs_obj; 921 } else { 922 return (ENOENT); 923 } 924 } 925 926 name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag); 927 if (holding) 928 error = zap_add(mos, zapobj, name, 8, 1, now, tx); 929 else 930 error = zap_remove(mos, zapobj, name, tx); 931 strfree(name); 932 933 return (error); 934} 935 936/* 937 * Add a temporary hold for the given dataset object and tag. 938 */ 939int 940dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag, 941 uint64_t *now, dmu_tx_t *tx) 942{ 943 return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE)); 944} 945 946/* 947 * Release a temporary hold for the given dataset object and tag. 948 */ 949int 950dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, 951 dmu_tx_t *tx) 952{ 953 return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, NULL, 954 tx, B_FALSE)); 955} 956