spa_misc.c revision 262120
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2013 by Delphix. All rights reserved. 24 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 25 * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. 26 */ 27 28#include <sys/zfs_context.h> 29#include <sys/spa_impl.h> 30#include <sys/spa_boot.h> 31#include <sys/zio.h> 32#include <sys/zio_checksum.h> 33#include <sys/zio_compress.h> 34#include <sys/dmu.h> 35#include <sys/dmu_tx.h> 36#include <sys/zap.h> 37#include <sys/zil.h> 38#include <sys/vdev_impl.h> 39#include <sys/metaslab.h> 40#include <sys/uberblock_impl.h> 41#include <sys/txg.h> 42#include <sys/avl.h> 43#include <sys/unique.h> 44#include <sys/dsl_pool.h> 45#include <sys/dsl_dir.h> 46#include <sys/dsl_prop.h> 47#include <sys/dsl_scan.h> 48#include <sys/fs/zfs.h> 49#include <sys/metaslab_impl.h> 50#include <sys/arc.h> 51#include <sys/ddt.h> 52#include "zfs_prop.h" 53#include "zfeature_common.h" 54 55/* 56 * SPA locking 57 * 58 * There are four basic locks for managing spa_t structures: 59 * 60 * spa_namespace_lock (global mutex) 61 * 62 * This lock must be acquired to do any of the following: 63 * 64 * - Lookup a spa_t by name 65 * - Add or remove a spa_t from the namespace 66 * - Increase spa_refcount from non-zero 67 * - Check if spa_refcount is zero 68 * - Rename a spa_t 69 * - add/remove/attach/detach devices 70 * - Held for the duration of create/destroy/import/export 71 * 72 * It does not need to handle recursion. A create or destroy may 73 * reference objects (files or zvols) in other pools, but by 74 * definition they must have an existing reference, and will never need 75 * to lookup a spa_t by name. 76 * 77 * spa_refcount (per-spa refcount_t protected by mutex) 78 * 79 * This reference count keep track of any active users of the spa_t. The 80 * spa_t cannot be destroyed or freed while this is non-zero. Internally, 81 * the refcount is never really 'zero' - opening a pool implicitly keeps 82 * some references in the DMU. Internally we check against spa_minref, but 83 * present the image of a zero/non-zero value to consumers. 84 * 85 * spa_config_lock[] (per-spa array of rwlocks) 86 * 87 * This protects the spa_t from config changes, and must be held in 88 * the following circumstances: 89 * 90 * - RW_READER to perform I/O to the spa 91 * - RW_WRITER to change the vdev config 92 * 93 * The locking order is fairly straightforward: 94 * 95 * spa_namespace_lock -> spa_refcount 96 * 97 * The namespace lock must be acquired to increase the refcount from 0 98 * or to check if it is zero. 99 * 100 * spa_refcount -> spa_config_lock[] 101 * 102 * There must be at least one valid reference on the spa_t to acquire 103 * the config lock. 104 * 105 * spa_namespace_lock -> spa_config_lock[] 106 * 107 * The namespace lock must always be taken before the config lock. 108 * 109 * 110 * The spa_namespace_lock can be acquired directly and is globally visible. 111 * 112 * The namespace is manipulated using the following functions, all of which 113 * require the spa_namespace_lock to be held. 114 * 115 * spa_lookup() Lookup a spa_t by name. 116 * 117 * spa_add() Create a new spa_t in the namespace. 118 * 119 * spa_remove() Remove a spa_t from the namespace. This also 120 * frees up any memory associated with the spa_t. 121 * 122 * spa_next() Returns the next spa_t in the system, or the 123 * first if NULL is passed. 124 * 125 * spa_evict_all() Shutdown and remove all spa_t structures in 126 * the system. 127 * 128 * spa_guid_exists() Determine whether a pool/device guid exists. 129 * 130 * The spa_refcount is manipulated using the following functions: 131 * 132 * spa_open_ref() Adds a reference to the given spa_t. Must be 133 * called with spa_namespace_lock held if the 134 * refcount is currently zero. 135 * 136 * spa_close() Remove a reference from the spa_t. This will 137 * not free the spa_t or remove it from the 138 * namespace. No locking is required. 139 * 140 * spa_refcount_zero() Returns true if the refcount is currently 141 * zero. Must be called with spa_namespace_lock 142 * held. 143 * 144 * The spa_config_lock[] is an array of rwlocks, ordered as follows: 145 * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV. 146 * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}(). 147 * 148 * To read the configuration, it suffices to hold one of these locks as reader. 149 * To modify the configuration, you must hold all locks as writer. To modify 150 * vdev state without altering the vdev tree's topology (e.g. online/offline), 151 * you must hold SCL_STATE and SCL_ZIO as writer. 152 * 153 * We use these distinct config locks to avoid recursive lock entry. 154 * For example, spa_sync() (which holds SCL_CONFIG as reader) induces 155 * block allocations (SCL_ALLOC), which may require reading space maps 156 * from disk (dmu_read() -> zio_read() -> SCL_ZIO). 157 * 158 * The spa config locks cannot be normal rwlocks because we need the 159 * ability to hand off ownership. For example, SCL_ZIO is acquired 160 * by the issuing thread and later released by an interrupt thread. 161 * They do, however, obey the usual write-wanted semantics to prevent 162 * writer (i.e. system administrator) starvation. 163 * 164 * The lock acquisition rules are as follows: 165 * 166 * SCL_CONFIG 167 * Protects changes to the vdev tree topology, such as vdev 168 * add/remove/attach/detach. Protects the dirty config list 169 * (spa_config_dirty_list) and the set of spares and l2arc devices. 170 * 171 * SCL_STATE 172 * Protects changes to pool state and vdev state, such as vdev 173 * online/offline/fault/degrade/clear. Protects the dirty state list 174 * (spa_state_dirty_list) and global pool state (spa_state). 175 * 176 * SCL_ALLOC 177 * Protects changes to metaslab groups and classes. 178 * Held as reader by metaslab_alloc() and metaslab_claim(). 179 * 180 * SCL_ZIO 181 * Held by bp-level zios (those which have no io_vd upon entry) 182 * to prevent changes to the vdev tree. The bp-level zio implicitly 183 * protects all of its vdev child zios, which do not hold SCL_ZIO. 184 * 185 * SCL_FREE 186 * Protects changes to metaslab groups and classes. 187 * Held as reader by metaslab_free(). SCL_FREE is distinct from 188 * SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free 189 * blocks in zio_done() while another i/o that holds either 190 * SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete. 191 * 192 * SCL_VDEV 193 * Held as reader to prevent changes to the vdev tree during trivial 194 * inquiries such as bp_get_dsize(). SCL_VDEV is distinct from the 195 * other locks, and lower than all of them, to ensure that it's safe 196 * to acquire regardless of caller context. 197 * 198 * In addition, the following rules apply: 199 * 200 * (a) spa_props_lock protects pool properties, spa_config and spa_config_list. 201 * The lock ordering is SCL_CONFIG > spa_props_lock. 202 * 203 * (b) I/O operations on leaf vdevs. For any zio operation that takes 204 * an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(), 205 * or zio_write_phys() -- the caller must ensure that the config cannot 206 * cannot change in the interim, and that the vdev cannot be reopened. 207 * SCL_STATE as reader suffices for both. 208 * 209 * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit(). 210 * 211 * spa_vdev_enter() Acquire the namespace lock and the config lock 212 * for writing. 213 * 214 * spa_vdev_exit() Release the config lock, wait for all I/O 215 * to complete, sync the updated configs to the 216 * cache, and release the namespace lock. 217 * 218 * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit(). 219 * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual 220 * locking is, always, based on spa_namespace_lock and spa_config_lock[]. 221 * 222 * spa_rename() is also implemented within this file since it requires 223 * manipulation of the namespace. 224 */ 225 226static avl_tree_t spa_namespace_avl; 227kmutex_t spa_namespace_lock; 228static kcondvar_t spa_namespace_cv; 229static int spa_active_count; 230int spa_max_replication_override = SPA_DVAS_PER_BP; 231 232static kmutex_t spa_spare_lock; 233static avl_tree_t spa_spare_avl; 234static kmutex_t spa_l2cache_lock; 235static avl_tree_t spa_l2cache_avl; 236 237kmem_cache_t *spa_buffer_pool; 238int spa_mode_global; 239 240#ifdef ZFS_DEBUG 241/* Everything except dprintf and spa is on by default in debug builds */ 242int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SPA); 243#else 244int zfs_flags = 0; 245#endif 246SYSCTL_DECL(_debug); 247TUNABLE_INT("debug.zfs_flags", &zfs_flags); 248SYSCTL_INT(_debug, OID_AUTO, zfs_flags, CTLFLAG_RWTUN, &zfs_flags, 0, 249 "ZFS debug flags."); 250 251/* 252 * zfs_recover can be set to nonzero to attempt to recover from 253 * otherwise-fatal errors, typically caused by on-disk corruption. When 254 * set, calls to zfs_panic_recover() will turn into warning messages. 255 * This should only be used as a last resort, as it typically results 256 * in leaked space, or worse. 257 */ 258int zfs_recover = 0; 259SYSCTL_DECL(_vfs_zfs); 260TUNABLE_INT("vfs.zfs.recover", &zfs_recover); 261SYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RDTUN, &zfs_recover, 0, 262 "Try to recover from otherwise-fatal errors."); 263 264/* 265 * Expiration time in milliseconds. This value has two meanings. First it is 266 * used to determine when the spa_deadman() logic should fire. By default the 267 * spa_deadman() will fire if spa_sync() has not completed in 1000 seconds. 268 * Secondly, the value determines if an I/O is considered "hung". Any I/O that 269 * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting 270 * in a system panic. 271 */ 272uint64_t zfs_deadman_synctime_ms = 1000000ULL; 273TUNABLE_QUAD("vfs.zfs.deadman_synctime_ms", &zfs_deadman_synctime_ms); 274SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_synctime_ms, CTLFLAG_RDTUN, 275 &zfs_deadman_synctime_ms, 0, 276 "Stalled ZFS I/O expiration time in milliseconds"); 277 278/* 279 * Check time in milliseconds. This defines the frequency at which we check 280 * for hung I/O. 281 */ 282uint64_t zfs_deadman_checktime_ms = 5000ULL; 283TUNABLE_QUAD("vfs.zfs.deadman_checktime_ms", &zfs_deadman_checktime_ms); 284SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_checktime_ms, CTLFLAG_RDTUN, 285 &zfs_deadman_checktime_ms, 0, 286 "Period of checks for stalled ZFS I/O in milliseconds"); 287 288/* 289 * Default value of -1 for zfs_deadman_enabled is resolved in 290 * zfs_deadman_init() 291 */ 292int zfs_deadman_enabled = -1; 293TUNABLE_INT("vfs.zfs.deadman_enabled", &zfs_deadman_enabled); 294SYSCTL_INT(_vfs_zfs, OID_AUTO, deadman_enabled, CTLFLAG_RDTUN, 295 &zfs_deadman_enabled, 0, "Kernel panic on stalled ZFS I/O"); 296 297/* 298 * The worst case is single-sector max-parity RAID-Z blocks, in which 299 * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1) 300 * times the size; so just assume that. Add to this the fact that 301 * we can have up to 3 DVAs per bp, and one more factor of 2 because 302 * the block may be dittoed with up to 3 DVAs by ddt_sync(). All together, 303 * the worst case is: 304 * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24 305 */ 306int spa_asize_inflation = 24; 307 308#ifndef illumos 309#ifdef _KERNEL 310static void 311zfs_deadman_init() 312{ 313 /* 314 * If we are not i386 or amd64 or in a virtual machine, 315 * disable ZFS deadman thread by default 316 */ 317 if (zfs_deadman_enabled == -1) { 318#if defined(__amd64__) || defined(__i386__) 319 zfs_deadman_enabled = (vm_guest == VM_GUEST_NO) ? 1 : 0; 320#else 321 zfs_deadman_enabled = 0; 322#endif 323 } 324} 325#endif /* _KERNEL */ 326#endif /* !illumos */ 327 328/* 329 * ========================================================================== 330 * SPA config locking 331 * ========================================================================== 332 */ 333static void 334spa_config_lock_init(spa_t *spa) 335{ 336 for (int i = 0; i < SCL_LOCKS; i++) { 337 spa_config_lock_t *scl = &spa->spa_config_lock[i]; 338 mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL); 339 cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL); 340 refcount_create_untracked(&scl->scl_count); 341 scl->scl_writer = NULL; 342 scl->scl_write_wanted = 0; 343 } 344} 345 346static void 347spa_config_lock_destroy(spa_t *spa) 348{ 349 for (int i = 0; i < SCL_LOCKS; i++) { 350 spa_config_lock_t *scl = &spa->spa_config_lock[i]; 351 mutex_destroy(&scl->scl_lock); 352 cv_destroy(&scl->scl_cv); 353 refcount_destroy(&scl->scl_count); 354 ASSERT(scl->scl_writer == NULL); 355 ASSERT(scl->scl_write_wanted == 0); 356 } 357} 358 359int 360spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw) 361{ 362 for (int i = 0; i < SCL_LOCKS; i++) { 363 spa_config_lock_t *scl = &spa->spa_config_lock[i]; 364 if (!(locks & (1 << i))) 365 continue; 366 mutex_enter(&scl->scl_lock); 367 if (rw == RW_READER) { 368 if (scl->scl_writer || scl->scl_write_wanted) { 369 mutex_exit(&scl->scl_lock); 370 spa_config_exit(spa, locks ^ (1 << i), tag); 371 return (0); 372 } 373 } else { 374 ASSERT(scl->scl_writer != curthread); 375 if (!refcount_is_zero(&scl->scl_count)) { 376 mutex_exit(&scl->scl_lock); 377 spa_config_exit(spa, locks ^ (1 << i), tag); 378 return (0); 379 } 380 scl->scl_writer = curthread; 381 } 382 (void) refcount_add(&scl->scl_count, tag); 383 mutex_exit(&scl->scl_lock); 384 } 385 return (1); 386} 387 388void 389spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw) 390{ 391 int wlocks_held = 0; 392 393 ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY); 394 395 for (int i = 0; i < SCL_LOCKS; i++) { 396 spa_config_lock_t *scl = &spa->spa_config_lock[i]; 397 if (scl->scl_writer == curthread) 398 wlocks_held |= (1 << i); 399 if (!(locks & (1 << i))) 400 continue; 401 mutex_enter(&scl->scl_lock); 402 if (rw == RW_READER) { 403 while (scl->scl_writer || scl->scl_write_wanted) { 404 cv_wait(&scl->scl_cv, &scl->scl_lock); 405 } 406 } else { 407 ASSERT(scl->scl_writer != curthread); 408 while (!refcount_is_zero(&scl->scl_count)) { 409 scl->scl_write_wanted++; 410 cv_wait(&scl->scl_cv, &scl->scl_lock); 411 scl->scl_write_wanted--; 412 } 413 scl->scl_writer = curthread; 414 } 415 (void) refcount_add(&scl->scl_count, tag); 416 mutex_exit(&scl->scl_lock); 417 } 418 ASSERT(wlocks_held <= locks); 419} 420 421void 422spa_config_exit(spa_t *spa, int locks, void *tag) 423{ 424 for (int i = SCL_LOCKS - 1; i >= 0; i--) { 425 spa_config_lock_t *scl = &spa->spa_config_lock[i]; 426 if (!(locks & (1 << i))) 427 continue; 428 mutex_enter(&scl->scl_lock); 429 ASSERT(!refcount_is_zero(&scl->scl_count)); 430 if (refcount_remove(&scl->scl_count, tag) == 0) { 431 ASSERT(scl->scl_writer == NULL || 432 scl->scl_writer == curthread); 433 scl->scl_writer = NULL; /* OK in either case */ 434 cv_broadcast(&scl->scl_cv); 435 } 436 mutex_exit(&scl->scl_lock); 437 } 438} 439 440int 441spa_config_held(spa_t *spa, int locks, krw_t rw) 442{ 443 int locks_held = 0; 444 445 for (int i = 0; i < SCL_LOCKS; i++) { 446 spa_config_lock_t *scl = &spa->spa_config_lock[i]; 447 if (!(locks & (1 << i))) 448 continue; 449 if ((rw == RW_READER && !refcount_is_zero(&scl->scl_count)) || 450 (rw == RW_WRITER && scl->scl_writer == curthread)) 451 locks_held |= 1 << i; 452 } 453 454 return (locks_held); 455} 456 457/* 458 * ========================================================================== 459 * SPA namespace functions 460 * ========================================================================== 461 */ 462 463/* 464 * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held. 465 * Returns NULL if no matching spa_t is found. 466 */ 467spa_t * 468spa_lookup(const char *name) 469{ 470 static spa_t search; /* spa_t is large; don't allocate on stack */ 471 spa_t *spa; 472 avl_index_t where; 473 char *cp; 474 475 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 476 477 (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); 478 479 /* 480 * If it's a full dataset name, figure out the pool name and 481 * just use that. 482 */ 483 cp = strpbrk(search.spa_name, "/@"); 484 if (cp != NULL) 485 *cp = '\0'; 486 487 spa = avl_find(&spa_namespace_avl, &search, &where); 488 489 return (spa); 490} 491 492/* 493 * Fires when spa_sync has not completed within zfs_deadman_synctime_ms. 494 * If the zfs_deadman_enabled flag is set then it inspects all vdev queues 495 * looking for potentially hung I/Os. 496 */ 497void 498spa_deadman(void *arg) 499{ 500 spa_t *spa = arg; 501 502 /* 503 * Disable the deadman timer if the pool is suspended. 504 */ 505 if (spa_suspended(spa)) { 506#ifdef illumos 507 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); 508#else 509 /* Nothing. just don't schedule any future callouts. */ 510#endif 511 return; 512 } 513 514 zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu", 515 (gethrtime() - spa->spa_sync_starttime) / NANOSEC, 516 ++spa->spa_deadman_calls); 517 if (zfs_deadman_enabled) 518 vdev_deadman(spa->spa_root_vdev); 519} 520 521/* 522 * Create an uninitialized spa_t with the given name. Requires 523 * spa_namespace_lock. The caller must ensure that the spa_t doesn't already 524 * exist by calling spa_lookup() first. 525 */ 526spa_t * 527spa_add(const char *name, nvlist_t *config, const char *altroot) 528{ 529 spa_t *spa; 530 spa_config_dirent_t *dp; 531#ifdef illumos 532 cyc_handler_t hdlr; 533 cyc_time_t when; 534#endif 535 536 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 537 538 spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP); 539 540 mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); 541 mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); 542 mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); 543 mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); 544 mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL); 545 mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); 546 mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); 547 mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); 548 mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL); 549 550 cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); 551 cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL); 552 cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL); 553 cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL); 554 555 for (int t = 0; t < TXG_SIZE; t++) 556 bplist_create(&spa->spa_free_bplist[t]); 557 558 (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name)); 559 spa->spa_state = POOL_STATE_UNINITIALIZED; 560 spa->spa_freeze_txg = UINT64_MAX; 561 spa->spa_final_txg = UINT64_MAX; 562 spa->spa_load_max_txg = UINT64_MAX; 563 spa->spa_proc = &p0; 564 spa->spa_proc_state = SPA_PROC_NONE; 565 566#ifdef illumos 567 hdlr.cyh_func = spa_deadman; 568 hdlr.cyh_arg = spa; 569 hdlr.cyh_level = CY_LOW_LEVEL; 570#endif 571 572 spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms); 573 574#ifdef illumos 575 /* 576 * This determines how often we need to check for hung I/Os after 577 * the cyclic has already fired. Since checking for hung I/Os is 578 * an expensive operation we don't want to check too frequently. 579 * Instead wait for 5 seconds before checking again. 580 */ 581 when.cyt_interval = MSEC2NSEC(zfs_deadman_checktime_ms); 582 when.cyt_when = CY_INFINITY; 583 mutex_enter(&cpu_lock); 584 spa->spa_deadman_cycid = cyclic_add(&hdlr, &when); 585 mutex_exit(&cpu_lock); 586#else /* !illumos */ 587#ifdef _KERNEL 588 callout_init(&spa->spa_deadman_cycid, CALLOUT_MPSAFE); 589#endif 590#endif 591 refcount_create(&spa->spa_refcount); 592 spa_config_lock_init(spa); 593 594 avl_add(&spa_namespace_avl, spa); 595 596 /* 597 * Set the alternate root, if there is one. 598 */ 599 if (altroot) { 600 spa->spa_root = spa_strdup(altroot); 601 spa_active_count++; 602 } 603 604 /* 605 * Every pool starts with the default cachefile 606 */ 607 list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t), 608 offsetof(spa_config_dirent_t, scd_link)); 609 610 dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP); 611 dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path); 612 list_insert_head(&spa->spa_config_list, dp); 613 614 VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME, 615 KM_SLEEP) == 0); 616 617 if (config != NULL) { 618 nvlist_t *features; 619 620 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ, 621 &features) == 0) { 622 VERIFY(nvlist_dup(features, &spa->spa_label_features, 623 0) == 0); 624 } 625 626 VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); 627 } 628 629 if (spa->spa_label_features == NULL) { 630 VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME, 631 KM_SLEEP) == 0); 632 } 633 634 spa->spa_debug = ((zfs_flags & ZFS_DEBUG_SPA) != 0); 635 636 return (spa); 637} 638 639/* 640 * Removes a spa_t from the namespace, freeing up any memory used. Requires 641 * spa_namespace_lock. This is called only after the spa_t has been closed and 642 * deactivated. 643 */ 644void 645spa_remove(spa_t *spa) 646{ 647 spa_config_dirent_t *dp; 648 649 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 650 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 651 652 nvlist_free(spa->spa_config_splitting); 653 654 avl_remove(&spa_namespace_avl, spa); 655 cv_broadcast(&spa_namespace_cv); 656 657 if (spa->spa_root) { 658 spa_strfree(spa->spa_root); 659 spa_active_count--; 660 } 661 662 while ((dp = list_head(&spa->spa_config_list)) != NULL) { 663 list_remove(&spa->spa_config_list, dp); 664 if (dp->scd_path != NULL) 665 spa_strfree(dp->scd_path); 666 kmem_free(dp, sizeof (spa_config_dirent_t)); 667 } 668 669 list_destroy(&spa->spa_config_list); 670 671 nvlist_free(spa->spa_label_features); 672 nvlist_free(spa->spa_load_info); 673 spa_config_set(spa, NULL); 674 675#ifdef illumos 676 mutex_enter(&cpu_lock); 677 if (spa->spa_deadman_cycid != CYCLIC_NONE) 678 cyclic_remove(spa->spa_deadman_cycid); 679 mutex_exit(&cpu_lock); 680 spa->spa_deadman_cycid = CYCLIC_NONE; 681#else /* !illumos */ 682#ifdef _KERNEL 683 callout_drain(&spa->spa_deadman_cycid); 684#endif 685#endif 686 687 refcount_destroy(&spa->spa_refcount); 688 689 spa_config_lock_destroy(spa); 690 691 for (int t = 0; t < TXG_SIZE; t++) 692 bplist_destroy(&spa->spa_free_bplist[t]); 693 694 cv_destroy(&spa->spa_async_cv); 695 cv_destroy(&spa->spa_proc_cv); 696 cv_destroy(&spa->spa_scrub_io_cv); 697 cv_destroy(&spa->spa_suspend_cv); 698 699 mutex_destroy(&spa->spa_async_lock); 700 mutex_destroy(&spa->spa_errlist_lock); 701 mutex_destroy(&spa->spa_errlog_lock); 702 mutex_destroy(&spa->spa_history_lock); 703 mutex_destroy(&spa->spa_proc_lock); 704 mutex_destroy(&spa->spa_props_lock); 705 mutex_destroy(&spa->spa_scrub_lock); 706 mutex_destroy(&spa->spa_suspend_lock); 707 mutex_destroy(&spa->spa_vdev_top_lock); 708 709 kmem_free(spa, sizeof (spa_t)); 710} 711 712/* 713 * Given a pool, return the next pool in the namespace, or NULL if there is 714 * none. If 'prev' is NULL, return the first pool. 715 */ 716spa_t * 717spa_next(spa_t *prev) 718{ 719 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 720 721 if (prev) 722 return (AVL_NEXT(&spa_namespace_avl, prev)); 723 else 724 return (avl_first(&spa_namespace_avl)); 725} 726 727/* 728 * ========================================================================== 729 * SPA refcount functions 730 * ========================================================================== 731 */ 732 733/* 734 * Add a reference to the given spa_t. Must have at least one reference, or 735 * have the namespace lock held. 736 */ 737void 738spa_open_ref(spa_t *spa, void *tag) 739{ 740 ASSERT(refcount_count(&spa->spa_refcount) >= spa->spa_minref || 741 MUTEX_HELD(&spa_namespace_lock)); 742 (void) refcount_add(&spa->spa_refcount, tag); 743} 744 745/* 746 * Remove a reference to the given spa_t. Must have at least one reference, or 747 * have the namespace lock held. 748 */ 749void 750spa_close(spa_t *spa, void *tag) 751{ 752 ASSERT(refcount_count(&spa->spa_refcount) > spa->spa_minref || 753 MUTEX_HELD(&spa_namespace_lock)); 754 (void) refcount_remove(&spa->spa_refcount, tag); 755} 756 757/* 758 * Check to see if the spa refcount is zero. Must be called with 759 * spa_namespace_lock held. We really compare against spa_minref, which is the 760 * number of references acquired when opening a pool 761 */ 762boolean_t 763spa_refcount_zero(spa_t *spa) 764{ 765 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 766 767 return (refcount_count(&spa->spa_refcount) == spa->spa_minref); 768} 769 770/* 771 * ========================================================================== 772 * SPA spare and l2cache tracking 773 * ========================================================================== 774 */ 775 776/* 777 * Hot spares and cache devices are tracked using the same code below, 778 * for 'auxiliary' devices. 779 */ 780 781typedef struct spa_aux { 782 uint64_t aux_guid; 783 uint64_t aux_pool; 784 avl_node_t aux_avl; 785 int aux_count; 786} spa_aux_t; 787 788static int 789spa_aux_compare(const void *a, const void *b) 790{ 791 const spa_aux_t *sa = a; 792 const spa_aux_t *sb = b; 793 794 if (sa->aux_guid < sb->aux_guid) 795 return (-1); 796 else if (sa->aux_guid > sb->aux_guid) 797 return (1); 798 else 799 return (0); 800} 801 802void 803spa_aux_add(vdev_t *vd, avl_tree_t *avl) 804{ 805 avl_index_t where; 806 spa_aux_t search; 807 spa_aux_t *aux; 808 809 search.aux_guid = vd->vdev_guid; 810 if ((aux = avl_find(avl, &search, &where)) != NULL) { 811 aux->aux_count++; 812 } else { 813 aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP); 814 aux->aux_guid = vd->vdev_guid; 815 aux->aux_count = 1; 816 avl_insert(avl, aux, where); 817 } 818} 819 820void 821spa_aux_remove(vdev_t *vd, avl_tree_t *avl) 822{ 823 spa_aux_t search; 824 spa_aux_t *aux; 825 avl_index_t where; 826 827 search.aux_guid = vd->vdev_guid; 828 aux = avl_find(avl, &search, &where); 829 830 ASSERT(aux != NULL); 831 832 if (--aux->aux_count == 0) { 833 avl_remove(avl, aux); 834 kmem_free(aux, sizeof (spa_aux_t)); 835 } else if (aux->aux_pool == spa_guid(vd->vdev_spa)) { 836 aux->aux_pool = 0ULL; 837 } 838} 839 840boolean_t 841spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl) 842{ 843 spa_aux_t search, *found; 844 845 search.aux_guid = guid; 846 found = avl_find(avl, &search, NULL); 847 848 if (pool) { 849 if (found) 850 *pool = found->aux_pool; 851 else 852 *pool = 0ULL; 853 } 854 855 if (refcnt) { 856 if (found) 857 *refcnt = found->aux_count; 858 else 859 *refcnt = 0; 860 } 861 862 return (found != NULL); 863} 864 865void 866spa_aux_activate(vdev_t *vd, avl_tree_t *avl) 867{ 868 spa_aux_t search, *found; 869 avl_index_t where; 870 871 search.aux_guid = vd->vdev_guid; 872 found = avl_find(avl, &search, &where); 873 ASSERT(found != NULL); 874 ASSERT(found->aux_pool == 0ULL); 875 876 found->aux_pool = spa_guid(vd->vdev_spa); 877} 878 879/* 880 * Spares are tracked globally due to the following constraints: 881 * 882 * - A spare may be part of multiple pools. 883 * - A spare may be added to a pool even if it's actively in use within 884 * another pool. 885 * - A spare in use in any pool can only be the source of a replacement if 886 * the target is a spare in the same pool. 887 * 888 * We keep track of all spares on the system through the use of a reference 889 * counted AVL tree. When a vdev is added as a spare, or used as a replacement 890 * spare, then we bump the reference count in the AVL tree. In addition, we set 891 * the 'vdev_isspare' member to indicate that the device is a spare (active or 892 * inactive). When a spare is made active (used to replace a device in the 893 * pool), we also keep track of which pool its been made a part of. 894 * 895 * The 'spa_spare_lock' protects the AVL tree. These functions are normally 896 * called under the spa_namespace lock as part of vdev reconfiguration. The 897 * separate spare lock exists for the status query path, which does not need to 898 * be completely consistent with respect to other vdev configuration changes. 899 */ 900 901static int 902spa_spare_compare(const void *a, const void *b) 903{ 904 return (spa_aux_compare(a, b)); 905} 906 907void 908spa_spare_add(vdev_t *vd) 909{ 910 mutex_enter(&spa_spare_lock); 911 ASSERT(!vd->vdev_isspare); 912 spa_aux_add(vd, &spa_spare_avl); 913 vd->vdev_isspare = B_TRUE; 914 mutex_exit(&spa_spare_lock); 915} 916 917void 918spa_spare_remove(vdev_t *vd) 919{ 920 mutex_enter(&spa_spare_lock); 921 ASSERT(vd->vdev_isspare); 922 spa_aux_remove(vd, &spa_spare_avl); 923 vd->vdev_isspare = B_FALSE; 924 mutex_exit(&spa_spare_lock); 925} 926 927boolean_t 928spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt) 929{ 930 boolean_t found; 931 932 mutex_enter(&spa_spare_lock); 933 found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl); 934 mutex_exit(&spa_spare_lock); 935 936 return (found); 937} 938 939void 940spa_spare_activate(vdev_t *vd) 941{ 942 mutex_enter(&spa_spare_lock); 943 ASSERT(vd->vdev_isspare); 944 spa_aux_activate(vd, &spa_spare_avl); 945 mutex_exit(&spa_spare_lock); 946} 947 948/* 949 * Level 2 ARC devices are tracked globally for the same reasons as spares. 950 * Cache devices currently only support one pool per cache device, and so 951 * for these devices the aux reference count is currently unused beyond 1. 952 */ 953 954static int 955spa_l2cache_compare(const void *a, const void *b) 956{ 957 return (spa_aux_compare(a, b)); 958} 959 960void 961spa_l2cache_add(vdev_t *vd) 962{ 963 mutex_enter(&spa_l2cache_lock); 964 ASSERT(!vd->vdev_isl2cache); 965 spa_aux_add(vd, &spa_l2cache_avl); 966 vd->vdev_isl2cache = B_TRUE; 967 mutex_exit(&spa_l2cache_lock); 968} 969 970void 971spa_l2cache_remove(vdev_t *vd) 972{ 973 mutex_enter(&spa_l2cache_lock); 974 ASSERT(vd->vdev_isl2cache); 975 spa_aux_remove(vd, &spa_l2cache_avl); 976 vd->vdev_isl2cache = B_FALSE; 977 mutex_exit(&spa_l2cache_lock); 978} 979 980boolean_t 981spa_l2cache_exists(uint64_t guid, uint64_t *pool) 982{ 983 boolean_t found; 984 985 mutex_enter(&spa_l2cache_lock); 986 found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl); 987 mutex_exit(&spa_l2cache_lock); 988 989 return (found); 990} 991 992void 993spa_l2cache_activate(vdev_t *vd) 994{ 995 mutex_enter(&spa_l2cache_lock); 996 ASSERT(vd->vdev_isl2cache); 997 spa_aux_activate(vd, &spa_l2cache_avl); 998 mutex_exit(&spa_l2cache_lock); 999} 1000 1001/* 1002 * ========================================================================== 1003 * SPA vdev locking 1004 * ========================================================================== 1005 */ 1006 1007/* 1008 * Lock the given spa_t for the purpose of adding or removing a vdev. 1009 * Grabs the global spa_namespace_lock plus the spa config lock for writing. 1010 * It returns the next transaction group for the spa_t. 1011 */ 1012uint64_t 1013spa_vdev_enter(spa_t *spa) 1014{ 1015 mutex_enter(&spa->spa_vdev_top_lock); 1016 mutex_enter(&spa_namespace_lock); 1017 return (spa_vdev_config_enter(spa)); 1018} 1019 1020/* 1021 * Internal implementation for spa_vdev_enter(). Used when a vdev 1022 * operation requires multiple syncs (i.e. removing a device) while 1023 * keeping the spa_namespace_lock held. 1024 */ 1025uint64_t 1026spa_vdev_config_enter(spa_t *spa) 1027{ 1028 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1029 1030 spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); 1031 1032 return (spa_last_synced_txg(spa) + 1); 1033} 1034 1035/* 1036 * Used in combination with spa_vdev_config_enter() to allow the syncing 1037 * of multiple transactions without releasing the spa_namespace_lock. 1038 */ 1039void 1040spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag) 1041{ 1042 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1043 1044 int config_changed = B_FALSE; 1045 1046 ASSERT(txg > spa_last_synced_txg(spa)); 1047 1048 spa->spa_pending_vdev = NULL; 1049 1050 /* 1051 * Reassess the DTLs. 1052 */ 1053 vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE); 1054 1055 if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) { 1056 config_changed = B_TRUE; 1057 spa->spa_config_generation++; 1058 } 1059 1060 /* 1061 * Verify the metaslab classes. 1062 */ 1063 ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0); 1064 ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0); 1065 1066 spa_config_exit(spa, SCL_ALL, spa); 1067 1068 /* 1069 * Panic the system if the specified tag requires it. This 1070 * is useful for ensuring that configurations are updated 1071 * transactionally. 1072 */ 1073 if (zio_injection_enabled) 1074 zio_handle_panic_injection(spa, tag, 0); 1075 1076 /* 1077 * Note: this txg_wait_synced() is important because it ensures 1078 * that there won't be more than one config change per txg. 1079 * This allows us to use the txg as the generation number. 1080 */ 1081 if (error == 0) 1082 txg_wait_synced(spa->spa_dsl_pool, txg); 1083 1084 if (vd != NULL) { 1085 ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL); 1086 spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); 1087 vdev_free(vd); 1088 spa_config_exit(spa, SCL_ALL, spa); 1089 } 1090 1091 /* 1092 * If the config changed, update the config cache. 1093 */ 1094 if (config_changed) 1095 spa_config_sync(spa, B_FALSE, B_TRUE); 1096} 1097 1098/* 1099 * Unlock the spa_t after adding or removing a vdev. Besides undoing the 1100 * locking of spa_vdev_enter(), we also want make sure the transactions have 1101 * synced to disk, and then update the global configuration cache with the new 1102 * information. 1103 */ 1104int 1105spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) 1106{ 1107 spa_vdev_config_exit(spa, vd, txg, error, FTAG); 1108 mutex_exit(&spa_namespace_lock); 1109 mutex_exit(&spa->spa_vdev_top_lock); 1110 1111 return (error); 1112} 1113 1114/* 1115 * Lock the given spa_t for the purpose of changing vdev state. 1116 */ 1117void 1118spa_vdev_state_enter(spa_t *spa, int oplocks) 1119{ 1120 int locks = SCL_STATE_ALL | oplocks; 1121 1122 /* 1123 * Root pools may need to read of the underlying devfs filesystem 1124 * when opening up a vdev. Unfortunately if we're holding the 1125 * SCL_ZIO lock it will result in a deadlock when we try to issue 1126 * the read from the root filesystem. Instead we "prefetch" 1127 * the associated vnodes that we need prior to opening the 1128 * underlying devices and cache them so that we can prevent 1129 * any I/O when we are doing the actual open. 1130 */ 1131 if (spa_is_root(spa)) { 1132 int low = locks & ~(SCL_ZIO - 1); 1133 int high = locks & ~low; 1134 1135 spa_config_enter(spa, high, spa, RW_WRITER); 1136 vdev_hold(spa->spa_root_vdev); 1137 spa_config_enter(spa, low, spa, RW_WRITER); 1138 } else { 1139 spa_config_enter(spa, locks, spa, RW_WRITER); 1140 } 1141 spa->spa_vdev_locks = locks; 1142} 1143 1144int 1145spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) 1146{ 1147 boolean_t config_changed = B_FALSE; 1148 1149 if (vd != NULL || error == 0) 1150 vdev_dtl_reassess(vd ? vd->vdev_top : spa->spa_root_vdev, 1151 0, 0, B_FALSE); 1152 1153 if (vd != NULL) { 1154 vdev_state_dirty(vd->vdev_top); 1155 config_changed = B_TRUE; 1156 spa->spa_config_generation++; 1157 } 1158 1159 if (spa_is_root(spa)) 1160 vdev_rele(spa->spa_root_vdev); 1161 1162 ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL); 1163 spa_config_exit(spa, spa->spa_vdev_locks, spa); 1164 1165 /* 1166 * If anything changed, wait for it to sync. This ensures that, 1167 * from the system administrator's perspective, zpool(1M) commands 1168 * are synchronous. This is important for things like zpool offline: 1169 * when the command completes, you expect no further I/O from ZFS. 1170 */ 1171 if (vd != NULL) 1172 txg_wait_synced(spa->spa_dsl_pool, 0); 1173 1174 /* 1175 * If the config changed, update the config cache. 1176 */ 1177 if (config_changed) { 1178 mutex_enter(&spa_namespace_lock); 1179 spa_config_sync(spa, B_FALSE, B_TRUE); 1180 mutex_exit(&spa_namespace_lock); 1181 } 1182 1183 return (error); 1184} 1185 1186/* 1187 * ========================================================================== 1188 * Miscellaneous functions 1189 * ========================================================================== 1190 */ 1191 1192void 1193spa_activate_mos_feature(spa_t *spa, const char *feature) 1194{ 1195 (void) nvlist_add_boolean(spa->spa_label_features, feature); 1196 vdev_config_dirty(spa->spa_root_vdev); 1197} 1198 1199void 1200spa_deactivate_mos_feature(spa_t *spa, const char *feature) 1201{ 1202 (void) nvlist_remove_all(spa->spa_label_features, feature); 1203 vdev_config_dirty(spa->spa_root_vdev); 1204} 1205 1206/* 1207 * Rename a spa_t. 1208 */ 1209int 1210spa_rename(const char *name, const char *newname) 1211{ 1212 spa_t *spa; 1213 int err; 1214 1215 /* 1216 * Lookup the spa_t and grab the config lock for writing. We need to 1217 * actually open the pool so that we can sync out the necessary labels. 1218 * It's OK to call spa_open() with the namespace lock held because we 1219 * allow recursive calls for other reasons. 1220 */ 1221 mutex_enter(&spa_namespace_lock); 1222 if ((err = spa_open(name, &spa, FTAG)) != 0) { 1223 mutex_exit(&spa_namespace_lock); 1224 return (err); 1225 } 1226 1227 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1228 1229 avl_remove(&spa_namespace_avl, spa); 1230 (void) strlcpy(spa->spa_name, newname, sizeof (spa->spa_name)); 1231 avl_add(&spa_namespace_avl, spa); 1232 1233 /* 1234 * Sync all labels to disk with the new names by marking the root vdev 1235 * dirty and waiting for it to sync. It will pick up the new pool name 1236 * during the sync. 1237 */ 1238 vdev_config_dirty(spa->spa_root_vdev); 1239 1240 spa_config_exit(spa, SCL_ALL, FTAG); 1241 1242 txg_wait_synced(spa->spa_dsl_pool, 0); 1243 1244 /* 1245 * Sync the updated config cache. 1246 */ 1247 spa_config_sync(spa, B_FALSE, B_TRUE); 1248 1249 spa_close(spa, FTAG); 1250 1251 mutex_exit(&spa_namespace_lock); 1252 1253 return (0); 1254} 1255 1256/* 1257 * Return the spa_t associated with given pool_guid, if it exists. If 1258 * device_guid is non-zero, determine whether the pool exists *and* contains 1259 * a device with the specified device_guid. 1260 */ 1261spa_t * 1262spa_by_guid(uint64_t pool_guid, uint64_t device_guid) 1263{ 1264 spa_t *spa; 1265 avl_tree_t *t = &spa_namespace_avl; 1266 1267 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1268 1269 for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) { 1270 if (spa->spa_state == POOL_STATE_UNINITIALIZED) 1271 continue; 1272 if (spa->spa_root_vdev == NULL) 1273 continue; 1274 if (spa_guid(spa) == pool_guid) { 1275 if (device_guid == 0) 1276 break; 1277 1278 if (vdev_lookup_by_guid(spa->spa_root_vdev, 1279 device_guid) != NULL) 1280 break; 1281 1282 /* 1283 * Check any devices we may be in the process of adding. 1284 */ 1285 if (spa->spa_pending_vdev) { 1286 if (vdev_lookup_by_guid(spa->spa_pending_vdev, 1287 device_guid) != NULL) 1288 break; 1289 } 1290 } 1291 } 1292 1293 return (spa); 1294} 1295 1296/* 1297 * Determine whether a pool with the given pool_guid exists. 1298 */ 1299boolean_t 1300spa_guid_exists(uint64_t pool_guid, uint64_t device_guid) 1301{ 1302 return (spa_by_guid(pool_guid, device_guid) != NULL); 1303} 1304 1305char * 1306spa_strdup(const char *s) 1307{ 1308 size_t len; 1309 char *new; 1310 1311 len = strlen(s); 1312 new = kmem_alloc(len + 1, KM_SLEEP); 1313 bcopy(s, new, len); 1314 new[len] = '\0'; 1315 1316 return (new); 1317} 1318 1319void 1320spa_strfree(char *s) 1321{ 1322 kmem_free(s, strlen(s) + 1); 1323} 1324 1325uint64_t 1326spa_get_random(uint64_t range) 1327{ 1328 uint64_t r; 1329 1330 ASSERT(range != 0); 1331 1332 (void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t)); 1333 1334 return (r % range); 1335} 1336 1337uint64_t 1338spa_generate_guid(spa_t *spa) 1339{ 1340 uint64_t guid = spa_get_random(-1ULL); 1341 1342 if (spa != NULL) { 1343 while (guid == 0 || spa_guid_exists(spa_guid(spa), guid)) 1344 guid = spa_get_random(-1ULL); 1345 } else { 1346 while (guid == 0 || spa_guid_exists(guid, 0)) 1347 guid = spa_get_random(-1ULL); 1348 } 1349 1350 return (guid); 1351} 1352 1353void 1354sprintf_blkptr(char *buf, const blkptr_t *bp) 1355{ 1356 char type[256]; 1357 char *checksum = NULL; 1358 char *compress = NULL; 1359 1360 if (bp != NULL) { 1361 if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) { 1362 dmu_object_byteswap_t bswap = 1363 DMU_OT_BYTESWAP(BP_GET_TYPE(bp)); 1364 (void) snprintf(type, sizeof (type), "bswap %s %s", 1365 DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ? 1366 "metadata" : "data", 1367 dmu_ot_byteswap[bswap].ob_name); 1368 } else { 1369 (void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name, 1370 sizeof (type)); 1371 } 1372 checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name; 1373 compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name; 1374 } 1375 1376 SPRINTF_BLKPTR(snprintf, ' ', buf, bp, type, checksum, compress); 1377} 1378 1379void 1380spa_freeze(spa_t *spa) 1381{ 1382 uint64_t freeze_txg = 0; 1383 1384 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1385 if (spa->spa_freeze_txg == UINT64_MAX) { 1386 freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE; 1387 spa->spa_freeze_txg = freeze_txg; 1388 } 1389 spa_config_exit(spa, SCL_ALL, FTAG); 1390 if (freeze_txg != 0) 1391 txg_wait_synced(spa_get_dsl(spa), freeze_txg); 1392} 1393 1394void 1395zfs_panic_recover(const char *fmt, ...) 1396{ 1397 va_list adx; 1398 1399 va_start(adx, fmt); 1400 vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx); 1401 va_end(adx); 1402} 1403 1404/* 1405 * This is a stripped-down version of strtoull, suitable only for converting 1406 * lowercase hexadecimal numbers that don't overflow. 1407 */ 1408uint64_t 1409zfs_strtonum(const char *str, char **nptr) 1410{ 1411 uint64_t val = 0; 1412 char c; 1413 int digit; 1414 1415 while ((c = *str) != '\0') { 1416 if (c >= '0' && c <= '9') 1417 digit = c - '0'; 1418 else if (c >= 'a' && c <= 'f') 1419 digit = 10 + c - 'a'; 1420 else 1421 break; 1422 1423 val *= 16; 1424 val += digit; 1425 1426 str++; 1427 } 1428 1429 if (nptr) 1430 *nptr = (char *)str; 1431 1432 return (val); 1433} 1434 1435/* 1436 * ========================================================================== 1437 * Accessor functions 1438 * ========================================================================== 1439 */ 1440 1441boolean_t 1442spa_shutting_down(spa_t *spa) 1443{ 1444 return (spa->spa_async_suspended); 1445} 1446 1447dsl_pool_t * 1448spa_get_dsl(spa_t *spa) 1449{ 1450 return (spa->spa_dsl_pool); 1451} 1452 1453boolean_t 1454spa_is_initializing(spa_t *spa) 1455{ 1456 return (spa->spa_is_initializing); 1457} 1458 1459blkptr_t * 1460spa_get_rootblkptr(spa_t *spa) 1461{ 1462 return (&spa->spa_ubsync.ub_rootbp); 1463} 1464 1465void 1466spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp) 1467{ 1468 spa->spa_uberblock.ub_rootbp = *bp; 1469} 1470 1471void 1472spa_altroot(spa_t *spa, char *buf, size_t buflen) 1473{ 1474 if (spa->spa_root == NULL) 1475 buf[0] = '\0'; 1476 else 1477 (void) strncpy(buf, spa->spa_root, buflen); 1478} 1479 1480int 1481spa_sync_pass(spa_t *spa) 1482{ 1483 return (spa->spa_sync_pass); 1484} 1485 1486char * 1487spa_name(spa_t *spa) 1488{ 1489 return (spa->spa_name); 1490} 1491 1492uint64_t 1493spa_guid(spa_t *spa) 1494{ 1495 dsl_pool_t *dp = spa_get_dsl(spa); 1496 uint64_t guid; 1497 1498 /* 1499 * If we fail to parse the config during spa_load(), we can go through 1500 * the error path (which posts an ereport) and end up here with no root 1501 * vdev. We stash the original pool guid in 'spa_config_guid' to handle 1502 * this case. 1503 */ 1504 if (spa->spa_root_vdev == NULL) 1505 return (spa->spa_config_guid); 1506 1507 guid = spa->spa_last_synced_guid != 0 ? 1508 spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid; 1509 1510 /* 1511 * Return the most recently synced out guid unless we're 1512 * in syncing context. 1513 */ 1514 if (dp && dsl_pool_sync_context(dp)) 1515 return (spa->spa_root_vdev->vdev_guid); 1516 else 1517 return (guid); 1518} 1519 1520uint64_t 1521spa_load_guid(spa_t *spa) 1522{ 1523 /* 1524 * This is a GUID that exists solely as a reference for the 1525 * purposes of the arc. It is generated at load time, and 1526 * is never written to persistent storage. 1527 */ 1528 return (spa->spa_load_guid); 1529} 1530 1531uint64_t 1532spa_last_synced_txg(spa_t *spa) 1533{ 1534 return (spa->spa_ubsync.ub_txg); 1535} 1536 1537uint64_t 1538spa_first_txg(spa_t *spa) 1539{ 1540 return (spa->spa_first_txg); 1541} 1542 1543uint64_t 1544spa_syncing_txg(spa_t *spa) 1545{ 1546 return (spa->spa_syncing_txg); 1547} 1548 1549pool_state_t 1550spa_state(spa_t *spa) 1551{ 1552 return (spa->spa_state); 1553} 1554 1555spa_load_state_t 1556spa_load_state(spa_t *spa) 1557{ 1558 return (spa->spa_load_state); 1559} 1560 1561uint64_t 1562spa_freeze_txg(spa_t *spa) 1563{ 1564 return (spa->spa_freeze_txg); 1565} 1566 1567/* ARGSUSED */ 1568uint64_t 1569spa_get_asize(spa_t *spa, uint64_t lsize) 1570{ 1571 return (lsize * spa_asize_inflation); 1572} 1573 1574uint64_t 1575spa_get_dspace(spa_t *spa) 1576{ 1577 return (spa->spa_dspace); 1578} 1579 1580void 1581spa_update_dspace(spa_t *spa) 1582{ 1583 spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) + 1584 ddt_get_dedup_dspace(spa); 1585} 1586 1587/* 1588 * Return the failure mode that has been set to this pool. The default 1589 * behavior will be to block all I/Os when a complete failure occurs. 1590 */ 1591uint8_t 1592spa_get_failmode(spa_t *spa) 1593{ 1594 return (spa->spa_failmode); 1595} 1596 1597boolean_t 1598spa_suspended(spa_t *spa) 1599{ 1600 return (spa->spa_suspended); 1601} 1602 1603uint64_t 1604spa_version(spa_t *spa) 1605{ 1606 return (spa->spa_ubsync.ub_version); 1607} 1608 1609boolean_t 1610spa_deflate(spa_t *spa) 1611{ 1612 return (spa->spa_deflate); 1613} 1614 1615metaslab_class_t * 1616spa_normal_class(spa_t *spa) 1617{ 1618 return (spa->spa_normal_class); 1619} 1620 1621metaslab_class_t * 1622spa_log_class(spa_t *spa) 1623{ 1624 return (spa->spa_log_class); 1625} 1626 1627int 1628spa_max_replication(spa_t *spa) 1629{ 1630 /* 1631 * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to 1632 * handle BPs with more than one DVA allocated. Set our max 1633 * replication level accordingly. 1634 */ 1635 if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS) 1636 return (1); 1637 return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override)); 1638} 1639 1640int 1641spa_prev_software_version(spa_t *spa) 1642{ 1643 return (spa->spa_prev_software_version); 1644} 1645 1646uint64_t 1647spa_deadman_synctime(spa_t *spa) 1648{ 1649 return (spa->spa_deadman_synctime); 1650} 1651 1652uint64_t 1653dva_get_dsize_sync(spa_t *spa, const dva_t *dva) 1654{ 1655 uint64_t asize = DVA_GET_ASIZE(dva); 1656 uint64_t dsize = asize; 1657 1658 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 1659 1660 if (asize != 0 && spa->spa_deflate) { 1661 vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); 1662 dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio; 1663 } 1664 1665 return (dsize); 1666} 1667 1668uint64_t 1669bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp) 1670{ 1671 uint64_t dsize = 0; 1672 1673 for (int d = 0; d < SPA_DVAS_PER_BP; d++) 1674 dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); 1675 1676 return (dsize); 1677} 1678 1679uint64_t 1680bp_get_dsize(spa_t *spa, const blkptr_t *bp) 1681{ 1682 uint64_t dsize = 0; 1683 1684 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 1685 1686 for (int d = 0; d < SPA_DVAS_PER_BP; d++) 1687 dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); 1688 1689 spa_config_exit(spa, SCL_VDEV, FTAG); 1690 1691 return (dsize); 1692} 1693 1694/* 1695 * ========================================================================== 1696 * Initialization and Termination 1697 * ========================================================================== 1698 */ 1699 1700static int 1701spa_name_compare(const void *a1, const void *a2) 1702{ 1703 const spa_t *s1 = a1; 1704 const spa_t *s2 = a2; 1705 int s; 1706 1707 s = strcmp(s1->spa_name, s2->spa_name); 1708 if (s > 0) 1709 return (1); 1710 if (s < 0) 1711 return (-1); 1712 return (0); 1713} 1714 1715int 1716spa_busy(void) 1717{ 1718 return (spa_active_count); 1719} 1720 1721void 1722spa_boot_init() 1723{ 1724 spa_config_load(); 1725} 1726 1727#ifdef _KERNEL 1728EVENTHANDLER_DEFINE(mountroot, spa_boot_init, NULL, 0); 1729#endif 1730 1731void 1732spa_init(int mode) 1733{ 1734 mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL); 1735 mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL); 1736 mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL); 1737 cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL); 1738 1739 avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t), 1740 offsetof(spa_t, spa_avl)); 1741 1742 avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t), 1743 offsetof(spa_aux_t, aux_avl)); 1744 1745 avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t), 1746 offsetof(spa_aux_t, aux_avl)); 1747 1748 spa_mode_global = mode; 1749 1750#ifdef illumos 1751#ifdef _KERNEL 1752 spa_arch_init(); 1753#else 1754 if (spa_mode_global != FREAD && dprintf_find_string("watch")) { 1755 arc_procfd = open("/proc/self/ctl", O_WRONLY); 1756 if (arc_procfd == -1) { 1757 perror("could not enable watchpoints: " 1758 "opening /proc/self/ctl failed: "); 1759 } else { 1760 arc_watch = B_TRUE; 1761 } 1762 } 1763#endif 1764#endif /* illumos */ 1765 refcount_sysinit(); 1766 unique_init(); 1767 range_tree_init(); 1768 zio_init(); 1769 lz4_init(); 1770 dmu_init(); 1771 zil_init(); 1772 vdev_cache_stat_init(); 1773 zfs_prop_init(); 1774 zpool_prop_init(); 1775 zpool_feature_init(); 1776 spa_config_load(); 1777 l2arc_start(); 1778#ifndef illumos 1779#ifdef _KERNEL 1780 zfs_deadman_init(); 1781#endif 1782#endif /* !illumos */ 1783} 1784 1785void 1786spa_fini(void) 1787{ 1788 l2arc_stop(); 1789 1790 spa_evict_all(); 1791 1792 vdev_cache_stat_fini(); 1793 zil_fini(); 1794 dmu_fini(); 1795 lz4_fini(); 1796 zio_fini(); 1797 range_tree_fini(); 1798 unique_fini(); 1799 refcount_fini(); 1800 1801 avl_destroy(&spa_namespace_avl); 1802 avl_destroy(&spa_spare_avl); 1803 avl_destroy(&spa_l2cache_avl); 1804 1805 cv_destroy(&spa_namespace_cv); 1806 mutex_destroy(&spa_namespace_lock); 1807 mutex_destroy(&spa_spare_lock); 1808 mutex_destroy(&spa_l2cache_lock); 1809} 1810 1811/* 1812 * Return whether this pool has slogs. No locking needed. 1813 * It's not a problem if the wrong answer is returned as it's only for 1814 * performance and not correctness 1815 */ 1816boolean_t 1817spa_has_slogs(spa_t *spa) 1818{ 1819 return (spa->spa_log_class->mc_rotor != NULL); 1820} 1821 1822spa_log_state_t 1823spa_get_log_state(spa_t *spa) 1824{ 1825 return (spa->spa_log_state); 1826} 1827 1828void 1829spa_set_log_state(spa_t *spa, spa_log_state_t state) 1830{ 1831 spa->spa_log_state = state; 1832} 1833 1834boolean_t 1835spa_is_root(spa_t *spa) 1836{ 1837 return (spa->spa_is_root); 1838} 1839 1840boolean_t 1841spa_writeable(spa_t *spa) 1842{ 1843 return (!!(spa->spa_mode & FWRITE)); 1844} 1845 1846int 1847spa_mode(spa_t *spa) 1848{ 1849 return (spa->spa_mode); 1850} 1851 1852uint64_t 1853spa_bootfs(spa_t *spa) 1854{ 1855 return (spa->spa_bootfs); 1856} 1857 1858uint64_t 1859spa_delegation(spa_t *spa) 1860{ 1861 return (spa->spa_delegation); 1862} 1863 1864objset_t * 1865spa_meta_objset(spa_t *spa) 1866{ 1867 return (spa->spa_meta_objset); 1868} 1869 1870enum zio_checksum 1871spa_dedup_checksum(spa_t *spa) 1872{ 1873 return (spa->spa_dedup_checksum); 1874} 1875 1876/* 1877 * Reset pool scan stat per scan pass (or reboot). 1878 */ 1879void 1880spa_scan_stat_init(spa_t *spa) 1881{ 1882 /* data not stored on disk */ 1883 spa->spa_scan_pass_start = gethrestime_sec(); 1884 spa->spa_scan_pass_exam = 0; 1885 vdev_scan_stat_init(spa->spa_root_vdev); 1886} 1887 1888/* 1889 * Get scan stats for zpool status reports 1890 */ 1891int 1892spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) 1893{ 1894 dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL; 1895 1896 if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE) 1897 return (SET_ERROR(ENOENT)); 1898 bzero(ps, sizeof (pool_scan_stat_t)); 1899 1900 /* data stored on disk */ 1901 ps->pss_func = scn->scn_phys.scn_func; 1902 ps->pss_start_time = scn->scn_phys.scn_start_time; 1903 ps->pss_end_time = scn->scn_phys.scn_end_time; 1904 ps->pss_to_examine = scn->scn_phys.scn_to_examine; 1905 ps->pss_examined = scn->scn_phys.scn_examined; 1906 ps->pss_to_process = scn->scn_phys.scn_to_process; 1907 ps->pss_processed = scn->scn_phys.scn_processed; 1908 ps->pss_errors = scn->scn_phys.scn_errors; 1909 ps->pss_state = scn->scn_phys.scn_state; 1910 1911 /* data not stored on disk */ 1912 ps->pss_pass_start = spa->spa_scan_pass_start; 1913 ps->pss_pass_exam = spa->spa_scan_pass_exam; 1914 1915 return (0); 1916} 1917 1918boolean_t 1919spa_debug_enabled(spa_t *spa) 1920{ 1921 return (spa->spa_debug); 1922} 1923