spa_misc.c revision 268650
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2013 by Delphix. All rights reserved. 24 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 25 * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. 26 */ 27 28#include <sys/zfs_context.h> 29#include <sys/spa_impl.h> 30#include <sys/spa_boot.h> 31#include <sys/zio.h> 32#include <sys/zio_checksum.h> 33#include <sys/zio_compress.h> 34#include <sys/dmu.h> 35#include <sys/dmu_tx.h> 36#include <sys/zap.h> 37#include <sys/zil.h> 38#include <sys/vdev_impl.h> 39#include <sys/metaslab.h> 40#include <sys/uberblock_impl.h> 41#include <sys/txg.h> 42#include <sys/avl.h> 43#include <sys/unique.h> 44#include <sys/dsl_pool.h> 45#include <sys/dsl_dir.h> 46#include <sys/dsl_prop.h> 47#include <sys/dsl_scan.h> 48#include <sys/fs/zfs.h> 49#include <sys/metaslab_impl.h> 50#include <sys/arc.h> 51#include <sys/ddt.h> 52#include "zfs_prop.h" 53#include "zfeature_common.h" 54 55/* 56 * SPA locking 57 * 58 * There are four basic locks for managing spa_t structures: 59 * 60 * spa_namespace_lock (global mutex) 61 * 62 * This lock must be acquired to do any of the following: 63 * 64 * - Lookup a spa_t by name 65 * - Add or remove a spa_t from the namespace 66 * - Increase spa_refcount from non-zero 67 * - Check if spa_refcount is zero 68 * - Rename a spa_t 69 * - add/remove/attach/detach devices 70 * - Held for the duration of create/destroy/import/export 71 * 72 * It does not need to handle recursion. A create or destroy may 73 * reference objects (files or zvols) in other pools, but by 74 * definition they must have an existing reference, and will never need 75 * to lookup a spa_t by name. 76 * 77 * spa_refcount (per-spa refcount_t protected by mutex) 78 * 79 * This reference count keep track of any active users of the spa_t. The 80 * spa_t cannot be destroyed or freed while this is non-zero. Internally, 81 * the refcount is never really 'zero' - opening a pool implicitly keeps 82 * some references in the DMU. Internally we check against spa_minref, but 83 * present the image of a zero/non-zero value to consumers. 84 * 85 * spa_config_lock[] (per-spa array of rwlocks) 86 * 87 * This protects the spa_t from config changes, and must be held in 88 * the following circumstances: 89 * 90 * - RW_READER to perform I/O to the spa 91 * - RW_WRITER to change the vdev config 92 * 93 * The locking order is fairly straightforward: 94 * 95 * spa_namespace_lock -> spa_refcount 96 * 97 * The namespace lock must be acquired to increase the refcount from 0 98 * or to check if it is zero. 99 * 100 * spa_refcount -> spa_config_lock[] 101 * 102 * There must be at least one valid reference on the spa_t to acquire 103 * the config lock. 104 * 105 * spa_namespace_lock -> spa_config_lock[] 106 * 107 * The namespace lock must always be taken before the config lock. 108 * 109 * 110 * The spa_namespace_lock can be acquired directly and is globally visible. 111 * 112 * The namespace is manipulated using the following functions, all of which 113 * require the spa_namespace_lock to be held. 114 * 115 * spa_lookup() Lookup a spa_t by name. 116 * 117 * spa_add() Create a new spa_t in the namespace. 118 * 119 * spa_remove() Remove a spa_t from the namespace. This also 120 * frees up any memory associated with the spa_t. 121 * 122 * spa_next() Returns the next spa_t in the system, or the 123 * first if NULL is passed. 124 * 125 * spa_evict_all() Shutdown and remove all spa_t structures in 126 * the system. 127 * 128 * spa_guid_exists() Determine whether a pool/device guid exists. 129 * 130 * The spa_refcount is manipulated using the following functions: 131 * 132 * spa_open_ref() Adds a reference to the given spa_t. Must be 133 * called with spa_namespace_lock held if the 134 * refcount is currently zero. 135 * 136 * spa_close() Remove a reference from the spa_t. This will 137 * not free the spa_t or remove it from the 138 * namespace. No locking is required. 139 * 140 * spa_refcount_zero() Returns true if the refcount is currently 141 * zero. Must be called with spa_namespace_lock 142 * held. 143 * 144 * The spa_config_lock[] is an array of rwlocks, ordered as follows: 145 * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV. 146 * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}(). 147 * 148 * To read the configuration, it suffices to hold one of these locks as reader. 149 * To modify the configuration, you must hold all locks as writer. To modify 150 * vdev state without altering the vdev tree's topology (e.g. online/offline), 151 * you must hold SCL_STATE and SCL_ZIO as writer. 152 * 153 * We use these distinct config locks to avoid recursive lock entry. 154 * For example, spa_sync() (which holds SCL_CONFIG as reader) induces 155 * block allocations (SCL_ALLOC), which may require reading space maps 156 * from disk (dmu_read() -> zio_read() -> SCL_ZIO). 157 * 158 * The spa config locks cannot be normal rwlocks because we need the 159 * ability to hand off ownership. For example, SCL_ZIO is acquired 160 * by the issuing thread and later released by an interrupt thread. 161 * They do, however, obey the usual write-wanted semantics to prevent 162 * writer (i.e. system administrator) starvation. 163 * 164 * The lock acquisition rules are as follows: 165 * 166 * SCL_CONFIG 167 * Protects changes to the vdev tree topology, such as vdev 168 * add/remove/attach/detach. Protects the dirty config list 169 * (spa_config_dirty_list) and the set of spares and l2arc devices. 170 * 171 * SCL_STATE 172 * Protects changes to pool state and vdev state, such as vdev 173 * online/offline/fault/degrade/clear. Protects the dirty state list 174 * (spa_state_dirty_list) and global pool state (spa_state). 175 * 176 * SCL_ALLOC 177 * Protects changes to metaslab groups and classes. 178 * Held as reader by metaslab_alloc() and metaslab_claim(). 179 * 180 * SCL_ZIO 181 * Held by bp-level zios (those which have no io_vd upon entry) 182 * to prevent changes to the vdev tree. The bp-level zio implicitly 183 * protects all of its vdev child zios, which do not hold SCL_ZIO. 184 * 185 * SCL_FREE 186 * Protects changes to metaslab groups and classes. 187 * Held as reader by metaslab_free(). SCL_FREE is distinct from 188 * SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free 189 * blocks in zio_done() while another i/o that holds either 190 * SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete. 191 * 192 * SCL_VDEV 193 * Held as reader to prevent changes to the vdev tree during trivial 194 * inquiries such as bp_get_dsize(). SCL_VDEV is distinct from the 195 * other locks, and lower than all of them, to ensure that it's safe 196 * to acquire regardless of caller context. 197 * 198 * In addition, the following rules apply: 199 * 200 * (a) spa_props_lock protects pool properties, spa_config and spa_config_list. 201 * The lock ordering is SCL_CONFIG > spa_props_lock. 202 * 203 * (b) I/O operations on leaf vdevs. For any zio operation that takes 204 * an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(), 205 * or zio_write_phys() -- the caller must ensure that the config cannot 206 * cannot change in the interim, and that the vdev cannot be reopened. 207 * SCL_STATE as reader suffices for both. 208 * 209 * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit(). 210 * 211 * spa_vdev_enter() Acquire the namespace lock and the config lock 212 * for writing. 213 * 214 * spa_vdev_exit() Release the config lock, wait for all I/O 215 * to complete, sync the updated configs to the 216 * cache, and release the namespace lock. 217 * 218 * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit(). 219 * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual 220 * locking is, always, based on spa_namespace_lock and spa_config_lock[]. 221 * 222 * spa_rename() is also implemented within this file since it requires 223 * manipulation of the namespace. 224 */ 225 226static avl_tree_t spa_namespace_avl; 227kmutex_t spa_namespace_lock; 228static kcondvar_t spa_namespace_cv; 229static int spa_active_count; 230int spa_max_replication_override = SPA_DVAS_PER_BP; 231 232static kmutex_t spa_spare_lock; 233static avl_tree_t spa_spare_avl; 234static kmutex_t spa_l2cache_lock; 235static avl_tree_t spa_l2cache_avl; 236 237kmem_cache_t *spa_buffer_pool; 238int spa_mode_global; 239 240#ifdef ZFS_DEBUG 241/* Everything except dprintf and spa is on by default in debug builds */ 242int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SPA); 243#else 244int zfs_flags = 0; 245#endif 246SYSCTL_DECL(_debug); 247TUNABLE_INT("debug.zfs_flags", &zfs_flags); 248SYSCTL_INT(_debug, OID_AUTO, zfs_flags, CTLFLAG_RWTUN, &zfs_flags, 0, 249 "ZFS debug flags."); 250 251/* 252 * zfs_recover can be set to nonzero to attempt to recover from 253 * otherwise-fatal errors, typically caused by on-disk corruption. When 254 * set, calls to zfs_panic_recover() will turn into warning messages. 255 * This should only be used as a last resort, as it typically results 256 * in leaked space, or worse. 257 */ 258boolean_t zfs_recover = B_FALSE; 259SYSCTL_DECL(_vfs_zfs); 260TUNABLE_INT("vfs.zfs.recover", &zfs_recover); 261SYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RDTUN, &zfs_recover, 0, 262 "Try to recover from otherwise-fatal errors."); 263 264/* 265 * If destroy encounters an EIO while reading metadata (e.g. indirect 266 * blocks), space referenced by the missing metadata can not be freed. 267 * Normally this causes the background destroy to become "stalled", as 268 * it is unable to make forward progress. While in this stalled state, 269 * all remaining space to free from the error-encountering filesystem is 270 * "temporarily leaked". Set this flag to cause it to ignore the EIO, 271 * permanently leak the space from indirect blocks that can not be read, 272 * and continue to free everything else that it can. 273 * 274 * The default, "stalling" behavior is useful if the storage partially 275 * fails (i.e. some but not all i/os fail), and then later recovers. In 276 * this case, we will be able to continue pool operations while it is 277 * partially failed, and when it recovers, we can continue to free the 278 * space, with no leaks. However, note that this case is actually 279 * fairly rare. 280 * 281 * Typically pools either (a) fail completely (but perhaps temporarily, 282 * e.g. a top-level vdev going offline), or (b) have localized, 283 * permanent errors (e.g. disk returns the wrong data due to bit flip or 284 * firmware bug). In case (a), this setting does not matter because the 285 * pool will be suspended and the sync thread will not be able to make 286 * forward progress regardless. In case (b), because the error is 287 * permanent, the best we can do is leak the minimum amount of space, 288 * which is what setting this flag will do. Therefore, it is reasonable 289 * for this flag to normally be set, but we chose the more conservative 290 * approach of not setting it, so that there is no possibility of 291 * leaking space in the "partial temporary" failure case. 292 */ 293boolean_t zfs_free_leak_on_eio = B_FALSE; 294 295/* 296 * Expiration time in milliseconds. This value has two meanings. First it is 297 * used to determine when the spa_deadman() logic should fire. By default the 298 * spa_deadman() will fire if spa_sync() has not completed in 1000 seconds. 299 * Secondly, the value determines if an I/O is considered "hung". Any I/O that 300 * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting 301 * in a system panic. 302 */ 303uint64_t zfs_deadman_synctime_ms = 1000000ULL; 304TUNABLE_QUAD("vfs.zfs.deadman_synctime_ms", &zfs_deadman_synctime_ms); 305SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_synctime_ms, CTLFLAG_RDTUN, 306 &zfs_deadman_synctime_ms, 0, 307 "Stalled ZFS I/O expiration time in milliseconds"); 308 309/* 310 * Check time in milliseconds. This defines the frequency at which we check 311 * for hung I/O. 312 */ 313uint64_t zfs_deadman_checktime_ms = 5000ULL; 314TUNABLE_QUAD("vfs.zfs.deadman_checktime_ms", &zfs_deadman_checktime_ms); 315SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_checktime_ms, CTLFLAG_RDTUN, 316 &zfs_deadman_checktime_ms, 0, 317 "Period of checks for stalled ZFS I/O in milliseconds"); 318 319/* 320 * Default value of -1 for zfs_deadman_enabled is resolved in 321 * zfs_deadman_init() 322 */ 323int zfs_deadman_enabled = -1; 324TUNABLE_INT("vfs.zfs.deadman_enabled", &zfs_deadman_enabled); 325SYSCTL_INT(_vfs_zfs, OID_AUTO, deadman_enabled, CTLFLAG_RDTUN, 326 &zfs_deadman_enabled, 0, "Kernel panic on stalled ZFS I/O"); 327 328/* 329 * The worst case is single-sector max-parity RAID-Z blocks, in which 330 * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1) 331 * times the size; so just assume that. Add to this the fact that 332 * we can have up to 3 DVAs per bp, and one more factor of 2 because 333 * the block may be dittoed with up to 3 DVAs by ddt_sync(). All together, 334 * the worst case is: 335 * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24 336 */ 337int spa_asize_inflation = 24; 338TUNABLE_INT("vfs.zfs.spa_asize_inflation", &spa_asize_inflation); 339SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_asize_inflation, CTLFLAG_RWTUN, 340 &spa_asize_inflation, 0, "Worst case inflation factor for single sector writes"); 341 342#ifndef illumos 343#ifdef _KERNEL 344static void 345zfs_deadman_init() 346{ 347 /* 348 * If we are not i386 or amd64 or in a virtual machine, 349 * disable ZFS deadman thread by default 350 */ 351 if (zfs_deadman_enabled == -1) { 352#if defined(__amd64__) || defined(__i386__) 353 zfs_deadman_enabled = (vm_guest == VM_GUEST_NO) ? 1 : 0; 354#else 355 zfs_deadman_enabled = 0; 356#endif 357 } 358} 359#endif /* _KERNEL */ 360#endif /* !illumos */ 361 362/* 363 * ========================================================================== 364 * SPA config locking 365 * ========================================================================== 366 */ 367static void 368spa_config_lock_init(spa_t *spa) 369{ 370 for (int i = 0; i < SCL_LOCKS; i++) { 371 spa_config_lock_t *scl = &spa->spa_config_lock[i]; 372 mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL); 373 cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL); 374 refcount_create_untracked(&scl->scl_count); 375 scl->scl_writer = NULL; 376 scl->scl_write_wanted = 0; 377 } 378} 379 380static void 381spa_config_lock_destroy(spa_t *spa) 382{ 383 for (int i = 0; i < SCL_LOCKS; i++) { 384 spa_config_lock_t *scl = &spa->spa_config_lock[i]; 385 mutex_destroy(&scl->scl_lock); 386 cv_destroy(&scl->scl_cv); 387 refcount_destroy(&scl->scl_count); 388 ASSERT(scl->scl_writer == NULL); 389 ASSERT(scl->scl_write_wanted == 0); 390 } 391} 392 393int 394spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw) 395{ 396 for (int i = 0; i < SCL_LOCKS; i++) { 397 spa_config_lock_t *scl = &spa->spa_config_lock[i]; 398 if (!(locks & (1 << i))) 399 continue; 400 mutex_enter(&scl->scl_lock); 401 if (rw == RW_READER) { 402 if (scl->scl_writer || scl->scl_write_wanted) { 403 mutex_exit(&scl->scl_lock); 404 spa_config_exit(spa, locks ^ (1 << i), tag); 405 return (0); 406 } 407 } else { 408 ASSERT(scl->scl_writer != curthread); 409 if (!refcount_is_zero(&scl->scl_count)) { 410 mutex_exit(&scl->scl_lock); 411 spa_config_exit(spa, locks ^ (1 << i), tag); 412 return (0); 413 } 414 scl->scl_writer = curthread; 415 } 416 (void) refcount_add(&scl->scl_count, tag); 417 mutex_exit(&scl->scl_lock); 418 } 419 return (1); 420} 421 422void 423spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw) 424{ 425 int wlocks_held = 0; 426 427 ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY); 428 429 for (int i = 0; i < SCL_LOCKS; i++) { 430 spa_config_lock_t *scl = &spa->spa_config_lock[i]; 431 if (scl->scl_writer == curthread) 432 wlocks_held |= (1 << i); 433 if (!(locks & (1 << i))) 434 continue; 435 mutex_enter(&scl->scl_lock); 436 if (rw == RW_READER) { 437 while (scl->scl_writer || scl->scl_write_wanted) { 438 cv_wait(&scl->scl_cv, &scl->scl_lock); 439 } 440 } else { 441 ASSERT(scl->scl_writer != curthread); 442 while (!refcount_is_zero(&scl->scl_count)) { 443 scl->scl_write_wanted++; 444 cv_wait(&scl->scl_cv, &scl->scl_lock); 445 scl->scl_write_wanted--; 446 } 447 scl->scl_writer = curthread; 448 } 449 (void) refcount_add(&scl->scl_count, tag); 450 mutex_exit(&scl->scl_lock); 451 } 452 ASSERT(wlocks_held <= locks); 453} 454 455void 456spa_config_exit(spa_t *spa, int locks, void *tag) 457{ 458 for (int i = SCL_LOCKS - 1; i >= 0; i--) { 459 spa_config_lock_t *scl = &spa->spa_config_lock[i]; 460 if (!(locks & (1 << i))) 461 continue; 462 mutex_enter(&scl->scl_lock); 463 ASSERT(!refcount_is_zero(&scl->scl_count)); 464 if (refcount_remove(&scl->scl_count, tag) == 0) { 465 ASSERT(scl->scl_writer == NULL || 466 scl->scl_writer == curthread); 467 scl->scl_writer = NULL; /* OK in either case */ 468 cv_broadcast(&scl->scl_cv); 469 } 470 mutex_exit(&scl->scl_lock); 471 } 472} 473 474int 475spa_config_held(spa_t *spa, int locks, krw_t rw) 476{ 477 int locks_held = 0; 478 479 for (int i = 0; i < SCL_LOCKS; i++) { 480 spa_config_lock_t *scl = &spa->spa_config_lock[i]; 481 if (!(locks & (1 << i))) 482 continue; 483 if ((rw == RW_READER && !refcount_is_zero(&scl->scl_count)) || 484 (rw == RW_WRITER && scl->scl_writer == curthread)) 485 locks_held |= 1 << i; 486 } 487 488 return (locks_held); 489} 490 491/* 492 * ========================================================================== 493 * SPA namespace functions 494 * ========================================================================== 495 */ 496 497/* 498 * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held. 499 * Returns NULL if no matching spa_t is found. 500 */ 501spa_t * 502spa_lookup(const char *name) 503{ 504 static spa_t search; /* spa_t is large; don't allocate on stack */ 505 spa_t *spa; 506 avl_index_t where; 507 char *cp; 508 509 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 510 511 (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); 512 513 /* 514 * If it's a full dataset name, figure out the pool name and 515 * just use that. 516 */ 517 cp = strpbrk(search.spa_name, "/@#"); 518 if (cp != NULL) 519 *cp = '\0'; 520 521 spa = avl_find(&spa_namespace_avl, &search, &where); 522 523 return (spa); 524} 525 526/* 527 * Fires when spa_sync has not completed within zfs_deadman_synctime_ms. 528 * If the zfs_deadman_enabled flag is set then it inspects all vdev queues 529 * looking for potentially hung I/Os. 530 */ 531void 532spa_deadman(void *arg) 533{ 534 spa_t *spa = arg; 535 536 /* 537 * Disable the deadman timer if the pool is suspended. 538 */ 539 if (spa_suspended(spa)) { 540#ifdef illumos 541 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); 542#else 543 /* Nothing. just don't schedule any future callouts. */ 544#endif 545 return; 546 } 547 548 zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu", 549 (gethrtime() - spa->spa_sync_starttime) / NANOSEC, 550 ++spa->spa_deadman_calls); 551 if (zfs_deadman_enabled) 552 vdev_deadman(spa->spa_root_vdev); 553} 554 555/* 556 * Create an uninitialized spa_t with the given name. Requires 557 * spa_namespace_lock. The caller must ensure that the spa_t doesn't already 558 * exist by calling spa_lookup() first. 559 */ 560spa_t * 561spa_add(const char *name, nvlist_t *config, const char *altroot) 562{ 563 spa_t *spa; 564 spa_config_dirent_t *dp; 565#ifdef illumos 566 cyc_handler_t hdlr; 567 cyc_time_t when; 568#endif 569 570 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 571 572 spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP); 573 574 mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); 575 mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); 576 mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); 577 mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); 578 mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL); 579 mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); 580 mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); 581 mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); 582 mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL); 583 584 cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); 585 cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL); 586 cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL); 587 cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL); 588 589 for (int t = 0; t < TXG_SIZE; t++) 590 bplist_create(&spa->spa_free_bplist[t]); 591 592 (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name)); 593 spa->spa_state = POOL_STATE_UNINITIALIZED; 594 spa->spa_freeze_txg = UINT64_MAX; 595 spa->spa_final_txg = UINT64_MAX; 596 spa->spa_load_max_txg = UINT64_MAX; 597 spa->spa_proc = &p0; 598 spa->spa_proc_state = SPA_PROC_NONE; 599 600#ifdef illumos 601 hdlr.cyh_func = spa_deadman; 602 hdlr.cyh_arg = spa; 603 hdlr.cyh_level = CY_LOW_LEVEL; 604#endif 605 606 spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms); 607 608#ifdef illumos 609 /* 610 * This determines how often we need to check for hung I/Os after 611 * the cyclic has already fired. Since checking for hung I/Os is 612 * an expensive operation we don't want to check too frequently. 613 * Instead wait for 5 seconds before checking again. 614 */ 615 when.cyt_interval = MSEC2NSEC(zfs_deadman_checktime_ms); 616 when.cyt_when = CY_INFINITY; 617 mutex_enter(&cpu_lock); 618 spa->spa_deadman_cycid = cyclic_add(&hdlr, &when); 619 mutex_exit(&cpu_lock); 620#else /* !illumos */ 621#ifdef _KERNEL 622 callout_init(&spa->spa_deadman_cycid, CALLOUT_MPSAFE); 623#endif 624#endif 625 refcount_create(&spa->spa_refcount); 626 spa_config_lock_init(spa); 627 628 avl_add(&spa_namespace_avl, spa); 629 630 /* 631 * Set the alternate root, if there is one. 632 */ 633 if (altroot) { 634 spa->spa_root = spa_strdup(altroot); 635 spa_active_count++; 636 } 637 638 /* 639 * Every pool starts with the default cachefile 640 */ 641 list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t), 642 offsetof(spa_config_dirent_t, scd_link)); 643 644 dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP); 645 dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path); 646 list_insert_head(&spa->spa_config_list, dp); 647 648 VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME, 649 KM_SLEEP) == 0); 650 651 if (config != NULL) { 652 nvlist_t *features; 653 654 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ, 655 &features) == 0) { 656 VERIFY(nvlist_dup(features, &spa->spa_label_features, 657 0) == 0); 658 } 659 660 VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); 661 } 662 663 if (spa->spa_label_features == NULL) { 664 VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME, 665 KM_SLEEP) == 0); 666 } 667 668 spa->spa_debug = ((zfs_flags & ZFS_DEBUG_SPA) != 0); 669 670 /* 671 * As a pool is being created, treat all features as disabled by 672 * setting SPA_FEATURE_DISABLED for all entries in the feature 673 * refcount cache. 674 */ 675 for (int i = 0; i < SPA_FEATURES; i++) { 676 spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED; 677 } 678 679 return (spa); 680} 681 682/* 683 * Removes a spa_t from the namespace, freeing up any memory used. Requires 684 * spa_namespace_lock. This is called only after the spa_t has been closed and 685 * deactivated. 686 */ 687void 688spa_remove(spa_t *spa) 689{ 690 spa_config_dirent_t *dp; 691 692 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 693 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 694 695 nvlist_free(spa->spa_config_splitting); 696 697 avl_remove(&spa_namespace_avl, spa); 698 cv_broadcast(&spa_namespace_cv); 699 700 if (spa->spa_root) { 701 spa_strfree(spa->spa_root); 702 spa_active_count--; 703 } 704 705 while ((dp = list_head(&spa->spa_config_list)) != NULL) { 706 list_remove(&spa->spa_config_list, dp); 707 if (dp->scd_path != NULL) 708 spa_strfree(dp->scd_path); 709 kmem_free(dp, sizeof (spa_config_dirent_t)); 710 } 711 712 list_destroy(&spa->spa_config_list); 713 714 nvlist_free(spa->spa_label_features); 715 nvlist_free(spa->spa_load_info); 716 spa_config_set(spa, NULL); 717 718#ifdef illumos 719 mutex_enter(&cpu_lock); 720 if (spa->spa_deadman_cycid != CYCLIC_NONE) 721 cyclic_remove(spa->spa_deadman_cycid); 722 mutex_exit(&cpu_lock); 723 spa->spa_deadman_cycid = CYCLIC_NONE; 724#else /* !illumos */ 725#ifdef _KERNEL 726 callout_drain(&spa->spa_deadman_cycid); 727#endif 728#endif 729 730 refcount_destroy(&spa->spa_refcount); 731 732 spa_config_lock_destroy(spa); 733 734 for (int t = 0; t < TXG_SIZE; t++) 735 bplist_destroy(&spa->spa_free_bplist[t]); 736 737 cv_destroy(&spa->spa_async_cv); 738 cv_destroy(&spa->spa_proc_cv); 739 cv_destroy(&spa->spa_scrub_io_cv); 740 cv_destroy(&spa->spa_suspend_cv); 741 742 mutex_destroy(&spa->spa_async_lock); 743 mutex_destroy(&spa->spa_errlist_lock); 744 mutex_destroy(&spa->spa_errlog_lock); 745 mutex_destroy(&spa->spa_history_lock); 746 mutex_destroy(&spa->spa_proc_lock); 747 mutex_destroy(&spa->spa_props_lock); 748 mutex_destroy(&spa->spa_scrub_lock); 749 mutex_destroy(&spa->spa_suspend_lock); 750 mutex_destroy(&spa->spa_vdev_top_lock); 751 752 kmem_free(spa, sizeof (spa_t)); 753} 754 755/* 756 * Given a pool, return the next pool in the namespace, or NULL if there is 757 * none. If 'prev' is NULL, return the first pool. 758 */ 759spa_t * 760spa_next(spa_t *prev) 761{ 762 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 763 764 if (prev) 765 return (AVL_NEXT(&spa_namespace_avl, prev)); 766 else 767 return (avl_first(&spa_namespace_avl)); 768} 769 770/* 771 * ========================================================================== 772 * SPA refcount functions 773 * ========================================================================== 774 */ 775 776/* 777 * Add a reference to the given spa_t. Must have at least one reference, or 778 * have the namespace lock held. 779 */ 780void 781spa_open_ref(spa_t *spa, void *tag) 782{ 783 ASSERT(refcount_count(&spa->spa_refcount) >= spa->spa_minref || 784 MUTEX_HELD(&spa_namespace_lock)); 785 (void) refcount_add(&spa->spa_refcount, tag); 786} 787 788/* 789 * Remove a reference to the given spa_t. Must have at least one reference, or 790 * have the namespace lock held. 791 */ 792void 793spa_close(spa_t *spa, void *tag) 794{ 795 ASSERT(refcount_count(&spa->spa_refcount) > spa->spa_minref || 796 MUTEX_HELD(&spa_namespace_lock)); 797 (void) refcount_remove(&spa->spa_refcount, tag); 798} 799 800/* 801 * Check to see if the spa refcount is zero. Must be called with 802 * spa_namespace_lock held. We really compare against spa_minref, which is the 803 * number of references acquired when opening a pool 804 */ 805boolean_t 806spa_refcount_zero(spa_t *spa) 807{ 808 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 809 810 return (refcount_count(&spa->spa_refcount) == spa->spa_minref); 811} 812 813/* 814 * ========================================================================== 815 * SPA spare and l2cache tracking 816 * ========================================================================== 817 */ 818 819/* 820 * Hot spares and cache devices are tracked using the same code below, 821 * for 'auxiliary' devices. 822 */ 823 824typedef struct spa_aux { 825 uint64_t aux_guid; 826 uint64_t aux_pool; 827 avl_node_t aux_avl; 828 int aux_count; 829} spa_aux_t; 830 831static int 832spa_aux_compare(const void *a, const void *b) 833{ 834 const spa_aux_t *sa = a; 835 const spa_aux_t *sb = b; 836 837 if (sa->aux_guid < sb->aux_guid) 838 return (-1); 839 else if (sa->aux_guid > sb->aux_guid) 840 return (1); 841 else 842 return (0); 843} 844 845void 846spa_aux_add(vdev_t *vd, avl_tree_t *avl) 847{ 848 avl_index_t where; 849 spa_aux_t search; 850 spa_aux_t *aux; 851 852 search.aux_guid = vd->vdev_guid; 853 if ((aux = avl_find(avl, &search, &where)) != NULL) { 854 aux->aux_count++; 855 } else { 856 aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP); 857 aux->aux_guid = vd->vdev_guid; 858 aux->aux_count = 1; 859 avl_insert(avl, aux, where); 860 } 861} 862 863void 864spa_aux_remove(vdev_t *vd, avl_tree_t *avl) 865{ 866 spa_aux_t search; 867 spa_aux_t *aux; 868 avl_index_t where; 869 870 search.aux_guid = vd->vdev_guid; 871 aux = avl_find(avl, &search, &where); 872 873 ASSERT(aux != NULL); 874 875 if (--aux->aux_count == 0) { 876 avl_remove(avl, aux); 877 kmem_free(aux, sizeof (spa_aux_t)); 878 } else if (aux->aux_pool == spa_guid(vd->vdev_spa)) { 879 aux->aux_pool = 0ULL; 880 } 881} 882 883boolean_t 884spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl) 885{ 886 spa_aux_t search, *found; 887 888 search.aux_guid = guid; 889 found = avl_find(avl, &search, NULL); 890 891 if (pool) { 892 if (found) 893 *pool = found->aux_pool; 894 else 895 *pool = 0ULL; 896 } 897 898 if (refcnt) { 899 if (found) 900 *refcnt = found->aux_count; 901 else 902 *refcnt = 0; 903 } 904 905 return (found != NULL); 906} 907 908void 909spa_aux_activate(vdev_t *vd, avl_tree_t *avl) 910{ 911 spa_aux_t search, *found; 912 avl_index_t where; 913 914 search.aux_guid = vd->vdev_guid; 915 found = avl_find(avl, &search, &where); 916 ASSERT(found != NULL); 917 ASSERT(found->aux_pool == 0ULL); 918 919 found->aux_pool = spa_guid(vd->vdev_spa); 920} 921 922/* 923 * Spares are tracked globally due to the following constraints: 924 * 925 * - A spare may be part of multiple pools. 926 * - A spare may be added to a pool even if it's actively in use within 927 * another pool. 928 * - A spare in use in any pool can only be the source of a replacement if 929 * the target is a spare in the same pool. 930 * 931 * We keep track of all spares on the system through the use of a reference 932 * counted AVL tree. When a vdev is added as a spare, or used as a replacement 933 * spare, then we bump the reference count in the AVL tree. In addition, we set 934 * the 'vdev_isspare' member to indicate that the device is a spare (active or 935 * inactive). When a spare is made active (used to replace a device in the 936 * pool), we also keep track of which pool its been made a part of. 937 * 938 * The 'spa_spare_lock' protects the AVL tree. These functions are normally 939 * called under the spa_namespace lock as part of vdev reconfiguration. The 940 * separate spare lock exists for the status query path, which does not need to 941 * be completely consistent with respect to other vdev configuration changes. 942 */ 943 944static int 945spa_spare_compare(const void *a, const void *b) 946{ 947 return (spa_aux_compare(a, b)); 948} 949 950void 951spa_spare_add(vdev_t *vd) 952{ 953 mutex_enter(&spa_spare_lock); 954 ASSERT(!vd->vdev_isspare); 955 spa_aux_add(vd, &spa_spare_avl); 956 vd->vdev_isspare = B_TRUE; 957 mutex_exit(&spa_spare_lock); 958} 959 960void 961spa_spare_remove(vdev_t *vd) 962{ 963 mutex_enter(&spa_spare_lock); 964 ASSERT(vd->vdev_isspare); 965 spa_aux_remove(vd, &spa_spare_avl); 966 vd->vdev_isspare = B_FALSE; 967 mutex_exit(&spa_spare_lock); 968} 969 970boolean_t 971spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt) 972{ 973 boolean_t found; 974 975 mutex_enter(&spa_spare_lock); 976 found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl); 977 mutex_exit(&spa_spare_lock); 978 979 return (found); 980} 981 982void 983spa_spare_activate(vdev_t *vd) 984{ 985 mutex_enter(&spa_spare_lock); 986 ASSERT(vd->vdev_isspare); 987 spa_aux_activate(vd, &spa_spare_avl); 988 mutex_exit(&spa_spare_lock); 989} 990 991/* 992 * Level 2 ARC devices are tracked globally for the same reasons as spares. 993 * Cache devices currently only support one pool per cache device, and so 994 * for these devices the aux reference count is currently unused beyond 1. 995 */ 996 997static int 998spa_l2cache_compare(const void *a, const void *b) 999{ 1000 return (spa_aux_compare(a, b)); 1001} 1002 1003void 1004spa_l2cache_add(vdev_t *vd) 1005{ 1006 mutex_enter(&spa_l2cache_lock); 1007 ASSERT(!vd->vdev_isl2cache); 1008 spa_aux_add(vd, &spa_l2cache_avl); 1009 vd->vdev_isl2cache = B_TRUE; 1010 mutex_exit(&spa_l2cache_lock); 1011} 1012 1013void 1014spa_l2cache_remove(vdev_t *vd) 1015{ 1016 mutex_enter(&spa_l2cache_lock); 1017 ASSERT(vd->vdev_isl2cache); 1018 spa_aux_remove(vd, &spa_l2cache_avl); 1019 vd->vdev_isl2cache = B_FALSE; 1020 mutex_exit(&spa_l2cache_lock); 1021} 1022 1023boolean_t 1024spa_l2cache_exists(uint64_t guid, uint64_t *pool) 1025{ 1026 boolean_t found; 1027 1028 mutex_enter(&spa_l2cache_lock); 1029 found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl); 1030 mutex_exit(&spa_l2cache_lock); 1031 1032 return (found); 1033} 1034 1035void 1036spa_l2cache_activate(vdev_t *vd) 1037{ 1038 mutex_enter(&spa_l2cache_lock); 1039 ASSERT(vd->vdev_isl2cache); 1040 spa_aux_activate(vd, &spa_l2cache_avl); 1041 mutex_exit(&spa_l2cache_lock); 1042} 1043 1044/* 1045 * ========================================================================== 1046 * SPA vdev locking 1047 * ========================================================================== 1048 */ 1049 1050/* 1051 * Lock the given spa_t for the purpose of adding or removing a vdev. 1052 * Grabs the global spa_namespace_lock plus the spa config lock for writing. 1053 * It returns the next transaction group for the spa_t. 1054 */ 1055uint64_t 1056spa_vdev_enter(spa_t *spa) 1057{ 1058 mutex_enter(&spa->spa_vdev_top_lock); 1059 mutex_enter(&spa_namespace_lock); 1060 return (spa_vdev_config_enter(spa)); 1061} 1062 1063/* 1064 * Internal implementation for spa_vdev_enter(). Used when a vdev 1065 * operation requires multiple syncs (i.e. removing a device) while 1066 * keeping the spa_namespace_lock held. 1067 */ 1068uint64_t 1069spa_vdev_config_enter(spa_t *spa) 1070{ 1071 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1072 1073 spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); 1074 1075 return (spa_last_synced_txg(spa) + 1); 1076} 1077 1078/* 1079 * Used in combination with spa_vdev_config_enter() to allow the syncing 1080 * of multiple transactions without releasing the spa_namespace_lock. 1081 */ 1082void 1083spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag) 1084{ 1085 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1086 1087 int config_changed = B_FALSE; 1088 1089 ASSERT(txg > spa_last_synced_txg(spa)); 1090 1091 spa->spa_pending_vdev = NULL; 1092 1093 /* 1094 * Reassess the DTLs. 1095 */ 1096 vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE); 1097 1098 if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) { 1099 config_changed = B_TRUE; 1100 spa->spa_config_generation++; 1101 } 1102 1103 /* 1104 * Verify the metaslab classes. 1105 */ 1106 ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0); 1107 ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0); 1108 1109 spa_config_exit(spa, SCL_ALL, spa); 1110 1111 /* 1112 * Panic the system if the specified tag requires it. This 1113 * is useful for ensuring that configurations are updated 1114 * transactionally. 1115 */ 1116 if (zio_injection_enabled) 1117 zio_handle_panic_injection(spa, tag, 0); 1118 1119 /* 1120 * Note: this txg_wait_synced() is important because it ensures 1121 * that there won't be more than one config change per txg. 1122 * This allows us to use the txg as the generation number. 1123 */ 1124 if (error == 0) 1125 txg_wait_synced(spa->spa_dsl_pool, txg); 1126 1127 if (vd != NULL) { 1128 ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL); 1129 spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); 1130 vdev_free(vd); 1131 spa_config_exit(spa, SCL_ALL, spa); 1132 } 1133 1134 /* 1135 * If the config changed, update the config cache. 1136 */ 1137 if (config_changed) 1138 spa_config_sync(spa, B_FALSE, B_TRUE); 1139} 1140 1141/* 1142 * Unlock the spa_t after adding or removing a vdev. Besides undoing the 1143 * locking of spa_vdev_enter(), we also want make sure the transactions have 1144 * synced to disk, and then update the global configuration cache with the new 1145 * information. 1146 */ 1147int 1148spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) 1149{ 1150 spa_vdev_config_exit(spa, vd, txg, error, FTAG); 1151 mutex_exit(&spa_namespace_lock); 1152 mutex_exit(&spa->spa_vdev_top_lock); 1153 1154 return (error); 1155} 1156 1157/* 1158 * Lock the given spa_t for the purpose of changing vdev state. 1159 */ 1160void 1161spa_vdev_state_enter(spa_t *spa, int oplocks) 1162{ 1163 int locks = SCL_STATE_ALL | oplocks; 1164 1165 /* 1166 * Root pools may need to read of the underlying devfs filesystem 1167 * when opening up a vdev. Unfortunately if we're holding the 1168 * SCL_ZIO lock it will result in a deadlock when we try to issue 1169 * the read from the root filesystem. Instead we "prefetch" 1170 * the associated vnodes that we need prior to opening the 1171 * underlying devices and cache them so that we can prevent 1172 * any I/O when we are doing the actual open. 1173 */ 1174 if (spa_is_root(spa)) { 1175 int low = locks & ~(SCL_ZIO - 1); 1176 int high = locks & ~low; 1177 1178 spa_config_enter(spa, high, spa, RW_WRITER); 1179 vdev_hold(spa->spa_root_vdev); 1180 spa_config_enter(spa, low, spa, RW_WRITER); 1181 } else { 1182 spa_config_enter(spa, locks, spa, RW_WRITER); 1183 } 1184 spa->spa_vdev_locks = locks; 1185} 1186 1187int 1188spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) 1189{ 1190 boolean_t config_changed = B_FALSE; 1191 1192 if (vd != NULL || error == 0) 1193 vdev_dtl_reassess(vd ? vd->vdev_top : spa->spa_root_vdev, 1194 0, 0, B_FALSE); 1195 1196 if (vd != NULL) { 1197 vdev_state_dirty(vd->vdev_top); 1198 config_changed = B_TRUE; 1199 spa->spa_config_generation++; 1200 } 1201 1202 if (spa_is_root(spa)) 1203 vdev_rele(spa->spa_root_vdev); 1204 1205 ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL); 1206 spa_config_exit(spa, spa->spa_vdev_locks, spa); 1207 1208 /* 1209 * If anything changed, wait for it to sync. This ensures that, 1210 * from the system administrator's perspective, zpool(1M) commands 1211 * are synchronous. This is important for things like zpool offline: 1212 * when the command completes, you expect no further I/O from ZFS. 1213 */ 1214 if (vd != NULL) 1215 txg_wait_synced(spa->spa_dsl_pool, 0); 1216 1217 /* 1218 * If the config changed, update the config cache. 1219 */ 1220 if (config_changed) { 1221 mutex_enter(&spa_namespace_lock); 1222 spa_config_sync(spa, B_FALSE, B_TRUE); 1223 mutex_exit(&spa_namespace_lock); 1224 } 1225 1226 return (error); 1227} 1228 1229/* 1230 * ========================================================================== 1231 * Miscellaneous functions 1232 * ========================================================================== 1233 */ 1234 1235void 1236spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx) 1237{ 1238 if (!nvlist_exists(spa->spa_label_features, feature)) { 1239 fnvlist_add_boolean(spa->spa_label_features, feature); 1240 /* 1241 * When we are creating the pool (tx_txg==TXG_INITIAL), we can't 1242 * dirty the vdev config because lock SCL_CONFIG is not held. 1243 * Thankfully, in this case we don't need to dirty the config 1244 * because it will be written out anyway when we finish 1245 * creating the pool. 1246 */ 1247 if (tx->tx_txg != TXG_INITIAL) 1248 vdev_config_dirty(spa->spa_root_vdev); 1249 } 1250} 1251 1252void 1253spa_deactivate_mos_feature(spa_t *spa, const char *feature) 1254{ 1255 if (nvlist_remove_all(spa->spa_label_features, feature) == 0) 1256 vdev_config_dirty(spa->spa_root_vdev); 1257} 1258 1259/* 1260 * Rename a spa_t. 1261 */ 1262int 1263spa_rename(const char *name, const char *newname) 1264{ 1265 spa_t *spa; 1266 int err; 1267 1268 /* 1269 * Lookup the spa_t and grab the config lock for writing. We need to 1270 * actually open the pool so that we can sync out the necessary labels. 1271 * It's OK to call spa_open() with the namespace lock held because we 1272 * allow recursive calls for other reasons. 1273 */ 1274 mutex_enter(&spa_namespace_lock); 1275 if ((err = spa_open(name, &spa, FTAG)) != 0) { 1276 mutex_exit(&spa_namespace_lock); 1277 return (err); 1278 } 1279 1280 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1281 1282 avl_remove(&spa_namespace_avl, spa); 1283 (void) strlcpy(spa->spa_name, newname, sizeof (spa->spa_name)); 1284 avl_add(&spa_namespace_avl, spa); 1285 1286 /* 1287 * Sync all labels to disk with the new names by marking the root vdev 1288 * dirty and waiting for it to sync. It will pick up the new pool name 1289 * during the sync. 1290 */ 1291 vdev_config_dirty(spa->spa_root_vdev); 1292 1293 spa_config_exit(spa, SCL_ALL, FTAG); 1294 1295 txg_wait_synced(spa->spa_dsl_pool, 0); 1296 1297 /* 1298 * Sync the updated config cache. 1299 */ 1300 spa_config_sync(spa, B_FALSE, B_TRUE); 1301 1302 spa_close(spa, FTAG); 1303 1304 mutex_exit(&spa_namespace_lock); 1305 1306 return (0); 1307} 1308 1309/* 1310 * Return the spa_t associated with given pool_guid, if it exists. If 1311 * device_guid is non-zero, determine whether the pool exists *and* contains 1312 * a device with the specified device_guid. 1313 */ 1314spa_t * 1315spa_by_guid(uint64_t pool_guid, uint64_t device_guid) 1316{ 1317 spa_t *spa; 1318 avl_tree_t *t = &spa_namespace_avl; 1319 1320 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1321 1322 for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) { 1323 if (spa->spa_state == POOL_STATE_UNINITIALIZED) 1324 continue; 1325 if (spa->spa_root_vdev == NULL) 1326 continue; 1327 if (spa_guid(spa) == pool_guid) { 1328 if (device_guid == 0) 1329 break; 1330 1331 if (vdev_lookup_by_guid(spa->spa_root_vdev, 1332 device_guid) != NULL) 1333 break; 1334 1335 /* 1336 * Check any devices we may be in the process of adding. 1337 */ 1338 if (spa->spa_pending_vdev) { 1339 if (vdev_lookup_by_guid(spa->spa_pending_vdev, 1340 device_guid) != NULL) 1341 break; 1342 } 1343 } 1344 } 1345 1346 return (spa); 1347} 1348 1349/* 1350 * Determine whether a pool with the given pool_guid exists. 1351 */ 1352boolean_t 1353spa_guid_exists(uint64_t pool_guid, uint64_t device_guid) 1354{ 1355 return (spa_by_guid(pool_guid, device_guid) != NULL); 1356} 1357 1358char * 1359spa_strdup(const char *s) 1360{ 1361 size_t len; 1362 char *new; 1363 1364 len = strlen(s); 1365 new = kmem_alloc(len + 1, KM_SLEEP); 1366 bcopy(s, new, len); 1367 new[len] = '\0'; 1368 1369 return (new); 1370} 1371 1372void 1373spa_strfree(char *s) 1374{ 1375 kmem_free(s, strlen(s) + 1); 1376} 1377 1378uint64_t 1379spa_get_random(uint64_t range) 1380{ 1381 uint64_t r; 1382 1383 ASSERT(range != 0); 1384 1385 (void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t)); 1386 1387 return (r % range); 1388} 1389 1390uint64_t 1391spa_generate_guid(spa_t *spa) 1392{ 1393 uint64_t guid = spa_get_random(-1ULL); 1394 1395 if (spa != NULL) { 1396 while (guid == 0 || spa_guid_exists(spa_guid(spa), guid)) 1397 guid = spa_get_random(-1ULL); 1398 } else { 1399 while (guid == 0 || spa_guid_exists(guid, 0)) 1400 guid = spa_get_random(-1ULL); 1401 } 1402 1403 return (guid); 1404} 1405 1406void 1407snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp) 1408{ 1409 char type[256]; 1410 char *checksum = NULL; 1411 char *compress = NULL; 1412 1413 if (bp != NULL) { 1414 if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) { 1415 dmu_object_byteswap_t bswap = 1416 DMU_OT_BYTESWAP(BP_GET_TYPE(bp)); 1417 (void) snprintf(type, sizeof (type), "bswap %s %s", 1418 DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ? 1419 "metadata" : "data", 1420 dmu_ot_byteswap[bswap].ob_name); 1421 } else { 1422 (void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name, 1423 sizeof (type)); 1424 } 1425 if (!BP_IS_EMBEDDED(bp)) { 1426 checksum = 1427 zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name; 1428 } 1429 compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name; 1430 } 1431 1432 SNPRINTF_BLKPTR(snprintf, ' ', buf, buflen, bp, type, checksum, 1433 compress); 1434} 1435 1436void 1437spa_freeze(spa_t *spa) 1438{ 1439 uint64_t freeze_txg = 0; 1440 1441 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1442 if (spa->spa_freeze_txg == UINT64_MAX) { 1443 freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE; 1444 spa->spa_freeze_txg = freeze_txg; 1445 } 1446 spa_config_exit(spa, SCL_ALL, FTAG); 1447 if (freeze_txg != 0) 1448 txg_wait_synced(spa_get_dsl(spa), freeze_txg); 1449} 1450 1451void 1452zfs_panic_recover(const char *fmt, ...) 1453{ 1454 va_list adx; 1455 1456 va_start(adx, fmt); 1457 vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx); 1458 va_end(adx); 1459} 1460 1461/* 1462 * This is a stripped-down version of strtoull, suitable only for converting 1463 * lowercase hexadecimal numbers that don't overflow. 1464 */ 1465uint64_t 1466zfs_strtonum(const char *str, char **nptr) 1467{ 1468 uint64_t val = 0; 1469 char c; 1470 int digit; 1471 1472 while ((c = *str) != '\0') { 1473 if (c >= '0' && c <= '9') 1474 digit = c - '0'; 1475 else if (c >= 'a' && c <= 'f') 1476 digit = 10 + c - 'a'; 1477 else 1478 break; 1479 1480 val *= 16; 1481 val += digit; 1482 1483 str++; 1484 } 1485 1486 if (nptr) 1487 *nptr = (char *)str; 1488 1489 return (val); 1490} 1491 1492/* 1493 * ========================================================================== 1494 * Accessor functions 1495 * ========================================================================== 1496 */ 1497 1498boolean_t 1499spa_shutting_down(spa_t *spa) 1500{ 1501 return (spa->spa_async_suspended); 1502} 1503 1504dsl_pool_t * 1505spa_get_dsl(spa_t *spa) 1506{ 1507 return (spa->spa_dsl_pool); 1508} 1509 1510boolean_t 1511spa_is_initializing(spa_t *spa) 1512{ 1513 return (spa->spa_is_initializing); 1514} 1515 1516blkptr_t * 1517spa_get_rootblkptr(spa_t *spa) 1518{ 1519 return (&spa->spa_ubsync.ub_rootbp); 1520} 1521 1522void 1523spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp) 1524{ 1525 spa->spa_uberblock.ub_rootbp = *bp; 1526} 1527 1528void 1529spa_altroot(spa_t *spa, char *buf, size_t buflen) 1530{ 1531 if (spa->spa_root == NULL) 1532 buf[0] = '\0'; 1533 else 1534 (void) strncpy(buf, spa->spa_root, buflen); 1535} 1536 1537int 1538spa_sync_pass(spa_t *spa) 1539{ 1540 return (spa->spa_sync_pass); 1541} 1542 1543char * 1544spa_name(spa_t *spa) 1545{ 1546 return (spa->spa_name); 1547} 1548 1549uint64_t 1550spa_guid(spa_t *spa) 1551{ 1552 dsl_pool_t *dp = spa_get_dsl(spa); 1553 uint64_t guid; 1554 1555 /* 1556 * If we fail to parse the config during spa_load(), we can go through 1557 * the error path (which posts an ereport) and end up here with no root 1558 * vdev. We stash the original pool guid in 'spa_config_guid' to handle 1559 * this case. 1560 */ 1561 if (spa->spa_root_vdev == NULL) 1562 return (spa->spa_config_guid); 1563 1564 guid = spa->spa_last_synced_guid != 0 ? 1565 spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid; 1566 1567 /* 1568 * Return the most recently synced out guid unless we're 1569 * in syncing context. 1570 */ 1571 if (dp && dsl_pool_sync_context(dp)) 1572 return (spa->spa_root_vdev->vdev_guid); 1573 else 1574 return (guid); 1575} 1576 1577uint64_t 1578spa_load_guid(spa_t *spa) 1579{ 1580 /* 1581 * This is a GUID that exists solely as a reference for the 1582 * purposes of the arc. It is generated at load time, and 1583 * is never written to persistent storage. 1584 */ 1585 return (spa->spa_load_guid); 1586} 1587 1588uint64_t 1589spa_last_synced_txg(spa_t *spa) 1590{ 1591 return (spa->spa_ubsync.ub_txg); 1592} 1593 1594uint64_t 1595spa_first_txg(spa_t *spa) 1596{ 1597 return (spa->spa_first_txg); 1598} 1599 1600uint64_t 1601spa_syncing_txg(spa_t *spa) 1602{ 1603 return (spa->spa_syncing_txg); 1604} 1605 1606pool_state_t 1607spa_state(spa_t *spa) 1608{ 1609 return (spa->spa_state); 1610} 1611 1612spa_load_state_t 1613spa_load_state(spa_t *spa) 1614{ 1615 return (spa->spa_load_state); 1616} 1617 1618uint64_t 1619spa_freeze_txg(spa_t *spa) 1620{ 1621 return (spa->spa_freeze_txg); 1622} 1623 1624/* ARGSUSED */ 1625uint64_t 1626spa_get_asize(spa_t *spa, uint64_t lsize) 1627{ 1628 return (lsize * spa_asize_inflation); 1629} 1630 1631uint64_t 1632spa_get_dspace(spa_t *spa) 1633{ 1634 return (spa->spa_dspace); 1635} 1636 1637void 1638spa_update_dspace(spa_t *spa) 1639{ 1640 spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) + 1641 ddt_get_dedup_dspace(spa); 1642} 1643 1644/* 1645 * Return the failure mode that has been set to this pool. The default 1646 * behavior will be to block all I/Os when a complete failure occurs. 1647 */ 1648uint8_t 1649spa_get_failmode(spa_t *spa) 1650{ 1651 return (spa->spa_failmode); 1652} 1653 1654boolean_t 1655spa_suspended(spa_t *spa) 1656{ 1657 return (spa->spa_suspended); 1658} 1659 1660uint64_t 1661spa_version(spa_t *spa) 1662{ 1663 return (spa->spa_ubsync.ub_version); 1664} 1665 1666boolean_t 1667spa_deflate(spa_t *spa) 1668{ 1669 return (spa->spa_deflate); 1670} 1671 1672metaslab_class_t * 1673spa_normal_class(spa_t *spa) 1674{ 1675 return (spa->spa_normal_class); 1676} 1677 1678metaslab_class_t * 1679spa_log_class(spa_t *spa) 1680{ 1681 return (spa->spa_log_class); 1682} 1683 1684int 1685spa_max_replication(spa_t *spa) 1686{ 1687 /* 1688 * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to 1689 * handle BPs with more than one DVA allocated. Set our max 1690 * replication level accordingly. 1691 */ 1692 if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS) 1693 return (1); 1694 return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override)); 1695} 1696 1697int 1698spa_prev_software_version(spa_t *spa) 1699{ 1700 return (spa->spa_prev_software_version); 1701} 1702 1703uint64_t 1704spa_deadman_synctime(spa_t *spa) 1705{ 1706 return (spa->spa_deadman_synctime); 1707} 1708 1709uint64_t 1710dva_get_dsize_sync(spa_t *spa, const dva_t *dva) 1711{ 1712 uint64_t asize = DVA_GET_ASIZE(dva); 1713 uint64_t dsize = asize; 1714 1715 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 1716 1717 if (asize != 0 && spa->spa_deflate) { 1718 vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); 1719 dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio; 1720 } 1721 1722 return (dsize); 1723} 1724 1725uint64_t 1726bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp) 1727{ 1728 uint64_t dsize = 0; 1729 1730 for (int d = 0; d < BP_GET_NDVAS(bp); d++) 1731 dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); 1732 1733 return (dsize); 1734} 1735 1736uint64_t 1737bp_get_dsize(spa_t *spa, const blkptr_t *bp) 1738{ 1739 uint64_t dsize = 0; 1740 1741 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 1742 1743 for (int d = 0; d < BP_GET_NDVAS(bp); d++) 1744 dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); 1745 1746 spa_config_exit(spa, SCL_VDEV, FTAG); 1747 1748 return (dsize); 1749} 1750 1751/* 1752 * ========================================================================== 1753 * Initialization and Termination 1754 * ========================================================================== 1755 */ 1756 1757static int 1758spa_name_compare(const void *a1, const void *a2) 1759{ 1760 const spa_t *s1 = a1; 1761 const spa_t *s2 = a2; 1762 int s; 1763 1764 s = strcmp(s1->spa_name, s2->spa_name); 1765 if (s > 0) 1766 return (1); 1767 if (s < 0) 1768 return (-1); 1769 return (0); 1770} 1771 1772int 1773spa_busy(void) 1774{ 1775 return (spa_active_count); 1776} 1777 1778void 1779spa_boot_init() 1780{ 1781 spa_config_load(); 1782} 1783 1784#ifdef _KERNEL 1785EVENTHANDLER_DEFINE(mountroot, spa_boot_init, NULL, 0); 1786#endif 1787 1788void 1789spa_init(int mode) 1790{ 1791 mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL); 1792 mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL); 1793 mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL); 1794 cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL); 1795 1796 avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t), 1797 offsetof(spa_t, spa_avl)); 1798 1799 avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t), 1800 offsetof(spa_aux_t, aux_avl)); 1801 1802 avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t), 1803 offsetof(spa_aux_t, aux_avl)); 1804 1805 spa_mode_global = mode; 1806 1807#ifdef illumos 1808#ifdef _KERNEL 1809 spa_arch_init(); 1810#else 1811 if (spa_mode_global != FREAD && dprintf_find_string("watch")) { 1812 arc_procfd = open("/proc/self/ctl", O_WRONLY); 1813 if (arc_procfd == -1) { 1814 perror("could not enable watchpoints: " 1815 "opening /proc/self/ctl failed: "); 1816 } else { 1817 arc_watch = B_TRUE; 1818 } 1819 } 1820#endif 1821#endif /* illumos */ 1822 refcount_sysinit(); 1823 unique_init(); 1824 range_tree_init(); 1825 zio_init(); 1826 lz4_init(); 1827 dmu_init(); 1828 zil_init(); 1829 vdev_cache_stat_init(); 1830 zfs_prop_init(); 1831 zpool_prop_init(); 1832 zpool_feature_init(); 1833 spa_config_load(); 1834 l2arc_start(); 1835#ifndef illumos 1836#ifdef _KERNEL 1837 zfs_deadman_init(); 1838#endif 1839#endif /* !illumos */ 1840} 1841 1842void 1843spa_fini(void) 1844{ 1845 l2arc_stop(); 1846 1847 spa_evict_all(); 1848 1849 vdev_cache_stat_fini(); 1850 zil_fini(); 1851 dmu_fini(); 1852 lz4_fini(); 1853 zio_fini(); 1854 range_tree_fini(); 1855 unique_fini(); 1856 refcount_fini(); 1857 1858 avl_destroy(&spa_namespace_avl); 1859 avl_destroy(&spa_spare_avl); 1860 avl_destroy(&spa_l2cache_avl); 1861 1862 cv_destroy(&spa_namespace_cv); 1863 mutex_destroy(&spa_namespace_lock); 1864 mutex_destroy(&spa_spare_lock); 1865 mutex_destroy(&spa_l2cache_lock); 1866} 1867 1868/* 1869 * Return whether this pool has slogs. No locking needed. 1870 * It's not a problem if the wrong answer is returned as it's only for 1871 * performance and not correctness 1872 */ 1873boolean_t 1874spa_has_slogs(spa_t *spa) 1875{ 1876 return (spa->spa_log_class->mc_rotor != NULL); 1877} 1878 1879spa_log_state_t 1880spa_get_log_state(spa_t *spa) 1881{ 1882 return (spa->spa_log_state); 1883} 1884 1885void 1886spa_set_log_state(spa_t *spa, spa_log_state_t state) 1887{ 1888 spa->spa_log_state = state; 1889} 1890 1891boolean_t 1892spa_is_root(spa_t *spa) 1893{ 1894 return (spa->spa_is_root); 1895} 1896 1897boolean_t 1898spa_writeable(spa_t *spa) 1899{ 1900 return (!!(spa->spa_mode & FWRITE)); 1901} 1902 1903int 1904spa_mode(spa_t *spa) 1905{ 1906 return (spa->spa_mode); 1907} 1908 1909uint64_t 1910spa_bootfs(spa_t *spa) 1911{ 1912 return (spa->spa_bootfs); 1913} 1914 1915uint64_t 1916spa_delegation(spa_t *spa) 1917{ 1918 return (spa->spa_delegation); 1919} 1920 1921objset_t * 1922spa_meta_objset(spa_t *spa) 1923{ 1924 return (spa->spa_meta_objset); 1925} 1926 1927enum zio_checksum 1928spa_dedup_checksum(spa_t *spa) 1929{ 1930 return (spa->spa_dedup_checksum); 1931} 1932 1933/* 1934 * Reset pool scan stat per scan pass (or reboot). 1935 */ 1936void 1937spa_scan_stat_init(spa_t *spa) 1938{ 1939 /* data not stored on disk */ 1940 spa->spa_scan_pass_start = gethrestime_sec(); 1941 spa->spa_scan_pass_exam = 0; 1942 vdev_scan_stat_init(spa->spa_root_vdev); 1943} 1944 1945/* 1946 * Get scan stats for zpool status reports 1947 */ 1948int 1949spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) 1950{ 1951 dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL; 1952 1953 if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE) 1954 return (SET_ERROR(ENOENT)); 1955 bzero(ps, sizeof (pool_scan_stat_t)); 1956 1957 /* data stored on disk */ 1958 ps->pss_func = scn->scn_phys.scn_func; 1959 ps->pss_start_time = scn->scn_phys.scn_start_time; 1960 ps->pss_end_time = scn->scn_phys.scn_end_time; 1961 ps->pss_to_examine = scn->scn_phys.scn_to_examine; 1962 ps->pss_examined = scn->scn_phys.scn_examined; 1963 ps->pss_to_process = scn->scn_phys.scn_to_process; 1964 ps->pss_processed = scn->scn_phys.scn_processed; 1965 ps->pss_errors = scn->scn_phys.scn_errors; 1966 ps->pss_state = scn->scn_phys.scn_state; 1967 1968 /* data not stored on disk */ 1969 ps->pss_pass_start = spa->spa_scan_pass_start; 1970 ps->pss_pass_exam = spa->spa_scan_pass_exam; 1971 1972 return (0); 1973} 1974 1975boolean_t 1976spa_debug_enabled(spa_t *spa) 1977{ 1978 return (spa->spa_debug); 1979} 1980