spa_misc.c revision 260731
180709Sjake/* 280709Sjake * CDDL HEADER START 3226054Smarius * 480709Sjake * The contents of this file are subject to the terms of the 580709Sjake * Common Development and Distribution License (the "License"). 680709Sjake * You may not use this file except in compliance with the License. 780709Sjake * 880709Sjake * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 980709Sjake * or http://www.opensolaris.org/os/licensing. 1080709Sjake * See the License for the specific language governing permissions 1180709Sjake * and limitations under the License. 1280709Sjake * 1380709Sjake * When distributing Covered Code, include this CDDL HEADER in each 1480709Sjake * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 1581334Sobrien * If applicable, add the following below this CDDL HEADER, with the 1680709Sjake * fields enclosed by brackets "[]" replaced with your own identifying 1780709Sjake * information: Portions Copyright [yyyy] [name of copyright owner] 1881334Sobrien * 1980709Sjake * CDDL HEADER END 2080709Sjake */ 2180709Sjake/* 2280709Sjake * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 2380709Sjake * Copyright (c) 2013 by Delphix. All rights reserved. 2480709Sjake * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 2580709Sjake * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. 2680709Sjake */ 2780709Sjake 2880709Sjake#include <sys/zfs_context.h> 2980709Sjake#include <sys/spa_impl.h> 3080709Sjake#include <sys/spa_boot.h> 3180709Sjake#include <sys/zio.h> 3280709Sjake#include <sys/zio_checksum.h> 3380709Sjake#include <sys/zio_compress.h> 3480709Sjake#include <sys/dmu.h> 3588616Sjake#include <sys/dmu_tx.h> 3689032Sjake#include <sys/zap.h> 37216802Smarius#include <sys/zil.h> 3888616Sjake#include <sys/vdev_impl.h> 3989032Sjake#include <sys/metaslab.h> 4088616Sjake#include <sys/uberblock_impl.h> 4181893Sjake#include <sys/txg.h> 4288616Sjake#include <sys/avl.h> 4389032Sjake#include <sys/unique.h> 4489032Sjake#include <sys/dsl_pool.h> 4588616Sjake#include <sys/dsl_dir.h> 4688616Sjake#include <sys/dsl_prop.h> 4788616Sjake#include <sys/dsl_scan.h> 4889032Sjake#include <sys/fs/zfs.h> 4989032Sjake#include <sys/metaslab_impl.h> 5088616Sjake#include <sys/arc.h> 5189032Sjake#include <sys/ddt.h> 5288616Sjake#include "zfs_prop.h" 53225886Smarius#include "zfeature_common.h" 54225886Smarius 55225886Smarius/* 56225886Smarius * SPA locking 57225886Smarius * 58225886Smarius * There are four basic locks for managing spa_t structures: 5989032Sjake * 6088616Sjake * spa_namespace_lock (global mutex) 6189032Sjake * 6289032Sjake * This lock must be acquired to do any of the following: 6389032Sjake * 64225886Smarius * - Lookup a spa_t by name 65225886Smarius * - Add or remove a spa_t from the namespace 66225886Smarius * - Increase spa_refcount from non-zero 67225886Smarius * - Check if spa_refcount is zero 68225886Smarius * - Rename a spa_t 69225886Smarius * - add/remove/attach/detach devices 7089032Sjake * - Held for the duration of create/destroy/import/export 7189032Sjake * 7292198Sjake * It does not need to handle recursion. A create or destroy may 73225886Smarius * reference objects (files or zvols) in other pools, but by 74117658Sjmg * definition they must have an existing reference, and will never need 75225886Smarius * to lookup a spa_t by name. 76225886Smarius * 77225886Smarius * spa_refcount (per-spa refcount_t protected by mutex) 78225886Smarius * 79225886Smarius * This reference count keep track of any active users of the spa_t. The 80225887Smarius * spa_t cannot be destroyed or freed while this is non-zero. Internally, 81117658Sjmg * the refcount is never really 'zero' - opening a pool implicitly keeps 82117658Sjmg * some references in the DMU. Internally we check against spa_minref, but 83117658Sjmg * present the image of a zero/non-zero value to consumers. 8492198Sjake * 8592198Sjake * spa_config_lock[] (per-spa array of rwlocks) 86225886Smarius * 87225886Smarius * This protects the spa_t from config changes, and must be held in 88225886Smarius * the following circumstances: 89225886Smarius * 90225886Smarius * - RW_READER to perform I/O to the spa 91225886Smarius * - RW_WRITER to change the vdev config 9292198Sjake * 9392198Sjake * The locking order is fairly straightforward: 94221750Smarius * 95225886Smarius * spa_namespace_lock -> spa_refcount 96221750Smarius * 97225886Smarius * The namespace lock must be acquired to increase the refcount from 0 98225886Smarius * or to check if it is zero. 99225886Smarius * 100225886Smarius * spa_refcount -> spa_config_lock[] 101225886Smarius * 102226054Smarius * There must be at least one valid reference on the spa_t to acquire 103226054Smarius * the config lock. 104226054Smarius * 105226054Smarius * spa_namespace_lock -> spa_config_lock[] 106226054Smarius * 107226054Smarius * The namespace lock must always be taken before the config lock. 108226054Smarius * 109226054Smarius * 110226054Smarius * The spa_namespace_lock can be acquired directly and is globally visible. 111226054Smarius * 112226054Smarius * The namespace is manipulated using the following functions, all of which 113226054Smarius * require the spa_namespace_lock to be held. 114226054Smarius * 115226054Smarius * spa_lookup() Lookup a spa_t by name. 116226054Smarius * 117226054Smarius * spa_add() Create a new spa_t in the namespace. 118226054Smarius * 119226054Smarius * spa_remove() Remove a spa_t from the namespace. This also 120226054Smarius * frees up any memory associated with the spa_t. 121226054Smarius * 122226054Smarius * spa_next() Returns the next spa_t in the system, or the 123226054Smarius * first if NULL is passed. 124226054Smarius * 125226054Smarius * spa_evict_all() Shutdown and remove all spa_t structures in 126226054Smarius * the system. 127225886Smarius * 128221750Smarius * spa_guid_exists() Determine whether a pool/device guid exists. 129221750Smarius * 130226054Smarius * The spa_refcount is manipulated using the following functions: 131226054Smarius * 132226054Smarius * spa_open_ref() Adds a reference to the given spa_t. Must be 133226054Smarius * called with spa_namespace_lock held if the 134226054Smarius * refcount is currently zero. 135226054Smarius * 136226054Smarius * spa_close() Remove a reference from the spa_t. This will 137226054Smarius * not free the spa_t or remove it from the 138226054Smarius * namespace. No locking is required. 139226054Smarius * 140226054Smarius * spa_refcount_zero() Returns true if the refcount is currently 141226054Smarius * zero. Must be called with spa_namespace_lock 142226054Smarius * held. 143226054Smarius * 144226054Smarius * The spa_config_lock[] is an array of rwlocks, ordered as follows: 145226054Smarius * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV. 146226054Smarius * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}(). 147226054Smarius * 148226054Smarius * To read the configuration, it suffices to hold one of these locks as reader. 149226054Smarius * To modify the configuration, you must hold all locks as writer. To modify 150226054Smarius * vdev state without altering the vdev tree's topology (e.g. online/offline), 151226054Smarius * you must hold SCL_STATE and SCL_ZIO as writer. 152226054Smarius * 153226054Smarius * We use these distinct config locks to avoid recursive lock entry. 154226054Smarius * For example, spa_sync() (which holds SCL_CONFIG as reader) induces 155226054Smarius * block allocations (SCL_ALLOC), which may require reading space maps 156226054Smarius * from disk (dmu_read() -> zio_read() -> SCL_ZIO). 157226054Smarius * 158226054Smarius * The spa config locks cannot be normal rwlocks because we need the 159226054Smarius * ability to hand off ownership. For example, SCL_ZIO is acquired 160226054Smarius * by the issuing thread and later released by an interrupt thread. 161226054Smarius * They do, however, obey the usual write-wanted semantics to prevent 162226054Smarius * writer (i.e. system administrator) starvation. 16397262Sjake * 164225886Smarius * The lock acquisition rules are as follows: 16597262Sjake * 16684176Sjake * SCL_CONFIG 167225886Smarius * Protects changes to the vdev tree topology, such as vdev 16897262Sjake * add/remove/attach/detach. Protects the dirty config list 16984176Sjake * (spa_config_dirty_list) and the set of spares and l2arc devices. 170225886Smarius * 171225886Smarius * SCL_STATE 172225886Smarius * Protects changes to pool state and vdev state, such as vdev 173225886Smarius * online/offline/fault/degrade/clear. Protects the dirty state list 174225886Smarius * (spa_state_dirty_list) and global pool state (spa_state). 175225886Smarius * 17680709Sjake * SCL_ALLOC 17780709Sjake * Protects changes to metaslab groups and classes. 17889032Sjake * Held as reader by metaslab_alloc() and metaslab_claim(). 179225886Smarius * 180225886Smarius * SCL_ZIO 181225886Smarius * Held by bp-level zios (those which have no io_vd upon entry) 182225886Smarius * to prevent changes to the vdev tree. The bp-level zio implicitly 18389032Sjake * protects all of its vdev child zios, which do not hold SCL_ZIO. 18489032Sjake * 18589032Sjake * SCL_FREE 18689032Sjake * Protects changes to metaslab groups and classes. 18789032Sjake * Held as reader by metaslab_free(). SCL_FREE is distinct from 188225886Smarius * SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free 189225886Smarius * blocks in zio_done() while another i/o that holds either 190225886Smarius * SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete. 191225886Smarius * 192225886Smarius * SCL_VDEV 193225886Smarius * Held as reader to prevent changes to the vdev tree during trivial 19488616Sjake * inquiries such as bp_get_dsize(). SCL_VDEV is distinct from the 19588616Sjake * other locks, and lower than all of them, to ensure that it's safe 196100840Sjake * to acquire regardless of caller context. 197100840Sjake * 198225886Smarius * In addition, the following rules apply: 199225886Smarius * 200225886Smarius * (a) spa_props_lock protects pool properties, spa_config and spa_config_list. 201225886Smarius * The lock ordering is SCL_CONFIG > spa_props_lock. 202225886Smarius * 203114071Sobrien * (b) I/O operations on leaf vdevs. For any zio operation that takes 20480709Sjake * an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(), 20580709Sjake * or zio_write_phys() -- the caller must ensure that the config cannot 20680709Sjake * cannot change in the interim, and that the vdev cannot be reopened. 207216802Smarius * SCL_STATE as reader suffices for both. 208216802Smarius * 209216802Smarius * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit(). 210216802Smarius * 211216802Smarius * spa_vdev_enter() Acquire the namespace lock and the config lock 212216802Smarius * for writing. 213216802Smarius * 214216802Smarius * spa_vdev_exit() Release the config lock, wait for all I/O 215216802Smarius * to complete, sync the updated configs to the 216216802Smarius * cache, and release the namespace lock. 217216802Smarius * 218216802Smarius * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit(). 219216802Smarius * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual 22092213Sjake * locking is, always, based on spa_namespace_lock and spa_config_lock[]. 22192213Sjake * 22292213Sjake * spa_rename() is also implemented within this file since it requires 22392213Sjake * manipulation of the namespace. 22480709Sjake */ 225 226static avl_tree_t spa_namespace_avl; 227kmutex_t spa_namespace_lock; 228static kcondvar_t spa_namespace_cv; 229static int spa_active_count; 230int spa_max_replication_override = SPA_DVAS_PER_BP; 231 232static kmutex_t spa_spare_lock; 233static avl_tree_t spa_spare_avl; 234static kmutex_t spa_l2cache_lock; 235static avl_tree_t spa_l2cache_avl; 236 237kmem_cache_t *spa_buffer_pool; 238int spa_mode_global; 239 240#ifdef ZFS_DEBUG 241/* Everything except dprintf and spa is on by default in debug builds */ 242int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SPA); 243#else 244int zfs_flags = 0; 245#endif 246SYSCTL_DECL(_debug); 247TUNABLE_INT("debug.zfs_flags", &zfs_flags); 248SYSCTL_INT(_debug, OID_AUTO, zfs_flags, CTLFLAG_RWTUN, &zfs_flags, 0, 249 "ZFS debug flags."); 250 251/* 252 * zfs_recover can be set to nonzero to attempt to recover from 253 * otherwise-fatal errors, typically caused by on-disk corruption. When 254 * set, calls to zfs_panic_recover() will turn into warning messages. 255 */ 256int zfs_recover = 0; 257SYSCTL_DECL(_vfs_zfs); 258TUNABLE_INT("vfs.zfs.recover", &zfs_recover); 259SYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RDTUN, &zfs_recover, 0, 260 "Try to recover from otherwise-fatal errors."); 261 262extern int zfs_txg_synctime_ms; 263 264/* 265 * Expiration time in units of zfs_txg_synctime_ms. This value has two 266 * meanings. First it is used to determine when the spa_deadman logic 267 * should fire. By default the spa_deadman will fire if spa_sync has 268 * not completed in 1000 * zfs_txg_synctime_ms (i.e. 1000 seconds). 269 * Secondly, the value determines if an I/O is considered "hung". 270 * Any I/O that has not completed in zfs_deadman_synctime is considered 271 * "hung" resulting in a system panic. 272 * 1000 zfs_txg_synctime_ms (i.e. 1000 seconds). 273 */ 274uint64_t zfs_deadman_synctime = 1000ULL; 275TUNABLE_QUAD("vfs.zfs.deadman_synctime", &zfs_deadman_synctime); 276SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_synctime, CTLFLAG_RDTUN, 277 &zfs_deadman_synctime, 0, 278 "Stalled ZFS I/O expiration time in units of vfs.zfs.txg.synctime_ms"); 279 280/* 281 * Default value of -1 for zfs_deadman_enabled is resolved in 282 * zfs_deadman_init() 283 */ 284int zfs_deadman_enabled = -1; 285TUNABLE_INT("vfs.zfs.deadman_enabled", &zfs_deadman_enabled); 286SYSCTL_INT(_vfs_zfs, OID_AUTO, deadman_enabled, CTLFLAG_RDTUN, 287 &zfs_deadman_enabled, 0, "Kernel panic on stalled ZFS I/O"); 288 289#ifndef illumos 290#ifdef _KERNEL 291static void 292zfs_deadman_init() 293{ 294 /* 295 * If we are not i386 or amd64 or in a virtual machine, 296 * disable ZFS deadman thread by default 297 */ 298 if (zfs_deadman_enabled == -1) { 299#if defined(__amd64__) || defined(__i386__) 300 zfs_deadman_enabled = (vm_guest == VM_GUEST_NO) ? 1 : 0; 301#else 302 zfs_deadman_enabled = 0; 303#endif 304 } 305} 306#endif /* _KERNEL */ 307#endif /* !illumos */ 308 309/* 310 * ========================================================================== 311 * SPA config locking 312 * ========================================================================== 313 */ 314static void 315spa_config_lock_init(spa_t *spa) 316{ 317 for (int i = 0; i < SCL_LOCKS; i++) { 318 spa_config_lock_t *scl = &spa->spa_config_lock[i]; 319 mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL); 320 cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL); 321 refcount_create_untracked(&scl->scl_count); 322 scl->scl_writer = NULL; 323 scl->scl_write_wanted = 0; 324 } 325} 326 327static void 328spa_config_lock_destroy(spa_t *spa) 329{ 330 for (int i = 0; i < SCL_LOCKS; i++) { 331 spa_config_lock_t *scl = &spa->spa_config_lock[i]; 332 mutex_destroy(&scl->scl_lock); 333 cv_destroy(&scl->scl_cv); 334 refcount_destroy(&scl->scl_count); 335 ASSERT(scl->scl_writer == NULL); 336 ASSERT(scl->scl_write_wanted == 0); 337 } 338} 339 340int 341spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw) 342{ 343 for (int i = 0; i < SCL_LOCKS; i++) { 344 spa_config_lock_t *scl = &spa->spa_config_lock[i]; 345 if (!(locks & (1 << i))) 346 continue; 347 mutex_enter(&scl->scl_lock); 348 if (rw == RW_READER) { 349 if (scl->scl_writer || scl->scl_write_wanted) { 350 mutex_exit(&scl->scl_lock); 351 spa_config_exit(spa, locks ^ (1 << i), tag); 352 return (0); 353 } 354 } else { 355 ASSERT(scl->scl_writer != curthread); 356 if (!refcount_is_zero(&scl->scl_count)) { 357 mutex_exit(&scl->scl_lock); 358 spa_config_exit(spa, locks ^ (1 << i), tag); 359 return (0); 360 } 361 scl->scl_writer = curthread; 362 } 363 (void) refcount_add(&scl->scl_count, tag); 364 mutex_exit(&scl->scl_lock); 365 } 366 return (1); 367} 368 369void 370spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw) 371{ 372 int wlocks_held = 0; 373 374 ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY); 375 376 for (int i = 0; i < SCL_LOCKS; i++) { 377 spa_config_lock_t *scl = &spa->spa_config_lock[i]; 378 if (scl->scl_writer == curthread) 379 wlocks_held |= (1 << i); 380 if (!(locks & (1 << i))) 381 continue; 382 mutex_enter(&scl->scl_lock); 383 if (rw == RW_READER) { 384 while (scl->scl_writer || scl->scl_write_wanted) { 385 cv_wait(&scl->scl_cv, &scl->scl_lock); 386 } 387 } else { 388 ASSERT(scl->scl_writer != curthread); 389 while (!refcount_is_zero(&scl->scl_count)) { 390 scl->scl_write_wanted++; 391 cv_wait(&scl->scl_cv, &scl->scl_lock); 392 scl->scl_write_wanted--; 393 } 394 scl->scl_writer = curthread; 395 } 396 (void) refcount_add(&scl->scl_count, tag); 397 mutex_exit(&scl->scl_lock); 398 } 399 ASSERT(wlocks_held <= locks); 400} 401 402void 403spa_config_exit(spa_t *spa, int locks, void *tag) 404{ 405 for (int i = SCL_LOCKS - 1; i >= 0; i--) { 406 spa_config_lock_t *scl = &spa->spa_config_lock[i]; 407 if (!(locks & (1 << i))) 408 continue; 409 mutex_enter(&scl->scl_lock); 410 ASSERT(!refcount_is_zero(&scl->scl_count)); 411 if (refcount_remove(&scl->scl_count, tag) == 0) { 412 ASSERT(scl->scl_writer == NULL || 413 scl->scl_writer == curthread); 414 scl->scl_writer = NULL; /* OK in either case */ 415 cv_broadcast(&scl->scl_cv); 416 } 417 mutex_exit(&scl->scl_lock); 418 } 419} 420 421int 422spa_config_held(spa_t *spa, int locks, krw_t rw) 423{ 424 int locks_held = 0; 425 426 for (int i = 0; i < SCL_LOCKS; i++) { 427 spa_config_lock_t *scl = &spa->spa_config_lock[i]; 428 if (!(locks & (1 << i))) 429 continue; 430 if ((rw == RW_READER && !refcount_is_zero(&scl->scl_count)) || 431 (rw == RW_WRITER && scl->scl_writer == curthread)) 432 locks_held |= 1 << i; 433 } 434 435 return (locks_held); 436} 437 438/* 439 * ========================================================================== 440 * SPA namespace functions 441 * ========================================================================== 442 */ 443 444/* 445 * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held. 446 * Returns NULL if no matching spa_t is found. 447 */ 448spa_t * 449spa_lookup(const char *name) 450{ 451 static spa_t search; /* spa_t is large; don't allocate on stack */ 452 spa_t *spa; 453 avl_index_t where; 454 char *cp; 455 456 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 457 458 (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); 459 460 /* 461 * If it's a full dataset name, figure out the pool name and 462 * just use that. 463 */ 464 cp = strpbrk(search.spa_name, "/@"); 465 if (cp != NULL) 466 *cp = '\0'; 467 468 spa = avl_find(&spa_namespace_avl, &search, &where); 469 470 return (spa); 471} 472 473/* 474 * Fires when spa_sync has not completed within zfs_deadman_synctime_ms. 475 * If the zfs_deadman_enabled flag is set then it inspects all vdev queues 476 * looking for potentially hung I/Os. 477 */ 478void 479spa_deadman(void *arg) 480{ 481 spa_t *spa = arg; 482 483 zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu", 484 (gethrtime() - spa->spa_sync_starttime) / NANOSEC, 485 ++spa->spa_deadman_calls); 486 if (zfs_deadman_enabled) 487 vdev_deadman(spa->spa_root_vdev); 488} 489 490/* 491 * Create an uninitialized spa_t with the given name. Requires 492 * spa_namespace_lock. The caller must ensure that the spa_t doesn't already 493 * exist by calling spa_lookup() first. 494 */ 495spa_t * 496spa_add(const char *name, nvlist_t *config, const char *altroot) 497{ 498 spa_t *spa; 499 spa_config_dirent_t *dp; 500#ifdef illumos 501 cyc_handler_t hdlr; 502 cyc_time_t when; 503#endif 504 505 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 506 507 spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP); 508 509 mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); 510 mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); 511 mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); 512 mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); 513 mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL); 514 mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); 515 mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); 516 mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); 517 mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL); 518 519 cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); 520 cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL); 521 cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL); 522 cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL); 523 524 for (int t = 0; t < TXG_SIZE; t++) 525 bplist_create(&spa->spa_free_bplist[t]); 526 527 (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name)); 528 spa->spa_state = POOL_STATE_UNINITIALIZED; 529 spa->spa_freeze_txg = UINT64_MAX; 530 spa->spa_final_txg = UINT64_MAX; 531 spa->spa_load_max_txg = UINT64_MAX; 532 spa->spa_proc = &p0; 533 spa->spa_proc_state = SPA_PROC_NONE; 534 535#ifdef illumos 536 hdlr.cyh_func = spa_deadman; 537 hdlr.cyh_arg = spa; 538 hdlr.cyh_level = CY_LOW_LEVEL; 539#endif 540 541 spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime * 542 zfs_txg_synctime_ms); 543 544#ifdef illumos 545 /* 546 * This determines how often we need to check for hung I/Os after 547 * the cyclic has already fired. Since checking for hung I/Os is 548 * an expensive operation we don't want to check too frequently. 549 * Instead wait for 5 synctimes before checking again. 550 */ 551 when.cyt_interval = MSEC2NSEC(5 * zfs_txg_synctime_ms); 552 when.cyt_when = CY_INFINITY; 553 mutex_enter(&cpu_lock); 554 spa->spa_deadman_cycid = cyclic_add(&hdlr, &when); 555 mutex_exit(&cpu_lock); 556#else /* !illumos */ 557#ifdef _KERNEL 558 callout_init(&spa->spa_deadman_cycid, CALLOUT_MPSAFE); 559#endif 560#endif 561 refcount_create(&spa->spa_refcount); 562 spa_config_lock_init(spa); 563 564 avl_add(&spa_namespace_avl, spa); 565 566 /* 567 * Set the alternate root, if there is one. 568 */ 569 if (altroot) { 570 spa->spa_root = spa_strdup(altroot); 571 spa_active_count++; 572 } 573 574 /* 575 * Every pool starts with the default cachefile 576 */ 577 list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t), 578 offsetof(spa_config_dirent_t, scd_link)); 579 580 dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP); 581 dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path); 582 list_insert_head(&spa->spa_config_list, dp); 583 584 VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME, 585 KM_SLEEP) == 0); 586 587 if (config != NULL) { 588 nvlist_t *features; 589 590 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ, 591 &features) == 0) { 592 VERIFY(nvlist_dup(features, &spa->spa_label_features, 593 0) == 0); 594 } 595 596 VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); 597 } 598 599 if (spa->spa_label_features == NULL) { 600 VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME, 601 KM_SLEEP) == 0); 602 } 603 604 spa->spa_debug = ((zfs_flags & ZFS_DEBUG_SPA) != 0); 605 606 return (spa); 607} 608 609/* 610 * Removes a spa_t from the namespace, freeing up any memory used. Requires 611 * spa_namespace_lock. This is called only after the spa_t has been closed and 612 * deactivated. 613 */ 614void 615spa_remove(spa_t *spa) 616{ 617 spa_config_dirent_t *dp; 618 619 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 620 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 621 622 nvlist_free(spa->spa_config_splitting); 623 624 avl_remove(&spa_namespace_avl, spa); 625 cv_broadcast(&spa_namespace_cv); 626 627 if (spa->spa_root) { 628 spa_strfree(spa->spa_root); 629 spa_active_count--; 630 } 631 632 while ((dp = list_head(&spa->spa_config_list)) != NULL) { 633 list_remove(&spa->spa_config_list, dp); 634 if (dp->scd_path != NULL) 635 spa_strfree(dp->scd_path); 636 kmem_free(dp, sizeof (spa_config_dirent_t)); 637 } 638 639 list_destroy(&spa->spa_config_list); 640 641 nvlist_free(spa->spa_label_features); 642 nvlist_free(spa->spa_load_info); 643 spa_config_set(spa, NULL); 644 645#ifdef illumos 646 mutex_enter(&cpu_lock); 647 if (spa->spa_deadman_cycid != CYCLIC_NONE) 648 cyclic_remove(spa->spa_deadman_cycid); 649 mutex_exit(&cpu_lock); 650 spa->spa_deadman_cycid = CYCLIC_NONE; 651#else /* !illumos */ 652#ifdef _KERNEL 653 callout_drain(&spa->spa_deadman_cycid); 654#endif 655#endif 656 657 refcount_destroy(&spa->spa_refcount); 658 659 spa_config_lock_destroy(spa); 660 661 for (int t = 0; t < TXG_SIZE; t++) 662 bplist_destroy(&spa->spa_free_bplist[t]); 663 664 cv_destroy(&spa->spa_async_cv); 665 cv_destroy(&spa->spa_proc_cv); 666 cv_destroy(&spa->spa_scrub_io_cv); 667 cv_destroy(&spa->spa_suspend_cv); 668 669 mutex_destroy(&spa->spa_async_lock); 670 mutex_destroy(&spa->spa_errlist_lock); 671 mutex_destroy(&spa->spa_errlog_lock); 672 mutex_destroy(&spa->spa_history_lock); 673 mutex_destroy(&spa->spa_proc_lock); 674 mutex_destroy(&spa->spa_props_lock); 675 mutex_destroy(&spa->spa_scrub_lock); 676 mutex_destroy(&spa->spa_suspend_lock); 677 mutex_destroy(&spa->spa_vdev_top_lock); 678 679 kmem_free(spa, sizeof (spa_t)); 680} 681 682/* 683 * Given a pool, return the next pool in the namespace, or NULL if there is 684 * none. If 'prev' is NULL, return the first pool. 685 */ 686spa_t * 687spa_next(spa_t *prev) 688{ 689 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 690 691 if (prev) 692 return (AVL_NEXT(&spa_namespace_avl, prev)); 693 else 694 return (avl_first(&spa_namespace_avl)); 695} 696 697/* 698 * ========================================================================== 699 * SPA refcount functions 700 * ========================================================================== 701 */ 702 703/* 704 * Add a reference to the given spa_t. Must have at least one reference, or 705 * have the namespace lock held. 706 */ 707void 708spa_open_ref(spa_t *spa, void *tag) 709{ 710 ASSERT(refcount_count(&spa->spa_refcount) >= spa->spa_minref || 711 MUTEX_HELD(&spa_namespace_lock)); 712 (void) refcount_add(&spa->spa_refcount, tag); 713} 714 715/* 716 * Remove a reference to the given spa_t. Must have at least one reference, or 717 * have the namespace lock held. 718 */ 719void 720spa_close(spa_t *spa, void *tag) 721{ 722 ASSERT(refcount_count(&spa->spa_refcount) > spa->spa_minref || 723 MUTEX_HELD(&spa_namespace_lock)); 724 (void) refcount_remove(&spa->spa_refcount, tag); 725} 726 727/* 728 * Check to see if the spa refcount is zero. Must be called with 729 * spa_namespace_lock held. We really compare against spa_minref, which is the 730 * number of references acquired when opening a pool 731 */ 732boolean_t 733spa_refcount_zero(spa_t *spa) 734{ 735 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 736 737 return (refcount_count(&spa->spa_refcount) == spa->spa_minref); 738} 739 740/* 741 * ========================================================================== 742 * SPA spare and l2cache tracking 743 * ========================================================================== 744 */ 745 746/* 747 * Hot spares and cache devices are tracked using the same code below, 748 * for 'auxiliary' devices. 749 */ 750 751typedef struct spa_aux { 752 uint64_t aux_guid; 753 uint64_t aux_pool; 754 avl_node_t aux_avl; 755 int aux_count; 756} spa_aux_t; 757 758static int 759spa_aux_compare(const void *a, const void *b) 760{ 761 const spa_aux_t *sa = a; 762 const spa_aux_t *sb = b; 763 764 if (sa->aux_guid < sb->aux_guid) 765 return (-1); 766 else if (sa->aux_guid > sb->aux_guid) 767 return (1); 768 else 769 return (0); 770} 771 772void 773spa_aux_add(vdev_t *vd, avl_tree_t *avl) 774{ 775 avl_index_t where; 776 spa_aux_t search; 777 spa_aux_t *aux; 778 779 search.aux_guid = vd->vdev_guid; 780 if ((aux = avl_find(avl, &search, &where)) != NULL) { 781 aux->aux_count++; 782 } else { 783 aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP); 784 aux->aux_guid = vd->vdev_guid; 785 aux->aux_count = 1; 786 avl_insert(avl, aux, where); 787 } 788} 789 790void 791spa_aux_remove(vdev_t *vd, avl_tree_t *avl) 792{ 793 spa_aux_t search; 794 spa_aux_t *aux; 795 avl_index_t where; 796 797 search.aux_guid = vd->vdev_guid; 798 aux = avl_find(avl, &search, &where); 799 800 ASSERT(aux != NULL); 801 802 if (--aux->aux_count == 0) { 803 avl_remove(avl, aux); 804 kmem_free(aux, sizeof (spa_aux_t)); 805 } else if (aux->aux_pool == spa_guid(vd->vdev_spa)) { 806 aux->aux_pool = 0ULL; 807 } 808} 809 810boolean_t 811spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl) 812{ 813 spa_aux_t search, *found; 814 815 search.aux_guid = guid; 816 found = avl_find(avl, &search, NULL); 817 818 if (pool) { 819 if (found) 820 *pool = found->aux_pool; 821 else 822 *pool = 0ULL; 823 } 824 825 if (refcnt) { 826 if (found) 827 *refcnt = found->aux_count; 828 else 829 *refcnt = 0; 830 } 831 832 return (found != NULL); 833} 834 835void 836spa_aux_activate(vdev_t *vd, avl_tree_t *avl) 837{ 838 spa_aux_t search, *found; 839 avl_index_t where; 840 841 search.aux_guid = vd->vdev_guid; 842 found = avl_find(avl, &search, &where); 843 ASSERT(found != NULL); 844 ASSERT(found->aux_pool == 0ULL); 845 846 found->aux_pool = spa_guid(vd->vdev_spa); 847} 848 849/* 850 * Spares are tracked globally due to the following constraints: 851 * 852 * - A spare may be part of multiple pools. 853 * - A spare may be added to a pool even if it's actively in use within 854 * another pool. 855 * - A spare in use in any pool can only be the source of a replacement if 856 * the target is a spare in the same pool. 857 * 858 * We keep track of all spares on the system through the use of a reference 859 * counted AVL tree. When a vdev is added as a spare, or used as a replacement 860 * spare, then we bump the reference count in the AVL tree. In addition, we set 861 * the 'vdev_isspare' member to indicate that the device is a spare (active or 862 * inactive). When a spare is made active (used to replace a device in the 863 * pool), we also keep track of which pool its been made a part of. 864 * 865 * The 'spa_spare_lock' protects the AVL tree. These functions are normally 866 * called under the spa_namespace lock as part of vdev reconfiguration. The 867 * separate spare lock exists for the status query path, which does not need to 868 * be completely consistent with respect to other vdev configuration changes. 869 */ 870 871static int 872spa_spare_compare(const void *a, const void *b) 873{ 874 return (spa_aux_compare(a, b)); 875} 876 877void 878spa_spare_add(vdev_t *vd) 879{ 880 mutex_enter(&spa_spare_lock); 881 ASSERT(!vd->vdev_isspare); 882 spa_aux_add(vd, &spa_spare_avl); 883 vd->vdev_isspare = B_TRUE; 884 mutex_exit(&spa_spare_lock); 885} 886 887void 888spa_spare_remove(vdev_t *vd) 889{ 890 mutex_enter(&spa_spare_lock); 891 ASSERT(vd->vdev_isspare); 892 spa_aux_remove(vd, &spa_spare_avl); 893 vd->vdev_isspare = B_FALSE; 894 mutex_exit(&spa_spare_lock); 895} 896 897boolean_t 898spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt) 899{ 900 boolean_t found; 901 902 mutex_enter(&spa_spare_lock); 903 found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl); 904 mutex_exit(&spa_spare_lock); 905 906 return (found); 907} 908 909void 910spa_spare_activate(vdev_t *vd) 911{ 912 mutex_enter(&spa_spare_lock); 913 ASSERT(vd->vdev_isspare); 914 spa_aux_activate(vd, &spa_spare_avl); 915 mutex_exit(&spa_spare_lock); 916} 917 918/* 919 * Level 2 ARC devices are tracked globally for the same reasons as spares. 920 * Cache devices currently only support one pool per cache device, and so 921 * for these devices the aux reference count is currently unused beyond 1. 922 */ 923 924static int 925spa_l2cache_compare(const void *a, const void *b) 926{ 927 return (spa_aux_compare(a, b)); 928} 929 930void 931spa_l2cache_add(vdev_t *vd) 932{ 933 mutex_enter(&spa_l2cache_lock); 934 ASSERT(!vd->vdev_isl2cache); 935 spa_aux_add(vd, &spa_l2cache_avl); 936 vd->vdev_isl2cache = B_TRUE; 937 mutex_exit(&spa_l2cache_lock); 938} 939 940void 941spa_l2cache_remove(vdev_t *vd) 942{ 943 mutex_enter(&spa_l2cache_lock); 944 ASSERT(vd->vdev_isl2cache); 945 spa_aux_remove(vd, &spa_l2cache_avl); 946 vd->vdev_isl2cache = B_FALSE; 947 mutex_exit(&spa_l2cache_lock); 948} 949 950boolean_t 951spa_l2cache_exists(uint64_t guid, uint64_t *pool) 952{ 953 boolean_t found; 954 955 mutex_enter(&spa_l2cache_lock); 956 found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl); 957 mutex_exit(&spa_l2cache_lock); 958 959 return (found); 960} 961 962void 963spa_l2cache_activate(vdev_t *vd) 964{ 965 mutex_enter(&spa_l2cache_lock); 966 ASSERT(vd->vdev_isl2cache); 967 spa_aux_activate(vd, &spa_l2cache_avl); 968 mutex_exit(&spa_l2cache_lock); 969} 970 971/* 972 * ========================================================================== 973 * SPA vdev locking 974 * ========================================================================== 975 */ 976 977/* 978 * Lock the given spa_t for the purpose of adding or removing a vdev. 979 * Grabs the global spa_namespace_lock plus the spa config lock for writing. 980 * It returns the next transaction group for the spa_t. 981 */ 982uint64_t 983spa_vdev_enter(spa_t *spa) 984{ 985 mutex_enter(&spa->spa_vdev_top_lock); 986 mutex_enter(&spa_namespace_lock); 987 return (spa_vdev_config_enter(spa)); 988} 989 990/* 991 * Internal implementation for spa_vdev_enter(). Used when a vdev 992 * operation requires multiple syncs (i.e. removing a device) while 993 * keeping the spa_namespace_lock held. 994 */ 995uint64_t 996spa_vdev_config_enter(spa_t *spa) 997{ 998 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 999 1000 spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); 1001 1002 return (spa_last_synced_txg(spa) + 1); 1003} 1004 1005/* 1006 * Used in combination with spa_vdev_config_enter() to allow the syncing 1007 * of multiple transactions without releasing the spa_namespace_lock. 1008 */ 1009void 1010spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag) 1011{ 1012 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1013 1014 int config_changed = B_FALSE; 1015 1016 ASSERT(txg > spa_last_synced_txg(spa)); 1017 1018 spa->spa_pending_vdev = NULL; 1019 1020 /* 1021 * Reassess the DTLs. 1022 */ 1023 vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE); 1024 1025 if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) { 1026 config_changed = B_TRUE; 1027 spa->spa_config_generation++; 1028 } 1029 1030 /* 1031 * Verify the metaslab classes. 1032 */ 1033 ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0); 1034 ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0); 1035 1036 spa_config_exit(spa, SCL_ALL, spa); 1037 1038 /* 1039 * Panic the system if the specified tag requires it. This 1040 * is useful for ensuring that configurations are updated 1041 * transactionally. 1042 */ 1043 if (zio_injection_enabled) 1044 zio_handle_panic_injection(spa, tag, 0); 1045 1046 /* 1047 * Note: this txg_wait_synced() is important because it ensures 1048 * that there won't be more than one config change per txg. 1049 * This allows us to use the txg as the generation number. 1050 */ 1051 if (error == 0) 1052 txg_wait_synced(spa->spa_dsl_pool, txg); 1053 1054 if (vd != NULL) { 1055 ASSERT(!vd->vdev_detached || vd->vdev_dtl_smo.smo_object == 0); 1056 spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); 1057 vdev_free(vd); 1058 spa_config_exit(spa, SCL_ALL, spa); 1059 } 1060 1061 /* 1062 * If the config changed, update the config cache. 1063 */ 1064 if (config_changed) 1065 spa_config_sync(spa, B_FALSE, B_TRUE); 1066} 1067 1068/* 1069 * Unlock the spa_t after adding or removing a vdev. Besides undoing the 1070 * locking of spa_vdev_enter(), we also want make sure the transactions have 1071 * synced to disk, and then update the global configuration cache with the new 1072 * information. 1073 */ 1074int 1075spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) 1076{ 1077 spa_vdev_config_exit(spa, vd, txg, error, FTAG); 1078 mutex_exit(&spa_namespace_lock); 1079 mutex_exit(&spa->spa_vdev_top_lock); 1080 1081 return (error); 1082} 1083 1084/* 1085 * Lock the given spa_t for the purpose of changing vdev state. 1086 */ 1087void 1088spa_vdev_state_enter(spa_t *spa, int oplocks) 1089{ 1090 int locks = SCL_STATE_ALL | oplocks; 1091 1092 /* 1093 * Root pools may need to read of the underlying devfs filesystem 1094 * when opening up a vdev. Unfortunately if we're holding the 1095 * SCL_ZIO lock it will result in a deadlock when we try to issue 1096 * the read from the root filesystem. Instead we "prefetch" 1097 * the associated vnodes that we need prior to opening the 1098 * underlying devices and cache them so that we can prevent 1099 * any I/O when we are doing the actual open. 1100 */ 1101 if (spa_is_root(spa)) { 1102 int low = locks & ~(SCL_ZIO - 1); 1103 int high = locks & ~low; 1104 1105 spa_config_enter(spa, high, spa, RW_WRITER); 1106 vdev_hold(spa->spa_root_vdev); 1107 spa_config_enter(spa, low, spa, RW_WRITER); 1108 } else { 1109 spa_config_enter(spa, locks, spa, RW_WRITER); 1110 } 1111 spa->spa_vdev_locks = locks; 1112} 1113 1114int 1115spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) 1116{ 1117 boolean_t config_changed = B_FALSE; 1118 1119 if (vd != NULL || error == 0) 1120 vdev_dtl_reassess(vd ? vd->vdev_top : spa->spa_root_vdev, 1121 0, 0, B_FALSE); 1122 1123 if (vd != NULL) { 1124 vdev_state_dirty(vd->vdev_top); 1125 config_changed = B_TRUE; 1126 spa->spa_config_generation++; 1127 } 1128 1129 if (spa_is_root(spa)) 1130 vdev_rele(spa->spa_root_vdev); 1131 1132 ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL); 1133 spa_config_exit(spa, spa->spa_vdev_locks, spa); 1134 1135 /* 1136 * If anything changed, wait for it to sync. This ensures that, 1137 * from the system administrator's perspective, zpool(1M) commands 1138 * are synchronous. This is important for things like zpool offline: 1139 * when the command completes, you expect no further I/O from ZFS. 1140 */ 1141 if (vd != NULL) 1142 txg_wait_synced(spa->spa_dsl_pool, 0); 1143 1144 /* 1145 * If the config changed, update the config cache. 1146 */ 1147 if (config_changed) { 1148 mutex_enter(&spa_namespace_lock); 1149 spa_config_sync(spa, B_FALSE, B_TRUE); 1150 mutex_exit(&spa_namespace_lock); 1151 } 1152 1153 return (error); 1154} 1155 1156/* 1157 * ========================================================================== 1158 * Miscellaneous functions 1159 * ========================================================================== 1160 */ 1161 1162void 1163spa_activate_mos_feature(spa_t *spa, const char *feature) 1164{ 1165 (void) nvlist_add_boolean(spa->spa_label_features, feature); 1166 vdev_config_dirty(spa->spa_root_vdev); 1167} 1168 1169void 1170spa_deactivate_mos_feature(spa_t *spa, const char *feature) 1171{ 1172 (void) nvlist_remove_all(spa->spa_label_features, feature); 1173 vdev_config_dirty(spa->spa_root_vdev); 1174} 1175 1176/* 1177 * Rename a spa_t. 1178 */ 1179int 1180spa_rename(const char *name, const char *newname) 1181{ 1182 spa_t *spa; 1183 int err; 1184 1185 /* 1186 * Lookup the spa_t and grab the config lock for writing. We need to 1187 * actually open the pool so that we can sync out the necessary labels. 1188 * It's OK to call spa_open() with the namespace lock held because we 1189 * allow recursive calls for other reasons. 1190 */ 1191 mutex_enter(&spa_namespace_lock); 1192 if ((err = spa_open(name, &spa, FTAG)) != 0) { 1193 mutex_exit(&spa_namespace_lock); 1194 return (err); 1195 } 1196 1197 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1198 1199 avl_remove(&spa_namespace_avl, spa); 1200 (void) strlcpy(spa->spa_name, newname, sizeof (spa->spa_name)); 1201 avl_add(&spa_namespace_avl, spa); 1202 1203 /* 1204 * Sync all labels to disk with the new names by marking the root vdev 1205 * dirty and waiting for it to sync. It will pick up the new pool name 1206 * during the sync. 1207 */ 1208 vdev_config_dirty(spa->spa_root_vdev); 1209 1210 spa_config_exit(spa, SCL_ALL, FTAG); 1211 1212 txg_wait_synced(spa->spa_dsl_pool, 0); 1213 1214 /* 1215 * Sync the updated config cache. 1216 */ 1217 spa_config_sync(spa, B_FALSE, B_TRUE); 1218 1219 spa_close(spa, FTAG); 1220 1221 mutex_exit(&spa_namespace_lock); 1222 1223 return (0); 1224} 1225 1226/* 1227 * Return the spa_t associated with given pool_guid, if it exists. If 1228 * device_guid is non-zero, determine whether the pool exists *and* contains 1229 * a device with the specified device_guid. 1230 */ 1231spa_t * 1232spa_by_guid(uint64_t pool_guid, uint64_t device_guid) 1233{ 1234 spa_t *spa; 1235 avl_tree_t *t = &spa_namespace_avl; 1236 1237 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1238 1239 for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) { 1240 if (spa->spa_state == POOL_STATE_UNINITIALIZED) 1241 continue; 1242 if (spa->spa_root_vdev == NULL) 1243 continue; 1244 if (spa_guid(spa) == pool_guid) { 1245 if (device_guid == 0) 1246 break; 1247 1248 if (vdev_lookup_by_guid(spa->spa_root_vdev, 1249 device_guid) != NULL) 1250 break; 1251 1252 /* 1253 * Check any devices we may be in the process of adding. 1254 */ 1255 if (spa->spa_pending_vdev) { 1256 if (vdev_lookup_by_guid(spa->spa_pending_vdev, 1257 device_guid) != NULL) 1258 break; 1259 } 1260 } 1261 } 1262 1263 return (spa); 1264} 1265 1266/* 1267 * Determine whether a pool with the given pool_guid exists. 1268 */ 1269boolean_t 1270spa_guid_exists(uint64_t pool_guid, uint64_t device_guid) 1271{ 1272 return (spa_by_guid(pool_guid, device_guid) != NULL); 1273} 1274 1275char * 1276spa_strdup(const char *s) 1277{ 1278 size_t len; 1279 char *new; 1280 1281 len = strlen(s); 1282 new = kmem_alloc(len + 1, KM_SLEEP); 1283 bcopy(s, new, len); 1284 new[len] = '\0'; 1285 1286 return (new); 1287} 1288 1289void 1290spa_strfree(char *s) 1291{ 1292 kmem_free(s, strlen(s) + 1); 1293} 1294 1295uint64_t 1296spa_get_random(uint64_t range) 1297{ 1298 uint64_t r; 1299 1300 ASSERT(range != 0); 1301 1302 (void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t)); 1303 1304 return (r % range); 1305} 1306 1307uint64_t 1308spa_generate_guid(spa_t *spa) 1309{ 1310 uint64_t guid = spa_get_random(-1ULL); 1311 1312 if (spa != NULL) { 1313 while (guid == 0 || spa_guid_exists(spa_guid(spa), guid)) 1314 guid = spa_get_random(-1ULL); 1315 } else { 1316 while (guid == 0 || spa_guid_exists(guid, 0)) 1317 guid = spa_get_random(-1ULL); 1318 } 1319 1320 return (guid); 1321} 1322 1323void 1324sprintf_blkptr(char *buf, const blkptr_t *bp) 1325{ 1326 char type[256]; 1327 char *checksum = NULL; 1328 char *compress = NULL; 1329 1330 if (bp != NULL) { 1331 if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) { 1332 dmu_object_byteswap_t bswap = 1333 DMU_OT_BYTESWAP(BP_GET_TYPE(bp)); 1334 (void) snprintf(type, sizeof (type), "bswap %s %s", 1335 DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ? 1336 "metadata" : "data", 1337 dmu_ot_byteswap[bswap].ob_name); 1338 } else { 1339 (void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name, 1340 sizeof (type)); 1341 } 1342 checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name; 1343 compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name; 1344 } 1345 1346 SPRINTF_BLKPTR(snprintf, ' ', buf, bp, type, checksum, compress); 1347} 1348 1349void 1350spa_freeze(spa_t *spa) 1351{ 1352 uint64_t freeze_txg = 0; 1353 1354 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1355 if (spa->spa_freeze_txg == UINT64_MAX) { 1356 freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE; 1357 spa->spa_freeze_txg = freeze_txg; 1358 } 1359 spa_config_exit(spa, SCL_ALL, FTAG); 1360 if (freeze_txg != 0) 1361 txg_wait_synced(spa_get_dsl(spa), freeze_txg); 1362} 1363 1364void 1365zfs_panic_recover(const char *fmt, ...) 1366{ 1367 va_list adx; 1368 1369 va_start(adx, fmt); 1370 vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx); 1371 va_end(adx); 1372} 1373 1374/* 1375 * This is a stripped-down version of strtoull, suitable only for converting 1376 * lowercase hexadecimal numbers that don't overflow. 1377 */ 1378uint64_t 1379zfs_strtonum(const char *str, char **nptr) 1380{ 1381 uint64_t val = 0; 1382 char c; 1383 int digit; 1384 1385 while ((c = *str) != '\0') { 1386 if (c >= '0' && c <= '9') 1387 digit = c - '0'; 1388 else if (c >= 'a' && c <= 'f') 1389 digit = 10 + c - 'a'; 1390 else 1391 break; 1392 1393 val *= 16; 1394 val += digit; 1395 1396 str++; 1397 } 1398 1399 if (nptr) 1400 *nptr = (char *)str; 1401 1402 return (val); 1403} 1404 1405/* 1406 * ========================================================================== 1407 * Accessor functions 1408 * ========================================================================== 1409 */ 1410 1411boolean_t 1412spa_shutting_down(spa_t *spa) 1413{ 1414 return (spa->spa_async_suspended); 1415} 1416 1417dsl_pool_t * 1418spa_get_dsl(spa_t *spa) 1419{ 1420 return (spa->spa_dsl_pool); 1421} 1422 1423boolean_t 1424spa_is_initializing(spa_t *spa) 1425{ 1426 return (spa->spa_is_initializing); 1427} 1428 1429blkptr_t * 1430spa_get_rootblkptr(spa_t *spa) 1431{ 1432 return (&spa->spa_ubsync.ub_rootbp); 1433} 1434 1435void 1436spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp) 1437{ 1438 spa->spa_uberblock.ub_rootbp = *bp; 1439} 1440 1441void 1442spa_altroot(spa_t *spa, char *buf, size_t buflen) 1443{ 1444 if (spa->spa_root == NULL) 1445 buf[0] = '\0'; 1446 else 1447 (void) strncpy(buf, spa->spa_root, buflen); 1448} 1449 1450int 1451spa_sync_pass(spa_t *spa) 1452{ 1453 return (spa->spa_sync_pass); 1454} 1455 1456char * 1457spa_name(spa_t *spa) 1458{ 1459 return (spa->spa_name); 1460} 1461 1462uint64_t 1463spa_guid(spa_t *spa) 1464{ 1465 dsl_pool_t *dp = spa_get_dsl(spa); 1466 uint64_t guid; 1467 1468 /* 1469 * If we fail to parse the config during spa_load(), we can go through 1470 * the error path (which posts an ereport) and end up here with no root 1471 * vdev. We stash the original pool guid in 'spa_config_guid' to handle 1472 * this case. 1473 */ 1474 if (spa->spa_root_vdev == NULL) 1475 return (spa->spa_config_guid); 1476 1477 guid = spa->spa_last_synced_guid != 0 ? 1478 spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid; 1479 1480 /* 1481 * Return the most recently synced out guid unless we're 1482 * in syncing context. 1483 */ 1484 if (dp && dsl_pool_sync_context(dp)) 1485 return (spa->spa_root_vdev->vdev_guid); 1486 else 1487 return (guid); 1488} 1489 1490uint64_t 1491spa_load_guid(spa_t *spa) 1492{ 1493 /* 1494 * This is a GUID that exists solely as a reference for the 1495 * purposes of the arc. It is generated at load time, and 1496 * is never written to persistent storage. 1497 */ 1498 return (spa->spa_load_guid); 1499} 1500 1501uint64_t 1502spa_last_synced_txg(spa_t *spa) 1503{ 1504 return (spa->spa_ubsync.ub_txg); 1505} 1506 1507uint64_t 1508spa_first_txg(spa_t *spa) 1509{ 1510 return (spa->spa_first_txg); 1511} 1512 1513uint64_t 1514spa_syncing_txg(spa_t *spa) 1515{ 1516 return (spa->spa_syncing_txg); 1517} 1518 1519pool_state_t 1520spa_state(spa_t *spa) 1521{ 1522 return (spa->spa_state); 1523} 1524 1525spa_load_state_t 1526spa_load_state(spa_t *spa) 1527{ 1528 return (spa->spa_load_state); 1529} 1530 1531uint64_t 1532spa_freeze_txg(spa_t *spa) 1533{ 1534 return (spa->spa_freeze_txg); 1535} 1536 1537/* ARGSUSED */ 1538uint64_t 1539spa_get_asize(spa_t *spa, uint64_t lsize) 1540{ 1541 /* 1542 * The worst case is single-sector max-parity RAID-Z blocks, in which 1543 * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1) 1544 * times the size; so just assume that. Add to this the fact that 1545 * we can have up to 3 DVAs per bp, and one more factor of 2 because 1546 * the block may be dittoed with up to 3 DVAs by ddt_sync(). 1547 */ 1548 return (lsize * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2); 1549} 1550 1551uint64_t 1552spa_get_dspace(spa_t *spa) 1553{ 1554 return (spa->spa_dspace); 1555} 1556 1557void 1558spa_update_dspace(spa_t *spa) 1559{ 1560 spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) + 1561 ddt_get_dedup_dspace(spa); 1562} 1563 1564/* 1565 * Return the failure mode that has been set to this pool. The default 1566 * behavior will be to block all I/Os when a complete failure occurs. 1567 */ 1568uint8_t 1569spa_get_failmode(spa_t *spa) 1570{ 1571 return (spa->spa_failmode); 1572} 1573 1574boolean_t 1575spa_suspended(spa_t *spa) 1576{ 1577 return (spa->spa_suspended); 1578} 1579 1580uint64_t 1581spa_version(spa_t *spa) 1582{ 1583 return (spa->spa_ubsync.ub_version); 1584} 1585 1586boolean_t 1587spa_deflate(spa_t *spa) 1588{ 1589 return (spa->spa_deflate); 1590} 1591 1592metaslab_class_t * 1593spa_normal_class(spa_t *spa) 1594{ 1595 return (spa->spa_normal_class); 1596} 1597 1598metaslab_class_t * 1599spa_log_class(spa_t *spa) 1600{ 1601 return (spa->spa_log_class); 1602} 1603 1604int 1605spa_max_replication(spa_t *spa) 1606{ 1607 /* 1608 * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to 1609 * handle BPs with more than one DVA allocated. Set our max 1610 * replication level accordingly. 1611 */ 1612 if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS) 1613 return (1); 1614 return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override)); 1615} 1616 1617int 1618spa_prev_software_version(spa_t *spa) 1619{ 1620 return (spa->spa_prev_software_version); 1621} 1622 1623uint64_t 1624spa_deadman_synctime(spa_t *spa) 1625{ 1626 return (spa->spa_deadman_synctime); 1627} 1628 1629uint64_t 1630dva_get_dsize_sync(spa_t *spa, const dva_t *dva) 1631{ 1632 uint64_t asize = DVA_GET_ASIZE(dva); 1633 uint64_t dsize = asize; 1634 1635 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 1636 1637 if (asize != 0 && spa->spa_deflate) { 1638 vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); 1639 dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio; 1640 } 1641 1642 return (dsize); 1643} 1644 1645uint64_t 1646bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp) 1647{ 1648 uint64_t dsize = 0; 1649 1650 for (int d = 0; d < SPA_DVAS_PER_BP; d++) 1651 dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); 1652 1653 return (dsize); 1654} 1655 1656uint64_t 1657bp_get_dsize(spa_t *spa, const blkptr_t *bp) 1658{ 1659 uint64_t dsize = 0; 1660 1661 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 1662 1663 for (int d = 0; d < SPA_DVAS_PER_BP; d++) 1664 dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); 1665 1666 spa_config_exit(spa, SCL_VDEV, FTAG); 1667 1668 return (dsize); 1669} 1670 1671/* 1672 * ========================================================================== 1673 * Initialization and Termination 1674 * ========================================================================== 1675 */ 1676 1677static int 1678spa_name_compare(const void *a1, const void *a2) 1679{ 1680 const spa_t *s1 = a1; 1681 const spa_t *s2 = a2; 1682 int s; 1683 1684 s = strcmp(s1->spa_name, s2->spa_name); 1685 if (s > 0) 1686 return (1); 1687 if (s < 0) 1688 return (-1); 1689 return (0); 1690} 1691 1692int 1693spa_busy(void) 1694{ 1695 return (spa_active_count); 1696} 1697 1698void 1699spa_boot_init() 1700{ 1701 spa_config_load(); 1702} 1703 1704#ifdef _KERNEL 1705EVENTHANDLER_DEFINE(mountroot, spa_boot_init, NULL, 0); 1706#endif 1707 1708void 1709spa_init(int mode) 1710{ 1711 mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL); 1712 mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL); 1713 mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL); 1714 cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL); 1715 1716 avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t), 1717 offsetof(spa_t, spa_avl)); 1718 1719 avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t), 1720 offsetof(spa_aux_t, aux_avl)); 1721 1722 avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t), 1723 offsetof(spa_aux_t, aux_avl)); 1724 1725 spa_mode_global = mode; 1726 1727#ifdef illumos 1728#ifdef _KERNEL 1729 spa_arch_init(); 1730#else 1731 if (spa_mode_global != FREAD && dprintf_find_string("watch")) { 1732 arc_procfd = open("/proc/self/ctl", O_WRONLY); 1733 if (arc_procfd == -1) { 1734 perror("could not enable watchpoints: " 1735 "opening /proc/self/ctl failed: "); 1736 } else { 1737 arc_watch = B_TRUE; 1738 } 1739 } 1740#endif 1741#endif /* illumos */ 1742 refcount_sysinit(); 1743 unique_init(); 1744 space_map_init(); 1745 zio_init(); 1746 lz4_init(); 1747 dmu_init(); 1748 zil_init(); 1749 vdev_cache_stat_init(); 1750 zfs_prop_init(); 1751 zpool_prop_init(); 1752 zpool_feature_init(); 1753 spa_config_load(); 1754 l2arc_start(); 1755#ifndef illumos 1756#ifdef _KERNEL 1757 zfs_deadman_init(); 1758#endif 1759#endif /* !illumos */ 1760} 1761 1762void 1763spa_fini(void) 1764{ 1765 l2arc_stop(); 1766 1767 spa_evict_all(); 1768 1769 vdev_cache_stat_fini(); 1770 zil_fini(); 1771 dmu_fini(); 1772 lz4_fini(); 1773 zio_fini(); 1774 space_map_fini(); 1775 unique_fini(); 1776 refcount_fini(); 1777 1778 avl_destroy(&spa_namespace_avl); 1779 avl_destroy(&spa_spare_avl); 1780 avl_destroy(&spa_l2cache_avl); 1781 1782 cv_destroy(&spa_namespace_cv); 1783 mutex_destroy(&spa_namespace_lock); 1784 mutex_destroy(&spa_spare_lock); 1785 mutex_destroy(&spa_l2cache_lock); 1786} 1787 1788/* 1789 * Return whether this pool has slogs. No locking needed. 1790 * It's not a problem if the wrong answer is returned as it's only for 1791 * performance and not correctness 1792 */ 1793boolean_t 1794spa_has_slogs(spa_t *spa) 1795{ 1796 return (spa->spa_log_class->mc_rotor != NULL); 1797} 1798 1799spa_log_state_t 1800spa_get_log_state(spa_t *spa) 1801{ 1802 return (spa->spa_log_state); 1803} 1804 1805void 1806spa_set_log_state(spa_t *spa, spa_log_state_t state) 1807{ 1808 spa->spa_log_state = state; 1809} 1810 1811boolean_t 1812spa_is_root(spa_t *spa) 1813{ 1814 return (spa->spa_is_root); 1815} 1816 1817boolean_t 1818spa_writeable(spa_t *spa) 1819{ 1820 return (!!(spa->spa_mode & FWRITE)); 1821} 1822 1823int 1824spa_mode(spa_t *spa) 1825{ 1826 return (spa->spa_mode); 1827} 1828 1829uint64_t 1830spa_bootfs(spa_t *spa) 1831{ 1832 return (spa->spa_bootfs); 1833} 1834 1835uint64_t 1836spa_delegation(spa_t *spa) 1837{ 1838 return (spa->spa_delegation); 1839} 1840 1841objset_t * 1842spa_meta_objset(spa_t *spa) 1843{ 1844 return (spa->spa_meta_objset); 1845} 1846 1847enum zio_checksum 1848spa_dedup_checksum(spa_t *spa) 1849{ 1850 return (spa->spa_dedup_checksum); 1851} 1852 1853/* 1854 * Reset pool scan stat per scan pass (or reboot). 1855 */ 1856void 1857spa_scan_stat_init(spa_t *spa) 1858{ 1859 /* data not stored on disk */ 1860 spa->spa_scan_pass_start = gethrestime_sec(); 1861 spa->spa_scan_pass_exam = 0; 1862 vdev_scan_stat_init(spa->spa_root_vdev); 1863} 1864 1865/* 1866 * Get scan stats for zpool status reports 1867 */ 1868int 1869spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) 1870{ 1871 dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL; 1872 1873 if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE) 1874 return (SET_ERROR(ENOENT)); 1875 bzero(ps, sizeof (pool_scan_stat_t)); 1876 1877 /* data stored on disk */ 1878 ps->pss_func = scn->scn_phys.scn_func; 1879 ps->pss_start_time = scn->scn_phys.scn_start_time; 1880 ps->pss_end_time = scn->scn_phys.scn_end_time; 1881 ps->pss_to_examine = scn->scn_phys.scn_to_examine; 1882 ps->pss_examined = scn->scn_phys.scn_examined; 1883 ps->pss_to_process = scn->scn_phys.scn_to_process; 1884 ps->pss_processed = scn->scn_phys.scn_processed; 1885 ps->pss_errors = scn->scn_phys.scn_errors; 1886 ps->pss_state = scn->scn_phys.scn_state; 1887 1888 /* data not stored on disk */ 1889 ps->pss_pass_start = spa->spa_scan_pass_start; 1890 ps->pss_pass_exam = spa->spa_scan_pass_exam; 1891 1892 return (0); 1893} 1894 1895boolean_t 1896spa_debug_enabled(spa_t *spa) 1897{ 1898 return (spa->spa_debug); 1899} 1900