spa_misc.c revision 318786
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2015 by Delphix. All rights reserved. 24 * Copyright 2015 Nexenta Systems, Inc. All rights reserved. 25 * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. 26 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 27 * Copyright 2013 Saso Kiselkov. All rights reserved. 28 * Copyright (c) 2014 Integros [integros.com] 29 */ 30 31#include <sys/zfs_context.h> 32#include <sys/spa_impl.h> 33#include <sys/spa_boot.h> 34#include <sys/zio.h> 35#include <sys/zio_checksum.h> 36#include <sys/zio_compress.h> 37#include <sys/dmu.h> 38#include <sys/dmu_tx.h> 39#include <sys/zap.h> 40#include <sys/zil.h> 41#include <sys/vdev_impl.h> 42#include <sys/metaslab.h> 43#include <sys/uberblock_impl.h> 44#include <sys/txg.h> 45#include <sys/avl.h> 46#include <sys/unique.h> 47#include <sys/dsl_pool.h> 48#include <sys/dsl_dir.h> 49#include <sys/dsl_prop.h> 50#include <sys/dsl_scan.h> 51#include <sys/fs/zfs.h> 52#include <sys/metaslab_impl.h> 53#include <sys/arc.h> 54#include <sys/ddt.h> 55#include "zfs_prop.h" 56#include <sys/zfeature.h> 57 58/* 59 * SPA locking 60 * 61 * There are four basic locks for managing spa_t structures: 62 * 63 * spa_namespace_lock (global mutex) 64 * 65 * This lock must be acquired to do any of the following: 66 * 67 * - Lookup a spa_t by name 68 * - Add or remove a spa_t from the namespace 69 * - Increase spa_refcount from non-zero 70 * - Check if spa_refcount is zero 71 * - Rename a spa_t 72 * - add/remove/attach/detach devices 73 * - Held for the duration of create/destroy/import/export 74 * 75 * It does not need to handle recursion. A create or destroy may 76 * reference objects (files or zvols) in other pools, but by 77 * definition they must have an existing reference, and will never need 78 * to lookup a spa_t by name. 79 * 80 * spa_refcount (per-spa refcount_t protected by mutex) 81 * 82 * This reference count keep track of any active users of the spa_t. The 83 * spa_t cannot be destroyed or freed while this is non-zero. Internally, 84 * the refcount is never really 'zero' - opening a pool implicitly keeps 85 * some references in the DMU. Internally we check against spa_minref, but 86 * present the image of a zero/non-zero value to consumers. 87 * 88 * spa_config_lock[] (per-spa array of rwlocks) 89 * 90 * This protects the spa_t from config changes, and must be held in 91 * the following circumstances: 92 * 93 * - RW_READER to perform I/O to the spa 94 * - RW_WRITER to change the vdev config 95 * 96 * The locking order is fairly straightforward: 97 * 98 * spa_namespace_lock -> spa_refcount 99 * 100 * The namespace lock must be acquired to increase the refcount from 0 101 * or to check if it is zero. 102 * 103 * spa_refcount -> spa_config_lock[] 104 * 105 * There must be at least one valid reference on the spa_t to acquire 106 * the config lock. 107 * 108 * spa_namespace_lock -> spa_config_lock[] 109 * 110 * The namespace lock must always be taken before the config lock. 111 * 112 * 113 * The spa_namespace_lock can be acquired directly and is globally visible. 114 * 115 * The namespace is manipulated using the following functions, all of which 116 * require the spa_namespace_lock to be held. 117 * 118 * spa_lookup() Lookup a spa_t by name. 119 * 120 * spa_add() Create a new spa_t in the namespace. 121 * 122 * spa_remove() Remove a spa_t from the namespace. This also 123 * frees up any memory associated with the spa_t. 124 * 125 * spa_next() Returns the next spa_t in the system, or the 126 * first if NULL is passed. 127 * 128 * spa_evict_all() Shutdown and remove all spa_t structures in 129 * the system. 130 * 131 * spa_guid_exists() Determine whether a pool/device guid exists. 132 * 133 * The spa_refcount is manipulated using the following functions: 134 * 135 * spa_open_ref() Adds a reference to the given spa_t. Must be 136 * called with spa_namespace_lock held if the 137 * refcount is currently zero. 138 * 139 * spa_close() Remove a reference from the spa_t. This will 140 * not free the spa_t or remove it from the 141 * namespace. No locking is required. 142 * 143 * spa_refcount_zero() Returns true if the refcount is currently 144 * zero. Must be called with spa_namespace_lock 145 * held. 146 * 147 * The spa_config_lock[] is an array of rwlocks, ordered as follows: 148 * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV. 149 * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}(). 150 * 151 * To read the configuration, it suffices to hold one of these locks as reader. 152 * To modify the configuration, you must hold all locks as writer. To modify 153 * vdev state without altering the vdev tree's topology (e.g. online/offline), 154 * you must hold SCL_STATE and SCL_ZIO as writer. 155 * 156 * We use these distinct config locks to avoid recursive lock entry. 157 * For example, spa_sync() (which holds SCL_CONFIG as reader) induces 158 * block allocations (SCL_ALLOC), which may require reading space maps 159 * from disk (dmu_read() -> zio_read() -> SCL_ZIO). 160 * 161 * The spa config locks cannot be normal rwlocks because we need the 162 * ability to hand off ownership. For example, SCL_ZIO is acquired 163 * by the issuing thread and later released by an interrupt thread. 164 * They do, however, obey the usual write-wanted semantics to prevent 165 * writer (i.e. system administrator) starvation. 166 * 167 * The lock acquisition rules are as follows: 168 * 169 * SCL_CONFIG 170 * Protects changes to the vdev tree topology, such as vdev 171 * add/remove/attach/detach. Protects the dirty config list 172 * (spa_config_dirty_list) and the set of spares and l2arc devices. 173 * 174 * SCL_STATE 175 * Protects changes to pool state and vdev state, such as vdev 176 * online/offline/fault/degrade/clear. Protects the dirty state list 177 * (spa_state_dirty_list) and global pool state (spa_state). 178 * 179 * SCL_ALLOC 180 * Protects changes to metaslab groups and classes. 181 * Held as reader by metaslab_alloc() and metaslab_claim(). 182 * 183 * SCL_ZIO 184 * Held by bp-level zios (those which have no io_vd upon entry) 185 * to prevent changes to the vdev tree. The bp-level zio implicitly 186 * protects all of its vdev child zios, which do not hold SCL_ZIO. 187 * 188 * SCL_FREE 189 * Protects changes to metaslab groups and classes. 190 * Held as reader by metaslab_free(). SCL_FREE is distinct from 191 * SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free 192 * blocks in zio_done() while another i/o that holds either 193 * SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete. 194 * 195 * SCL_VDEV 196 * Held as reader to prevent changes to the vdev tree during trivial 197 * inquiries such as bp_get_dsize(). SCL_VDEV is distinct from the 198 * other locks, and lower than all of them, to ensure that it's safe 199 * to acquire regardless of caller context. 200 * 201 * In addition, the following rules apply: 202 * 203 * (a) spa_props_lock protects pool properties, spa_config and spa_config_list. 204 * The lock ordering is SCL_CONFIG > spa_props_lock. 205 * 206 * (b) I/O operations on leaf vdevs. For any zio operation that takes 207 * an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(), 208 * or zio_write_phys() -- the caller must ensure that the config cannot 209 * cannot change in the interim, and that the vdev cannot be reopened. 210 * SCL_STATE as reader suffices for both. 211 * 212 * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit(). 213 * 214 * spa_vdev_enter() Acquire the namespace lock and the config lock 215 * for writing. 216 * 217 * spa_vdev_exit() Release the config lock, wait for all I/O 218 * to complete, sync the updated configs to the 219 * cache, and release the namespace lock. 220 * 221 * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit(). 222 * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual 223 * locking is, always, based on spa_namespace_lock and spa_config_lock[]. 224 * 225 * spa_rename() is also implemented within this file since it requires 226 * manipulation of the namespace. 227 */ 228 229static avl_tree_t spa_namespace_avl; 230kmutex_t spa_namespace_lock; 231static kcondvar_t spa_namespace_cv; 232static int spa_active_count; 233int spa_max_replication_override = SPA_DVAS_PER_BP; 234 235static kmutex_t spa_spare_lock; 236static avl_tree_t spa_spare_avl; 237static kmutex_t spa_l2cache_lock; 238static avl_tree_t spa_l2cache_avl; 239 240kmem_cache_t *spa_buffer_pool; 241int spa_mode_global; 242 243#ifdef ZFS_DEBUG 244/* Everything except dprintf and spa is on by default in debug builds */ 245int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SPA); 246#else 247int zfs_flags = 0; 248#endif 249SYSCTL_DECL(_debug); 250TUNABLE_INT("debug.zfs_flags", &zfs_flags); 251SYSCTL_INT(_debug, OID_AUTO, zfs_flags, CTLFLAG_RWTUN, &zfs_flags, 0, 252 "ZFS debug flags."); 253 254/* 255 * zfs_recover can be set to nonzero to attempt to recover from 256 * otherwise-fatal errors, typically caused by on-disk corruption. When 257 * set, calls to zfs_panic_recover() will turn into warning messages. 258 * This should only be used as a last resort, as it typically results 259 * in leaked space, or worse. 260 */ 261boolean_t zfs_recover = B_FALSE; 262SYSCTL_DECL(_vfs_zfs); 263TUNABLE_INT("vfs.zfs.recover", &zfs_recover); 264SYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RWTUN, &zfs_recover, 0, 265 "Try to recover from otherwise-fatal errors."); 266 267static int 268sysctl_vfs_zfs_debug_flags(SYSCTL_HANDLER_ARGS) 269{ 270 int err, val; 271 272 val = zfs_flags; 273 err = sysctl_handle_int(oidp, &val, 0, req); 274 if (err != 0 || req->newptr == NULL) 275 return (err); 276 277 /* 278 * ZFS_DEBUG_MODIFY must be enabled prior to boot so all 279 * arc buffers in the system have the necessary additional 280 * checksum data. However, it is safe to disable at any 281 * time. 282 */ 283 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 284 val &= ~ZFS_DEBUG_MODIFY; 285 zfs_flags = val; 286 287 return (0); 288} 289TUNABLE_INT("vfs.zfs.debugflags", &zfs_flags); 290SYSCTL_PROC(_vfs_zfs, OID_AUTO, debugflags, 291 CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(int), 292 sysctl_vfs_zfs_debug_flags, "IU", "Debug flags for ZFS testing."); 293SYSCTL_PROC(_vfs_zfs, OID_AUTO, debug_flags, 294 CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(int), 295 sysctl_vfs_zfs_debug_flags, "IU", 296 "Debug flags for ZFS testing (deprecated, see vfs.zfs.debugflags)."); 297 298/* 299 * If destroy encounters an EIO while reading metadata (e.g. indirect 300 * blocks), space referenced by the missing metadata can not be freed. 301 * Normally this causes the background destroy to become "stalled", as 302 * it is unable to make forward progress. While in this stalled state, 303 * all remaining space to free from the error-encountering filesystem is 304 * "temporarily leaked". Set this flag to cause it to ignore the EIO, 305 * permanently leak the space from indirect blocks that can not be read, 306 * and continue to free everything else that it can. 307 * 308 * The default, "stalling" behavior is useful if the storage partially 309 * fails (i.e. some but not all i/os fail), and then later recovers. In 310 * this case, we will be able to continue pool operations while it is 311 * partially failed, and when it recovers, we can continue to free the 312 * space, with no leaks. However, note that this case is actually 313 * fairly rare. 314 * 315 * Typically pools either (a) fail completely (but perhaps temporarily, 316 * e.g. a top-level vdev going offline), or (b) have localized, 317 * permanent errors (e.g. disk returns the wrong data due to bit flip or 318 * firmware bug). In case (a), this setting does not matter because the 319 * pool will be suspended and the sync thread will not be able to make 320 * forward progress regardless. In case (b), because the error is 321 * permanent, the best we can do is leak the minimum amount of space, 322 * which is what setting this flag will do. Therefore, it is reasonable 323 * for this flag to normally be set, but we chose the more conservative 324 * approach of not setting it, so that there is no possibility of 325 * leaking space in the "partial temporary" failure case. 326 */ 327boolean_t zfs_free_leak_on_eio = B_FALSE; 328 329/* 330 * Expiration time in milliseconds. This value has two meanings. First it is 331 * used to determine when the spa_deadman() logic should fire. By default the 332 * spa_deadman() will fire if spa_sync() has not completed in 1000 seconds. 333 * Secondly, the value determines if an I/O is considered "hung". Any I/O that 334 * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting 335 * in a system panic. 336 */ 337uint64_t zfs_deadman_synctime_ms = 1000000ULL; 338TUNABLE_QUAD("vfs.zfs.deadman_synctime_ms", &zfs_deadman_synctime_ms); 339SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_synctime_ms, CTLFLAG_RDTUN, 340 &zfs_deadman_synctime_ms, 0, 341 "Stalled ZFS I/O expiration time in milliseconds"); 342 343/* 344 * Check time in milliseconds. This defines the frequency at which we check 345 * for hung I/O. 346 */ 347uint64_t zfs_deadman_checktime_ms = 5000ULL; 348TUNABLE_QUAD("vfs.zfs.deadman_checktime_ms", &zfs_deadman_checktime_ms); 349SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_checktime_ms, CTLFLAG_RDTUN, 350 &zfs_deadman_checktime_ms, 0, 351 "Period of checks for stalled ZFS I/O in milliseconds"); 352 353/* 354 * Default value of -1 for zfs_deadman_enabled is resolved in 355 * zfs_deadman_init() 356 */ 357int zfs_deadman_enabled = -1; 358TUNABLE_INT("vfs.zfs.deadman_enabled", &zfs_deadman_enabled); 359SYSCTL_INT(_vfs_zfs, OID_AUTO, deadman_enabled, CTLFLAG_RDTUN, 360 &zfs_deadman_enabled, 0, "Kernel panic on stalled ZFS I/O"); 361 362/* 363 * The worst case is single-sector max-parity RAID-Z blocks, in which 364 * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1) 365 * times the size; so just assume that. Add to this the fact that 366 * we can have up to 3 DVAs per bp, and one more factor of 2 because 367 * the block may be dittoed with up to 3 DVAs by ddt_sync(). All together, 368 * the worst case is: 369 * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24 370 */ 371int spa_asize_inflation = 24; 372TUNABLE_INT("vfs.zfs.spa_asize_inflation", &spa_asize_inflation); 373SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_asize_inflation, CTLFLAG_RWTUN, 374 &spa_asize_inflation, 0, "Worst case inflation factor for single sector writes"); 375 376#ifndef illumos 377#ifdef _KERNEL 378static void 379zfs_deadman_init() 380{ 381 /* 382 * If we are not i386 or amd64 or in a virtual machine, 383 * disable ZFS deadman thread by default 384 */ 385 if (zfs_deadman_enabled == -1) { 386#if defined(__amd64__) || defined(__i386__) 387 zfs_deadman_enabled = (vm_guest == VM_GUEST_NO) ? 1 : 0; 388#else 389 zfs_deadman_enabled = 0; 390#endif 391 } 392} 393#endif /* _KERNEL */ 394#endif /* !illumos */ 395 396/* 397 * Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in 398 * the pool to be consumed. This ensures that we don't run the pool 399 * completely out of space, due to unaccounted changes (e.g. to the MOS). 400 * It also limits the worst-case time to allocate space. If we have 401 * less than this amount of free space, most ZPL operations (e.g. write, 402 * create) will return ENOSPC. 403 * 404 * Certain operations (e.g. file removal, most administrative actions) can 405 * use half the slop space. They will only return ENOSPC if less than half 406 * the slop space is free. Typically, once the pool has less than the slop 407 * space free, the user will use these operations to free up space in the pool. 408 * These are the operations that call dsl_pool_adjustedsize() with the netfree 409 * argument set to TRUE. 410 * 411 * A very restricted set of operations are always permitted, regardless of 412 * the amount of free space. These are the operations that call 413 * dsl_sync_task(ZFS_SPACE_CHECK_NONE), e.g. "zfs destroy". If these 414 * operations result in a net increase in the amount of space used, 415 * it is possible to run the pool completely out of space, causing it to 416 * be permanently read-only. 417 * 418 * Note that on very small pools, the slop space will be larger than 419 * 3.2%, in an effort to have it be at least spa_min_slop (128MB), 420 * but we never allow it to be more than half the pool size. 421 * 422 * See also the comments in zfs_space_check_t. 423 */ 424int spa_slop_shift = 5; 425SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_slop_shift, CTLFLAG_RWTUN, 426 &spa_slop_shift, 0, 427 "Shift value of reserved space (1/(2^spa_slop_shift))."); 428uint64_t spa_min_slop = 128 * 1024 * 1024; 429SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, spa_min_slop, CTLFLAG_RWTUN, 430 &spa_min_slop, 0, 431 "Minimal value of reserved space"); 432 433/* 434 * ========================================================================== 435 * SPA config locking 436 * ========================================================================== 437 */ 438static void 439spa_config_lock_init(spa_t *spa) 440{ 441 for (int i = 0; i < SCL_LOCKS; i++) { 442 spa_config_lock_t *scl = &spa->spa_config_lock[i]; 443 mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL); 444 cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL); 445 refcount_create_untracked(&scl->scl_count); 446 scl->scl_writer = NULL; 447 scl->scl_write_wanted = 0; 448 } 449} 450 451static void 452spa_config_lock_destroy(spa_t *spa) 453{ 454 for (int i = 0; i < SCL_LOCKS; i++) { 455 spa_config_lock_t *scl = &spa->spa_config_lock[i]; 456 mutex_destroy(&scl->scl_lock); 457 cv_destroy(&scl->scl_cv); 458 refcount_destroy(&scl->scl_count); 459 ASSERT(scl->scl_writer == NULL); 460 ASSERT(scl->scl_write_wanted == 0); 461 } 462} 463 464int 465spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw) 466{ 467 for (int i = 0; i < SCL_LOCKS; i++) { 468 spa_config_lock_t *scl = &spa->spa_config_lock[i]; 469 if (!(locks & (1 << i))) 470 continue; 471 mutex_enter(&scl->scl_lock); 472 if (rw == RW_READER) { 473 if (scl->scl_writer || scl->scl_write_wanted) { 474 mutex_exit(&scl->scl_lock); 475 spa_config_exit(spa, locks & ((1 << i) - 1), 476 tag); 477 return (0); 478 } 479 } else { 480 ASSERT(scl->scl_writer != curthread); 481 if (!refcount_is_zero(&scl->scl_count)) { 482 mutex_exit(&scl->scl_lock); 483 spa_config_exit(spa, locks & ((1 << i) - 1), 484 tag); 485 return (0); 486 } 487 scl->scl_writer = curthread; 488 } 489 (void) refcount_add(&scl->scl_count, tag); 490 mutex_exit(&scl->scl_lock); 491 } 492 return (1); 493} 494 495void 496spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw) 497{ 498 int wlocks_held = 0; 499 500 ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY); 501 502 for (int i = 0; i < SCL_LOCKS; i++) { 503 spa_config_lock_t *scl = &spa->spa_config_lock[i]; 504 if (scl->scl_writer == curthread) 505 wlocks_held |= (1 << i); 506 if (!(locks & (1 << i))) 507 continue; 508 mutex_enter(&scl->scl_lock); 509 if (rw == RW_READER) { 510 while (scl->scl_writer || scl->scl_write_wanted) { 511 cv_wait(&scl->scl_cv, &scl->scl_lock); 512 } 513 } else { 514 ASSERT(scl->scl_writer != curthread); 515 while (!refcount_is_zero(&scl->scl_count)) { 516 scl->scl_write_wanted++; 517 cv_wait(&scl->scl_cv, &scl->scl_lock); 518 scl->scl_write_wanted--; 519 } 520 scl->scl_writer = curthread; 521 } 522 (void) refcount_add(&scl->scl_count, tag); 523 mutex_exit(&scl->scl_lock); 524 } 525 ASSERT(wlocks_held <= locks); 526} 527 528void 529spa_config_exit(spa_t *spa, int locks, void *tag) 530{ 531 for (int i = SCL_LOCKS - 1; i >= 0; i--) { 532 spa_config_lock_t *scl = &spa->spa_config_lock[i]; 533 if (!(locks & (1 << i))) 534 continue; 535 mutex_enter(&scl->scl_lock); 536 ASSERT(!refcount_is_zero(&scl->scl_count)); 537 if (refcount_remove(&scl->scl_count, tag) == 0) { 538 ASSERT(scl->scl_writer == NULL || 539 scl->scl_writer == curthread); 540 scl->scl_writer = NULL; /* OK in either case */ 541 cv_broadcast(&scl->scl_cv); 542 } 543 mutex_exit(&scl->scl_lock); 544 } 545} 546 547int 548spa_config_held(spa_t *spa, int locks, krw_t rw) 549{ 550 int locks_held = 0; 551 552 for (int i = 0; i < SCL_LOCKS; i++) { 553 spa_config_lock_t *scl = &spa->spa_config_lock[i]; 554 if (!(locks & (1 << i))) 555 continue; 556 if ((rw == RW_READER && !refcount_is_zero(&scl->scl_count)) || 557 (rw == RW_WRITER && scl->scl_writer == curthread)) 558 locks_held |= 1 << i; 559 } 560 561 return (locks_held); 562} 563 564/* 565 * ========================================================================== 566 * SPA namespace functions 567 * ========================================================================== 568 */ 569 570/* 571 * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held. 572 * Returns NULL if no matching spa_t is found. 573 */ 574spa_t * 575spa_lookup(const char *name) 576{ 577 static spa_t search; /* spa_t is large; don't allocate on stack */ 578 spa_t *spa; 579 avl_index_t where; 580 char *cp; 581 582 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 583 584 (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); 585 586 /* 587 * If it's a full dataset name, figure out the pool name and 588 * just use that. 589 */ 590 cp = strpbrk(search.spa_name, "/@#"); 591 if (cp != NULL) 592 *cp = '\0'; 593 594 spa = avl_find(&spa_namespace_avl, &search, &where); 595 596 return (spa); 597} 598 599/* 600 * Fires when spa_sync has not completed within zfs_deadman_synctime_ms. 601 * If the zfs_deadman_enabled flag is set then it inspects all vdev queues 602 * looking for potentially hung I/Os. 603 */ 604static void 605spa_deadman(void *arg, int pending) 606{ 607 spa_t *spa = arg; 608 609 /* 610 * Disable the deadman timer if the pool is suspended. 611 */ 612 if (spa_suspended(spa)) { 613#ifdef illumos 614 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); 615#else 616 /* Nothing. just don't schedule any future callouts. */ 617#endif 618 return; 619 } 620 621 zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu", 622 (gethrtime() - spa->spa_sync_starttime) / NANOSEC, 623 ++spa->spa_deadman_calls); 624 if (zfs_deadman_enabled) 625 vdev_deadman(spa->spa_root_vdev); 626#ifdef __FreeBSD__ 627#ifdef _KERNEL 628 callout_schedule(&spa->spa_deadman_cycid, 629 hz * zfs_deadman_checktime_ms / MILLISEC); 630#endif 631#endif 632} 633 634#if defined(__FreeBSD__) && defined(_KERNEL) 635static void 636spa_deadman_timeout(void *arg) 637{ 638 spa_t *spa = arg; 639 640 taskqueue_enqueue(taskqueue_thread, &spa->spa_deadman_task); 641} 642#endif 643 644/* 645 * Create an uninitialized spa_t with the given name. Requires 646 * spa_namespace_lock. The caller must ensure that the spa_t doesn't already 647 * exist by calling spa_lookup() first. 648 */ 649spa_t * 650spa_add(const char *name, nvlist_t *config, const char *altroot) 651{ 652 spa_t *spa; 653 spa_config_dirent_t *dp; 654#ifdef illumos 655 cyc_handler_t hdlr; 656 cyc_time_t when; 657#endif 658 659 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 660 661 spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP); 662 663 mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); 664 mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); 665 mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); 666 mutex_init(&spa->spa_evicting_os_lock, NULL, MUTEX_DEFAULT, NULL); 667 mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); 668 mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL); 669 mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); 670 mutex_init(&spa->spa_cksum_tmpls_lock, NULL, MUTEX_DEFAULT, NULL); 671 mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); 672 mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); 673 mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL); 674 mutex_init(&spa->spa_alloc_lock, NULL, MUTEX_DEFAULT, NULL); 675 676 cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); 677 cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL); 678 cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL); 679 cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL); 680 cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL); 681 682 for (int t = 0; t < TXG_SIZE; t++) 683 bplist_create(&spa->spa_free_bplist[t]); 684 685 (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name)); 686 spa->spa_state = POOL_STATE_UNINITIALIZED; 687 spa->spa_freeze_txg = UINT64_MAX; 688 spa->spa_final_txg = UINT64_MAX; 689 spa->spa_load_max_txg = UINT64_MAX; 690 spa->spa_proc = &p0; 691 spa->spa_proc_state = SPA_PROC_NONE; 692 693#ifdef illumos 694 hdlr.cyh_func = spa_deadman; 695 hdlr.cyh_arg = spa; 696 hdlr.cyh_level = CY_LOW_LEVEL; 697#endif 698 699 spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms); 700 701#ifdef illumos 702 /* 703 * This determines how often we need to check for hung I/Os after 704 * the cyclic has already fired. Since checking for hung I/Os is 705 * an expensive operation we don't want to check too frequently. 706 * Instead wait for 5 seconds before checking again. 707 */ 708 when.cyt_interval = MSEC2NSEC(zfs_deadman_checktime_ms); 709 when.cyt_when = CY_INFINITY; 710 mutex_enter(&cpu_lock); 711 spa->spa_deadman_cycid = cyclic_add(&hdlr, &when); 712 mutex_exit(&cpu_lock); 713#else /* !illumos */ 714#ifdef _KERNEL 715 /* 716 * callout(9) does not provide a way to initialize a callout with 717 * a function and an argument, so we use callout_reset() to schedule 718 * the callout in the very distant future. Even if that event ever 719 * fires, it should be okayas we won't have any active zio-s. 720 * But normally spa_sync() will reschedule the callout with a proper 721 * timeout. 722 * callout(9) does not allow the callback function to sleep but 723 * vdev_deadman() needs to acquire vq_lock and illumos mutexes are 724 * emulated using sx(9). For this reason spa_deadman_timeout() 725 * will schedule spa_deadman() as task on a taskqueue that allows 726 * sleeping. 727 */ 728 TASK_INIT(&spa->spa_deadman_task, 0, spa_deadman, spa); 729 callout_init(&spa->spa_deadman_cycid, 1); 730 callout_reset_sbt(&spa->spa_deadman_cycid, SBT_MAX, 0, 731 spa_deadman_timeout, spa, 0); 732#endif 733#endif 734 refcount_create(&spa->spa_refcount); 735 spa_config_lock_init(spa); 736 737 avl_add(&spa_namespace_avl, spa); 738 739 /* 740 * Set the alternate root, if there is one. 741 */ 742 if (altroot) { 743 spa->spa_root = spa_strdup(altroot); 744 spa_active_count++; 745 } 746 747 avl_create(&spa->spa_alloc_tree, zio_timestamp_compare, 748 sizeof (zio_t), offsetof(zio_t, io_alloc_node)); 749 750 /* 751 * Every pool starts with the default cachefile 752 */ 753 list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t), 754 offsetof(spa_config_dirent_t, scd_link)); 755 756 dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP); 757 dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path); 758 list_insert_head(&spa->spa_config_list, dp); 759 760 VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME, 761 KM_SLEEP) == 0); 762 763 if (config != NULL) { 764 nvlist_t *features; 765 766 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ, 767 &features) == 0) { 768 VERIFY(nvlist_dup(features, &spa->spa_label_features, 769 0) == 0); 770 } 771 772 VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); 773 } 774 775 if (spa->spa_label_features == NULL) { 776 VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME, 777 KM_SLEEP) == 0); 778 } 779 780 spa->spa_debug = ((zfs_flags & ZFS_DEBUG_SPA) != 0); 781 782 spa->spa_min_ashift = INT_MAX; 783 spa->spa_max_ashift = 0; 784 785 /* 786 * As a pool is being created, treat all features as disabled by 787 * setting SPA_FEATURE_DISABLED for all entries in the feature 788 * refcount cache. 789 */ 790 for (int i = 0; i < SPA_FEATURES; i++) { 791 spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED; 792 } 793 794 return (spa); 795} 796 797/* 798 * Removes a spa_t from the namespace, freeing up any memory used. Requires 799 * spa_namespace_lock. This is called only after the spa_t has been closed and 800 * deactivated. 801 */ 802void 803spa_remove(spa_t *spa) 804{ 805 spa_config_dirent_t *dp; 806 807 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 808 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 809 ASSERT3U(refcount_count(&spa->spa_refcount), ==, 0); 810 811 nvlist_free(spa->spa_config_splitting); 812 813 avl_remove(&spa_namespace_avl, spa); 814 cv_broadcast(&spa_namespace_cv); 815 816 if (spa->spa_root) { 817 spa_strfree(spa->spa_root); 818 spa_active_count--; 819 } 820 821 while ((dp = list_head(&spa->spa_config_list)) != NULL) { 822 list_remove(&spa->spa_config_list, dp); 823 if (dp->scd_path != NULL) 824 spa_strfree(dp->scd_path); 825 kmem_free(dp, sizeof (spa_config_dirent_t)); 826 } 827 828 avl_destroy(&spa->spa_alloc_tree); 829 list_destroy(&spa->spa_config_list); 830 831 nvlist_free(spa->spa_label_features); 832 nvlist_free(spa->spa_load_info); 833 spa_config_set(spa, NULL); 834 835#ifdef illumos 836 mutex_enter(&cpu_lock); 837 if (spa->spa_deadman_cycid != CYCLIC_NONE) 838 cyclic_remove(spa->spa_deadman_cycid); 839 mutex_exit(&cpu_lock); 840 spa->spa_deadman_cycid = CYCLIC_NONE; 841#else /* !illumos */ 842#ifdef _KERNEL 843 callout_drain(&spa->spa_deadman_cycid); 844 taskqueue_drain(taskqueue_thread, &spa->spa_deadman_task); 845#endif 846#endif 847 848 refcount_destroy(&spa->spa_refcount); 849 850 spa_config_lock_destroy(spa); 851 852 for (int t = 0; t < TXG_SIZE; t++) 853 bplist_destroy(&spa->spa_free_bplist[t]); 854 855 zio_checksum_templates_free(spa); 856 857 cv_destroy(&spa->spa_async_cv); 858 cv_destroy(&spa->spa_evicting_os_cv); 859 cv_destroy(&spa->spa_proc_cv); 860 cv_destroy(&spa->spa_scrub_io_cv); 861 cv_destroy(&spa->spa_suspend_cv); 862 863 mutex_destroy(&spa->spa_alloc_lock); 864 mutex_destroy(&spa->spa_async_lock); 865 mutex_destroy(&spa->spa_errlist_lock); 866 mutex_destroy(&spa->spa_errlog_lock); 867 mutex_destroy(&spa->spa_evicting_os_lock); 868 mutex_destroy(&spa->spa_history_lock); 869 mutex_destroy(&spa->spa_proc_lock); 870 mutex_destroy(&spa->spa_props_lock); 871 mutex_destroy(&spa->spa_cksum_tmpls_lock); 872 mutex_destroy(&spa->spa_scrub_lock); 873 mutex_destroy(&spa->spa_suspend_lock); 874 mutex_destroy(&spa->spa_vdev_top_lock); 875 876 kmem_free(spa, sizeof (spa_t)); 877} 878 879/* 880 * Given a pool, return the next pool in the namespace, or NULL if there is 881 * none. If 'prev' is NULL, return the first pool. 882 */ 883spa_t * 884spa_next(spa_t *prev) 885{ 886 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 887 888 if (prev) 889 return (AVL_NEXT(&spa_namespace_avl, prev)); 890 else 891 return (avl_first(&spa_namespace_avl)); 892} 893 894/* 895 * ========================================================================== 896 * SPA refcount functions 897 * ========================================================================== 898 */ 899 900/* 901 * Add a reference to the given spa_t. Must have at least one reference, or 902 * have the namespace lock held. 903 */ 904void 905spa_open_ref(spa_t *spa, void *tag) 906{ 907 ASSERT(refcount_count(&spa->spa_refcount) >= spa->spa_minref || 908 MUTEX_HELD(&spa_namespace_lock)); 909 (void) refcount_add(&spa->spa_refcount, tag); 910} 911 912/* 913 * Remove a reference to the given spa_t. Must have at least one reference, or 914 * have the namespace lock held. 915 */ 916void 917spa_close(spa_t *spa, void *tag) 918{ 919 ASSERT(refcount_count(&spa->spa_refcount) > spa->spa_minref || 920 MUTEX_HELD(&spa_namespace_lock)); 921 (void) refcount_remove(&spa->spa_refcount, tag); 922} 923 924/* 925 * Remove a reference to the given spa_t held by a dsl dir that is 926 * being asynchronously released. Async releases occur from a taskq 927 * performing eviction of dsl datasets and dirs. The namespace lock 928 * isn't held and the hold by the object being evicted may contribute to 929 * spa_minref (e.g. dataset or directory released during pool export), 930 * so the asserts in spa_close() do not apply. 931 */ 932void 933spa_async_close(spa_t *spa, void *tag) 934{ 935 (void) refcount_remove(&spa->spa_refcount, tag); 936} 937 938/* 939 * Check to see if the spa refcount is zero. Must be called with 940 * spa_namespace_lock held. We really compare against spa_minref, which is the 941 * number of references acquired when opening a pool 942 */ 943boolean_t 944spa_refcount_zero(spa_t *spa) 945{ 946 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 947 948 return (refcount_count(&spa->spa_refcount) == spa->spa_minref); 949} 950 951/* 952 * ========================================================================== 953 * SPA spare and l2cache tracking 954 * ========================================================================== 955 */ 956 957/* 958 * Hot spares and cache devices are tracked using the same code below, 959 * for 'auxiliary' devices. 960 */ 961 962typedef struct spa_aux { 963 uint64_t aux_guid; 964 uint64_t aux_pool; 965 avl_node_t aux_avl; 966 int aux_count; 967} spa_aux_t; 968 969static int 970spa_aux_compare(const void *a, const void *b) 971{ 972 const spa_aux_t *sa = a; 973 const spa_aux_t *sb = b; 974 975 if (sa->aux_guid < sb->aux_guid) 976 return (-1); 977 else if (sa->aux_guid > sb->aux_guid) 978 return (1); 979 else 980 return (0); 981} 982 983void 984spa_aux_add(vdev_t *vd, avl_tree_t *avl) 985{ 986 avl_index_t where; 987 spa_aux_t search; 988 spa_aux_t *aux; 989 990 search.aux_guid = vd->vdev_guid; 991 if ((aux = avl_find(avl, &search, &where)) != NULL) { 992 aux->aux_count++; 993 } else { 994 aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP); 995 aux->aux_guid = vd->vdev_guid; 996 aux->aux_count = 1; 997 avl_insert(avl, aux, where); 998 } 999} 1000 1001void 1002spa_aux_remove(vdev_t *vd, avl_tree_t *avl) 1003{ 1004 spa_aux_t search; 1005 spa_aux_t *aux; 1006 avl_index_t where; 1007 1008 search.aux_guid = vd->vdev_guid; 1009 aux = avl_find(avl, &search, &where); 1010 1011 ASSERT(aux != NULL); 1012 1013 if (--aux->aux_count == 0) { 1014 avl_remove(avl, aux); 1015 kmem_free(aux, sizeof (spa_aux_t)); 1016 } else if (aux->aux_pool == spa_guid(vd->vdev_spa)) { 1017 aux->aux_pool = 0ULL; 1018 } 1019} 1020 1021boolean_t 1022spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl) 1023{ 1024 spa_aux_t search, *found; 1025 1026 search.aux_guid = guid; 1027 found = avl_find(avl, &search, NULL); 1028 1029 if (pool) { 1030 if (found) 1031 *pool = found->aux_pool; 1032 else 1033 *pool = 0ULL; 1034 } 1035 1036 if (refcnt) { 1037 if (found) 1038 *refcnt = found->aux_count; 1039 else 1040 *refcnt = 0; 1041 } 1042 1043 return (found != NULL); 1044} 1045 1046void 1047spa_aux_activate(vdev_t *vd, avl_tree_t *avl) 1048{ 1049 spa_aux_t search, *found; 1050 avl_index_t where; 1051 1052 search.aux_guid = vd->vdev_guid; 1053 found = avl_find(avl, &search, &where); 1054 ASSERT(found != NULL); 1055 ASSERT(found->aux_pool == 0ULL); 1056 1057 found->aux_pool = spa_guid(vd->vdev_spa); 1058} 1059 1060/* 1061 * Spares are tracked globally due to the following constraints: 1062 * 1063 * - A spare may be part of multiple pools. 1064 * - A spare may be added to a pool even if it's actively in use within 1065 * another pool. 1066 * - A spare in use in any pool can only be the source of a replacement if 1067 * the target is a spare in the same pool. 1068 * 1069 * We keep track of all spares on the system through the use of a reference 1070 * counted AVL tree. When a vdev is added as a spare, or used as a replacement 1071 * spare, then we bump the reference count in the AVL tree. In addition, we set 1072 * the 'vdev_isspare' member to indicate that the device is a spare (active or 1073 * inactive). When a spare is made active (used to replace a device in the 1074 * pool), we also keep track of which pool its been made a part of. 1075 * 1076 * The 'spa_spare_lock' protects the AVL tree. These functions are normally 1077 * called under the spa_namespace lock as part of vdev reconfiguration. The 1078 * separate spare lock exists for the status query path, which does not need to 1079 * be completely consistent with respect to other vdev configuration changes. 1080 */ 1081 1082static int 1083spa_spare_compare(const void *a, const void *b) 1084{ 1085 return (spa_aux_compare(a, b)); 1086} 1087 1088void 1089spa_spare_add(vdev_t *vd) 1090{ 1091 mutex_enter(&spa_spare_lock); 1092 ASSERT(!vd->vdev_isspare); 1093 spa_aux_add(vd, &spa_spare_avl); 1094 vd->vdev_isspare = B_TRUE; 1095 mutex_exit(&spa_spare_lock); 1096} 1097 1098void 1099spa_spare_remove(vdev_t *vd) 1100{ 1101 mutex_enter(&spa_spare_lock); 1102 ASSERT(vd->vdev_isspare); 1103 spa_aux_remove(vd, &spa_spare_avl); 1104 vd->vdev_isspare = B_FALSE; 1105 mutex_exit(&spa_spare_lock); 1106} 1107 1108boolean_t 1109spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt) 1110{ 1111 boolean_t found; 1112 1113 mutex_enter(&spa_spare_lock); 1114 found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl); 1115 mutex_exit(&spa_spare_lock); 1116 1117 return (found); 1118} 1119 1120void 1121spa_spare_activate(vdev_t *vd) 1122{ 1123 mutex_enter(&spa_spare_lock); 1124 ASSERT(vd->vdev_isspare); 1125 spa_aux_activate(vd, &spa_spare_avl); 1126 mutex_exit(&spa_spare_lock); 1127} 1128 1129/* 1130 * Level 2 ARC devices are tracked globally for the same reasons as spares. 1131 * Cache devices currently only support one pool per cache device, and so 1132 * for these devices the aux reference count is currently unused beyond 1. 1133 */ 1134 1135static int 1136spa_l2cache_compare(const void *a, const void *b) 1137{ 1138 return (spa_aux_compare(a, b)); 1139} 1140 1141void 1142spa_l2cache_add(vdev_t *vd) 1143{ 1144 mutex_enter(&spa_l2cache_lock); 1145 ASSERT(!vd->vdev_isl2cache); 1146 spa_aux_add(vd, &spa_l2cache_avl); 1147 vd->vdev_isl2cache = B_TRUE; 1148 mutex_exit(&spa_l2cache_lock); 1149} 1150 1151void 1152spa_l2cache_remove(vdev_t *vd) 1153{ 1154 mutex_enter(&spa_l2cache_lock); 1155 ASSERT(vd->vdev_isl2cache); 1156 spa_aux_remove(vd, &spa_l2cache_avl); 1157 vd->vdev_isl2cache = B_FALSE; 1158 mutex_exit(&spa_l2cache_lock); 1159} 1160 1161boolean_t 1162spa_l2cache_exists(uint64_t guid, uint64_t *pool) 1163{ 1164 boolean_t found; 1165 1166 mutex_enter(&spa_l2cache_lock); 1167 found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl); 1168 mutex_exit(&spa_l2cache_lock); 1169 1170 return (found); 1171} 1172 1173void 1174spa_l2cache_activate(vdev_t *vd) 1175{ 1176 mutex_enter(&spa_l2cache_lock); 1177 ASSERT(vd->vdev_isl2cache); 1178 spa_aux_activate(vd, &spa_l2cache_avl); 1179 mutex_exit(&spa_l2cache_lock); 1180} 1181 1182/* 1183 * ========================================================================== 1184 * SPA vdev locking 1185 * ========================================================================== 1186 */ 1187 1188/* 1189 * Lock the given spa_t for the purpose of adding or removing a vdev. 1190 * Grabs the global spa_namespace_lock plus the spa config lock for writing. 1191 * It returns the next transaction group for the spa_t. 1192 */ 1193uint64_t 1194spa_vdev_enter(spa_t *spa) 1195{ 1196 mutex_enter(&spa->spa_vdev_top_lock); 1197 mutex_enter(&spa_namespace_lock); 1198 return (spa_vdev_config_enter(spa)); 1199} 1200 1201/* 1202 * Internal implementation for spa_vdev_enter(). Used when a vdev 1203 * operation requires multiple syncs (i.e. removing a device) while 1204 * keeping the spa_namespace_lock held. 1205 */ 1206uint64_t 1207spa_vdev_config_enter(spa_t *spa) 1208{ 1209 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1210 1211 spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); 1212 1213 return (spa_last_synced_txg(spa) + 1); 1214} 1215 1216/* 1217 * Used in combination with spa_vdev_config_enter() to allow the syncing 1218 * of multiple transactions without releasing the spa_namespace_lock. 1219 */ 1220void 1221spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag) 1222{ 1223 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1224 1225 int config_changed = B_FALSE; 1226 1227 ASSERT(txg > spa_last_synced_txg(spa)); 1228 1229 spa->spa_pending_vdev = NULL; 1230 1231 /* 1232 * Reassess the DTLs. 1233 */ 1234 vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE); 1235 1236 if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) { 1237 config_changed = B_TRUE; 1238 spa->spa_config_generation++; 1239 } 1240 1241 /* 1242 * Verify the metaslab classes. 1243 */ 1244 ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0); 1245 ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0); 1246 1247 spa_config_exit(spa, SCL_ALL, spa); 1248 1249 /* 1250 * Panic the system if the specified tag requires it. This 1251 * is useful for ensuring that configurations are updated 1252 * transactionally. 1253 */ 1254 if (zio_injection_enabled) 1255 zio_handle_panic_injection(spa, tag, 0); 1256 1257 /* 1258 * Note: this txg_wait_synced() is important because it ensures 1259 * that there won't be more than one config change per txg. 1260 * This allows us to use the txg as the generation number. 1261 */ 1262 if (error == 0) 1263 txg_wait_synced(spa->spa_dsl_pool, txg); 1264 1265 if (vd != NULL) { 1266 ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL); 1267 spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); 1268 vdev_free(vd); 1269 spa_config_exit(spa, SCL_ALL, spa); 1270 } 1271 1272 /* 1273 * If the config changed, update the config cache. 1274 */ 1275 if (config_changed) 1276 spa_config_sync(spa, B_FALSE, B_TRUE); 1277} 1278 1279/* 1280 * Unlock the spa_t after adding or removing a vdev. Besides undoing the 1281 * locking of spa_vdev_enter(), we also want make sure the transactions have 1282 * synced to disk, and then update the global configuration cache with the new 1283 * information. 1284 */ 1285int 1286spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) 1287{ 1288 spa_vdev_config_exit(spa, vd, txg, error, FTAG); 1289 mutex_exit(&spa_namespace_lock); 1290 mutex_exit(&spa->spa_vdev_top_lock); 1291 1292 return (error); 1293} 1294 1295/* 1296 * Lock the given spa_t for the purpose of changing vdev state. 1297 */ 1298void 1299spa_vdev_state_enter(spa_t *spa, int oplocks) 1300{ 1301 int locks = SCL_STATE_ALL | oplocks; 1302 1303 /* 1304 * Root pools may need to read of the underlying devfs filesystem 1305 * when opening up a vdev. Unfortunately if we're holding the 1306 * SCL_ZIO lock it will result in a deadlock when we try to issue 1307 * the read from the root filesystem. Instead we "prefetch" 1308 * the associated vnodes that we need prior to opening the 1309 * underlying devices and cache them so that we can prevent 1310 * any I/O when we are doing the actual open. 1311 */ 1312 if (spa_is_root(spa)) { 1313 int low = locks & ~(SCL_ZIO - 1); 1314 int high = locks & ~low; 1315 1316 spa_config_enter(spa, high, spa, RW_WRITER); 1317 vdev_hold(spa->spa_root_vdev); 1318 spa_config_enter(spa, low, spa, RW_WRITER); 1319 } else { 1320 spa_config_enter(spa, locks, spa, RW_WRITER); 1321 } 1322 spa->spa_vdev_locks = locks; 1323} 1324 1325int 1326spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) 1327{ 1328 boolean_t config_changed = B_FALSE; 1329 1330 if (vd != NULL || error == 0) 1331 vdev_dtl_reassess(vd ? vd->vdev_top : spa->spa_root_vdev, 1332 0, 0, B_FALSE); 1333 1334 if (vd != NULL) { 1335 vdev_state_dirty(vd->vdev_top); 1336 config_changed = B_TRUE; 1337 spa->spa_config_generation++; 1338 } 1339 1340 if (spa_is_root(spa)) 1341 vdev_rele(spa->spa_root_vdev); 1342 1343 ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL); 1344 spa_config_exit(spa, spa->spa_vdev_locks, spa); 1345 1346 /* 1347 * If anything changed, wait for it to sync. This ensures that, 1348 * from the system administrator's perspective, zpool(1M) commands 1349 * are synchronous. This is important for things like zpool offline: 1350 * when the command completes, you expect no further I/O from ZFS. 1351 */ 1352 if (vd != NULL) 1353 txg_wait_synced(spa->spa_dsl_pool, 0); 1354 1355 /* 1356 * If the config changed, update the config cache. 1357 */ 1358 if (config_changed) { 1359 mutex_enter(&spa_namespace_lock); 1360 spa_config_sync(spa, B_FALSE, B_TRUE); 1361 mutex_exit(&spa_namespace_lock); 1362 } 1363 1364 return (error); 1365} 1366 1367/* 1368 * ========================================================================== 1369 * Miscellaneous functions 1370 * ========================================================================== 1371 */ 1372 1373void 1374spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx) 1375{ 1376 if (!nvlist_exists(spa->spa_label_features, feature)) { 1377 fnvlist_add_boolean(spa->spa_label_features, feature); 1378 /* 1379 * When we are creating the pool (tx_txg==TXG_INITIAL), we can't 1380 * dirty the vdev config because lock SCL_CONFIG is not held. 1381 * Thankfully, in this case we don't need to dirty the config 1382 * because it will be written out anyway when we finish 1383 * creating the pool. 1384 */ 1385 if (tx->tx_txg != TXG_INITIAL) 1386 vdev_config_dirty(spa->spa_root_vdev); 1387 } 1388} 1389 1390void 1391spa_deactivate_mos_feature(spa_t *spa, const char *feature) 1392{ 1393 if (nvlist_remove_all(spa->spa_label_features, feature) == 0) 1394 vdev_config_dirty(spa->spa_root_vdev); 1395} 1396 1397/* 1398 * Rename a spa_t. 1399 */ 1400int 1401spa_rename(const char *name, const char *newname) 1402{ 1403 spa_t *spa; 1404 int err; 1405 1406 /* 1407 * Lookup the spa_t and grab the config lock for writing. We need to 1408 * actually open the pool so that we can sync out the necessary labels. 1409 * It's OK to call spa_open() with the namespace lock held because we 1410 * allow recursive calls for other reasons. 1411 */ 1412 mutex_enter(&spa_namespace_lock); 1413 if ((err = spa_open(name, &spa, FTAG)) != 0) { 1414 mutex_exit(&spa_namespace_lock); 1415 return (err); 1416 } 1417 1418 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1419 1420 avl_remove(&spa_namespace_avl, spa); 1421 (void) strlcpy(spa->spa_name, newname, sizeof (spa->spa_name)); 1422 avl_add(&spa_namespace_avl, spa); 1423 1424 /* 1425 * Sync all labels to disk with the new names by marking the root vdev 1426 * dirty and waiting for it to sync. It will pick up the new pool name 1427 * during the sync. 1428 */ 1429 vdev_config_dirty(spa->spa_root_vdev); 1430 1431 spa_config_exit(spa, SCL_ALL, FTAG); 1432 1433 txg_wait_synced(spa->spa_dsl_pool, 0); 1434 1435 /* 1436 * Sync the updated config cache. 1437 */ 1438 spa_config_sync(spa, B_FALSE, B_TRUE); 1439 1440 spa_close(spa, FTAG); 1441 1442 mutex_exit(&spa_namespace_lock); 1443 1444 return (0); 1445} 1446 1447/* 1448 * Return the spa_t associated with given pool_guid, if it exists. If 1449 * device_guid is non-zero, determine whether the pool exists *and* contains 1450 * a device with the specified device_guid. 1451 */ 1452spa_t * 1453spa_by_guid(uint64_t pool_guid, uint64_t device_guid) 1454{ 1455 spa_t *spa; 1456 avl_tree_t *t = &spa_namespace_avl; 1457 1458 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1459 1460 for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) { 1461 if (spa->spa_state == POOL_STATE_UNINITIALIZED) 1462 continue; 1463 if (spa->spa_root_vdev == NULL) 1464 continue; 1465 if (spa_guid(spa) == pool_guid) { 1466 if (device_guid == 0) 1467 break; 1468 1469 if (vdev_lookup_by_guid(spa->spa_root_vdev, 1470 device_guid) != NULL) 1471 break; 1472 1473 /* 1474 * Check any devices we may be in the process of adding. 1475 */ 1476 if (spa->spa_pending_vdev) { 1477 if (vdev_lookup_by_guid(spa->spa_pending_vdev, 1478 device_guid) != NULL) 1479 break; 1480 } 1481 } 1482 } 1483 1484 return (spa); 1485} 1486 1487/* 1488 * Determine whether a pool with the given pool_guid exists. 1489 */ 1490boolean_t 1491spa_guid_exists(uint64_t pool_guid, uint64_t device_guid) 1492{ 1493 return (spa_by_guid(pool_guid, device_guid) != NULL); 1494} 1495 1496char * 1497spa_strdup(const char *s) 1498{ 1499 size_t len; 1500 char *new; 1501 1502 len = strlen(s); 1503 new = kmem_alloc(len + 1, KM_SLEEP); 1504 bcopy(s, new, len); 1505 new[len] = '\0'; 1506 1507 return (new); 1508} 1509 1510void 1511spa_strfree(char *s) 1512{ 1513 kmem_free(s, strlen(s) + 1); 1514} 1515 1516uint64_t 1517spa_get_random(uint64_t range) 1518{ 1519 uint64_t r; 1520 1521 ASSERT(range != 0); 1522 1523 (void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t)); 1524 1525 return (r % range); 1526} 1527 1528uint64_t 1529spa_generate_guid(spa_t *spa) 1530{ 1531 uint64_t guid = spa_get_random(-1ULL); 1532 1533 if (spa != NULL) { 1534 while (guid == 0 || spa_guid_exists(spa_guid(spa), guid)) 1535 guid = spa_get_random(-1ULL); 1536 } else { 1537 while (guid == 0 || spa_guid_exists(guid, 0)) 1538 guid = spa_get_random(-1ULL); 1539 } 1540 1541 return (guid); 1542} 1543 1544void 1545snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp) 1546{ 1547 char type[256]; 1548 char *checksum = NULL; 1549 char *compress = NULL; 1550 1551 if (bp != NULL) { 1552 if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) { 1553 dmu_object_byteswap_t bswap = 1554 DMU_OT_BYTESWAP(BP_GET_TYPE(bp)); 1555 (void) snprintf(type, sizeof (type), "bswap %s %s", 1556 DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ? 1557 "metadata" : "data", 1558 dmu_ot_byteswap[bswap].ob_name); 1559 } else { 1560 (void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name, 1561 sizeof (type)); 1562 } 1563 if (!BP_IS_EMBEDDED(bp)) { 1564 checksum = 1565 zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name; 1566 } 1567 compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name; 1568 } 1569 1570 SNPRINTF_BLKPTR(snprintf, ' ', buf, buflen, bp, type, checksum, 1571 compress); 1572} 1573 1574void 1575spa_freeze(spa_t *spa) 1576{ 1577 uint64_t freeze_txg = 0; 1578 1579 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1580 if (spa->spa_freeze_txg == UINT64_MAX) { 1581 freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE; 1582 spa->spa_freeze_txg = freeze_txg; 1583 } 1584 spa_config_exit(spa, SCL_ALL, FTAG); 1585 if (freeze_txg != 0) 1586 txg_wait_synced(spa_get_dsl(spa), freeze_txg); 1587} 1588 1589void 1590zfs_panic_recover(const char *fmt, ...) 1591{ 1592 va_list adx; 1593 1594 va_start(adx, fmt); 1595 vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx); 1596 va_end(adx); 1597} 1598 1599/* 1600 * This is a stripped-down version of strtoull, suitable only for converting 1601 * lowercase hexadecimal numbers that don't overflow. 1602 */ 1603uint64_t 1604zfs_strtonum(const char *str, char **nptr) 1605{ 1606 uint64_t val = 0; 1607 char c; 1608 int digit; 1609 1610 while ((c = *str) != '\0') { 1611 if (c >= '0' && c <= '9') 1612 digit = c - '0'; 1613 else if (c >= 'a' && c <= 'f') 1614 digit = 10 + c - 'a'; 1615 else 1616 break; 1617 1618 val *= 16; 1619 val += digit; 1620 1621 str++; 1622 } 1623 1624 if (nptr) 1625 *nptr = (char *)str; 1626 1627 return (val); 1628} 1629 1630/* 1631 * ========================================================================== 1632 * Accessor functions 1633 * ========================================================================== 1634 */ 1635 1636boolean_t 1637spa_shutting_down(spa_t *spa) 1638{ 1639 return (spa->spa_async_suspended); 1640} 1641 1642dsl_pool_t * 1643spa_get_dsl(spa_t *spa) 1644{ 1645 return (spa->spa_dsl_pool); 1646} 1647 1648boolean_t 1649spa_is_initializing(spa_t *spa) 1650{ 1651 return (spa->spa_is_initializing); 1652} 1653 1654blkptr_t * 1655spa_get_rootblkptr(spa_t *spa) 1656{ 1657 return (&spa->spa_ubsync.ub_rootbp); 1658} 1659 1660void 1661spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp) 1662{ 1663 spa->spa_uberblock.ub_rootbp = *bp; 1664} 1665 1666void 1667spa_altroot(spa_t *spa, char *buf, size_t buflen) 1668{ 1669 if (spa->spa_root == NULL) 1670 buf[0] = '\0'; 1671 else 1672 (void) strncpy(buf, spa->spa_root, buflen); 1673} 1674 1675int 1676spa_sync_pass(spa_t *spa) 1677{ 1678 return (spa->spa_sync_pass); 1679} 1680 1681char * 1682spa_name(spa_t *spa) 1683{ 1684 return (spa->spa_name); 1685} 1686 1687uint64_t 1688spa_guid(spa_t *spa) 1689{ 1690 dsl_pool_t *dp = spa_get_dsl(spa); 1691 uint64_t guid; 1692 1693 /* 1694 * If we fail to parse the config during spa_load(), we can go through 1695 * the error path (which posts an ereport) and end up here with no root 1696 * vdev. We stash the original pool guid in 'spa_config_guid' to handle 1697 * this case. 1698 */ 1699 if (spa->spa_root_vdev == NULL) 1700 return (spa->spa_config_guid); 1701 1702 guid = spa->spa_last_synced_guid != 0 ? 1703 spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid; 1704 1705 /* 1706 * Return the most recently synced out guid unless we're 1707 * in syncing context. 1708 */ 1709 if (dp && dsl_pool_sync_context(dp)) 1710 return (spa->spa_root_vdev->vdev_guid); 1711 else 1712 return (guid); 1713} 1714 1715uint64_t 1716spa_load_guid(spa_t *spa) 1717{ 1718 /* 1719 * This is a GUID that exists solely as a reference for the 1720 * purposes of the arc. It is generated at load time, and 1721 * is never written to persistent storage. 1722 */ 1723 return (spa->spa_load_guid); 1724} 1725 1726uint64_t 1727spa_last_synced_txg(spa_t *spa) 1728{ 1729 return (spa->spa_ubsync.ub_txg); 1730} 1731 1732uint64_t 1733spa_first_txg(spa_t *spa) 1734{ 1735 return (spa->spa_first_txg); 1736} 1737 1738uint64_t 1739spa_syncing_txg(spa_t *spa) 1740{ 1741 return (spa->spa_syncing_txg); 1742} 1743 1744pool_state_t 1745spa_state(spa_t *spa) 1746{ 1747 return (spa->spa_state); 1748} 1749 1750spa_load_state_t 1751spa_load_state(spa_t *spa) 1752{ 1753 return (spa->spa_load_state); 1754} 1755 1756uint64_t 1757spa_freeze_txg(spa_t *spa) 1758{ 1759 return (spa->spa_freeze_txg); 1760} 1761 1762/* ARGSUSED */ 1763uint64_t 1764spa_get_asize(spa_t *spa, uint64_t lsize) 1765{ 1766 return (lsize * spa_asize_inflation); 1767} 1768 1769/* 1770 * Return the amount of slop space in bytes. It is 1/32 of the pool (3.2%), 1771 * or at least 128MB, unless that would cause it to be more than half the 1772 * pool size. 1773 * 1774 * See the comment above spa_slop_shift for details. 1775 */ 1776uint64_t 1777spa_get_slop_space(spa_t *spa) 1778{ 1779 uint64_t space = spa_get_dspace(spa); 1780 return (MAX(space >> spa_slop_shift, MIN(space >> 1, spa_min_slop))); 1781} 1782 1783uint64_t 1784spa_get_dspace(spa_t *spa) 1785{ 1786 return (spa->spa_dspace); 1787} 1788 1789void 1790spa_update_dspace(spa_t *spa) 1791{ 1792 spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) + 1793 ddt_get_dedup_dspace(spa); 1794} 1795 1796/* 1797 * Return the failure mode that has been set to this pool. The default 1798 * behavior will be to block all I/Os when a complete failure occurs. 1799 */ 1800uint8_t 1801spa_get_failmode(spa_t *spa) 1802{ 1803 return (spa->spa_failmode); 1804} 1805 1806boolean_t 1807spa_suspended(spa_t *spa) 1808{ 1809 return (spa->spa_suspended); 1810} 1811 1812uint64_t 1813spa_version(spa_t *spa) 1814{ 1815 return (spa->spa_ubsync.ub_version); 1816} 1817 1818boolean_t 1819spa_deflate(spa_t *spa) 1820{ 1821 return (spa->spa_deflate); 1822} 1823 1824metaslab_class_t * 1825spa_normal_class(spa_t *spa) 1826{ 1827 return (spa->spa_normal_class); 1828} 1829 1830metaslab_class_t * 1831spa_log_class(spa_t *spa) 1832{ 1833 return (spa->spa_log_class); 1834} 1835 1836void 1837spa_evicting_os_register(spa_t *spa, objset_t *os) 1838{ 1839 mutex_enter(&spa->spa_evicting_os_lock); 1840 list_insert_head(&spa->spa_evicting_os_list, os); 1841 mutex_exit(&spa->spa_evicting_os_lock); 1842} 1843 1844void 1845spa_evicting_os_deregister(spa_t *spa, objset_t *os) 1846{ 1847 mutex_enter(&spa->spa_evicting_os_lock); 1848 list_remove(&spa->spa_evicting_os_list, os); 1849 cv_broadcast(&spa->spa_evicting_os_cv); 1850 mutex_exit(&spa->spa_evicting_os_lock); 1851} 1852 1853void 1854spa_evicting_os_wait(spa_t *spa) 1855{ 1856 mutex_enter(&spa->spa_evicting_os_lock); 1857 while (!list_is_empty(&spa->spa_evicting_os_list)) 1858 cv_wait(&spa->spa_evicting_os_cv, &spa->spa_evicting_os_lock); 1859 mutex_exit(&spa->spa_evicting_os_lock); 1860 1861 dmu_buf_user_evict_wait(); 1862} 1863 1864int 1865spa_max_replication(spa_t *spa) 1866{ 1867 /* 1868 * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to 1869 * handle BPs with more than one DVA allocated. Set our max 1870 * replication level accordingly. 1871 */ 1872 if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS) 1873 return (1); 1874 return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override)); 1875} 1876 1877int 1878spa_prev_software_version(spa_t *spa) 1879{ 1880 return (spa->spa_prev_software_version); 1881} 1882 1883uint64_t 1884spa_deadman_synctime(spa_t *spa) 1885{ 1886 return (spa->spa_deadman_synctime); 1887} 1888 1889uint64_t 1890dva_get_dsize_sync(spa_t *spa, const dva_t *dva) 1891{ 1892 uint64_t asize = DVA_GET_ASIZE(dva); 1893 uint64_t dsize = asize; 1894 1895 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 1896 1897 if (asize != 0 && spa->spa_deflate) { 1898 uint64_t vdev = DVA_GET_VDEV(dva); 1899 vdev_t *vd = vdev_lookup_top(spa, vdev); 1900 if (vd == NULL) { 1901 panic( 1902 "dva_get_dsize_sync(): bad DVA %llu:%llu", 1903 (u_longlong_t)vdev, (u_longlong_t)asize); 1904 } 1905 dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio; 1906 } 1907 1908 return (dsize); 1909} 1910 1911uint64_t 1912bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp) 1913{ 1914 uint64_t dsize = 0; 1915 1916 for (int d = 0; d < BP_GET_NDVAS(bp); d++) 1917 dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); 1918 1919 return (dsize); 1920} 1921 1922uint64_t 1923bp_get_dsize(spa_t *spa, const blkptr_t *bp) 1924{ 1925 uint64_t dsize = 0; 1926 1927 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 1928 1929 for (int d = 0; d < BP_GET_NDVAS(bp); d++) 1930 dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); 1931 1932 spa_config_exit(spa, SCL_VDEV, FTAG); 1933 1934 return (dsize); 1935} 1936 1937/* 1938 * ========================================================================== 1939 * Initialization and Termination 1940 * ========================================================================== 1941 */ 1942 1943static int 1944spa_name_compare(const void *a1, const void *a2) 1945{ 1946 const spa_t *s1 = a1; 1947 const spa_t *s2 = a2; 1948 int s; 1949 1950 s = strcmp(s1->spa_name, s2->spa_name); 1951 if (s > 0) 1952 return (1); 1953 if (s < 0) 1954 return (-1); 1955 return (0); 1956} 1957 1958int 1959spa_busy(void) 1960{ 1961 return (spa_active_count); 1962} 1963 1964void 1965spa_boot_init() 1966{ 1967 spa_config_load(); 1968} 1969 1970#ifdef _KERNEL 1971EVENTHANDLER_DEFINE(mountroot, spa_boot_init, NULL, 0); 1972#endif 1973 1974void 1975spa_init(int mode) 1976{ 1977 mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL); 1978 mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL); 1979 mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL); 1980 cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL); 1981 1982 avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t), 1983 offsetof(spa_t, spa_avl)); 1984 1985 avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t), 1986 offsetof(spa_aux_t, aux_avl)); 1987 1988 avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t), 1989 offsetof(spa_aux_t, aux_avl)); 1990 1991 spa_mode_global = mode; 1992 1993#ifdef illumos 1994#ifdef _KERNEL 1995 spa_arch_init(); 1996#else 1997 if (spa_mode_global != FREAD && dprintf_find_string("watch")) { 1998 arc_procfd = open("/proc/self/ctl", O_WRONLY); 1999 if (arc_procfd == -1) { 2000 perror("could not enable watchpoints: " 2001 "opening /proc/self/ctl failed: "); 2002 } else { 2003 arc_watch = B_TRUE; 2004 } 2005 } 2006#endif 2007#endif /* illumos */ 2008 refcount_sysinit(); 2009 unique_init(); 2010 range_tree_init(); 2011 zio_init(); 2012 lz4_init(); 2013 dmu_init(); 2014 zil_init(); 2015 vdev_cache_stat_init(); 2016 zfs_prop_init(); 2017 zpool_prop_init(); 2018 zpool_feature_init(); 2019 spa_config_load(); 2020 l2arc_start(); 2021#ifndef illumos 2022#ifdef _KERNEL 2023 zfs_deadman_init(); 2024#endif 2025#endif /* !illumos */ 2026} 2027 2028void 2029spa_fini(void) 2030{ 2031 l2arc_stop(); 2032 2033 spa_evict_all(); 2034 2035 vdev_cache_stat_fini(); 2036 zil_fini(); 2037 dmu_fini(); 2038 lz4_fini(); 2039 zio_fini(); 2040 range_tree_fini(); 2041 unique_fini(); 2042 refcount_fini(); 2043 2044 avl_destroy(&spa_namespace_avl); 2045 avl_destroy(&spa_spare_avl); 2046 avl_destroy(&spa_l2cache_avl); 2047 2048 cv_destroy(&spa_namespace_cv); 2049 mutex_destroy(&spa_namespace_lock); 2050 mutex_destroy(&spa_spare_lock); 2051 mutex_destroy(&spa_l2cache_lock); 2052} 2053 2054/* 2055 * Return whether this pool has slogs. No locking needed. 2056 * It's not a problem if the wrong answer is returned as it's only for 2057 * performance and not correctness 2058 */ 2059boolean_t 2060spa_has_slogs(spa_t *spa) 2061{ 2062 return (spa->spa_log_class->mc_rotor != NULL); 2063} 2064 2065spa_log_state_t 2066spa_get_log_state(spa_t *spa) 2067{ 2068 return (spa->spa_log_state); 2069} 2070 2071void 2072spa_set_log_state(spa_t *spa, spa_log_state_t state) 2073{ 2074 spa->spa_log_state = state; 2075} 2076 2077boolean_t 2078spa_is_root(spa_t *spa) 2079{ 2080 return (spa->spa_is_root); 2081} 2082 2083boolean_t 2084spa_writeable(spa_t *spa) 2085{ 2086 return (!!(spa->spa_mode & FWRITE)); 2087} 2088 2089/* 2090 * Returns true if there is a pending sync task in any of the current 2091 * syncing txg, the current quiescing txg, or the current open txg. 2092 */ 2093boolean_t 2094spa_has_pending_synctask(spa_t *spa) 2095{ 2096 return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks)); 2097} 2098 2099int 2100spa_mode(spa_t *spa) 2101{ 2102 return (spa->spa_mode); 2103} 2104 2105uint64_t 2106spa_bootfs(spa_t *spa) 2107{ 2108 return (spa->spa_bootfs); 2109} 2110 2111uint64_t 2112spa_delegation(spa_t *spa) 2113{ 2114 return (spa->spa_delegation); 2115} 2116 2117objset_t * 2118spa_meta_objset(spa_t *spa) 2119{ 2120 return (spa->spa_meta_objset); 2121} 2122 2123enum zio_checksum 2124spa_dedup_checksum(spa_t *spa) 2125{ 2126 return (spa->spa_dedup_checksum); 2127} 2128 2129/* 2130 * Reset pool scan stat per scan pass (or reboot). 2131 */ 2132void 2133spa_scan_stat_init(spa_t *spa) 2134{ 2135 /* data not stored on disk */ 2136 spa->spa_scan_pass_start = gethrestime_sec(); 2137 spa->spa_scan_pass_exam = 0; 2138 vdev_scan_stat_init(spa->spa_root_vdev); 2139} 2140 2141/* 2142 * Get scan stats for zpool status reports 2143 */ 2144int 2145spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) 2146{ 2147 dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL; 2148 2149 if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE) 2150 return (SET_ERROR(ENOENT)); 2151 bzero(ps, sizeof (pool_scan_stat_t)); 2152 2153 /* data stored on disk */ 2154 ps->pss_func = scn->scn_phys.scn_func; 2155 ps->pss_start_time = scn->scn_phys.scn_start_time; 2156 ps->pss_end_time = scn->scn_phys.scn_end_time; 2157 ps->pss_to_examine = scn->scn_phys.scn_to_examine; 2158 ps->pss_examined = scn->scn_phys.scn_examined; 2159 ps->pss_to_process = scn->scn_phys.scn_to_process; 2160 ps->pss_processed = scn->scn_phys.scn_processed; 2161 ps->pss_errors = scn->scn_phys.scn_errors; 2162 ps->pss_state = scn->scn_phys.scn_state; 2163 2164 /* data not stored on disk */ 2165 ps->pss_pass_start = spa->spa_scan_pass_start; 2166 ps->pss_pass_exam = spa->spa_scan_pass_exam; 2167 2168 return (0); 2169} 2170 2171boolean_t 2172spa_debug_enabled(spa_t *spa) 2173{ 2174 return (spa->spa_debug); 2175} 2176 2177int 2178spa_maxblocksize(spa_t *spa) 2179{ 2180 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) 2181 return (SPA_MAXBLOCKSIZE); 2182 else 2183 return (SPA_OLD_MAXBLOCKSIZE); 2184} 2185