spa_misc.c revision 288549
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
24 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
25 * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
26 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
27 */
28
29#include <sys/zfs_context.h>
30#include <sys/spa_impl.h>
31#include <sys/spa_boot.h>
32#include <sys/zio.h>
33#include <sys/zio_checksum.h>
34#include <sys/zio_compress.h>
35#include <sys/dmu.h>
36#include <sys/dmu_tx.h>
37#include <sys/zap.h>
38#include <sys/zil.h>
39#include <sys/vdev_impl.h>
40#include <sys/metaslab.h>
41#include <sys/uberblock_impl.h>
42#include <sys/txg.h>
43#include <sys/avl.h>
44#include <sys/unique.h>
45#include <sys/dsl_pool.h>
46#include <sys/dsl_dir.h>
47#include <sys/dsl_prop.h>
48#include <sys/dsl_scan.h>
49#include <sys/fs/zfs.h>
50#include <sys/metaslab_impl.h>
51#include <sys/arc.h>
52#include <sys/ddt.h>
53#include "zfs_prop.h"
54#include "zfeature_common.h"
55
56/*
57 * SPA locking
58 *
59 * There are four basic locks for managing spa_t structures:
60 *
61 * spa_namespace_lock (global mutex)
62 *
63 *	This lock must be acquired to do any of the following:
64 *
65 *		- Lookup a spa_t by name
66 *		- Add or remove a spa_t from the namespace
67 *		- Increase spa_refcount from non-zero
68 *		- Check if spa_refcount is zero
69 *		- Rename a spa_t
70 *		- add/remove/attach/detach devices
71 *		- Held for the duration of create/destroy/import/export
72 *
73 *	It does not need to handle recursion.  A create or destroy may
74 *	reference objects (files or zvols) in other pools, but by
75 *	definition they must have an existing reference, and will never need
76 *	to lookup a spa_t by name.
77 *
78 * spa_refcount (per-spa refcount_t protected by mutex)
79 *
80 *	This reference count keep track of any active users of the spa_t.  The
81 *	spa_t cannot be destroyed or freed while this is non-zero.  Internally,
82 *	the refcount is never really 'zero' - opening a pool implicitly keeps
83 *	some references in the DMU.  Internally we check against spa_minref, but
84 *	present the image of a zero/non-zero value to consumers.
85 *
86 * spa_config_lock[] (per-spa array of rwlocks)
87 *
88 *	This protects the spa_t from config changes, and must be held in
89 *	the following circumstances:
90 *
91 *		- RW_READER to perform I/O to the spa
92 *		- RW_WRITER to change the vdev config
93 *
94 * The locking order is fairly straightforward:
95 *
96 *		spa_namespace_lock	->	spa_refcount
97 *
98 *	The namespace lock must be acquired to increase the refcount from 0
99 *	or to check if it is zero.
100 *
101 *		spa_refcount		->	spa_config_lock[]
102 *
103 *	There must be at least one valid reference on the spa_t to acquire
104 *	the config lock.
105 *
106 *		spa_namespace_lock	->	spa_config_lock[]
107 *
108 *	The namespace lock must always be taken before the config lock.
109 *
110 *
111 * The spa_namespace_lock can be acquired directly and is globally visible.
112 *
113 * The namespace is manipulated using the following functions, all of which
114 * require the spa_namespace_lock to be held.
115 *
116 *	spa_lookup()		Lookup a spa_t by name.
117 *
118 *	spa_add()		Create a new spa_t in the namespace.
119 *
120 *	spa_remove()		Remove a spa_t from the namespace.  This also
121 *				frees up any memory associated with the spa_t.
122 *
123 *	spa_next()		Returns the next spa_t in the system, or the
124 *				first if NULL is passed.
125 *
126 *	spa_evict_all()		Shutdown and remove all spa_t structures in
127 *				the system.
128 *
129 *	spa_guid_exists()	Determine whether a pool/device guid exists.
130 *
131 * The spa_refcount is manipulated using the following functions:
132 *
133 *	spa_open_ref()		Adds a reference to the given spa_t.  Must be
134 *				called with spa_namespace_lock held if the
135 *				refcount is currently zero.
136 *
137 *	spa_close()		Remove a reference from the spa_t.  This will
138 *				not free the spa_t or remove it from the
139 *				namespace.  No locking is required.
140 *
141 *	spa_refcount_zero()	Returns true if the refcount is currently
142 *				zero.  Must be called with spa_namespace_lock
143 *				held.
144 *
145 * The spa_config_lock[] is an array of rwlocks, ordered as follows:
146 * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV.
147 * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}().
148 *
149 * To read the configuration, it suffices to hold one of these locks as reader.
150 * To modify the configuration, you must hold all locks as writer.  To modify
151 * vdev state without altering the vdev tree's topology (e.g. online/offline),
152 * you must hold SCL_STATE and SCL_ZIO as writer.
153 *
154 * We use these distinct config locks to avoid recursive lock entry.
155 * For example, spa_sync() (which holds SCL_CONFIG as reader) induces
156 * block allocations (SCL_ALLOC), which may require reading space maps
157 * from disk (dmu_read() -> zio_read() -> SCL_ZIO).
158 *
159 * The spa config locks cannot be normal rwlocks because we need the
160 * ability to hand off ownership.  For example, SCL_ZIO is acquired
161 * by the issuing thread and later released by an interrupt thread.
162 * They do, however, obey the usual write-wanted semantics to prevent
163 * writer (i.e. system administrator) starvation.
164 *
165 * The lock acquisition rules are as follows:
166 *
167 * SCL_CONFIG
168 *	Protects changes to the vdev tree topology, such as vdev
169 *	add/remove/attach/detach.  Protects the dirty config list
170 *	(spa_config_dirty_list) and the set of spares and l2arc devices.
171 *
172 * SCL_STATE
173 *	Protects changes to pool state and vdev state, such as vdev
174 *	online/offline/fault/degrade/clear.  Protects the dirty state list
175 *	(spa_state_dirty_list) and global pool state (spa_state).
176 *
177 * SCL_ALLOC
178 *	Protects changes to metaslab groups and classes.
179 *	Held as reader by metaslab_alloc() and metaslab_claim().
180 *
181 * SCL_ZIO
182 *	Held by bp-level zios (those which have no io_vd upon entry)
183 *	to prevent changes to the vdev tree.  The bp-level zio implicitly
184 *	protects all of its vdev child zios, which do not hold SCL_ZIO.
185 *
186 * SCL_FREE
187 *	Protects changes to metaslab groups and classes.
188 *	Held as reader by metaslab_free().  SCL_FREE is distinct from
189 *	SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free
190 *	blocks in zio_done() while another i/o that holds either
191 *	SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete.
192 *
193 * SCL_VDEV
194 *	Held as reader to prevent changes to the vdev tree during trivial
195 *	inquiries such as bp_get_dsize().  SCL_VDEV is distinct from the
196 *	other locks, and lower than all of them, to ensure that it's safe
197 *	to acquire regardless of caller context.
198 *
199 * In addition, the following rules apply:
200 *
201 * (a)	spa_props_lock protects pool properties, spa_config and spa_config_list.
202 *	The lock ordering is SCL_CONFIG > spa_props_lock.
203 *
204 * (b)	I/O operations on leaf vdevs.  For any zio operation that takes
205 *	an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(),
206 *	or zio_write_phys() -- the caller must ensure that the config cannot
207 *	cannot change in the interim, and that the vdev cannot be reopened.
208 *	SCL_STATE as reader suffices for both.
209 *
210 * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit().
211 *
212 *	spa_vdev_enter()	Acquire the namespace lock and the config lock
213 *				for writing.
214 *
215 *	spa_vdev_exit()		Release the config lock, wait for all I/O
216 *				to complete, sync the updated configs to the
217 *				cache, and release the namespace lock.
218 *
219 * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit().
220 * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual
221 * locking is, always, based on spa_namespace_lock and spa_config_lock[].
222 *
223 * spa_rename() is also implemented within this file since it requires
224 * manipulation of the namespace.
225 */
226
227static avl_tree_t spa_namespace_avl;
228kmutex_t spa_namespace_lock;
229static kcondvar_t spa_namespace_cv;
230static int spa_active_count;
231int spa_max_replication_override = SPA_DVAS_PER_BP;
232
233static kmutex_t spa_spare_lock;
234static avl_tree_t spa_spare_avl;
235static kmutex_t spa_l2cache_lock;
236static avl_tree_t spa_l2cache_avl;
237
238kmem_cache_t *spa_buffer_pool;
239int spa_mode_global;
240
241#ifdef ZFS_DEBUG
242/* Everything except dprintf and spa is on by default in debug builds */
243int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SPA);
244#else
245int zfs_flags = 0;
246#endif
247SYSCTL_DECL(_debug);
248TUNABLE_INT("debug.zfs_flags", &zfs_flags);
249SYSCTL_INT(_debug, OID_AUTO, zfs_flags, CTLFLAG_RWTUN, &zfs_flags, 0,
250    "ZFS debug flags.");
251
252/*
253 * zfs_recover can be set to nonzero to attempt to recover from
254 * otherwise-fatal errors, typically caused by on-disk corruption.  When
255 * set, calls to zfs_panic_recover() will turn into warning messages.
256 * This should only be used as a last resort, as it typically results
257 * in leaked space, or worse.
258 */
259boolean_t zfs_recover = B_FALSE;
260SYSCTL_DECL(_vfs_zfs);
261TUNABLE_INT("vfs.zfs.recover", &zfs_recover);
262SYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RDTUN, &zfs_recover, 0,
263    "Try to recover from otherwise-fatal errors.");
264
265/*
266 * If destroy encounters an EIO while reading metadata (e.g. indirect
267 * blocks), space referenced by the missing metadata can not be freed.
268 * Normally this causes the background destroy to become "stalled", as
269 * it is unable to make forward progress.  While in this stalled state,
270 * all remaining space to free from the error-encountering filesystem is
271 * "temporarily leaked".  Set this flag to cause it to ignore the EIO,
272 * permanently leak the space from indirect blocks that can not be read,
273 * and continue to free everything else that it can.
274 *
275 * The default, "stalling" behavior is useful if the storage partially
276 * fails (i.e. some but not all i/os fail), and then later recovers.  In
277 * this case, we will be able to continue pool operations while it is
278 * partially failed, and when it recovers, we can continue to free the
279 * space, with no leaks.  However, note that this case is actually
280 * fairly rare.
281 *
282 * Typically pools either (a) fail completely (but perhaps temporarily,
283 * e.g. a top-level vdev going offline), or (b) have localized,
284 * permanent errors (e.g. disk returns the wrong data due to bit flip or
285 * firmware bug).  In case (a), this setting does not matter because the
286 * pool will be suspended and the sync thread will not be able to make
287 * forward progress regardless.  In case (b), because the error is
288 * permanent, the best we can do is leak the minimum amount of space,
289 * which is what setting this flag will do.  Therefore, it is reasonable
290 * for this flag to normally be set, but we chose the more conservative
291 * approach of not setting it, so that there is no possibility of
292 * leaking space in the "partial temporary" failure case.
293 */
294boolean_t zfs_free_leak_on_eio = B_FALSE;
295
296/*
297 * Expiration time in milliseconds. This value has two meanings. First it is
298 * used to determine when the spa_deadman() logic should fire. By default the
299 * spa_deadman() will fire if spa_sync() has not completed in 1000 seconds.
300 * Secondly, the value determines if an I/O is considered "hung". Any I/O that
301 * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting
302 * in a system panic.
303 */
304uint64_t zfs_deadman_synctime_ms = 1000000ULL;
305TUNABLE_QUAD("vfs.zfs.deadman_synctime_ms", &zfs_deadman_synctime_ms);
306SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_synctime_ms, CTLFLAG_RDTUN,
307    &zfs_deadman_synctime_ms, 0,
308    "Stalled ZFS I/O expiration time in milliseconds");
309
310/*
311 * Check time in milliseconds. This defines the frequency at which we check
312 * for hung I/O.
313 */
314uint64_t zfs_deadman_checktime_ms = 5000ULL;
315TUNABLE_QUAD("vfs.zfs.deadman_checktime_ms", &zfs_deadman_checktime_ms);
316SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_checktime_ms, CTLFLAG_RDTUN,
317    &zfs_deadman_checktime_ms, 0,
318    "Period of checks for stalled ZFS I/O in milliseconds");
319
320/*
321 * Default value of -1 for zfs_deadman_enabled is resolved in
322 * zfs_deadman_init()
323 */
324int zfs_deadman_enabled = -1;
325TUNABLE_INT("vfs.zfs.deadman_enabled", &zfs_deadman_enabled);
326SYSCTL_INT(_vfs_zfs, OID_AUTO, deadman_enabled, CTLFLAG_RDTUN,
327    &zfs_deadman_enabled, 0, "Kernel panic on stalled ZFS I/O");
328
329/*
330 * The worst case is single-sector max-parity RAID-Z blocks, in which
331 * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
332 * times the size; so just assume that.  Add to this the fact that
333 * we can have up to 3 DVAs per bp, and one more factor of 2 because
334 * the block may be dittoed with up to 3 DVAs by ddt_sync().  All together,
335 * the worst case is:
336 *     (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24
337 */
338int spa_asize_inflation = 24;
339TUNABLE_INT("vfs.zfs.spa_asize_inflation", &spa_asize_inflation);
340SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_asize_inflation, CTLFLAG_RWTUN,
341    &spa_asize_inflation, 0, "Worst case inflation factor for single sector writes");
342
343#ifndef illumos
344#ifdef _KERNEL
345static void
346zfs_deadman_init()
347{
348	/*
349	 * If we are not i386 or amd64 or in a virtual machine,
350	 * disable ZFS deadman thread by default
351	 */
352	if (zfs_deadman_enabled == -1) {
353#if defined(__amd64__) || defined(__i386__)
354		zfs_deadman_enabled = (vm_guest == VM_GUEST_NO) ? 1 : 0;
355#else
356		zfs_deadman_enabled = 0;
357#endif
358	}
359}
360#endif	/* _KERNEL */
361#endif	/* !illumos */
362
363/*
364 * Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in
365 * the pool to be consumed.  This ensures that we don't run the pool
366 * completely out of space, due to unaccounted changes (e.g. to the MOS).
367 * It also limits the worst-case time to allocate space.  If we have
368 * less than this amount of free space, most ZPL operations (e.g. write,
369 * create) will return ENOSPC.
370 *
371 * Certain operations (e.g. file removal, most administrative actions) can
372 * use half the slop space.  They will only return ENOSPC if less than half
373 * the slop space is free.  Typically, once the pool has less than the slop
374 * space free, the user will use these operations to free up space in the pool.
375 * These are the operations that call dsl_pool_adjustedsize() with the netfree
376 * argument set to TRUE.
377 *
378 * A very restricted set of operations are always permitted, regardless of
379 * the amount of free space.  These are the operations that call
380 * dsl_sync_task(ZFS_SPACE_CHECK_NONE), e.g. "zfs destroy".  If these
381 * operations result in a net increase in the amount of space used,
382 * it is possible to run the pool completely out of space, causing it to
383 * be permanently read-only.
384 *
385 * See also the comments in zfs_space_check_t.
386 */
387int spa_slop_shift = 5;
388SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_slop_shift, CTLFLAG_RWTUN,
389    &spa_slop_shift, 0,
390    "Shift value of reserved space (1/(2^spa_slop_shift)).");
391
392/*
393 * ==========================================================================
394 * SPA config locking
395 * ==========================================================================
396 */
397static void
398spa_config_lock_init(spa_t *spa)
399{
400	for (int i = 0; i < SCL_LOCKS; i++) {
401		spa_config_lock_t *scl = &spa->spa_config_lock[i];
402		mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL);
403		cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL);
404		refcount_create_untracked(&scl->scl_count);
405		scl->scl_writer = NULL;
406		scl->scl_write_wanted = 0;
407	}
408}
409
410static void
411spa_config_lock_destroy(spa_t *spa)
412{
413	for (int i = 0; i < SCL_LOCKS; i++) {
414		spa_config_lock_t *scl = &spa->spa_config_lock[i];
415		mutex_destroy(&scl->scl_lock);
416		cv_destroy(&scl->scl_cv);
417		refcount_destroy(&scl->scl_count);
418		ASSERT(scl->scl_writer == NULL);
419		ASSERT(scl->scl_write_wanted == 0);
420	}
421}
422
423int
424spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw)
425{
426	for (int i = 0; i < SCL_LOCKS; i++) {
427		spa_config_lock_t *scl = &spa->spa_config_lock[i];
428		if (!(locks & (1 << i)))
429			continue;
430		mutex_enter(&scl->scl_lock);
431		if (rw == RW_READER) {
432			if (scl->scl_writer || scl->scl_write_wanted) {
433				mutex_exit(&scl->scl_lock);
434				spa_config_exit(spa, locks ^ (1 << i), tag);
435				return (0);
436			}
437		} else {
438			ASSERT(scl->scl_writer != curthread);
439			if (!refcount_is_zero(&scl->scl_count)) {
440				mutex_exit(&scl->scl_lock);
441				spa_config_exit(spa, locks ^ (1 << i), tag);
442				return (0);
443			}
444			scl->scl_writer = curthread;
445		}
446		(void) refcount_add(&scl->scl_count, tag);
447		mutex_exit(&scl->scl_lock);
448	}
449	return (1);
450}
451
452void
453spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw)
454{
455	int wlocks_held = 0;
456
457	ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY);
458
459	for (int i = 0; i < SCL_LOCKS; i++) {
460		spa_config_lock_t *scl = &spa->spa_config_lock[i];
461		if (scl->scl_writer == curthread)
462			wlocks_held |= (1 << i);
463		if (!(locks & (1 << i)))
464			continue;
465		mutex_enter(&scl->scl_lock);
466		if (rw == RW_READER) {
467			while (scl->scl_writer || scl->scl_write_wanted) {
468				cv_wait(&scl->scl_cv, &scl->scl_lock);
469			}
470		} else {
471			ASSERT(scl->scl_writer != curthread);
472			while (!refcount_is_zero(&scl->scl_count)) {
473				scl->scl_write_wanted++;
474				cv_wait(&scl->scl_cv, &scl->scl_lock);
475				scl->scl_write_wanted--;
476			}
477			scl->scl_writer = curthread;
478		}
479		(void) refcount_add(&scl->scl_count, tag);
480		mutex_exit(&scl->scl_lock);
481	}
482	ASSERT(wlocks_held <= locks);
483}
484
485void
486spa_config_exit(spa_t *spa, int locks, void *tag)
487{
488	for (int i = SCL_LOCKS - 1; i >= 0; i--) {
489		spa_config_lock_t *scl = &spa->spa_config_lock[i];
490		if (!(locks & (1 << i)))
491			continue;
492		mutex_enter(&scl->scl_lock);
493		ASSERT(!refcount_is_zero(&scl->scl_count));
494		if (refcount_remove(&scl->scl_count, tag) == 0) {
495			ASSERT(scl->scl_writer == NULL ||
496			    scl->scl_writer == curthread);
497			scl->scl_writer = NULL;	/* OK in either case */
498			cv_broadcast(&scl->scl_cv);
499		}
500		mutex_exit(&scl->scl_lock);
501	}
502}
503
504int
505spa_config_held(spa_t *spa, int locks, krw_t rw)
506{
507	int locks_held = 0;
508
509	for (int i = 0; i < SCL_LOCKS; i++) {
510		spa_config_lock_t *scl = &spa->spa_config_lock[i];
511		if (!(locks & (1 << i)))
512			continue;
513		if ((rw == RW_READER && !refcount_is_zero(&scl->scl_count)) ||
514		    (rw == RW_WRITER && scl->scl_writer == curthread))
515			locks_held |= 1 << i;
516	}
517
518	return (locks_held);
519}
520
521/*
522 * ==========================================================================
523 * SPA namespace functions
524 * ==========================================================================
525 */
526
527/*
528 * Lookup the named spa_t in the AVL tree.  The spa_namespace_lock must be held.
529 * Returns NULL if no matching spa_t is found.
530 */
531spa_t *
532spa_lookup(const char *name)
533{
534	static spa_t search;	/* spa_t is large; don't allocate on stack */
535	spa_t *spa;
536	avl_index_t where;
537	char *cp;
538
539	ASSERT(MUTEX_HELD(&spa_namespace_lock));
540
541	(void) strlcpy(search.spa_name, name, sizeof (search.spa_name));
542
543	/*
544	 * If it's a full dataset name, figure out the pool name and
545	 * just use that.
546	 */
547	cp = strpbrk(search.spa_name, "/@#");
548	if (cp != NULL)
549		*cp = '\0';
550
551	spa = avl_find(&spa_namespace_avl, &search, &where);
552
553	return (spa);
554}
555
556/*
557 * Fires when spa_sync has not completed within zfs_deadman_synctime_ms.
558 * If the zfs_deadman_enabled flag is set then it inspects all vdev queues
559 * looking for potentially hung I/Os.
560 */
561void
562spa_deadman(void *arg)
563{
564	spa_t *spa = arg;
565
566	/*
567	 * Disable the deadman timer if the pool is suspended.
568	 */
569	if (spa_suspended(spa)) {
570#ifdef illumos
571		VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));
572#else
573		/* Nothing.  just don't schedule any future callouts. */
574#endif
575		return;
576	}
577
578	zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu",
579	    (gethrtime() - spa->spa_sync_starttime) / NANOSEC,
580	    ++spa->spa_deadman_calls);
581	if (zfs_deadman_enabled)
582		vdev_deadman(spa->spa_root_vdev);
583}
584
585/*
586 * Create an uninitialized spa_t with the given name.  Requires
587 * spa_namespace_lock.  The caller must ensure that the spa_t doesn't already
588 * exist by calling spa_lookup() first.
589 */
590spa_t *
591spa_add(const char *name, nvlist_t *config, const char *altroot)
592{
593	spa_t *spa;
594	spa_config_dirent_t *dp;
595#ifdef illumos
596	cyc_handler_t hdlr;
597	cyc_time_t when;
598#endif
599
600	ASSERT(MUTEX_HELD(&spa_namespace_lock));
601
602	spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);
603
604	mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
605	mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
606	mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
607	mutex_init(&spa->spa_evicting_os_lock, NULL, MUTEX_DEFAULT, NULL);
608	mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
609	mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL);
610	mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
611	mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
612	mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
613	mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
614
615	cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
616	cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL);
617	cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL);
618	cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
619	cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
620
621	for (int t = 0; t < TXG_SIZE; t++)
622		bplist_create(&spa->spa_free_bplist[t]);
623
624	(void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name));
625	spa->spa_state = POOL_STATE_UNINITIALIZED;
626	spa->spa_freeze_txg = UINT64_MAX;
627	spa->spa_final_txg = UINT64_MAX;
628	spa->spa_load_max_txg = UINT64_MAX;
629	spa->spa_proc = &p0;
630	spa->spa_proc_state = SPA_PROC_NONE;
631
632#ifdef illumos
633	hdlr.cyh_func = spa_deadman;
634	hdlr.cyh_arg = spa;
635	hdlr.cyh_level = CY_LOW_LEVEL;
636#endif
637
638	spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms);
639
640#ifdef illumos
641	/*
642	 * This determines how often we need to check for hung I/Os after
643	 * the cyclic has already fired. Since checking for hung I/Os is
644	 * an expensive operation we don't want to check too frequently.
645	 * Instead wait for 5 seconds before checking again.
646	 */
647	when.cyt_interval = MSEC2NSEC(zfs_deadman_checktime_ms);
648	when.cyt_when = CY_INFINITY;
649	mutex_enter(&cpu_lock);
650	spa->spa_deadman_cycid = cyclic_add(&hdlr, &when);
651	mutex_exit(&cpu_lock);
652#else	/* !illumos */
653#ifdef _KERNEL
654	callout_init(&spa->spa_deadman_cycid, CALLOUT_MPSAFE);
655#endif
656#endif
657	refcount_create(&spa->spa_refcount);
658	spa_config_lock_init(spa);
659
660	avl_add(&spa_namespace_avl, spa);
661
662	/*
663	 * Set the alternate root, if there is one.
664	 */
665	if (altroot) {
666		spa->spa_root = spa_strdup(altroot);
667		spa_active_count++;
668	}
669
670	/*
671	 * Every pool starts with the default cachefile
672	 */
673	list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t),
674	    offsetof(spa_config_dirent_t, scd_link));
675
676	dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP);
677	dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path);
678	list_insert_head(&spa->spa_config_list, dp);
679
680	VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME,
681	    KM_SLEEP) == 0);
682
683	if (config != NULL) {
684		nvlist_t *features;
685
686		if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ,
687		    &features) == 0) {
688			VERIFY(nvlist_dup(features, &spa->spa_label_features,
689			    0) == 0);
690		}
691
692		VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
693	}
694
695	if (spa->spa_label_features == NULL) {
696		VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME,
697		    KM_SLEEP) == 0);
698	}
699
700	spa->spa_debug = ((zfs_flags & ZFS_DEBUG_SPA) != 0);
701
702	spa->spa_min_ashift = INT_MAX;
703	spa->spa_max_ashift = 0;
704
705	/*
706	 * As a pool is being created, treat all features as disabled by
707	 * setting SPA_FEATURE_DISABLED for all entries in the feature
708	 * refcount cache.
709	 */
710	for (int i = 0; i < SPA_FEATURES; i++) {
711		spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED;
712	}
713
714	return (spa);
715}
716
717/*
718 * Removes a spa_t from the namespace, freeing up any memory used.  Requires
719 * spa_namespace_lock.  This is called only after the spa_t has been closed and
720 * deactivated.
721 */
722void
723spa_remove(spa_t *spa)
724{
725	spa_config_dirent_t *dp;
726
727	ASSERT(MUTEX_HELD(&spa_namespace_lock));
728	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
729	ASSERT3U(refcount_count(&spa->spa_refcount), ==, 0);
730
731	nvlist_free(spa->spa_config_splitting);
732
733	avl_remove(&spa_namespace_avl, spa);
734	cv_broadcast(&spa_namespace_cv);
735
736	if (spa->spa_root) {
737		spa_strfree(spa->spa_root);
738		spa_active_count--;
739	}
740
741	while ((dp = list_head(&spa->spa_config_list)) != NULL) {
742		list_remove(&spa->spa_config_list, dp);
743		if (dp->scd_path != NULL)
744			spa_strfree(dp->scd_path);
745		kmem_free(dp, sizeof (spa_config_dirent_t));
746	}
747
748	list_destroy(&spa->spa_config_list);
749
750	nvlist_free(spa->spa_label_features);
751	nvlist_free(spa->spa_load_info);
752	spa_config_set(spa, NULL);
753
754#ifdef illumos
755	mutex_enter(&cpu_lock);
756	if (spa->spa_deadman_cycid != CYCLIC_NONE)
757		cyclic_remove(spa->spa_deadman_cycid);
758	mutex_exit(&cpu_lock);
759	spa->spa_deadman_cycid = CYCLIC_NONE;
760#else	/* !illumos */
761#ifdef _KERNEL
762	callout_drain(&spa->spa_deadman_cycid);
763#endif
764#endif
765
766	refcount_destroy(&spa->spa_refcount);
767
768	spa_config_lock_destroy(spa);
769
770	for (int t = 0; t < TXG_SIZE; t++)
771		bplist_destroy(&spa->spa_free_bplist[t]);
772
773	cv_destroy(&spa->spa_async_cv);
774	cv_destroy(&spa->spa_evicting_os_cv);
775	cv_destroy(&spa->spa_proc_cv);
776	cv_destroy(&spa->spa_scrub_io_cv);
777	cv_destroy(&spa->spa_suspend_cv);
778
779	mutex_destroy(&spa->spa_async_lock);
780	mutex_destroy(&spa->spa_errlist_lock);
781	mutex_destroy(&spa->spa_errlog_lock);
782	mutex_destroy(&spa->spa_evicting_os_lock);
783	mutex_destroy(&spa->spa_history_lock);
784	mutex_destroy(&spa->spa_proc_lock);
785	mutex_destroy(&spa->spa_props_lock);
786	mutex_destroy(&spa->spa_scrub_lock);
787	mutex_destroy(&spa->spa_suspend_lock);
788	mutex_destroy(&spa->spa_vdev_top_lock);
789
790	kmem_free(spa, sizeof (spa_t));
791}
792
793/*
794 * Given a pool, return the next pool in the namespace, or NULL if there is
795 * none.  If 'prev' is NULL, return the first pool.
796 */
797spa_t *
798spa_next(spa_t *prev)
799{
800	ASSERT(MUTEX_HELD(&spa_namespace_lock));
801
802	if (prev)
803		return (AVL_NEXT(&spa_namespace_avl, prev));
804	else
805		return (avl_first(&spa_namespace_avl));
806}
807
808/*
809 * ==========================================================================
810 * SPA refcount functions
811 * ==========================================================================
812 */
813
814/*
815 * Add a reference to the given spa_t.  Must have at least one reference, or
816 * have the namespace lock held.
817 */
818void
819spa_open_ref(spa_t *spa, void *tag)
820{
821	ASSERT(refcount_count(&spa->spa_refcount) >= spa->spa_minref ||
822	    MUTEX_HELD(&spa_namespace_lock));
823	(void) refcount_add(&spa->spa_refcount, tag);
824}
825
826/*
827 * Remove a reference to the given spa_t.  Must have at least one reference, or
828 * have the namespace lock held.
829 */
830void
831spa_close(spa_t *spa, void *tag)
832{
833	ASSERT(refcount_count(&spa->spa_refcount) > spa->spa_minref ||
834	    MUTEX_HELD(&spa_namespace_lock));
835	(void) refcount_remove(&spa->spa_refcount, tag);
836}
837
838/*
839 * Remove a reference to the given spa_t held by a dsl dir that is
840 * being asynchronously released.  Async releases occur from a taskq
841 * performing eviction of dsl datasets and dirs.  The namespace lock
842 * isn't held and the hold by the object being evicted may contribute to
843 * spa_minref (e.g. dataset or directory released during pool export),
844 * so the asserts in spa_close() do not apply.
845 */
846void
847spa_async_close(spa_t *spa, void *tag)
848{
849	(void) refcount_remove(&spa->spa_refcount, tag);
850}
851
852/*
853 * Check to see if the spa refcount is zero.  Must be called with
854 * spa_namespace_lock held.  We really compare against spa_minref, which is the
855 * number of references acquired when opening a pool
856 */
857boolean_t
858spa_refcount_zero(spa_t *spa)
859{
860	ASSERT(MUTEX_HELD(&spa_namespace_lock));
861
862	return (refcount_count(&spa->spa_refcount) == spa->spa_minref);
863}
864
865/*
866 * ==========================================================================
867 * SPA spare and l2cache tracking
868 * ==========================================================================
869 */
870
871/*
872 * Hot spares and cache devices are tracked using the same code below,
873 * for 'auxiliary' devices.
874 */
875
876typedef struct spa_aux {
877	uint64_t	aux_guid;
878	uint64_t	aux_pool;
879	avl_node_t	aux_avl;
880	int		aux_count;
881} spa_aux_t;
882
883static int
884spa_aux_compare(const void *a, const void *b)
885{
886	const spa_aux_t *sa = a;
887	const spa_aux_t *sb = b;
888
889	if (sa->aux_guid < sb->aux_guid)
890		return (-1);
891	else if (sa->aux_guid > sb->aux_guid)
892		return (1);
893	else
894		return (0);
895}
896
897void
898spa_aux_add(vdev_t *vd, avl_tree_t *avl)
899{
900	avl_index_t where;
901	spa_aux_t search;
902	spa_aux_t *aux;
903
904	search.aux_guid = vd->vdev_guid;
905	if ((aux = avl_find(avl, &search, &where)) != NULL) {
906		aux->aux_count++;
907	} else {
908		aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP);
909		aux->aux_guid = vd->vdev_guid;
910		aux->aux_count = 1;
911		avl_insert(avl, aux, where);
912	}
913}
914
915void
916spa_aux_remove(vdev_t *vd, avl_tree_t *avl)
917{
918	spa_aux_t search;
919	spa_aux_t *aux;
920	avl_index_t where;
921
922	search.aux_guid = vd->vdev_guid;
923	aux = avl_find(avl, &search, &where);
924
925	ASSERT(aux != NULL);
926
927	if (--aux->aux_count == 0) {
928		avl_remove(avl, aux);
929		kmem_free(aux, sizeof (spa_aux_t));
930	} else if (aux->aux_pool == spa_guid(vd->vdev_spa)) {
931		aux->aux_pool = 0ULL;
932	}
933}
934
935boolean_t
936spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl)
937{
938	spa_aux_t search, *found;
939
940	search.aux_guid = guid;
941	found = avl_find(avl, &search, NULL);
942
943	if (pool) {
944		if (found)
945			*pool = found->aux_pool;
946		else
947			*pool = 0ULL;
948	}
949
950	if (refcnt) {
951		if (found)
952			*refcnt = found->aux_count;
953		else
954			*refcnt = 0;
955	}
956
957	return (found != NULL);
958}
959
960void
961spa_aux_activate(vdev_t *vd, avl_tree_t *avl)
962{
963	spa_aux_t search, *found;
964	avl_index_t where;
965
966	search.aux_guid = vd->vdev_guid;
967	found = avl_find(avl, &search, &where);
968	ASSERT(found != NULL);
969	ASSERT(found->aux_pool == 0ULL);
970
971	found->aux_pool = spa_guid(vd->vdev_spa);
972}
973
974/*
975 * Spares are tracked globally due to the following constraints:
976 *
977 * 	- A spare may be part of multiple pools.
978 * 	- A spare may be added to a pool even if it's actively in use within
979 *	  another pool.
980 * 	- A spare in use in any pool can only be the source of a replacement if
981 *	  the target is a spare in the same pool.
982 *
983 * We keep track of all spares on the system through the use of a reference
984 * counted AVL tree.  When a vdev is added as a spare, or used as a replacement
985 * spare, then we bump the reference count in the AVL tree.  In addition, we set
986 * the 'vdev_isspare' member to indicate that the device is a spare (active or
987 * inactive).  When a spare is made active (used to replace a device in the
988 * pool), we also keep track of which pool its been made a part of.
989 *
990 * The 'spa_spare_lock' protects the AVL tree.  These functions are normally
991 * called under the spa_namespace lock as part of vdev reconfiguration.  The
992 * separate spare lock exists for the status query path, which does not need to
993 * be completely consistent with respect to other vdev configuration changes.
994 */
995
996static int
997spa_spare_compare(const void *a, const void *b)
998{
999	return (spa_aux_compare(a, b));
1000}
1001
1002void
1003spa_spare_add(vdev_t *vd)
1004{
1005	mutex_enter(&spa_spare_lock);
1006	ASSERT(!vd->vdev_isspare);
1007	spa_aux_add(vd, &spa_spare_avl);
1008	vd->vdev_isspare = B_TRUE;
1009	mutex_exit(&spa_spare_lock);
1010}
1011
1012void
1013spa_spare_remove(vdev_t *vd)
1014{
1015	mutex_enter(&spa_spare_lock);
1016	ASSERT(vd->vdev_isspare);
1017	spa_aux_remove(vd, &spa_spare_avl);
1018	vd->vdev_isspare = B_FALSE;
1019	mutex_exit(&spa_spare_lock);
1020}
1021
1022boolean_t
1023spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt)
1024{
1025	boolean_t found;
1026
1027	mutex_enter(&spa_spare_lock);
1028	found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl);
1029	mutex_exit(&spa_spare_lock);
1030
1031	return (found);
1032}
1033
1034void
1035spa_spare_activate(vdev_t *vd)
1036{
1037	mutex_enter(&spa_spare_lock);
1038	ASSERT(vd->vdev_isspare);
1039	spa_aux_activate(vd, &spa_spare_avl);
1040	mutex_exit(&spa_spare_lock);
1041}
1042
1043/*
1044 * Level 2 ARC devices are tracked globally for the same reasons as spares.
1045 * Cache devices currently only support one pool per cache device, and so
1046 * for these devices the aux reference count is currently unused beyond 1.
1047 */
1048
1049static int
1050spa_l2cache_compare(const void *a, const void *b)
1051{
1052	return (spa_aux_compare(a, b));
1053}
1054
1055void
1056spa_l2cache_add(vdev_t *vd)
1057{
1058	mutex_enter(&spa_l2cache_lock);
1059	ASSERT(!vd->vdev_isl2cache);
1060	spa_aux_add(vd, &spa_l2cache_avl);
1061	vd->vdev_isl2cache = B_TRUE;
1062	mutex_exit(&spa_l2cache_lock);
1063}
1064
1065void
1066spa_l2cache_remove(vdev_t *vd)
1067{
1068	mutex_enter(&spa_l2cache_lock);
1069	ASSERT(vd->vdev_isl2cache);
1070	spa_aux_remove(vd, &spa_l2cache_avl);
1071	vd->vdev_isl2cache = B_FALSE;
1072	mutex_exit(&spa_l2cache_lock);
1073}
1074
1075boolean_t
1076spa_l2cache_exists(uint64_t guid, uint64_t *pool)
1077{
1078	boolean_t found;
1079
1080	mutex_enter(&spa_l2cache_lock);
1081	found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl);
1082	mutex_exit(&spa_l2cache_lock);
1083
1084	return (found);
1085}
1086
1087void
1088spa_l2cache_activate(vdev_t *vd)
1089{
1090	mutex_enter(&spa_l2cache_lock);
1091	ASSERT(vd->vdev_isl2cache);
1092	spa_aux_activate(vd, &spa_l2cache_avl);
1093	mutex_exit(&spa_l2cache_lock);
1094}
1095
1096/*
1097 * ==========================================================================
1098 * SPA vdev locking
1099 * ==========================================================================
1100 */
1101
1102/*
1103 * Lock the given spa_t for the purpose of adding or removing a vdev.
1104 * Grabs the global spa_namespace_lock plus the spa config lock for writing.
1105 * It returns the next transaction group for the spa_t.
1106 */
1107uint64_t
1108spa_vdev_enter(spa_t *spa)
1109{
1110	mutex_enter(&spa->spa_vdev_top_lock);
1111	mutex_enter(&spa_namespace_lock);
1112	return (spa_vdev_config_enter(spa));
1113}
1114
1115/*
1116 * Internal implementation for spa_vdev_enter().  Used when a vdev
1117 * operation requires multiple syncs (i.e. removing a device) while
1118 * keeping the spa_namespace_lock held.
1119 */
1120uint64_t
1121spa_vdev_config_enter(spa_t *spa)
1122{
1123	ASSERT(MUTEX_HELD(&spa_namespace_lock));
1124
1125	spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
1126
1127	return (spa_last_synced_txg(spa) + 1);
1128}
1129
1130/*
1131 * Used in combination with spa_vdev_config_enter() to allow the syncing
1132 * of multiple transactions without releasing the spa_namespace_lock.
1133 */
1134void
1135spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
1136{
1137	ASSERT(MUTEX_HELD(&spa_namespace_lock));
1138
1139	int config_changed = B_FALSE;
1140
1141	ASSERT(txg > spa_last_synced_txg(spa));
1142
1143	spa->spa_pending_vdev = NULL;
1144
1145	/*
1146	 * Reassess the DTLs.
1147	 */
1148	vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
1149
1150	if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) {
1151		config_changed = B_TRUE;
1152		spa->spa_config_generation++;
1153	}
1154
1155	/*
1156	 * Verify the metaslab classes.
1157	 */
1158	ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0);
1159	ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0);
1160
1161	spa_config_exit(spa, SCL_ALL, spa);
1162
1163	/*
1164	 * Panic the system if the specified tag requires it.  This
1165	 * is useful for ensuring that configurations are updated
1166	 * transactionally.
1167	 */
1168	if (zio_injection_enabled)
1169		zio_handle_panic_injection(spa, tag, 0);
1170
1171	/*
1172	 * Note: this txg_wait_synced() is important because it ensures
1173	 * that there won't be more than one config change per txg.
1174	 * This allows us to use the txg as the generation number.
1175	 */
1176	if (error == 0)
1177		txg_wait_synced(spa->spa_dsl_pool, txg);
1178
1179	if (vd != NULL) {
1180		ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL);
1181		spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
1182		vdev_free(vd);
1183		spa_config_exit(spa, SCL_ALL, spa);
1184	}
1185
1186	/*
1187	 * If the config changed, update the config cache.
1188	 */
1189	if (config_changed)
1190		spa_config_sync(spa, B_FALSE, B_TRUE);
1191}
1192
1193/*
1194 * Unlock the spa_t after adding or removing a vdev.  Besides undoing the
1195 * locking of spa_vdev_enter(), we also want make sure the transactions have
1196 * synced to disk, and then update the global configuration cache with the new
1197 * information.
1198 */
1199int
1200spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
1201{
1202	spa_vdev_config_exit(spa, vd, txg, error, FTAG);
1203	mutex_exit(&spa_namespace_lock);
1204	mutex_exit(&spa->spa_vdev_top_lock);
1205
1206	return (error);
1207}
1208
1209/*
1210 * Lock the given spa_t for the purpose of changing vdev state.
1211 */
1212void
1213spa_vdev_state_enter(spa_t *spa, int oplocks)
1214{
1215	int locks = SCL_STATE_ALL | oplocks;
1216
1217	/*
1218	 * Root pools may need to read of the underlying devfs filesystem
1219	 * when opening up a vdev.  Unfortunately if we're holding the
1220	 * SCL_ZIO lock it will result in a deadlock when we try to issue
1221	 * the read from the root filesystem.  Instead we "prefetch"
1222	 * the associated vnodes that we need prior to opening the
1223	 * underlying devices and cache them so that we can prevent
1224	 * any I/O when we are doing the actual open.
1225	 */
1226	if (spa_is_root(spa)) {
1227		int low = locks & ~(SCL_ZIO - 1);
1228		int high = locks & ~low;
1229
1230		spa_config_enter(spa, high, spa, RW_WRITER);
1231		vdev_hold(spa->spa_root_vdev);
1232		spa_config_enter(spa, low, spa, RW_WRITER);
1233	} else {
1234		spa_config_enter(spa, locks, spa, RW_WRITER);
1235	}
1236	spa->spa_vdev_locks = locks;
1237}
1238
1239int
1240spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
1241{
1242	boolean_t config_changed = B_FALSE;
1243
1244	if (vd != NULL || error == 0)
1245		vdev_dtl_reassess(vd ? vd->vdev_top : spa->spa_root_vdev,
1246		    0, 0, B_FALSE);
1247
1248	if (vd != NULL) {
1249		vdev_state_dirty(vd->vdev_top);
1250		config_changed = B_TRUE;
1251		spa->spa_config_generation++;
1252	}
1253
1254	if (spa_is_root(spa))
1255		vdev_rele(spa->spa_root_vdev);
1256
1257	ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL);
1258	spa_config_exit(spa, spa->spa_vdev_locks, spa);
1259
1260	/*
1261	 * If anything changed, wait for it to sync.  This ensures that,
1262	 * from the system administrator's perspective, zpool(1M) commands
1263	 * are synchronous.  This is important for things like zpool offline:
1264	 * when the command completes, you expect no further I/O from ZFS.
1265	 */
1266	if (vd != NULL)
1267		txg_wait_synced(spa->spa_dsl_pool, 0);
1268
1269	/*
1270	 * If the config changed, update the config cache.
1271	 */
1272	if (config_changed) {
1273		mutex_enter(&spa_namespace_lock);
1274		spa_config_sync(spa, B_FALSE, B_TRUE);
1275		mutex_exit(&spa_namespace_lock);
1276	}
1277
1278	return (error);
1279}
1280
1281/*
1282 * ==========================================================================
1283 * Miscellaneous functions
1284 * ==========================================================================
1285 */
1286
1287void
1288spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx)
1289{
1290	if (!nvlist_exists(spa->spa_label_features, feature)) {
1291		fnvlist_add_boolean(spa->spa_label_features, feature);
1292		/*
1293		 * When we are creating the pool (tx_txg==TXG_INITIAL), we can't
1294		 * dirty the vdev config because lock SCL_CONFIG is not held.
1295		 * Thankfully, in this case we don't need to dirty the config
1296		 * because it will be written out anyway when we finish
1297		 * creating the pool.
1298		 */
1299		if (tx->tx_txg != TXG_INITIAL)
1300			vdev_config_dirty(spa->spa_root_vdev);
1301	}
1302}
1303
1304void
1305spa_deactivate_mos_feature(spa_t *spa, const char *feature)
1306{
1307	if (nvlist_remove_all(spa->spa_label_features, feature) == 0)
1308		vdev_config_dirty(spa->spa_root_vdev);
1309}
1310
1311/*
1312 * Rename a spa_t.
1313 */
1314int
1315spa_rename(const char *name, const char *newname)
1316{
1317	spa_t *spa;
1318	int err;
1319
1320	/*
1321	 * Lookup the spa_t and grab the config lock for writing.  We need to
1322	 * actually open the pool so that we can sync out the necessary labels.
1323	 * It's OK to call spa_open() with the namespace lock held because we
1324	 * allow recursive calls for other reasons.
1325	 */
1326	mutex_enter(&spa_namespace_lock);
1327	if ((err = spa_open(name, &spa, FTAG)) != 0) {
1328		mutex_exit(&spa_namespace_lock);
1329		return (err);
1330	}
1331
1332	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1333
1334	avl_remove(&spa_namespace_avl, spa);
1335	(void) strlcpy(spa->spa_name, newname, sizeof (spa->spa_name));
1336	avl_add(&spa_namespace_avl, spa);
1337
1338	/*
1339	 * Sync all labels to disk with the new names by marking the root vdev
1340	 * dirty and waiting for it to sync.  It will pick up the new pool name
1341	 * during the sync.
1342	 */
1343	vdev_config_dirty(spa->spa_root_vdev);
1344
1345	spa_config_exit(spa, SCL_ALL, FTAG);
1346
1347	txg_wait_synced(spa->spa_dsl_pool, 0);
1348
1349	/*
1350	 * Sync the updated config cache.
1351	 */
1352	spa_config_sync(spa, B_FALSE, B_TRUE);
1353
1354	spa_close(spa, FTAG);
1355
1356	mutex_exit(&spa_namespace_lock);
1357
1358	return (0);
1359}
1360
1361/*
1362 * Return the spa_t associated with given pool_guid, if it exists.  If
1363 * device_guid is non-zero, determine whether the pool exists *and* contains
1364 * a device with the specified device_guid.
1365 */
1366spa_t *
1367spa_by_guid(uint64_t pool_guid, uint64_t device_guid)
1368{
1369	spa_t *spa;
1370	avl_tree_t *t = &spa_namespace_avl;
1371
1372	ASSERT(MUTEX_HELD(&spa_namespace_lock));
1373
1374	for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) {
1375		if (spa->spa_state == POOL_STATE_UNINITIALIZED)
1376			continue;
1377		if (spa->spa_root_vdev == NULL)
1378			continue;
1379		if (spa_guid(spa) == pool_guid) {
1380			if (device_guid == 0)
1381				break;
1382
1383			if (vdev_lookup_by_guid(spa->spa_root_vdev,
1384			    device_guid) != NULL)
1385				break;
1386
1387			/*
1388			 * Check any devices we may be in the process of adding.
1389			 */
1390			if (spa->spa_pending_vdev) {
1391				if (vdev_lookup_by_guid(spa->spa_pending_vdev,
1392				    device_guid) != NULL)
1393					break;
1394			}
1395		}
1396	}
1397
1398	return (spa);
1399}
1400
1401/*
1402 * Determine whether a pool with the given pool_guid exists.
1403 */
1404boolean_t
1405spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
1406{
1407	return (spa_by_guid(pool_guid, device_guid) != NULL);
1408}
1409
1410char *
1411spa_strdup(const char *s)
1412{
1413	size_t len;
1414	char *new;
1415
1416	len = strlen(s);
1417	new = kmem_alloc(len + 1, KM_SLEEP);
1418	bcopy(s, new, len);
1419	new[len] = '\0';
1420
1421	return (new);
1422}
1423
1424void
1425spa_strfree(char *s)
1426{
1427	kmem_free(s, strlen(s) + 1);
1428}
1429
1430uint64_t
1431spa_get_random(uint64_t range)
1432{
1433	uint64_t r;
1434
1435	ASSERT(range != 0);
1436
1437	(void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t));
1438
1439	return (r % range);
1440}
1441
1442uint64_t
1443spa_generate_guid(spa_t *spa)
1444{
1445	uint64_t guid = spa_get_random(-1ULL);
1446
1447	if (spa != NULL) {
1448		while (guid == 0 || spa_guid_exists(spa_guid(spa), guid))
1449			guid = spa_get_random(-1ULL);
1450	} else {
1451		while (guid == 0 || spa_guid_exists(guid, 0))
1452			guid = spa_get_random(-1ULL);
1453	}
1454
1455	return (guid);
1456}
1457
1458void
1459snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp)
1460{
1461	char type[256];
1462	char *checksum = NULL;
1463	char *compress = NULL;
1464
1465	if (bp != NULL) {
1466		if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) {
1467			dmu_object_byteswap_t bswap =
1468			    DMU_OT_BYTESWAP(BP_GET_TYPE(bp));
1469			(void) snprintf(type, sizeof (type), "bswap %s %s",
1470			    DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ?
1471			    "metadata" : "data",
1472			    dmu_ot_byteswap[bswap].ob_name);
1473		} else {
1474			(void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name,
1475			    sizeof (type));
1476		}
1477		if (!BP_IS_EMBEDDED(bp)) {
1478			checksum =
1479			    zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
1480		}
1481		compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name;
1482	}
1483
1484	SNPRINTF_BLKPTR(snprintf, ' ', buf, buflen, bp, type, checksum,
1485	    compress);
1486}
1487
1488void
1489spa_freeze(spa_t *spa)
1490{
1491	uint64_t freeze_txg = 0;
1492
1493	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1494	if (spa->spa_freeze_txg == UINT64_MAX) {
1495		freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE;
1496		spa->spa_freeze_txg = freeze_txg;
1497	}
1498	spa_config_exit(spa, SCL_ALL, FTAG);
1499	if (freeze_txg != 0)
1500		txg_wait_synced(spa_get_dsl(spa), freeze_txg);
1501}
1502
1503void
1504zfs_panic_recover(const char *fmt, ...)
1505{
1506	va_list adx;
1507
1508	va_start(adx, fmt);
1509	vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx);
1510	va_end(adx);
1511}
1512
1513/*
1514 * This is a stripped-down version of strtoull, suitable only for converting
1515 * lowercase hexadecimal numbers that don't overflow.
1516 */
1517uint64_t
1518zfs_strtonum(const char *str, char **nptr)
1519{
1520	uint64_t val = 0;
1521	char c;
1522	int digit;
1523
1524	while ((c = *str) != '\0') {
1525		if (c >= '0' && c <= '9')
1526			digit = c - '0';
1527		else if (c >= 'a' && c <= 'f')
1528			digit = 10 + c - 'a';
1529		else
1530			break;
1531
1532		val *= 16;
1533		val += digit;
1534
1535		str++;
1536	}
1537
1538	if (nptr)
1539		*nptr = (char *)str;
1540
1541	return (val);
1542}
1543
1544/*
1545 * ==========================================================================
1546 * Accessor functions
1547 * ==========================================================================
1548 */
1549
1550boolean_t
1551spa_shutting_down(spa_t *spa)
1552{
1553	return (spa->spa_async_suspended);
1554}
1555
1556dsl_pool_t *
1557spa_get_dsl(spa_t *spa)
1558{
1559	return (spa->spa_dsl_pool);
1560}
1561
1562boolean_t
1563spa_is_initializing(spa_t *spa)
1564{
1565	return (spa->spa_is_initializing);
1566}
1567
1568blkptr_t *
1569spa_get_rootblkptr(spa_t *spa)
1570{
1571	return (&spa->spa_ubsync.ub_rootbp);
1572}
1573
1574void
1575spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp)
1576{
1577	spa->spa_uberblock.ub_rootbp = *bp;
1578}
1579
1580void
1581spa_altroot(spa_t *spa, char *buf, size_t buflen)
1582{
1583	if (spa->spa_root == NULL)
1584		buf[0] = '\0';
1585	else
1586		(void) strncpy(buf, spa->spa_root, buflen);
1587}
1588
1589int
1590spa_sync_pass(spa_t *spa)
1591{
1592	return (spa->spa_sync_pass);
1593}
1594
1595char *
1596spa_name(spa_t *spa)
1597{
1598	return (spa->spa_name);
1599}
1600
1601uint64_t
1602spa_guid(spa_t *spa)
1603{
1604	dsl_pool_t *dp = spa_get_dsl(spa);
1605	uint64_t guid;
1606
1607	/*
1608	 * If we fail to parse the config during spa_load(), we can go through
1609	 * the error path (which posts an ereport) and end up here with no root
1610	 * vdev.  We stash the original pool guid in 'spa_config_guid' to handle
1611	 * this case.
1612	 */
1613	if (spa->spa_root_vdev == NULL)
1614		return (spa->spa_config_guid);
1615
1616	guid = spa->spa_last_synced_guid != 0 ?
1617	    spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid;
1618
1619	/*
1620	 * Return the most recently synced out guid unless we're
1621	 * in syncing context.
1622	 */
1623	if (dp && dsl_pool_sync_context(dp))
1624		return (spa->spa_root_vdev->vdev_guid);
1625	else
1626		return (guid);
1627}
1628
1629uint64_t
1630spa_load_guid(spa_t *spa)
1631{
1632	/*
1633	 * This is a GUID that exists solely as a reference for the
1634	 * purposes of the arc.  It is generated at load time, and
1635	 * is never written to persistent storage.
1636	 */
1637	return (spa->spa_load_guid);
1638}
1639
1640uint64_t
1641spa_last_synced_txg(spa_t *spa)
1642{
1643	return (spa->spa_ubsync.ub_txg);
1644}
1645
1646uint64_t
1647spa_first_txg(spa_t *spa)
1648{
1649	return (spa->spa_first_txg);
1650}
1651
1652uint64_t
1653spa_syncing_txg(spa_t *spa)
1654{
1655	return (spa->spa_syncing_txg);
1656}
1657
1658pool_state_t
1659spa_state(spa_t *spa)
1660{
1661	return (spa->spa_state);
1662}
1663
1664spa_load_state_t
1665spa_load_state(spa_t *spa)
1666{
1667	return (spa->spa_load_state);
1668}
1669
1670uint64_t
1671spa_freeze_txg(spa_t *spa)
1672{
1673	return (spa->spa_freeze_txg);
1674}
1675
1676/* ARGSUSED */
1677uint64_t
1678spa_get_asize(spa_t *spa, uint64_t lsize)
1679{
1680	return (lsize * spa_asize_inflation);
1681}
1682
1683/*
1684 * Return the amount of slop space in bytes.  It is 1/32 of the pool (3.2%),
1685 * or at least 32MB.
1686 *
1687 * See the comment above spa_slop_shift for details.
1688 */
1689uint64_t
1690spa_get_slop_space(spa_t *spa) {
1691	uint64_t space = spa_get_dspace(spa);
1692	return (MAX(space >> spa_slop_shift, SPA_MINDEVSIZE >> 1));
1693}
1694
1695uint64_t
1696spa_get_dspace(spa_t *spa)
1697{
1698	return (spa->spa_dspace);
1699}
1700
1701void
1702spa_update_dspace(spa_t *spa)
1703{
1704	spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) +
1705	    ddt_get_dedup_dspace(spa);
1706}
1707
1708/*
1709 * Return the failure mode that has been set to this pool. The default
1710 * behavior will be to block all I/Os when a complete failure occurs.
1711 */
1712uint8_t
1713spa_get_failmode(spa_t *spa)
1714{
1715	return (spa->spa_failmode);
1716}
1717
1718boolean_t
1719spa_suspended(spa_t *spa)
1720{
1721	return (spa->spa_suspended);
1722}
1723
1724uint64_t
1725spa_version(spa_t *spa)
1726{
1727	return (spa->spa_ubsync.ub_version);
1728}
1729
1730boolean_t
1731spa_deflate(spa_t *spa)
1732{
1733	return (spa->spa_deflate);
1734}
1735
1736metaslab_class_t *
1737spa_normal_class(spa_t *spa)
1738{
1739	return (spa->spa_normal_class);
1740}
1741
1742metaslab_class_t *
1743spa_log_class(spa_t *spa)
1744{
1745	return (spa->spa_log_class);
1746}
1747
1748void
1749spa_evicting_os_register(spa_t *spa, objset_t *os)
1750{
1751	mutex_enter(&spa->spa_evicting_os_lock);
1752	list_insert_head(&spa->spa_evicting_os_list, os);
1753	mutex_exit(&spa->spa_evicting_os_lock);
1754}
1755
1756void
1757spa_evicting_os_deregister(spa_t *spa, objset_t *os)
1758{
1759	mutex_enter(&spa->spa_evicting_os_lock);
1760	list_remove(&spa->spa_evicting_os_list, os);
1761	cv_broadcast(&spa->spa_evicting_os_cv);
1762	mutex_exit(&spa->spa_evicting_os_lock);
1763}
1764
1765void
1766spa_evicting_os_wait(spa_t *spa)
1767{
1768	mutex_enter(&spa->spa_evicting_os_lock);
1769	while (!list_is_empty(&spa->spa_evicting_os_list))
1770		cv_wait(&spa->spa_evicting_os_cv, &spa->spa_evicting_os_lock);
1771	mutex_exit(&spa->spa_evicting_os_lock);
1772
1773	dmu_buf_user_evict_wait();
1774}
1775
1776int
1777spa_max_replication(spa_t *spa)
1778{
1779	/*
1780	 * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to
1781	 * handle BPs with more than one DVA allocated.  Set our max
1782	 * replication level accordingly.
1783	 */
1784	if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS)
1785		return (1);
1786	return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override));
1787}
1788
1789int
1790spa_prev_software_version(spa_t *spa)
1791{
1792	return (spa->spa_prev_software_version);
1793}
1794
1795uint64_t
1796spa_deadman_synctime(spa_t *spa)
1797{
1798	return (spa->spa_deadman_synctime);
1799}
1800
1801uint64_t
1802dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
1803{
1804	uint64_t asize = DVA_GET_ASIZE(dva);
1805	uint64_t dsize = asize;
1806
1807	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
1808
1809	if (asize != 0 && spa->spa_deflate) {
1810		vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
1811		dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
1812	}
1813
1814	return (dsize);
1815}
1816
1817uint64_t
1818bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp)
1819{
1820	uint64_t dsize = 0;
1821
1822	for (int d = 0; d < BP_GET_NDVAS(bp); d++)
1823		dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
1824
1825	return (dsize);
1826}
1827
1828uint64_t
1829bp_get_dsize(spa_t *spa, const blkptr_t *bp)
1830{
1831	uint64_t dsize = 0;
1832
1833	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
1834
1835	for (int d = 0; d < BP_GET_NDVAS(bp); d++)
1836		dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
1837
1838	spa_config_exit(spa, SCL_VDEV, FTAG);
1839
1840	return (dsize);
1841}
1842
1843/*
1844 * ==========================================================================
1845 * Initialization and Termination
1846 * ==========================================================================
1847 */
1848
1849static int
1850spa_name_compare(const void *a1, const void *a2)
1851{
1852	const spa_t *s1 = a1;
1853	const spa_t *s2 = a2;
1854	int s;
1855
1856	s = strcmp(s1->spa_name, s2->spa_name);
1857	if (s > 0)
1858		return (1);
1859	if (s < 0)
1860		return (-1);
1861	return (0);
1862}
1863
1864int
1865spa_busy(void)
1866{
1867	return (spa_active_count);
1868}
1869
1870void
1871spa_boot_init()
1872{
1873	spa_config_load();
1874}
1875
1876#ifdef _KERNEL
1877EVENTHANDLER_DEFINE(mountroot, spa_boot_init, NULL, 0);
1878#endif
1879
1880void
1881spa_init(int mode)
1882{
1883	mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL);
1884	mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL);
1885	mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL);
1886	cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL);
1887
1888	avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t),
1889	    offsetof(spa_t, spa_avl));
1890
1891	avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t),
1892	    offsetof(spa_aux_t, aux_avl));
1893
1894	avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t),
1895	    offsetof(spa_aux_t, aux_avl));
1896
1897	spa_mode_global = mode;
1898
1899#ifdef illumos
1900#ifdef _KERNEL
1901	spa_arch_init();
1902#else
1903	if (spa_mode_global != FREAD && dprintf_find_string("watch")) {
1904		arc_procfd = open("/proc/self/ctl", O_WRONLY);
1905		if (arc_procfd == -1) {
1906			perror("could not enable watchpoints: "
1907			    "opening /proc/self/ctl failed: ");
1908		} else {
1909			arc_watch = B_TRUE;
1910		}
1911	}
1912#endif
1913#endif /* illumos */
1914	refcount_sysinit();
1915	unique_init();
1916	range_tree_init();
1917	zio_init();
1918	lz4_init();
1919	dmu_init();
1920	zil_init();
1921	vdev_cache_stat_init();
1922	zfs_prop_init();
1923	zpool_prop_init();
1924	zpool_feature_init();
1925	spa_config_load();
1926	l2arc_start();
1927#ifndef illumos
1928#ifdef _KERNEL
1929	zfs_deadman_init();
1930#endif
1931#endif	/* !illumos */
1932}
1933
1934void
1935spa_fini(void)
1936{
1937	l2arc_stop();
1938
1939	spa_evict_all();
1940
1941	vdev_cache_stat_fini();
1942	zil_fini();
1943	dmu_fini();
1944	lz4_fini();
1945	zio_fini();
1946	range_tree_fini();
1947	unique_fini();
1948	refcount_fini();
1949
1950	avl_destroy(&spa_namespace_avl);
1951	avl_destroy(&spa_spare_avl);
1952	avl_destroy(&spa_l2cache_avl);
1953
1954	cv_destroy(&spa_namespace_cv);
1955	mutex_destroy(&spa_namespace_lock);
1956	mutex_destroy(&spa_spare_lock);
1957	mutex_destroy(&spa_l2cache_lock);
1958}
1959
1960/*
1961 * Return whether this pool has slogs. No locking needed.
1962 * It's not a problem if the wrong answer is returned as it's only for
1963 * performance and not correctness
1964 */
1965boolean_t
1966spa_has_slogs(spa_t *spa)
1967{
1968	return (spa->spa_log_class->mc_rotor != NULL);
1969}
1970
1971spa_log_state_t
1972spa_get_log_state(spa_t *spa)
1973{
1974	return (spa->spa_log_state);
1975}
1976
1977void
1978spa_set_log_state(spa_t *spa, spa_log_state_t state)
1979{
1980	spa->spa_log_state = state;
1981}
1982
1983boolean_t
1984spa_is_root(spa_t *spa)
1985{
1986	return (spa->spa_is_root);
1987}
1988
1989boolean_t
1990spa_writeable(spa_t *spa)
1991{
1992	return (!!(spa->spa_mode & FWRITE));
1993}
1994
1995/*
1996 * Returns true if there is a pending sync task in any of the current
1997 * syncing txg, the current quiescing txg, or the current open txg.
1998 */
1999boolean_t
2000spa_has_pending_synctask(spa_t *spa)
2001{
2002	return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks));
2003}
2004
2005int
2006spa_mode(spa_t *spa)
2007{
2008	return (spa->spa_mode);
2009}
2010
2011uint64_t
2012spa_bootfs(spa_t *spa)
2013{
2014	return (spa->spa_bootfs);
2015}
2016
2017uint64_t
2018spa_delegation(spa_t *spa)
2019{
2020	return (spa->spa_delegation);
2021}
2022
2023objset_t *
2024spa_meta_objset(spa_t *spa)
2025{
2026	return (spa->spa_meta_objset);
2027}
2028
2029enum zio_checksum
2030spa_dedup_checksum(spa_t *spa)
2031{
2032	return (spa->spa_dedup_checksum);
2033}
2034
2035/*
2036 * Reset pool scan stat per scan pass (or reboot).
2037 */
2038void
2039spa_scan_stat_init(spa_t *spa)
2040{
2041	/* data not stored on disk */
2042	spa->spa_scan_pass_start = gethrestime_sec();
2043	spa->spa_scan_pass_exam = 0;
2044	vdev_scan_stat_init(spa->spa_root_vdev);
2045}
2046
2047/*
2048 * Get scan stats for zpool status reports
2049 */
2050int
2051spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
2052{
2053	dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL;
2054
2055	if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE)
2056		return (SET_ERROR(ENOENT));
2057	bzero(ps, sizeof (pool_scan_stat_t));
2058
2059	/* data stored on disk */
2060	ps->pss_func = scn->scn_phys.scn_func;
2061	ps->pss_start_time = scn->scn_phys.scn_start_time;
2062	ps->pss_end_time = scn->scn_phys.scn_end_time;
2063	ps->pss_to_examine = scn->scn_phys.scn_to_examine;
2064	ps->pss_examined = scn->scn_phys.scn_examined;
2065	ps->pss_to_process = scn->scn_phys.scn_to_process;
2066	ps->pss_processed = scn->scn_phys.scn_processed;
2067	ps->pss_errors = scn->scn_phys.scn_errors;
2068	ps->pss_state = scn->scn_phys.scn_state;
2069
2070	/* data not stored on disk */
2071	ps->pss_pass_start = spa->spa_scan_pass_start;
2072	ps->pss_pass_exam = spa->spa_scan_pass_exam;
2073
2074	return (0);
2075}
2076
2077boolean_t
2078spa_debug_enabled(spa_t *spa)
2079{
2080	return (spa->spa_debug);
2081}
2082
2083int
2084spa_maxblocksize(spa_t *spa)
2085{
2086	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS))
2087		return (SPA_MAXBLOCKSIZE);
2088	else
2089		return (SPA_OLD_MAXBLOCKSIZE);
2090}
2091