1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2011, 2021 by Delphix. All rights reserved.
25 * Copyright 2017 Nexenta Systems, Inc.
26 * Copyright (c) 2014 Integros [integros.com]
27 * Copyright 2016 Toomas Soome <tsoome@me.com>
28 * Copyright 2017 Joyent, Inc.
29 * Copyright (c) 2017, Intel Corporation.
30 * Copyright (c) 2019, Datto Inc. All rights reserved.
31 * Copyright (c) 2021, Klara Inc.
32 * Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP.
33 */
34
35#include <sys/zfs_context.h>
36#include <sys/fm/fs/zfs.h>
37#include <sys/spa.h>
38#include <sys/spa_impl.h>
39#include <sys/bpobj.h>
40#include <sys/dmu.h>
41#include <sys/dmu_tx.h>
42#include <sys/dsl_dir.h>
43#include <sys/vdev_impl.h>
44#include <sys/vdev_rebuild.h>
45#include <sys/vdev_draid.h>
46#include <sys/uberblock_impl.h>
47#include <sys/metaslab.h>
48#include <sys/metaslab_impl.h>
49#include <sys/space_map.h>
50#include <sys/space_reftree.h>
51#include <sys/zio.h>
52#include <sys/zap.h>
53#include <sys/fs/zfs.h>
54#include <sys/arc.h>
55#include <sys/zil.h>
56#include <sys/dsl_scan.h>
57#include <sys/vdev_raidz.h>
58#include <sys/abd.h>
59#include <sys/vdev_initialize.h>
60#include <sys/vdev_trim.h>
61#include <sys/vdev_raidz.h>
62#include <sys/zvol.h>
63#include <sys/zfs_ratelimit.h>
64#include "zfs_prop.h"
65
66/*
67 * One metaslab from each (normal-class) vdev is used by the ZIL.  These are
68 * called "embedded slog metaslabs", are referenced by vdev_log_mg, and are
69 * part of the spa_embedded_log_class.  The metaslab with the most free space
70 * in each vdev is selected for this purpose when the pool is opened (or a
71 * vdev is added).  See vdev_metaslab_init().
72 *
73 * Log blocks can be allocated from the following locations.  Each one is tried
74 * in order until the allocation succeeds:
75 * 1. dedicated log vdevs, aka "slog" (spa_log_class)
76 * 2. embedded slog metaslabs (spa_embedded_log_class)
77 * 3. other metaslabs in normal vdevs (spa_normal_class)
78 *
79 * zfs_embedded_slog_min_ms disables the embedded slog if there are fewer
80 * than this number of metaslabs in the vdev.  This ensures that we don't set
81 * aside an unreasonable amount of space for the ZIL.  If set to less than
82 * 1 << (spa_slop_shift + 1), on small pools the usable space may be reduced
83 * (by more than 1<<spa_slop_shift) due to the embedded slog metaslab.
84 */
85static uint_t zfs_embedded_slog_min_ms = 64;
86
87/* default target for number of metaslabs per top-level vdev */
88static uint_t zfs_vdev_default_ms_count = 200;
89
90/* minimum number of metaslabs per top-level vdev */
91static uint_t zfs_vdev_min_ms_count = 16;
92
93/* practical upper limit of total metaslabs per top-level vdev */
94static uint_t zfs_vdev_ms_count_limit = 1ULL << 17;
95
96/* lower limit for metaslab size (512M) */
97static uint_t zfs_vdev_default_ms_shift = 29;
98
99/* upper limit for metaslab size (16G) */
100static uint_t zfs_vdev_max_ms_shift = 34;
101
102int vdev_validate_skip = B_FALSE;
103
104/*
105 * Since the DTL space map of a vdev is not expected to have a lot of
106 * entries, we default its block size to 4K.
107 */
108int zfs_vdev_dtl_sm_blksz = (1 << 12);
109
110/*
111 * Rate limit slow IO (delay) events to this many per second.
112 */
113static unsigned int zfs_slow_io_events_per_second = 20;
114
115/*
116 * Rate limit checksum events after this many checksum errors per second.
117 */
118static unsigned int zfs_checksum_events_per_second = 20;
119
120/*
121 * Ignore errors during scrub/resilver.  Allows to work around resilver
122 * upon import when there are pool errors.
123 */
124static int zfs_scan_ignore_errors = 0;
125
126/*
127 * vdev-wide space maps that have lots of entries written to them at
128 * the end of each transaction can benefit from a higher I/O bandwidth
129 * (e.g. vdev_obsolete_sm), thus we default their block size to 128K.
130 */
131int zfs_vdev_standard_sm_blksz = (1 << 17);
132
133/*
134 * Tunable parameter for debugging or performance analysis. Setting this
135 * will cause pool corruption on power loss if a volatile out-of-order
136 * write cache is enabled.
137 */
138int zfs_nocacheflush = 0;
139
140/*
141 * Maximum and minimum ashift values that can be automatically set based on
142 * vdev's physical ashift (disk's physical sector size).  While ASHIFT_MAX
143 * is higher than the maximum value, it is intentionally limited here to not
144 * excessively impact pool space efficiency.  Higher ashift values may still
145 * be forced by vdev logical ashift or by user via ashift property, but won't
146 * be set automatically as a performance optimization.
147 */
148uint_t zfs_vdev_max_auto_ashift = 14;
149uint_t zfs_vdev_min_auto_ashift = ASHIFT_MIN;
150
151void
152vdev_dbgmsg(vdev_t *vd, const char *fmt, ...)
153{
154	va_list adx;
155	char buf[256];
156
157	va_start(adx, fmt);
158	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
159	va_end(adx);
160
161	if (vd->vdev_path != NULL) {
162		zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type,
163		    vd->vdev_path, buf);
164	} else {
165		zfs_dbgmsg("%s-%llu vdev (guid %llu): %s",
166		    vd->vdev_ops->vdev_op_type,
167		    (u_longlong_t)vd->vdev_id,
168		    (u_longlong_t)vd->vdev_guid, buf);
169	}
170}
171
172void
173vdev_dbgmsg_print_tree(vdev_t *vd, int indent)
174{
175	char state[20];
176
177	if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) {
178		zfs_dbgmsg("%*svdev %llu: %s", indent, "",
179		    (u_longlong_t)vd->vdev_id,
180		    vd->vdev_ops->vdev_op_type);
181		return;
182	}
183
184	switch (vd->vdev_state) {
185	case VDEV_STATE_UNKNOWN:
186		(void) snprintf(state, sizeof (state), "unknown");
187		break;
188	case VDEV_STATE_CLOSED:
189		(void) snprintf(state, sizeof (state), "closed");
190		break;
191	case VDEV_STATE_OFFLINE:
192		(void) snprintf(state, sizeof (state), "offline");
193		break;
194	case VDEV_STATE_REMOVED:
195		(void) snprintf(state, sizeof (state), "removed");
196		break;
197	case VDEV_STATE_CANT_OPEN:
198		(void) snprintf(state, sizeof (state), "can't open");
199		break;
200	case VDEV_STATE_FAULTED:
201		(void) snprintf(state, sizeof (state), "faulted");
202		break;
203	case VDEV_STATE_DEGRADED:
204		(void) snprintf(state, sizeof (state), "degraded");
205		break;
206	case VDEV_STATE_HEALTHY:
207		(void) snprintf(state, sizeof (state), "healthy");
208		break;
209	default:
210		(void) snprintf(state, sizeof (state), "<state %u>",
211		    (uint_t)vd->vdev_state);
212	}
213
214	zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent,
215	    "", (int)vd->vdev_id, vd->vdev_ops->vdev_op_type,
216	    vd->vdev_islog ? " (log)" : "",
217	    (u_longlong_t)vd->vdev_guid,
218	    vd->vdev_path ? vd->vdev_path : "N/A", state);
219
220	for (uint64_t i = 0; i < vd->vdev_children; i++)
221		vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2);
222}
223
224/*
225 * Virtual device management.
226 */
227
228static vdev_ops_t *const vdev_ops_table[] = {
229	&vdev_root_ops,
230	&vdev_raidz_ops,
231	&vdev_draid_ops,
232	&vdev_draid_spare_ops,
233	&vdev_mirror_ops,
234	&vdev_replacing_ops,
235	&vdev_spare_ops,
236	&vdev_disk_ops,
237	&vdev_file_ops,
238	&vdev_missing_ops,
239	&vdev_hole_ops,
240	&vdev_indirect_ops,
241	NULL
242};
243
244/*
245 * Given a vdev type, return the appropriate ops vector.
246 */
247static vdev_ops_t *
248vdev_getops(const char *type)
249{
250	vdev_ops_t *ops, *const *opspp;
251
252	for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
253		if (strcmp(ops->vdev_op_type, type) == 0)
254			break;
255
256	return (ops);
257}
258
259/*
260 * Given a vdev and a metaslab class, find which metaslab group we're
261 * interested in. All vdevs may belong to two different metaslab classes.
262 * Dedicated slog devices use only the primary metaslab group, rather than a
263 * separate log group. For embedded slogs, the vdev_log_mg will be non-NULL.
264 */
265metaslab_group_t *
266vdev_get_mg(vdev_t *vd, metaslab_class_t *mc)
267{
268	if (mc == spa_embedded_log_class(vd->vdev_spa) &&
269	    vd->vdev_log_mg != NULL)
270		return (vd->vdev_log_mg);
271	else
272		return (vd->vdev_mg);
273}
274
275void
276vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
277    range_seg64_t *physical_rs, range_seg64_t *remain_rs)
278{
279	(void) vd, (void) remain_rs;
280
281	physical_rs->rs_start = logical_rs->rs_start;
282	physical_rs->rs_end = logical_rs->rs_end;
283}
284
285/*
286 * Derive the enumerated allocation bias from string input.
287 * String origin is either the per-vdev zap or zpool(8).
288 */
289static vdev_alloc_bias_t
290vdev_derive_alloc_bias(const char *bias)
291{
292	vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
293
294	if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0)
295		alloc_bias = VDEV_BIAS_LOG;
296	else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0)
297		alloc_bias = VDEV_BIAS_SPECIAL;
298	else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0)
299		alloc_bias = VDEV_BIAS_DEDUP;
300
301	return (alloc_bias);
302}
303
304/*
305 * Default asize function: return the MAX of psize with the asize of
306 * all children.  This is what's used by anything other than RAID-Z.
307 */
308uint64_t
309vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
310{
311	uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
312	uint64_t csize;
313
314	for (int c = 0; c < vd->vdev_children; c++) {
315		csize = vdev_psize_to_asize_txg(vd->vdev_child[c], psize, txg);
316		asize = MAX(asize, csize);
317	}
318
319	return (asize);
320}
321
322uint64_t
323vdev_default_min_asize(vdev_t *vd)
324{
325	return (vd->vdev_min_asize);
326}
327
328/*
329 * Get the minimum allocatable size. We define the allocatable size as
330 * the vdev's asize rounded to the nearest metaslab. This allows us to
331 * replace or attach devices which don't have the same physical size but
332 * can still satisfy the same number of allocations.
333 */
334uint64_t
335vdev_get_min_asize(vdev_t *vd)
336{
337	vdev_t *pvd = vd->vdev_parent;
338
339	/*
340	 * If our parent is NULL (inactive spare or cache) or is the root,
341	 * just return our own asize.
342	 */
343	if (pvd == NULL)
344		return (vd->vdev_asize);
345
346	/*
347	 * The top-level vdev just returns the allocatable size rounded
348	 * to the nearest metaslab.
349	 */
350	if (vd == vd->vdev_top)
351		return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
352
353	return (pvd->vdev_ops->vdev_op_min_asize(pvd));
354}
355
356void
357vdev_set_min_asize(vdev_t *vd)
358{
359	vd->vdev_min_asize = vdev_get_min_asize(vd);
360
361	for (int c = 0; c < vd->vdev_children; c++)
362		vdev_set_min_asize(vd->vdev_child[c]);
363}
364
365/*
366 * Get the minimal allocation size for the top-level vdev.
367 */
368uint64_t
369vdev_get_min_alloc(vdev_t *vd)
370{
371	uint64_t min_alloc = 1ULL << vd->vdev_ashift;
372
373	if (vd->vdev_ops->vdev_op_min_alloc != NULL)
374		min_alloc = vd->vdev_ops->vdev_op_min_alloc(vd);
375
376	return (min_alloc);
377}
378
379/*
380 * Get the parity level for a top-level vdev.
381 */
382uint64_t
383vdev_get_nparity(vdev_t *vd)
384{
385	uint64_t nparity = 0;
386
387	if (vd->vdev_ops->vdev_op_nparity != NULL)
388		nparity = vd->vdev_ops->vdev_op_nparity(vd);
389
390	return (nparity);
391}
392
393static int
394vdev_prop_get_int(vdev_t *vd, vdev_prop_t prop, uint64_t *value)
395{
396	spa_t *spa = vd->vdev_spa;
397	objset_t *mos = spa->spa_meta_objset;
398	uint64_t objid;
399	int err;
400
401	if (vd->vdev_root_zap != 0) {
402		objid = vd->vdev_root_zap;
403	} else if (vd->vdev_top_zap != 0) {
404		objid = vd->vdev_top_zap;
405	} else if (vd->vdev_leaf_zap != 0) {
406		objid = vd->vdev_leaf_zap;
407	} else {
408		return (EINVAL);
409	}
410
411	err = zap_lookup(mos, objid, vdev_prop_to_name(prop),
412	    sizeof (uint64_t), 1, value);
413
414	if (err == ENOENT)
415		*value = vdev_prop_default_numeric(prop);
416
417	return (err);
418}
419
420/*
421 * Get the number of data disks for a top-level vdev.
422 */
423uint64_t
424vdev_get_ndisks(vdev_t *vd)
425{
426	uint64_t ndisks = 1;
427
428	if (vd->vdev_ops->vdev_op_ndisks != NULL)
429		ndisks = vd->vdev_ops->vdev_op_ndisks(vd);
430
431	return (ndisks);
432}
433
434vdev_t *
435vdev_lookup_top(spa_t *spa, uint64_t vdev)
436{
437	vdev_t *rvd = spa->spa_root_vdev;
438
439	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
440
441	if (vdev < rvd->vdev_children) {
442		ASSERT(rvd->vdev_child[vdev] != NULL);
443		return (rvd->vdev_child[vdev]);
444	}
445
446	return (NULL);
447}
448
449vdev_t *
450vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
451{
452	vdev_t *mvd;
453
454	if (vd->vdev_guid == guid)
455		return (vd);
456
457	for (int c = 0; c < vd->vdev_children; c++)
458		if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
459		    NULL)
460			return (mvd);
461
462	return (NULL);
463}
464
465static int
466vdev_count_leaves_impl(vdev_t *vd)
467{
468	int n = 0;
469
470	if (vd->vdev_ops->vdev_op_leaf)
471		return (1);
472
473	for (int c = 0; c < vd->vdev_children; c++)
474		n += vdev_count_leaves_impl(vd->vdev_child[c]);
475
476	return (n);
477}
478
479int
480vdev_count_leaves(spa_t *spa)
481{
482	int rc;
483
484	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
485	rc = vdev_count_leaves_impl(spa->spa_root_vdev);
486	spa_config_exit(spa, SCL_VDEV, FTAG);
487
488	return (rc);
489}
490
491void
492vdev_add_child(vdev_t *pvd, vdev_t *cvd)
493{
494	size_t oldsize, newsize;
495	uint64_t id = cvd->vdev_id;
496	vdev_t **newchild;
497
498	ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
499	ASSERT(cvd->vdev_parent == NULL);
500
501	cvd->vdev_parent = pvd;
502
503	if (pvd == NULL)
504		return;
505
506	ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
507
508	oldsize = pvd->vdev_children * sizeof (vdev_t *);
509	pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
510	newsize = pvd->vdev_children * sizeof (vdev_t *);
511
512	newchild = kmem_alloc(newsize, KM_SLEEP);
513	if (pvd->vdev_child != NULL) {
514		memcpy(newchild, pvd->vdev_child, oldsize);
515		kmem_free(pvd->vdev_child, oldsize);
516	}
517
518	pvd->vdev_child = newchild;
519	pvd->vdev_child[id] = cvd;
520
521	cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
522	ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
523
524	/*
525	 * Walk up all ancestors to update guid sum.
526	 */
527	for (; pvd != NULL; pvd = pvd->vdev_parent)
528		pvd->vdev_guid_sum += cvd->vdev_guid_sum;
529
530	if (cvd->vdev_ops->vdev_op_leaf) {
531		list_insert_head(&cvd->vdev_spa->spa_leaf_list, cvd);
532		cvd->vdev_spa->spa_leaf_list_gen++;
533	}
534}
535
536void
537vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
538{
539	int c;
540	uint_t id = cvd->vdev_id;
541
542	ASSERT(cvd->vdev_parent == pvd);
543
544	if (pvd == NULL)
545		return;
546
547	ASSERT(id < pvd->vdev_children);
548	ASSERT(pvd->vdev_child[id] == cvd);
549
550	pvd->vdev_child[id] = NULL;
551	cvd->vdev_parent = NULL;
552
553	for (c = 0; c < pvd->vdev_children; c++)
554		if (pvd->vdev_child[c])
555			break;
556
557	if (c == pvd->vdev_children) {
558		kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
559		pvd->vdev_child = NULL;
560		pvd->vdev_children = 0;
561	}
562
563	if (cvd->vdev_ops->vdev_op_leaf) {
564		spa_t *spa = cvd->vdev_spa;
565		list_remove(&spa->spa_leaf_list, cvd);
566		spa->spa_leaf_list_gen++;
567	}
568
569	/*
570	 * Walk up all ancestors to update guid sum.
571	 */
572	for (; pvd != NULL; pvd = pvd->vdev_parent)
573		pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
574}
575
576/*
577 * Remove any holes in the child array.
578 */
579void
580vdev_compact_children(vdev_t *pvd)
581{
582	vdev_t **newchild, *cvd;
583	int oldc = pvd->vdev_children;
584	int newc;
585
586	ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
587
588	if (oldc == 0)
589		return;
590
591	for (int c = newc = 0; c < oldc; c++)
592		if (pvd->vdev_child[c])
593			newc++;
594
595	if (newc > 0) {
596		newchild = kmem_zalloc(newc * sizeof (vdev_t *), KM_SLEEP);
597
598		for (int c = newc = 0; c < oldc; c++) {
599			if ((cvd = pvd->vdev_child[c]) != NULL) {
600				newchild[newc] = cvd;
601				cvd->vdev_id = newc++;
602			}
603		}
604	} else {
605		newchild = NULL;
606	}
607
608	kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
609	pvd->vdev_child = newchild;
610	pvd->vdev_children = newc;
611}
612
613/*
614 * Allocate and minimally initialize a vdev_t.
615 */
616vdev_t *
617vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
618{
619	vdev_t *vd;
620	vdev_indirect_config_t *vic;
621
622	vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
623	vic = &vd->vdev_indirect_config;
624
625	if (spa->spa_root_vdev == NULL) {
626		ASSERT(ops == &vdev_root_ops);
627		spa->spa_root_vdev = vd;
628		spa->spa_load_guid = spa_generate_guid(NULL);
629	}
630
631	if (guid == 0 && ops != &vdev_hole_ops) {
632		if (spa->spa_root_vdev == vd) {
633			/*
634			 * The root vdev's guid will also be the pool guid,
635			 * which must be unique among all pools.
636			 */
637			guid = spa_generate_guid(NULL);
638		} else {
639			/*
640			 * Any other vdev's guid must be unique within the pool.
641			 */
642			guid = spa_generate_guid(spa);
643		}
644		ASSERT(!spa_guid_exists(spa_guid(spa), guid));
645	}
646
647	vd->vdev_spa = spa;
648	vd->vdev_id = id;
649	vd->vdev_guid = guid;
650	vd->vdev_guid_sum = guid;
651	vd->vdev_ops = ops;
652	vd->vdev_state = VDEV_STATE_CLOSED;
653	vd->vdev_ishole = (ops == &vdev_hole_ops);
654	vic->vic_prev_indirect_vdev = UINT64_MAX;
655
656	rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL);
657	mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL);
658	vd->vdev_obsolete_segments = range_tree_create(NULL, RANGE_SEG64, NULL,
659	    0, 0);
660
661	/*
662	 * Initialize rate limit structs for events.  We rate limit ZIO delay
663	 * and checksum events so that we don't overwhelm ZED with thousands
664	 * of events when a disk is acting up.
665	 */
666	zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second,
667	    1);
668	zfs_ratelimit_init(&vd->vdev_deadman_rl, &zfs_slow_io_events_per_second,
669	    1);
670	zfs_ratelimit_init(&vd->vdev_checksum_rl,
671	    &zfs_checksum_events_per_second, 1);
672
673	/*
674	 * Default Thresholds for tuning ZED
675	 */
676	vd->vdev_checksum_n = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_N);
677	vd->vdev_checksum_t = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T);
678	vd->vdev_io_n = vdev_prop_default_numeric(VDEV_PROP_IO_N);
679	vd->vdev_io_t = vdev_prop_default_numeric(VDEV_PROP_IO_T);
680	vd->vdev_slow_io_n = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N);
681	vd->vdev_slow_io_t = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T);
682
683	list_link_init(&vd->vdev_config_dirty_node);
684	list_link_init(&vd->vdev_state_dirty_node);
685	list_link_init(&vd->vdev_initialize_node);
686	list_link_init(&vd->vdev_leaf_node);
687	list_link_init(&vd->vdev_trim_node);
688
689	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL);
690	mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
691	mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
692	mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);
693
694	mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
695	mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
696	cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL);
697	cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL);
698
699	mutex_init(&vd->vdev_trim_lock, NULL, MUTEX_DEFAULT, NULL);
700	mutex_init(&vd->vdev_autotrim_lock, NULL, MUTEX_DEFAULT, NULL);
701	mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL);
702	cv_init(&vd->vdev_trim_cv, NULL, CV_DEFAULT, NULL);
703	cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL);
704	cv_init(&vd->vdev_autotrim_kick_cv, NULL, CV_DEFAULT, NULL);
705	cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL);
706
707	mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL);
708	cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL);
709
710	for (int t = 0; t < DTL_TYPES; t++) {
711		vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0,
712		    0);
713	}
714
715	txg_list_create(&vd->vdev_ms_list, spa,
716	    offsetof(struct metaslab, ms_txg_node));
717	txg_list_create(&vd->vdev_dtl_list, spa,
718	    offsetof(struct vdev, vdev_dtl_node));
719	vd->vdev_stat.vs_timestamp = gethrtime();
720	vdev_queue_init(vd);
721
722	return (vd);
723}
724
725/*
726 * Allocate a new vdev.  The 'alloctype' is used to control whether we are
727 * creating a new vdev or loading an existing one - the behavior is slightly
728 * different for each case.
729 */
730int
731vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
732    int alloctype)
733{
734	vdev_ops_t *ops;
735	const char *type;
736	uint64_t guid = 0, islog;
737	vdev_t *vd;
738	vdev_indirect_config_t *vic;
739	const char *tmp = NULL;
740	int rc;
741	vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
742	boolean_t top_level = (parent && !parent->vdev_parent);
743
744	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
745
746	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
747		return (SET_ERROR(EINVAL));
748
749	if ((ops = vdev_getops(type)) == NULL)
750		return (SET_ERROR(EINVAL));
751
752	/*
753	 * If this is a load, get the vdev guid from the nvlist.
754	 * Otherwise, vdev_alloc_common() will generate one for us.
755	 */
756	if (alloctype == VDEV_ALLOC_LOAD) {
757		uint64_t label_id;
758
759		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
760		    label_id != id)
761			return (SET_ERROR(EINVAL));
762
763		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
764			return (SET_ERROR(EINVAL));
765	} else if (alloctype == VDEV_ALLOC_SPARE) {
766		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
767			return (SET_ERROR(EINVAL));
768	} else if (alloctype == VDEV_ALLOC_L2CACHE) {
769		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
770			return (SET_ERROR(EINVAL));
771	} else if (alloctype == VDEV_ALLOC_ROOTPOOL) {
772		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
773			return (SET_ERROR(EINVAL));
774	}
775
776	/*
777	 * The first allocated vdev must be of type 'root'.
778	 */
779	if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL)
780		return (SET_ERROR(EINVAL));
781
782	/*
783	 * Determine whether we're a log vdev.
784	 */
785	islog = 0;
786	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog);
787	if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
788		return (SET_ERROR(ENOTSUP));
789
790	if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
791		return (SET_ERROR(ENOTSUP));
792
793	if (top_level && alloctype == VDEV_ALLOC_ADD) {
794		const char *bias;
795
796		/*
797		 * If creating a top-level vdev, check for allocation
798		 * classes input.
799		 */
800		if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
801		    &bias) == 0) {
802			alloc_bias = vdev_derive_alloc_bias(bias);
803
804			/* spa_vdev_add() expects feature to be enabled */
805			if (spa->spa_load_state != SPA_LOAD_CREATE &&
806			    !spa_feature_is_enabled(spa,
807			    SPA_FEATURE_ALLOCATION_CLASSES)) {
808				return (SET_ERROR(ENOTSUP));
809			}
810		}
811
812		/* spa_vdev_add() expects feature to be enabled */
813		if (ops == &vdev_draid_ops &&
814		    spa->spa_load_state != SPA_LOAD_CREATE &&
815		    !spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) {
816			return (SET_ERROR(ENOTSUP));
817		}
818	}
819
820	/*
821	 * Initialize the vdev specific data.  This is done before calling
822	 * vdev_alloc_common() since it may fail and this simplifies the
823	 * error reporting and cleanup code paths.
824	 */
825	void *tsd = NULL;
826	if (ops->vdev_op_init != NULL) {
827		rc = ops->vdev_op_init(spa, nv, &tsd);
828		if (rc != 0) {
829			return (rc);
830		}
831	}
832
833	vd = vdev_alloc_common(spa, id, guid, ops);
834	vd->vdev_tsd = tsd;
835	vd->vdev_islog = islog;
836
837	if (top_level && alloc_bias != VDEV_BIAS_NONE)
838		vd->vdev_alloc_bias = alloc_bias;
839
840	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &tmp) == 0)
841		vd->vdev_path = spa_strdup(tmp);
842
843	/*
844	 * ZPOOL_CONFIG_AUX_STATE = "external" means we previously forced a
845	 * fault on a vdev and want it to persist across imports (like with
846	 * zpool offline -f).
847	 */
848	rc = nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &tmp);
849	if (rc == 0 && tmp != NULL && strcmp(tmp, "external") == 0) {
850		vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL;
851		vd->vdev_faulted = 1;
852		vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
853	}
854
855	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &tmp) == 0)
856		vd->vdev_devid = spa_strdup(tmp);
857	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, &tmp) == 0)
858		vd->vdev_physpath = spa_strdup(tmp);
859
860	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH,
861	    &tmp) == 0)
862		vd->vdev_enc_sysfs_path = spa_strdup(tmp);
863
864	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &tmp) == 0)
865		vd->vdev_fru = spa_strdup(tmp);
866
867	/*
868	 * Set the whole_disk property.  If it's not specified, leave the value
869	 * as -1.
870	 */
871	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
872	    &vd->vdev_wholedisk) != 0)
873		vd->vdev_wholedisk = -1ULL;
874
875	vic = &vd->vdev_indirect_config;
876
877	ASSERT0(vic->vic_mapping_object);
878	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
879	    &vic->vic_mapping_object);
880	ASSERT0(vic->vic_births_object);
881	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS,
882	    &vic->vic_births_object);
883	ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX);
884	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
885	    &vic->vic_prev_indirect_vdev);
886
887	/*
888	 * Look for the 'not present' flag.  This will only be set if the device
889	 * was not present at the time of import.
890	 */
891	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
892	    &vd->vdev_not_present);
893
894	/*
895	 * Get the alignment requirement. Ignore pool ashift for vdev
896	 * attach case.
897	 */
898	if (alloctype != VDEV_ALLOC_ATTACH) {
899		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT,
900		    &vd->vdev_ashift);
901	} else {
902		vd->vdev_attaching = B_TRUE;
903	}
904
905	/*
906	 * Retrieve the vdev creation time.
907	 */
908	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
909	    &vd->vdev_crtxg);
910
911	if (vd->vdev_ops == &vdev_root_ops &&
912	    (alloctype == VDEV_ALLOC_LOAD ||
913	    alloctype == VDEV_ALLOC_SPLIT ||
914	    alloctype == VDEV_ALLOC_ROOTPOOL)) {
915		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_ROOT_ZAP,
916		    &vd->vdev_root_zap);
917	}
918
919	/*
920	 * If we're a top-level vdev, try to load the allocation parameters.
921	 */
922	if (top_level &&
923	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
924		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
925		    &vd->vdev_ms_array);
926		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
927		    &vd->vdev_ms_shift);
928		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
929		    &vd->vdev_asize);
930		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NONALLOCATING,
931		    &vd->vdev_noalloc);
932		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
933		    &vd->vdev_removing);
934		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
935		    &vd->vdev_top_zap);
936		vd->vdev_rz_expanding = nvlist_exists(nv,
937		    ZPOOL_CONFIG_RAIDZ_EXPANDING);
938	} else {
939		ASSERT0(vd->vdev_top_zap);
940	}
941
942	if (top_level && alloctype != VDEV_ALLOC_ATTACH) {
943		ASSERT(alloctype == VDEV_ALLOC_LOAD ||
944		    alloctype == VDEV_ALLOC_ADD ||
945		    alloctype == VDEV_ALLOC_SPLIT ||
946		    alloctype == VDEV_ALLOC_ROOTPOOL);
947		/* Note: metaslab_group_create() is now deferred */
948	}
949
950	if (vd->vdev_ops->vdev_op_leaf &&
951	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
952		(void) nvlist_lookup_uint64(nv,
953		    ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap);
954	} else {
955		ASSERT0(vd->vdev_leaf_zap);
956	}
957
958	/*
959	 * If we're a leaf vdev, try to load the DTL object and other state.
960	 */
961
962	if (vd->vdev_ops->vdev_op_leaf &&
963	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
964	    alloctype == VDEV_ALLOC_ROOTPOOL)) {
965		if (alloctype == VDEV_ALLOC_LOAD) {
966			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
967			    &vd->vdev_dtl_object);
968			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
969			    &vd->vdev_unspare);
970		}
971
972		if (alloctype == VDEV_ALLOC_ROOTPOOL) {
973			uint64_t spare = 0;
974
975			if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
976			    &spare) == 0 && spare)
977				spa_spare_add(vd);
978		}
979
980		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
981		    &vd->vdev_offline);
982
983		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
984		    &vd->vdev_resilver_txg);
985
986		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG,
987		    &vd->vdev_rebuild_txg);
988
989		if (nvlist_exists(nv, ZPOOL_CONFIG_RESILVER_DEFER))
990			vdev_defer_resilver(vd);
991
992		/*
993		 * In general, when importing a pool we want to ignore the
994		 * persistent fault state, as the diagnosis made on another
995		 * system may not be valid in the current context.  The only
996		 * exception is if we forced a vdev to a persistently faulted
997		 * state with 'zpool offline -f'.  The persistent fault will
998		 * remain across imports until cleared.
999		 *
1000		 * Local vdevs will remain in the faulted state.
1001		 */
1002		if (spa_load_state(spa) == SPA_LOAD_OPEN ||
1003		    spa_load_state(spa) == SPA_LOAD_IMPORT) {
1004			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
1005			    &vd->vdev_faulted);
1006			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
1007			    &vd->vdev_degraded);
1008			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED,
1009			    &vd->vdev_removed);
1010
1011			if (vd->vdev_faulted || vd->vdev_degraded) {
1012				const char *aux;
1013
1014				vd->vdev_label_aux =
1015				    VDEV_AUX_ERR_EXCEEDED;
1016				if (nvlist_lookup_string(nv,
1017				    ZPOOL_CONFIG_AUX_STATE, &aux) == 0 &&
1018				    strcmp(aux, "external") == 0)
1019					vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
1020				else
1021					vd->vdev_faulted = 0ULL;
1022			}
1023		}
1024	}
1025
1026	/*
1027	 * Add ourselves to the parent's list of children.
1028	 */
1029	vdev_add_child(parent, vd);
1030
1031	*vdp = vd;
1032
1033	return (0);
1034}
1035
1036void
1037vdev_free(vdev_t *vd)
1038{
1039	spa_t *spa = vd->vdev_spa;
1040
1041	ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
1042	ASSERT3P(vd->vdev_trim_thread, ==, NULL);
1043	ASSERT3P(vd->vdev_autotrim_thread, ==, NULL);
1044	ASSERT3P(vd->vdev_rebuild_thread, ==, NULL);
1045
1046	/*
1047	 * Scan queues are normally destroyed at the end of a scan. If the
1048	 * queue exists here, that implies the vdev is being removed while
1049	 * the scan is still running.
1050	 */
1051	if (vd->vdev_scan_io_queue != NULL) {
1052		mutex_enter(&vd->vdev_scan_io_queue_lock);
1053		dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue);
1054		vd->vdev_scan_io_queue = NULL;
1055		mutex_exit(&vd->vdev_scan_io_queue_lock);
1056	}
1057
1058	/*
1059	 * vdev_free() implies closing the vdev first.  This is simpler than
1060	 * trying to ensure complicated semantics for all callers.
1061	 */
1062	vdev_close(vd);
1063
1064	ASSERT(!list_link_active(&vd->vdev_config_dirty_node));
1065	ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
1066
1067	/*
1068	 * Free all children.
1069	 */
1070	for (int c = 0; c < vd->vdev_children; c++)
1071		vdev_free(vd->vdev_child[c]);
1072
1073	ASSERT(vd->vdev_child == NULL);
1074	ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
1075
1076	if (vd->vdev_ops->vdev_op_fini != NULL)
1077		vd->vdev_ops->vdev_op_fini(vd);
1078
1079	/*
1080	 * Discard allocation state.
1081	 */
1082	if (vd->vdev_mg != NULL) {
1083		vdev_metaslab_fini(vd);
1084		metaslab_group_destroy(vd->vdev_mg);
1085		vd->vdev_mg = NULL;
1086	}
1087	if (vd->vdev_log_mg != NULL) {
1088		ASSERT0(vd->vdev_ms_count);
1089		metaslab_group_destroy(vd->vdev_log_mg);
1090		vd->vdev_log_mg = NULL;
1091	}
1092
1093	ASSERT0(vd->vdev_stat.vs_space);
1094	ASSERT0(vd->vdev_stat.vs_dspace);
1095	ASSERT0(vd->vdev_stat.vs_alloc);
1096
1097	/*
1098	 * Remove this vdev from its parent's child list.
1099	 */
1100	vdev_remove_child(vd->vdev_parent, vd);
1101
1102	ASSERT(vd->vdev_parent == NULL);
1103	ASSERT(!list_link_active(&vd->vdev_leaf_node));
1104
1105	/*
1106	 * Clean up vdev structure.
1107	 */
1108	vdev_queue_fini(vd);
1109
1110	if (vd->vdev_path)
1111		spa_strfree(vd->vdev_path);
1112	if (vd->vdev_devid)
1113		spa_strfree(vd->vdev_devid);
1114	if (vd->vdev_physpath)
1115		spa_strfree(vd->vdev_physpath);
1116
1117	if (vd->vdev_enc_sysfs_path)
1118		spa_strfree(vd->vdev_enc_sysfs_path);
1119
1120	if (vd->vdev_fru)
1121		spa_strfree(vd->vdev_fru);
1122
1123	if (vd->vdev_isspare)
1124		spa_spare_remove(vd);
1125	if (vd->vdev_isl2cache)
1126		spa_l2cache_remove(vd);
1127
1128	txg_list_destroy(&vd->vdev_ms_list);
1129	txg_list_destroy(&vd->vdev_dtl_list);
1130
1131	mutex_enter(&vd->vdev_dtl_lock);
1132	space_map_close(vd->vdev_dtl_sm);
1133	for (int t = 0; t < DTL_TYPES; t++) {
1134		range_tree_vacate(vd->vdev_dtl[t], NULL, NULL);
1135		range_tree_destroy(vd->vdev_dtl[t]);
1136	}
1137	mutex_exit(&vd->vdev_dtl_lock);
1138
1139	EQUIV(vd->vdev_indirect_births != NULL,
1140	    vd->vdev_indirect_mapping != NULL);
1141	if (vd->vdev_indirect_births != NULL) {
1142		vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
1143		vdev_indirect_births_close(vd->vdev_indirect_births);
1144	}
1145
1146	if (vd->vdev_obsolete_sm != NULL) {
1147		ASSERT(vd->vdev_removing ||
1148		    vd->vdev_ops == &vdev_indirect_ops);
1149		space_map_close(vd->vdev_obsolete_sm);
1150		vd->vdev_obsolete_sm = NULL;
1151	}
1152	range_tree_destroy(vd->vdev_obsolete_segments);
1153	rw_destroy(&vd->vdev_indirect_rwlock);
1154	mutex_destroy(&vd->vdev_obsolete_lock);
1155
1156	mutex_destroy(&vd->vdev_dtl_lock);
1157	mutex_destroy(&vd->vdev_stat_lock);
1158	mutex_destroy(&vd->vdev_probe_lock);
1159	mutex_destroy(&vd->vdev_scan_io_queue_lock);
1160
1161	mutex_destroy(&vd->vdev_initialize_lock);
1162	mutex_destroy(&vd->vdev_initialize_io_lock);
1163	cv_destroy(&vd->vdev_initialize_io_cv);
1164	cv_destroy(&vd->vdev_initialize_cv);
1165
1166	mutex_destroy(&vd->vdev_trim_lock);
1167	mutex_destroy(&vd->vdev_autotrim_lock);
1168	mutex_destroy(&vd->vdev_trim_io_lock);
1169	cv_destroy(&vd->vdev_trim_cv);
1170	cv_destroy(&vd->vdev_autotrim_cv);
1171	cv_destroy(&vd->vdev_autotrim_kick_cv);
1172	cv_destroy(&vd->vdev_trim_io_cv);
1173
1174	mutex_destroy(&vd->vdev_rebuild_lock);
1175	cv_destroy(&vd->vdev_rebuild_cv);
1176
1177	zfs_ratelimit_fini(&vd->vdev_delay_rl);
1178	zfs_ratelimit_fini(&vd->vdev_deadman_rl);
1179	zfs_ratelimit_fini(&vd->vdev_checksum_rl);
1180
1181	if (vd == spa->spa_root_vdev)
1182		spa->spa_root_vdev = NULL;
1183
1184	kmem_free(vd, sizeof (vdev_t));
1185}
1186
1187/*
1188 * Transfer top-level vdev state from svd to tvd.
1189 */
1190static void
1191vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
1192{
1193	spa_t *spa = svd->vdev_spa;
1194	metaslab_t *msp;
1195	vdev_t *vd;
1196	int t;
1197
1198	ASSERT(tvd == tvd->vdev_top);
1199
1200	tvd->vdev_ms_array = svd->vdev_ms_array;
1201	tvd->vdev_ms_shift = svd->vdev_ms_shift;
1202	tvd->vdev_ms_count = svd->vdev_ms_count;
1203	tvd->vdev_top_zap = svd->vdev_top_zap;
1204
1205	svd->vdev_ms_array = 0;
1206	svd->vdev_ms_shift = 0;
1207	svd->vdev_ms_count = 0;
1208	svd->vdev_top_zap = 0;
1209
1210	if (tvd->vdev_mg)
1211		ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg);
1212	if (tvd->vdev_log_mg)
1213		ASSERT3P(tvd->vdev_log_mg, ==, svd->vdev_log_mg);
1214	tvd->vdev_mg = svd->vdev_mg;
1215	tvd->vdev_log_mg = svd->vdev_log_mg;
1216	tvd->vdev_ms = svd->vdev_ms;
1217
1218	svd->vdev_mg = NULL;
1219	svd->vdev_log_mg = NULL;
1220	svd->vdev_ms = NULL;
1221
1222	if (tvd->vdev_mg != NULL)
1223		tvd->vdev_mg->mg_vd = tvd;
1224	if (tvd->vdev_log_mg != NULL)
1225		tvd->vdev_log_mg->mg_vd = tvd;
1226
1227	tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm;
1228	svd->vdev_checkpoint_sm = NULL;
1229
1230	tvd->vdev_alloc_bias = svd->vdev_alloc_bias;
1231	svd->vdev_alloc_bias = VDEV_BIAS_NONE;
1232
1233	tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
1234	tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
1235	tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace;
1236
1237	svd->vdev_stat.vs_alloc = 0;
1238	svd->vdev_stat.vs_space = 0;
1239	svd->vdev_stat.vs_dspace = 0;
1240
1241	/*
1242	 * State which may be set on a top-level vdev that's in the
1243	 * process of being removed.
1244	 */
1245	ASSERT0(tvd->vdev_indirect_config.vic_births_object);
1246	ASSERT0(tvd->vdev_indirect_config.vic_mapping_object);
1247	ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL);
1248	ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL);
1249	ASSERT3P(tvd->vdev_indirect_births, ==, NULL);
1250	ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL);
1251	ASSERT0(tvd->vdev_noalloc);
1252	ASSERT0(tvd->vdev_removing);
1253	ASSERT0(tvd->vdev_rebuilding);
1254	tvd->vdev_noalloc = svd->vdev_noalloc;
1255	tvd->vdev_removing = svd->vdev_removing;
1256	tvd->vdev_rebuilding = svd->vdev_rebuilding;
1257	tvd->vdev_rebuild_config = svd->vdev_rebuild_config;
1258	tvd->vdev_indirect_config = svd->vdev_indirect_config;
1259	tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping;
1260	tvd->vdev_indirect_births = svd->vdev_indirect_births;
1261	range_tree_swap(&svd->vdev_obsolete_segments,
1262	    &tvd->vdev_obsolete_segments);
1263	tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm;
1264	svd->vdev_indirect_config.vic_mapping_object = 0;
1265	svd->vdev_indirect_config.vic_births_object = 0;
1266	svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL;
1267	svd->vdev_indirect_mapping = NULL;
1268	svd->vdev_indirect_births = NULL;
1269	svd->vdev_obsolete_sm = NULL;
1270	svd->vdev_noalloc = 0;
1271	svd->vdev_removing = 0;
1272	svd->vdev_rebuilding = 0;
1273
1274	for (t = 0; t < TXG_SIZE; t++) {
1275		while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
1276			(void) txg_list_add(&tvd->vdev_ms_list, msp, t);
1277		while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
1278			(void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
1279		if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
1280			(void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
1281	}
1282
1283	if (list_link_active(&svd->vdev_config_dirty_node)) {
1284		vdev_config_clean(svd);
1285		vdev_config_dirty(tvd);
1286	}
1287
1288	if (list_link_active(&svd->vdev_state_dirty_node)) {
1289		vdev_state_clean(svd);
1290		vdev_state_dirty(tvd);
1291	}
1292
1293	tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio;
1294	svd->vdev_deflate_ratio = 0;
1295
1296	tvd->vdev_islog = svd->vdev_islog;
1297	svd->vdev_islog = 0;
1298
1299	dsl_scan_io_queue_vdev_xfer(svd, tvd);
1300}
1301
1302static void
1303vdev_top_update(vdev_t *tvd, vdev_t *vd)
1304{
1305	if (vd == NULL)
1306		return;
1307
1308	vd->vdev_top = tvd;
1309
1310	for (int c = 0; c < vd->vdev_children; c++)
1311		vdev_top_update(tvd, vd->vdev_child[c]);
1312}
1313
1314/*
1315 * Add a mirror/replacing vdev above an existing vdev.  There is no need to
1316 * call .vdev_op_init() since mirror/replacing vdevs do not have private state.
1317 */
1318vdev_t *
1319vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
1320{
1321	spa_t *spa = cvd->vdev_spa;
1322	vdev_t *pvd = cvd->vdev_parent;
1323	vdev_t *mvd;
1324
1325	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1326
1327	mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
1328
1329	mvd->vdev_asize = cvd->vdev_asize;
1330	mvd->vdev_min_asize = cvd->vdev_min_asize;
1331	mvd->vdev_max_asize = cvd->vdev_max_asize;
1332	mvd->vdev_psize = cvd->vdev_psize;
1333	mvd->vdev_ashift = cvd->vdev_ashift;
1334	mvd->vdev_logical_ashift = cvd->vdev_logical_ashift;
1335	mvd->vdev_physical_ashift = cvd->vdev_physical_ashift;
1336	mvd->vdev_state = cvd->vdev_state;
1337	mvd->vdev_crtxg = cvd->vdev_crtxg;
1338
1339	vdev_remove_child(pvd, cvd);
1340	vdev_add_child(pvd, mvd);
1341	cvd->vdev_id = mvd->vdev_children;
1342	vdev_add_child(mvd, cvd);
1343	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
1344
1345	if (mvd == mvd->vdev_top)
1346		vdev_top_transfer(cvd, mvd);
1347
1348	return (mvd);
1349}
1350
1351/*
1352 * Remove a 1-way mirror/replacing vdev from the tree.
1353 */
1354void
1355vdev_remove_parent(vdev_t *cvd)
1356{
1357	vdev_t *mvd = cvd->vdev_parent;
1358	vdev_t *pvd = mvd->vdev_parent;
1359
1360	ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1361
1362	ASSERT(mvd->vdev_children == 1);
1363	ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
1364	    mvd->vdev_ops == &vdev_replacing_ops ||
1365	    mvd->vdev_ops == &vdev_spare_ops);
1366	cvd->vdev_ashift = mvd->vdev_ashift;
1367	cvd->vdev_logical_ashift = mvd->vdev_logical_ashift;
1368	cvd->vdev_physical_ashift = mvd->vdev_physical_ashift;
1369	vdev_remove_child(mvd, cvd);
1370	vdev_remove_child(pvd, mvd);
1371
1372	/*
1373	 * If cvd will replace mvd as a top-level vdev, preserve mvd's guid.
1374	 * Otherwise, we could have detached an offline device, and when we
1375	 * go to import the pool we'll think we have two top-level vdevs,
1376	 * instead of a different version of the same top-level vdev.
1377	 */
1378	if (mvd->vdev_top == mvd) {
1379		uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid;
1380		cvd->vdev_orig_guid = cvd->vdev_guid;
1381		cvd->vdev_guid += guid_delta;
1382		cvd->vdev_guid_sum += guid_delta;
1383
1384		/*
1385		 * If pool not set for autoexpand, we need to also preserve
1386		 * mvd's asize to prevent automatic expansion of cvd.
1387		 * Otherwise if we are adjusting the mirror by attaching and
1388		 * detaching children of non-uniform sizes, the mirror could
1389		 * autoexpand, unexpectedly requiring larger devices to
1390		 * re-establish the mirror.
1391		 */
1392		if (!cvd->vdev_spa->spa_autoexpand)
1393			cvd->vdev_asize = mvd->vdev_asize;
1394	}
1395	cvd->vdev_id = mvd->vdev_id;
1396	vdev_add_child(pvd, cvd);
1397	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
1398
1399	if (cvd == cvd->vdev_top)
1400		vdev_top_transfer(mvd, cvd);
1401
1402	ASSERT(mvd->vdev_children == 0);
1403	vdev_free(mvd);
1404}
1405
1406/*
1407 * Choose GCD for spa_gcd_alloc.
1408 */
1409static uint64_t
1410vdev_gcd(uint64_t a, uint64_t b)
1411{
1412	while (b != 0) {
1413		uint64_t t = b;
1414		b = a % b;
1415		a = t;
1416	}
1417	return (a);
1418}
1419
1420/*
1421 * Set spa_min_alloc and spa_gcd_alloc.
1422 */
1423static void
1424vdev_spa_set_alloc(spa_t *spa, uint64_t min_alloc)
1425{
1426	if (min_alloc < spa->spa_min_alloc)
1427		spa->spa_min_alloc = min_alloc;
1428	if (spa->spa_gcd_alloc == INT_MAX) {
1429		spa->spa_gcd_alloc = min_alloc;
1430	} else {
1431		spa->spa_gcd_alloc = vdev_gcd(min_alloc,
1432		    spa->spa_gcd_alloc);
1433	}
1434}
1435
1436void
1437vdev_metaslab_group_create(vdev_t *vd)
1438{
1439	spa_t *spa = vd->vdev_spa;
1440
1441	/*
1442	 * metaslab_group_create was delayed until allocation bias was available
1443	 */
1444	if (vd->vdev_mg == NULL) {
1445		metaslab_class_t *mc;
1446
1447		if (vd->vdev_islog && vd->vdev_alloc_bias == VDEV_BIAS_NONE)
1448			vd->vdev_alloc_bias = VDEV_BIAS_LOG;
1449
1450		ASSERT3U(vd->vdev_islog, ==,
1451		    (vd->vdev_alloc_bias == VDEV_BIAS_LOG));
1452
1453		switch (vd->vdev_alloc_bias) {
1454		case VDEV_BIAS_LOG:
1455			mc = spa_log_class(spa);
1456			break;
1457		case VDEV_BIAS_SPECIAL:
1458			mc = spa_special_class(spa);
1459			break;
1460		case VDEV_BIAS_DEDUP:
1461			mc = spa_dedup_class(spa);
1462			break;
1463		default:
1464			mc = spa_normal_class(spa);
1465		}
1466
1467		vd->vdev_mg = metaslab_group_create(mc, vd,
1468		    spa->spa_alloc_count);
1469
1470		if (!vd->vdev_islog) {
1471			vd->vdev_log_mg = metaslab_group_create(
1472			    spa_embedded_log_class(spa), vd, 1);
1473		}
1474
1475		/*
1476		 * The spa ashift min/max only apply for the normal metaslab
1477		 * class. Class destination is late binding so ashift boundary
1478		 * setting had to wait until now.
1479		 */
1480		if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
1481		    mc == spa_normal_class(spa) && vd->vdev_aux == NULL) {
1482			if (vd->vdev_ashift > spa->spa_max_ashift)
1483				spa->spa_max_ashift = vd->vdev_ashift;
1484			if (vd->vdev_ashift < spa->spa_min_ashift)
1485				spa->spa_min_ashift = vd->vdev_ashift;
1486
1487			uint64_t min_alloc = vdev_get_min_alloc(vd);
1488			vdev_spa_set_alloc(spa, min_alloc);
1489		}
1490	}
1491}
1492
1493int
1494vdev_metaslab_init(vdev_t *vd, uint64_t txg)
1495{
1496	spa_t *spa = vd->vdev_spa;
1497	uint64_t oldc = vd->vdev_ms_count;
1498	uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
1499	metaslab_t **mspp;
1500	int error;
1501	boolean_t expanding = (oldc != 0);
1502
1503	ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER));
1504
1505	/*
1506	 * This vdev is not being allocated from yet or is a hole.
1507	 */
1508	if (vd->vdev_ms_shift == 0)
1509		return (0);
1510
1511	ASSERT(!vd->vdev_ishole);
1512
1513	ASSERT(oldc <= newc);
1514
1515	mspp = vmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
1516
1517	if (expanding) {
1518		memcpy(mspp, vd->vdev_ms, oldc * sizeof (*mspp));
1519		vmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
1520	}
1521
1522	vd->vdev_ms = mspp;
1523	vd->vdev_ms_count = newc;
1524
1525	for (uint64_t m = oldc; m < newc; m++) {
1526		uint64_t object = 0;
1527		/*
1528		 * vdev_ms_array may be 0 if we are creating the "fake"
1529		 * metaslabs for an indirect vdev for zdb's leak detection.
1530		 * See zdb_leak_init().
1531		 */
1532		if (txg == 0 && vd->vdev_ms_array != 0) {
1533			error = dmu_read(spa->spa_meta_objset,
1534			    vd->vdev_ms_array,
1535			    m * sizeof (uint64_t), sizeof (uint64_t), &object,
1536			    DMU_READ_PREFETCH);
1537			if (error != 0) {
1538				vdev_dbgmsg(vd, "unable to read the metaslab "
1539				    "array [error=%d]", error);
1540				return (error);
1541			}
1542		}
1543
1544		error = metaslab_init(vd->vdev_mg, m, object, txg,
1545		    &(vd->vdev_ms[m]));
1546		if (error != 0) {
1547			vdev_dbgmsg(vd, "metaslab_init failed [error=%d]",
1548			    error);
1549			return (error);
1550		}
1551	}
1552
1553	/*
1554	 * Find the emptiest metaslab on the vdev and mark it for use for
1555	 * embedded slog by moving it from the regular to the log metaslab
1556	 * group.
1557	 */
1558	if (vd->vdev_mg->mg_class == spa_normal_class(spa) &&
1559	    vd->vdev_ms_count > zfs_embedded_slog_min_ms &&
1560	    avl_is_empty(&vd->vdev_log_mg->mg_metaslab_tree)) {
1561		uint64_t slog_msid = 0;
1562		uint64_t smallest = UINT64_MAX;
1563
1564		/*
1565		 * Note, we only search the new metaslabs, because the old
1566		 * (pre-existing) ones may be active (e.g. have non-empty
1567		 * range_tree's), and we don't move them to the new
1568		 * metaslab_t.
1569		 */
1570		for (uint64_t m = oldc; m < newc; m++) {
1571			uint64_t alloc =
1572			    space_map_allocated(vd->vdev_ms[m]->ms_sm);
1573			if (alloc < smallest) {
1574				slog_msid = m;
1575				smallest = alloc;
1576			}
1577		}
1578		metaslab_t *slog_ms = vd->vdev_ms[slog_msid];
1579		/*
1580		 * The metaslab was marked as dirty at the end of
1581		 * metaslab_init(). Remove it from the dirty list so that we
1582		 * can uninitialize and reinitialize it to the new class.
1583		 */
1584		if (txg != 0) {
1585			(void) txg_list_remove_this(&vd->vdev_ms_list,
1586			    slog_ms, txg);
1587		}
1588		uint64_t sm_obj = space_map_object(slog_ms->ms_sm);
1589		metaslab_fini(slog_ms);
1590		VERIFY0(metaslab_init(vd->vdev_log_mg, slog_msid, sm_obj, txg,
1591		    &vd->vdev_ms[slog_msid]));
1592	}
1593
1594	if (txg == 0)
1595		spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
1596
1597	/*
1598	 * If the vdev is marked as non-allocating then don't
1599	 * activate the metaslabs since we want to ensure that
1600	 * no allocations are performed on this device.
1601	 */
1602	if (vd->vdev_noalloc) {
1603		/* track non-allocating vdev space */
1604		spa->spa_nonallocating_dspace += spa_deflate(spa) ?
1605		    vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
1606	} else if (!expanding) {
1607		metaslab_group_activate(vd->vdev_mg);
1608		if (vd->vdev_log_mg != NULL)
1609			metaslab_group_activate(vd->vdev_log_mg);
1610	}
1611
1612	if (txg == 0)
1613		spa_config_exit(spa, SCL_ALLOC, FTAG);
1614
1615	return (0);
1616}
1617
1618void
1619vdev_metaslab_fini(vdev_t *vd)
1620{
1621	if (vd->vdev_checkpoint_sm != NULL) {
1622		ASSERT(spa_feature_is_active(vd->vdev_spa,
1623		    SPA_FEATURE_POOL_CHECKPOINT));
1624		space_map_close(vd->vdev_checkpoint_sm);
1625		/*
1626		 * Even though we close the space map, we need to set its
1627		 * pointer to NULL. The reason is that vdev_metaslab_fini()
1628		 * may be called multiple times for certain operations
1629		 * (i.e. when destroying a pool) so we need to ensure that
1630		 * this clause never executes twice. This logic is similar
1631		 * to the one used for the vdev_ms clause below.
1632		 */
1633		vd->vdev_checkpoint_sm = NULL;
1634	}
1635
1636	if (vd->vdev_ms != NULL) {
1637		metaslab_group_t *mg = vd->vdev_mg;
1638
1639		metaslab_group_passivate(mg);
1640		if (vd->vdev_log_mg != NULL) {
1641			ASSERT(!vd->vdev_islog);
1642			metaslab_group_passivate(vd->vdev_log_mg);
1643		}
1644
1645		uint64_t count = vd->vdev_ms_count;
1646		for (uint64_t m = 0; m < count; m++) {
1647			metaslab_t *msp = vd->vdev_ms[m];
1648			if (msp != NULL)
1649				metaslab_fini(msp);
1650		}
1651		vmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
1652		vd->vdev_ms = NULL;
1653		vd->vdev_ms_count = 0;
1654
1655		for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
1656			ASSERT0(mg->mg_histogram[i]);
1657			if (vd->vdev_log_mg != NULL)
1658				ASSERT0(vd->vdev_log_mg->mg_histogram[i]);
1659		}
1660	}
1661	ASSERT0(vd->vdev_ms_count);
1662}
1663
1664typedef struct vdev_probe_stats {
1665	boolean_t	vps_readable;
1666	boolean_t	vps_writeable;
1667	boolean_t	vps_zio_done_probe;
1668	int		vps_flags;
1669} vdev_probe_stats_t;
1670
1671static void
1672vdev_probe_done(zio_t *zio)
1673{
1674	spa_t *spa = zio->io_spa;
1675	vdev_t *vd = zio->io_vd;
1676	vdev_probe_stats_t *vps = zio->io_private;
1677
1678	ASSERT(vd->vdev_probe_zio != NULL);
1679
1680	if (zio->io_type == ZIO_TYPE_READ) {
1681		if (zio->io_error == 0)
1682			vps->vps_readable = 1;
1683		if (zio->io_error == 0 && spa_writeable(spa)) {
1684			zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd,
1685			    zio->io_offset, zio->io_size, zio->io_abd,
1686			    ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
1687			    ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE));
1688		} else {
1689			abd_free(zio->io_abd);
1690		}
1691	} else if (zio->io_type == ZIO_TYPE_WRITE) {
1692		if (zio->io_error == 0)
1693			vps->vps_writeable = 1;
1694		abd_free(zio->io_abd);
1695	} else if (zio->io_type == ZIO_TYPE_NULL) {
1696		zio_t *pio;
1697		zio_link_t *zl;
1698
1699		vd->vdev_cant_read |= !vps->vps_readable;
1700		vd->vdev_cant_write |= !vps->vps_writeable;
1701		vdev_dbgmsg(vd, "probe done, cant_read=%u cant_write=%u",
1702		    vd->vdev_cant_read, vd->vdev_cant_write);
1703
1704		if (vdev_readable(vd) &&
1705		    (vdev_writeable(vd) || !spa_writeable(spa))) {
1706			zio->io_error = 0;
1707		} else {
1708			ASSERT(zio->io_error != 0);
1709			vdev_dbgmsg(vd, "failed probe");
1710			(void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
1711			    spa, vd, NULL, NULL, 0);
1712			zio->io_error = SET_ERROR(ENXIO);
1713
1714			/*
1715			 * If this probe was initiated from zio pipeline, then
1716			 * change the state in a spa_async_request. Probes that
1717			 * were initiated from a vdev_open can change the state
1718			 * as part of the open call.
1719			 */
1720			if (vps->vps_zio_done_probe) {
1721				vd->vdev_fault_wanted = B_TRUE;
1722				spa_async_request(spa, SPA_ASYNC_FAULT_VDEV);
1723			}
1724		}
1725
1726		mutex_enter(&vd->vdev_probe_lock);
1727		ASSERT(vd->vdev_probe_zio == zio);
1728		vd->vdev_probe_zio = NULL;
1729		mutex_exit(&vd->vdev_probe_lock);
1730
1731		zl = NULL;
1732		while ((pio = zio_walk_parents(zio, &zl)) != NULL)
1733			if (!vdev_accessible(vd, pio))
1734				pio->io_error = SET_ERROR(ENXIO);
1735
1736		kmem_free(vps, sizeof (*vps));
1737	}
1738}
1739
1740/*
1741 * Determine whether this device is accessible.
1742 *
1743 * Read and write to several known locations: the pad regions of each
1744 * vdev label but the first, which we leave alone in case it contains
1745 * a VTOC.
1746 */
1747zio_t *
1748vdev_probe(vdev_t *vd, zio_t *zio)
1749{
1750	spa_t *spa = vd->vdev_spa;
1751	vdev_probe_stats_t *vps = NULL;
1752	zio_t *pio;
1753
1754	ASSERT(vd->vdev_ops->vdev_op_leaf);
1755
1756	/*
1757	 * Don't probe the probe.
1758	 */
1759	if (zio && (zio->io_flags & ZIO_FLAG_PROBE))
1760		return (NULL);
1761
1762	/*
1763	 * To prevent 'probe storms' when a device fails, we create
1764	 * just one probe i/o at a time.  All zios that want to probe
1765	 * this vdev will become parents of the probe io.
1766	 */
1767	mutex_enter(&vd->vdev_probe_lock);
1768
1769	if ((pio = vd->vdev_probe_zio) == NULL) {
1770		vps = kmem_zalloc(sizeof (*vps), KM_SLEEP);
1771
1772		vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
1773		    ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD;
1774		vps->vps_zio_done_probe = (zio != NULL);
1775
1776		if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
1777			/*
1778			 * vdev_cant_read and vdev_cant_write can only
1779			 * transition from TRUE to FALSE when we have the
1780			 * SCL_ZIO lock as writer; otherwise they can only
1781			 * transition from FALSE to TRUE.  This ensures that
1782			 * any zio looking at these values can assume that
1783			 * failures persist for the life of the I/O.  That's
1784			 * important because when a device has intermittent
1785			 * connectivity problems, we want to ensure that
1786			 * they're ascribed to the device (ENXIO) and not
1787			 * the zio (EIO).
1788			 *
1789			 * Since we hold SCL_ZIO as writer here, clear both
1790			 * values so the probe can reevaluate from first
1791			 * principles.
1792			 */
1793			vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER;
1794			vd->vdev_cant_read = B_FALSE;
1795			vd->vdev_cant_write = B_FALSE;
1796		}
1797
1798		vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
1799		    vdev_probe_done, vps,
1800		    vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
1801	}
1802
1803	if (zio != NULL)
1804		zio_add_child(zio, pio);
1805
1806	mutex_exit(&vd->vdev_probe_lock);
1807
1808	if (vps == NULL) {
1809		ASSERT(zio != NULL);
1810		return (NULL);
1811	}
1812
1813	for (int l = 1; l < VDEV_LABELS; l++) {
1814		zio_nowait(zio_read_phys(pio, vd,
1815		    vdev_label_offset(vd->vdev_psize, l,
1816		    offsetof(vdev_label_t, vl_be)), VDEV_PAD_SIZE,
1817		    abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE),
1818		    ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
1819		    ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
1820	}
1821
1822	if (zio == NULL)
1823		return (pio);
1824
1825	zio_nowait(pio);
1826	return (NULL);
1827}
1828
1829static void
1830vdev_load_child(void *arg)
1831{
1832	vdev_t *vd = arg;
1833
1834	vd->vdev_load_error = vdev_load(vd);
1835}
1836
1837static void
1838vdev_open_child(void *arg)
1839{
1840	vdev_t *vd = arg;
1841
1842	vd->vdev_open_thread = curthread;
1843	vd->vdev_open_error = vdev_open(vd);
1844	vd->vdev_open_thread = NULL;
1845}
1846
1847static boolean_t
1848vdev_uses_zvols(vdev_t *vd)
1849{
1850#ifdef _KERNEL
1851	if (zvol_is_zvol(vd->vdev_path))
1852		return (B_TRUE);
1853#endif
1854
1855	for (int c = 0; c < vd->vdev_children; c++)
1856		if (vdev_uses_zvols(vd->vdev_child[c]))
1857			return (B_TRUE);
1858
1859	return (B_FALSE);
1860}
1861
1862/*
1863 * Returns B_TRUE if the passed child should be opened.
1864 */
1865static boolean_t
1866vdev_default_open_children_func(vdev_t *vd)
1867{
1868	(void) vd;
1869	return (B_TRUE);
1870}
1871
1872/*
1873 * Open the requested child vdevs.  If any of the leaf vdevs are using
1874 * a ZFS volume then do the opens in a single thread.  This avoids a
1875 * deadlock when the current thread is holding the spa_namespace_lock.
1876 */
1877static void
1878vdev_open_children_impl(vdev_t *vd, vdev_open_children_func_t *open_func)
1879{
1880	int children = vd->vdev_children;
1881
1882	taskq_t *tq = taskq_create("vdev_open", children, minclsyspri,
1883	    children, children, TASKQ_PREPOPULATE);
1884	vd->vdev_nonrot = B_TRUE;
1885
1886	for (int c = 0; c < children; c++) {
1887		vdev_t *cvd = vd->vdev_child[c];
1888
1889		if (open_func(cvd) == B_FALSE)
1890			continue;
1891
1892		if (tq == NULL || vdev_uses_zvols(vd)) {
1893			cvd->vdev_open_error = vdev_open(cvd);
1894		} else {
1895			VERIFY(taskq_dispatch(tq, vdev_open_child,
1896			    cvd, TQ_SLEEP) != TASKQID_INVALID);
1897		}
1898
1899		vd->vdev_nonrot &= cvd->vdev_nonrot;
1900	}
1901
1902	if (tq != NULL) {
1903		taskq_wait(tq);
1904		taskq_destroy(tq);
1905	}
1906}
1907
1908/*
1909 * Open all child vdevs.
1910 */
1911void
1912vdev_open_children(vdev_t *vd)
1913{
1914	vdev_open_children_impl(vd, vdev_default_open_children_func);
1915}
1916
1917/*
1918 * Conditionally open a subset of child vdevs.
1919 */
1920void
1921vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func)
1922{
1923	vdev_open_children_impl(vd, open_func);
1924}
1925
1926/*
1927 * Compute the raidz-deflation ratio.  Note, we hard-code 128k (1 << 17)
1928 * because it is the "typical" blocksize.  Even though SPA_MAXBLOCKSIZE
1929 * changed, this algorithm can not change, otherwise it would inconsistently
1930 * account for existing bp's.  We also hard-code txg 0 for the same reason
1931 * since expanded RAIDZ vdevs can use a different asize for different birth
1932 * txg's.
1933 */
1934static void
1935vdev_set_deflate_ratio(vdev_t *vd)
1936{
1937	if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) {
1938		vd->vdev_deflate_ratio = (1 << 17) /
1939		    (vdev_psize_to_asize_txg(vd, 1 << 17, 0) >>
1940		    SPA_MINBLOCKSHIFT);
1941	}
1942}
1943
1944/*
1945 * Choose the best of two ashifts, preferring one between logical ashift
1946 * (absolute minimum) and administrator defined maximum, otherwise take
1947 * the biggest of the two.
1948 */
1949uint64_t
1950vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b)
1951{
1952	if (a > logical && a <= zfs_vdev_max_auto_ashift) {
1953		if (b <= logical || b > zfs_vdev_max_auto_ashift)
1954			return (a);
1955		else
1956			return (MAX(a, b));
1957	} else if (b <= logical || b > zfs_vdev_max_auto_ashift)
1958		return (MAX(a, b));
1959	return (b);
1960}
1961
1962/*
1963 * Maximize performance by inflating the configured ashift for top level
1964 * vdevs to be as close to the physical ashift as possible while maintaining
1965 * administrator defined limits and ensuring it doesn't go below the
1966 * logical ashift.
1967 */
1968static void
1969vdev_ashift_optimize(vdev_t *vd)
1970{
1971	ASSERT(vd == vd->vdev_top);
1972
1973	if (vd->vdev_ashift < vd->vdev_physical_ashift &&
1974	    vd->vdev_physical_ashift <= zfs_vdev_max_auto_ashift) {
1975		vd->vdev_ashift = MIN(
1976		    MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift),
1977		    MAX(zfs_vdev_min_auto_ashift,
1978		    vd->vdev_physical_ashift));
1979	} else {
1980		/*
1981		 * If the logical and physical ashifts are the same, then
1982		 * we ensure that the top-level vdev's ashift is not smaller
1983		 * than our minimum ashift value. For the unusual case
1984		 * where logical ashift > physical ashift, we can't cap
1985		 * the calculated ashift based on max ashift as that
1986		 * would cause failures.
1987		 * We still check if we need to increase it to match
1988		 * the min ashift.
1989		 */
1990		vd->vdev_ashift = MAX(zfs_vdev_min_auto_ashift,
1991		    vd->vdev_ashift);
1992	}
1993}
1994
1995/*
1996 * Prepare a virtual device for access.
1997 */
1998int
1999vdev_open(vdev_t *vd)
2000{
2001	spa_t *spa = vd->vdev_spa;
2002	int error;
2003	uint64_t osize = 0;
2004	uint64_t max_osize = 0;
2005	uint64_t asize, max_asize, psize;
2006	uint64_t logical_ashift = 0;
2007	uint64_t physical_ashift = 0;
2008
2009	ASSERT(vd->vdev_open_thread == curthread ||
2010	    spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
2011	ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
2012	    vd->vdev_state == VDEV_STATE_CANT_OPEN ||
2013	    vd->vdev_state == VDEV_STATE_OFFLINE);
2014
2015	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
2016	vd->vdev_cant_read = B_FALSE;
2017	vd->vdev_cant_write = B_FALSE;
2018	vd->vdev_min_asize = vdev_get_min_asize(vd);
2019
2020	/*
2021	 * If this vdev is not removed, check its fault status.  If it's
2022	 * faulted, bail out of the open.
2023	 */
2024	if (!vd->vdev_removed && vd->vdev_faulted) {
2025		ASSERT(vd->vdev_children == 0);
2026		ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
2027		    vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
2028		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
2029		    vd->vdev_label_aux);
2030		return (SET_ERROR(ENXIO));
2031	} else if (vd->vdev_offline) {
2032		ASSERT(vd->vdev_children == 0);
2033		vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
2034		return (SET_ERROR(ENXIO));
2035	}
2036
2037	error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize,
2038	    &logical_ashift, &physical_ashift);
2039
2040	/* Keep the device in removed state if unplugged */
2041	if (error == ENOENT && vd->vdev_removed) {
2042		vdev_set_state(vd, B_TRUE, VDEV_STATE_REMOVED,
2043		    VDEV_AUX_NONE);
2044		return (error);
2045	}
2046
2047	/*
2048	 * Physical volume size should never be larger than its max size, unless
2049	 * the disk has shrunk while we were reading it or the device is buggy
2050	 * or damaged: either way it's not safe for use, bail out of the open.
2051	 */
2052	if (osize > max_osize) {
2053		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
2054		    VDEV_AUX_OPEN_FAILED);
2055		return (SET_ERROR(ENXIO));
2056	}
2057
2058	/*
2059	 * Reset the vdev_reopening flag so that we actually close
2060	 * the vdev on error.
2061	 */
2062	vd->vdev_reopening = B_FALSE;
2063	if (zio_injection_enabled && error == 0)
2064		error = zio_handle_device_injection(vd, NULL, SET_ERROR(ENXIO));
2065
2066	if (error) {
2067		if (vd->vdev_removed &&
2068		    vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED)
2069			vd->vdev_removed = B_FALSE;
2070
2071		if (vd->vdev_stat.vs_aux == VDEV_AUX_CHILDREN_OFFLINE) {
2072			vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE,
2073			    vd->vdev_stat.vs_aux);
2074		} else {
2075			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
2076			    vd->vdev_stat.vs_aux);
2077		}
2078		return (error);
2079	}
2080
2081	vd->vdev_removed = B_FALSE;
2082
2083	/*
2084	 * Recheck the faulted flag now that we have confirmed that
2085	 * the vdev is accessible.  If we're faulted, bail.
2086	 */
2087	if (vd->vdev_faulted) {
2088		ASSERT(vd->vdev_children == 0);
2089		ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
2090		    vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
2091		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
2092		    vd->vdev_label_aux);
2093		return (SET_ERROR(ENXIO));
2094	}
2095
2096	if (vd->vdev_degraded) {
2097		ASSERT(vd->vdev_children == 0);
2098		vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
2099		    VDEV_AUX_ERR_EXCEEDED);
2100	} else {
2101		vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0);
2102	}
2103
2104	/*
2105	 * For hole or missing vdevs we just return success.
2106	 */
2107	if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
2108		return (0);
2109
2110	for (int c = 0; c < vd->vdev_children; c++) {
2111		if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
2112			vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
2113			    VDEV_AUX_NONE);
2114			break;
2115		}
2116	}
2117
2118	osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
2119	max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t));
2120
2121	if (vd->vdev_children == 0) {
2122		if (osize < SPA_MINDEVSIZE) {
2123			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
2124			    VDEV_AUX_TOO_SMALL);
2125			return (SET_ERROR(EOVERFLOW));
2126		}
2127		psize = osize;
2128		asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
2129		max_asize = max_osize - (VDEV_LABEL_START_SIZE +
2130		    VDEV_LABEL_END_SIZE);
2131	} else {
2132		if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE -
2133		    (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
2134			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
2135			    VDEV_AUX_TOO_SMALL);
2136			return (SET_ERROR(EOVERFLOW));
2137		}
2138		psize = 0;
2139		asize = osize;
2140		max_asize = max_osize;
2141	}
2142
2143	/*
2144	 * If the vdev was expanded, record this so that we can re-create the
2145	 * uberblock rings in labels {2,3}, during the next sync.
2146	 */
2147	if ((psize > vd->vdev_psize) && (vd->vdev_psize != 0))
2148		vd->vdev_copy_uberblocks = B_TRUE;
2149
2150	vd->vdev_psize = psize;
2151
2152	/*
2153	 * Make sure the allocatable size hasn't shrunk too much.
2154	 */
2155	if (asize < vd->vdev_min_asize) {
2156		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
2157		    VDEV_AUX_BAD_LABEL);
2158		return (SET_ERROR(EINVAL));
2159	}
2160
2161	/*
2162	 * We can always set the logical/physical ashift members since
2163	 * their values are only used to calculate the vdev_ashift when
2164	 * the device is first added to the config. These values should
2165	 * not be used for anything else since they may change whenever
2166	 * the device is reopened and we don't store them in the label.
2167	 */
2168	vd->vdev_physical_ashift =
2169	    MAX(physical_ashift, vd->vdev_physical_ashift);
2170	vd->vdev_logical_ashift = MAX(logical_ashift,
2171	    vd->vdev_logical_ashift);
2172
2173	if (vd->vdev_asize == 0) {
2174		/*
2175		 * This is the first-ever open, so use the computed values.
2176		 * For compatibility, a different ashift can be requested.
2177		 */
2178		vd->vdev_asize = asize;
2179		vd->vdev_max_asize = max_asize;
2180
2181		/*
2182		 * If the vdev_ashift was not overridden at creation time,
2183		 * then set it the logical ashift and optimize the ashift.
2184		 */
2185		if (vd->vdev_ashift == 0) {
2186			vd->vdev_ashift = vd->vdev_logical_ashift;
2187
2188			if (vd->vdev_logical_ashift > ASHIFT_MAX) {
2189				vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
2190				    VDEV_AUX_ASHIFT_TOO_BIG);
2191				return (SET_ERROR(EDOM));
2192			}
2193
2194			if (vd->vdev_top == vd && vd->vdev_attaching == B_FALSE)
2195				vdev_ashift_optimize(vd);
2196			vd->vdev_attaching = B_FALSE;
2197		}
2198		if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN ||
2199		    vd->vdev_ashift > ASHIFT_MAX)) {
2200			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
2201			    VDEV_AUX_BAD_ASHIFT);
2202			return (SET_ERROR(EDOM));
2203		}
2204	} else {
2205		/*
2206		 * Make sure the alignment required hasn't increased.
2207		 */
2208		if (vd->vdev_ashift > vd->vdev_top->vdev_ashift &&
2209		    vd->vdev_ops->vdev_op_leaf) {
2210			(void) zfs_ereport_post(
2211			    FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT,
2212			    spa, vd, NULL, NULL, 0);
2213			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
2214			    VDEV_AUX_BAD_LABEL);
2215			return (SET_ERROR(EDOM));
2216		}
2217		vd->vdev_max_asize = max_asize;
2218	}
2219
2220	/*
2221	 * If all children are healthy we update asize if either:
2222	 * The asize has increased, due to a device expansion caused by dynamic
2223	 * LUN growth or vdev replacement, and automatic expansion is enabled;
2224	 * making the additional space available.
2225	 *
2226	 * The asize has decreased, due to a device shrink usually caused by a
2227	 * vdev replace with a smaller device. This ensures that calculations
2228	 * based of max_asize and asize e.g. esize are always valid. It's safe
2229	 * to do this as we've already validated that asize is greater than
2230	 * vdev_min_asize.
2231	 */
2232	if (vd->vdev_state == VDEV_STATE_HEALTHY &&
2233	    ((asize > vd->vdev_asize &&
2234	    (vd->vdev_expanding || spa->spa_autoexpand)) ||
2235	    (asize < vd->vdev_asize)))
2236		vd->vdev_asize = asize;
2237
2238	vdev_set_min_asize(vd);
2239
2240	/*
2241	 * Ensure we can issue some IO before declaring the
2242	 * vdev open for business.
2243	 */
2244	if (vd->vdev_ops->vdev_op_leaf &&
2245	    (error = zio_wait(vdev_probe(vd, NULL))) != 0) {
2246		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
2247		    VDEV_AUX_ERR_EXCEEDED);
2248		return (error);
2249	}
2250
2251	/*
2252	 * Track the minimum allocation size.
2253	 */
2254	if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
2255	    vd->vdev_islog == 0 && vd->vdev_aux == NULL) {
2256		uint64_t min_alloc = vdev_get_min_alloc(vd);
2257		vdev_spa_set_alloc(spa, min_alloc);
2258	}
2259
2260	/*
2261	 * If this is a leaf vdev, assess whether a resilver is needed.
2262	 * But don't do this if we are doing a reopen for a scrub, since
2263	 * this would just restart the scrub we are already doing.
2264	 */
2265	if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen)
2266		dsl_scan_assess_vdev(spa->spa_dsl_pool, vd);
2267
2268	return (0);
2269}
2270
2271static void
2272vdev_validate_child(void *arg)
2273{
2274	vdev_t *vd = arg;
2275
2276	vd->vdev_validate_thread = curthread;
2277	vd->vdev_validate_error = vdev_validate(vd);
2278	vd->vdev_validate_thread = NULL;
2279}
2280
2281/*
2282 * Called once the vdevs are all opened, this routine validates the label
2283 * contents. This needs to be done before vdev_load() so that we don't
2284 * inadvertently do repair I/Os to the wrong device.
2285 *
2286 * This function will only return failure if one of the vdevs indicates that it
2287 * has since been destroyed or exported.  This is only possible if
2288 * /etc/zfs/zpool.cache was readonly at the time.  Otherwise, the vdev state
2289 * will be updated but the function will return 0.
2290 */
2291int
2292vdev_validate(vdev_t *vd)
2293{
2294	spa_t *spa = vd->vdev_spa;
2295	taskq_t *tq = NULL;
2296	nvlist_t *label;
2297	uint64_t guid = 0, aux_guid = 0, top_guid;
2298	uint64_t state;
2299	nvlist_t *nvl;
2300	uint64_t txg;
2301	int children = vd->vdev_children;
2302
2303	if (vdev_validate_skip)
2304		return (0);
2305
2306	if (children > 0) {
2307		tq = taskq_create("vdev_validate", children, minclsyspri,
2308		    children, children, TASKQ_PREPOPULATE);
2309	}
2310
2311	for (uint64_t c = 0; c < children; c++) {
2312		vdev_t *cvd = vd->vdev_child[c];
2313
2314		if (tq == NULL || vdev_uses_zvols(cvd)) {
2315			vdev_validate_child(cvd);
2316		} else {
2317			VERIFY(taskq_dispatch(tq, vdev_validate_child, cvd,
2318			    TQ_SLEEP) != TASKQID_INVALID);
2319		}
2320	}
2321	if (tq != NULL) {
2322		taskq_wait(tq);
2323		taskq_destroy(tq);
2324	}
2325	for (int c = 0; c < children; c++) {
2326		int error = vd->vdev_child[c]->vdev_validate_error;
2327
2328		if (error != 0)
2329			return (SET_ERROR(EBADF));
2330	}
2331
2332
2333	/*
2334	 * If the device has already failed, or was marked offline, don't do
2335	 * any further validation.  Otherwise, label I/O will fail and we will
2336	 * overwrite the previous state.
2337	 */
2338	if (!vd->vdev_ops->vdev_op_leaf || !vdev_readable(vd))
2339		return (0);
2340
2341	/*
2342	 * If we are performing an extreme rewind, we allow for a label that
2343	 * was modified at a point after the current txg.
2344	 * If config lock is not held do not check for the txg. spa_sync could
2345	 * be updating the vdev's label before updating spa_last_synced_txg.
2346	 */
2347	if (spa->spa_extreme_rewind || spa_last_synced_txg(spa) == 0 ||
2348	    spa_config_held(spa, SCL_CONFIG, RW_WRITER) != SCL_CONFIG)
2349		txg = UINT64_MAX;
2350	else
2351		txg = spa_last_synced_txg(spa);
2352
2353	if ((label = vdev_label_read_config(vd, txg)) == NULL) {
2354		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2355		    VDEV_AUX_BAD_LABEL);
2356		vdev_dbgmsg(vd, "vdev_validate: failed reading config for "
2357		    "txg %llu", (u_longlong_t)txg);
2358		return (0);
2359	}
2360
2361	/*
2362	 * Determine if this vdev has been split off into another
2363	 * pool.  If so, then refuse to open it.
2364	 */
2365	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
2366	    &aux_guid) == 0 && aux_guid == spa_guid(spa)) {
2367		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2368		    VDEV_AUX_SPLIT_POOL);
2369		nvlist_free(label);
2370		vdev_dbgmsg(vd, "vdev_validate: vdev split into other pool");
2371		return (0);
2372	}
2373
2374	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0) {
2375		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2376		    VDEV_AUX_CORRUPT_DATA);
2377		nvlist_free(label);
2378		vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
2379		    ZPOOL_CONFIG_POOL_GUID);
2380		return (0);
2381	}
2382
2383	/*
2384	 * If config is not trusted then ignore the spa guid check. This is
2385	 * necessary because if the machine crashed during a re-guid the new
2386	 * guid might have been written to all of the vdev labels, but not the
2387	 * cached config. The check will be performed again once we have the
2388	 * trusted config from the MOS.
2389	 */
2390	if (spa->spa_trust_config && guid != spa_guid(spa)) {
2391		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2392		    VDEV_AUX_CORRUPT_DATA);
2393		nvlist_free(label);
2394		vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid doesn't "
2395		    "match config (%llu != %llu)", (u_longlong_t)guid,
2396		    (u_longlong_t)spa_guid(spa));
2397		return (0);
2398	}
2399
2400	if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl)
2401	    != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID,
2402	    &aux_guid) != 0)
2403		aux_guid = 0;
2404
2405	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0) {
2406		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2407		    VDEV_AUX_CORRUPT_DATA);
2408		nvlist_free(label);
2409		vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
2410		    ZPOOL_CONFIG_GUID);
2411		return (0);
2412	}
2413
2414	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid)
2415	    != 0) {
2416		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2417		    VDEV_AUX_CORRUPT_DATA);
2418		nvlist_free(label);
2419		vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
2420		    ZPOOL_CONFIG_TOP_GUID);
2421		return (0);
2422	}
2423
2424	/*
2425	 * If this vdev just became a top-level vdev because its sibling was
2426	 * detached, it will have adopted the parent's vdev guid -- but the
2427	 * label may or may not be on disk yet. Fortunately, either version
2428	 * of the label will have the same top guid, so if we're a top-level
2429	 * vdev, we can safely compare to that instead.
2430	 * However, if the config comes from a cachefile that failed to update
2431	 * after the detach, a top-level vdev will appear as a non top-level
2432	 * vdev in the config. Also relax the constraints if we perform an
2433	 * extreme rewind.
2434	 *
2435	 * If we split this vdev off instead, then we also check the
2436	 * original pool's guid. We don't want to consider the vdev
2437	 * corrupt if it is partway through a split operation.
2438	 */
2439	if (vd->vdev_guid != guid && vd->vdev_guid != aux_guid) {
2440		boolean_t mismatch = B_FALSE;
2441		if (spa->spa_trust_config && !spa->spa_extreme_rewind) {
2442			if (vd != vd->vdev_top || vd->vdev_guid != top_guid)
2443				mismatch = B_TRUE;
2444		} else {
2445			if (vd->vdev_guid != top_guid &&
2446			    vd->vdev_top->vdev_guid != guid)
2447				mismatch = B_TRUE;
2448		}
2449
2450		if (mismatch) {
2451			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2452			    VDEV_AUX_CORRUPT_DATA);
2453			nvlist_free(label);
2454			vdev_dbgmsg(vd, "vdev_validate: config guid "
2455			    "doesn't match label guid");
2456			vdev_dbgmsg(vd, "CONFIG: guid %llu, top_guid %llu",
2457			    (u_longlong_t)vd->vdev_guid,
2458			    (u_longlong_t)vd->vdev_top->vdev_guid);
2459			vdev_dbgmsg(vd, "LABEL: guid %llu, top_guid %llu, "
2460			    "aux_guid %llu", (u_longlong_t)guid,
2461			    (u_longlong_t)top_guid, (u_longlong_t)aux_guid);
2462			return (0);
2463		}
2464	}
2465
2466	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
2467	    &state) != 0) {
2468		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2469		    VDEV_AUX_CORRUPT_DATA);
2470		nvlist_free(label);
2471		vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
2472		    ZPOOL_CONFIG_POOL_STATE);
2473		return (0);
2474	}
2475
2476	nvlist_free(label);
2477
2478	/*
2479	 * If this is a verbatim import, no need to check the
2480	 * state of the pool.
2481	 */
2482	if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) &&
2483	    spa_load_state(spa) == SPA_LOAD_OPEN &&
2484	    state != POOL_STATE_ACTIVE) {
2485		vdev_dbgmsg(vd, "vdev_validate: invalid pool state (%llu) "
2486		    "for spa %s", (u_longlong_t)state, spa->spa_name);
2487		return (SET_ERROR(EBADF));
2488	}
2489
2490	/*
2491	 * If we were able to open and validate a vdev that was
2492	 * previously marked permanently unavailable, clear that state
2493	 * now.
2494	 */
2495	if (vd->vdev_not_present)
2496		vd->vdev_not_present = 0;
2497
2498	return (0);
2499}
2500
2501static void
2502vdev_update_path(const char *prefix, char *svd, char **dvd, uint64_t guid)
2503{
2504	if (svd != NULL && *dvd != NULL) {
2505		if (strcmp(svd, *dvd) != 0) {
2506			zfs_dbgmsg("vdev_copy_path: vdev %llu: %s changed "
2507			    "from '%s' to '%s'", (u_longlong_t)guid, prefix,
2508			    *dvd, svd);
2509			spa_strfree(*dvd);
2510			*dvd = spa_strdup(svd);
2511		}
2512	} else if (svd != NULL) {
2513		*dvd = spa_strdup(svd);
2514		zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'",
2515		    (u_longlong_t)guid, *dvd);
2516	}
2517}
2518
2519static void
2520vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd)
2521{
2522	char *old, *new;
2523
2524	vdev_update_path("vdev_path", svd->vdev_path, &dvd->vdev_path,
2525	    dvd->vdev_guid);
2526
2527	vdev_update_path("vdev_devid", svd->vdev_devid, &dvd->vdev_devid,
2528	    dvd->vdev_guid);
2529
2530	vdev_update_path("vdev_physpath", svd->vdev_physpath,
2531	    &dvd->vdev_physpath, dvd->vdev_guid);
2532
2533	/*
2534	 * Our enclosure sysfs path may have changed between imports
2535	 */
2536	old = dvd->vdev_enc_sysfs_path;
2537	new = svd->vdev_enc_sysfs_path;
2538	if ((old != NULL && new == NULL) ||
2539	    (old == NULL && new != NULL) ||
2540	    ((old != NULL && new != NULL) && strcmp(new, old) != 0)) {
2541		zfs_dbgmsg("vdev_copy_path: vdev %llu: vdev_enc_sysfs_path "
2542		    "changed from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid,
2543		    old, new);
2544
2545		if (dvd->vdev_enc_sysfs_path)
2546			spa_strfree(dvd->vdev_enc_sysfs_path);
2547
2548		if (svd->vdev_enc_sysfs_path) {
2549			dvd->vdev_enc_sysfs_path = spa_strdup(
2550			    svd->vdev_enc_sysfs_path);
2551		} else {
2552			dvd->vdev_enc_sysfs_path = NULL;
2553		}
2554	}
2555}
2556
2557/*
2558 * Recursively copy vdev paths from one vdev to another. Source and destination
2559 * vdev trees must have same geometry otherwise return error. Intended to copy
2560 * paths from userland config into MOS config.
2561 */
2562int
2563vdev_copy_path_strict(vdev_t *svd, vdev_t *dvd)
2564{
2565	if ((svd->vdev_ops == &vdev_missing_ops) ||
2566	    (svd->vdev_ishole && dvd->vdev_ishole) ||
2567	    (dvd->vdev_ops == &vdev_indirect_ops))
2568		return (0);
2569
2570	if (svd->vdev_ops != dvd->vdev_ops) {
2571		vdev_dbgmsg(svd, "vdev_copy_path: vdev type mismatch: %s != %s",
2572		    svd->vdev_ops->vdev_op_type, dvd->vdev_ops->vdev_op_type);
2573		return (SET_ERROR(EINVAL));
2574	}
2575
2576	if (svd->vdev_guid != dvd->vdev_guid) {
2577		vdev_dbgmsg(svd, "vdev_copy_path: guids mismatch (%llu != "
2578		    "%llu)", (u_longlong_t)svd->vdev_guid,
2579		    (u_longlong_t)dvd->vdev_guid);
2580		return (SET_ERROR(EINVAL));
2581	}
2582
2583	if (svd->vdev_children != dvd->vdev_children) {
2584		vdev_dbgmsg(svd, "vdev_copy_path: children count mismatch: "
2585		    "%llu != %llu", (u_longlong_t)svd->vdev_children,
2586		    (u_longlong_t)dvd->vdev_children);
2587		return (SET_ERROR(EINVAL));
2588	}
2589
2590	for (uint64_t i = 0; i < svd->vdev_children; i++) {
2591		int error = vdev_copy_path_strict(svd->vdev_child[i],
2592		    dvd->vdev_child[i]);
2593		if (error != 0)
2594			return (error);
2595	}
2596
2597	if (svd->vdev_ops->vdev_op_leaf)
2598		vdev_copy_path_impl(svd, dvd);
2599
2600	return (0);
2601}
2602
2603static void
2604vdev_copy_path_search(vdev_t *stvd, vdev_t *dvd)
2605{
2606	ASSERT(stvd->vdev_top == stvd);
2607	ASSERT3U(stvd->vdev_id, ==, dvd->vdev_top->vdev_id);
2608
2609	for (uint64_t i = 0; i < dvd->vdev_children; i++) {
2610		vdev_copy_path_search(stvd, dvd->vdev_child[i]);
2611	}
2612
2613	if (!dvd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(dvd))
2614		return;
2615
2616	/*
2617	 * The idea here is that while a vdev can shift positions within
2618	 * a top vdev (when replacing, attaching mirror, etc.) it cannot
2619	 * step outside of it.
2620	 */
2621	vdev_t *vd = vdev_lookup_by_guid(stvd, dvd->vdev_guid);
2622
2623	if (vd == NULL || vd->vdev_ops != dvd->vdev_ops)
2624		return;
2625
2626	ASSERT(vd->vdev_ops->vdev_op_leaf);
2627
2628	vdev_copy_path_impl(vd, dvd);
2629}
2630
2631/*
2632 * Recursively copy vdev paths from one root vdev to another. Source and
2633 * destination vdev trees may differ in geometry. For each destination leaf
2634 * vdev, search a vdev with the same guid and top vdev id in the source.
2635 * Intended to copy paths from userland config into MOS config.
2636 */
2637void
2638vdev_copy_path_relaxed(vdev_t *srvd, vdev_t *drvd)
2639{
2640	uint64_t children = MIN(srvd->vdev_children, drvd->vdev_children);
2641	ASSERT(srvd->vdev_ops == &vdev_root_ops);
2642	ASSERT(drvd->vdev_ops == &vdev_root_ops);
2643
2644	for (uint64_t i = 0; i < children; i++) {
2645		vdev_copy_path_search(srvd->vdev_child[i],
2646		    drvd->vdev_child[i]);
2647	}
2648}
2649
2650/*
2651 * Close a virtual device.
2652 */
2653void
2654vdev_close(vdev_t *vd)
2655{
2656	vdev_t *pvd = vd->vdev_parent;
2657	spa_t *spa __maybe_unused = vd->vdev_spa;
2658
2659	ASSERT(vd != NULL);
2660	ASSERT(vd->vdev_open_thread == curthread ||
2661	    spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
2662
2663	/*
2664	 * If our parent is reopening, then we are as well, unless we are
2665	 * going offline.
2666	 */
2667	if (pvd != NULL && pvd->vdev_reopening)
2668		vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline);
2669
2670	vd->vdev_ops->vdev_op_close(vd);
2671
2672	/*
2673	 * We record the previous state before we close it, so that if we are
2674	 * doing a reopen(), we don't generate FMA ereports if we notice that
2675	 * it's still faulted.
2676	 */
2677	vd->vdev_prevstate = vd->vdev_state;
2678
2679	if (vd->vdev_offline)
2680		vd->vdev_state = VDEV_STATE_OFFLINE;
2681	else
2682		vd->vdev_state = VDEV_STATE_CLOSED;
2683	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
2684}
2685
2686void
2687vdev_hold(vdev_t *vd)
2688{
2689	spa_t *spa = vd->vdev_spa;
2690
2691	ASSERT(spa_is_root(spa));
2692	if (spa->spa_state == POOL_STATE_UNINITIALIZED)
2693		return;
2694
2695	for (int c = 0; c < vd->vdev_children; c++)
2696		vdev_hold(vd->vdev_child[c]);
2697
2698	if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_hold != NULL)
2699		vd->vdev_ops->vdev_op_hold(vd);
2700}
2701
2702void
2703vdev_rele(vdev_t *vd)
2704{
2705	ASSERT(spa_is_root(vd->vdev_spa));
2706	for (int c = 0; c < vd->vdev_children; c++)
2707		vdev_rele(vd->vdev_child[c]);
2708
2709	if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_rele != NULL)
2710		vd->vdev_ops->vdev_op_rele(vd);
2711}
2712
2713/*
2714 * Reopen all interior vdevs and any unopened leaves.  We don't actually
2715 * reopen leaf vdevs which had previously been opened as they might deadlock
2716 * on the spa_config_lock.  Instead we only obtain the leaf's physical size.
2717 * If the leaf has never been opened then open it, as usual.
2718 */
2719void
2720vdev_reopen(vdev_t *vd)
2721{
2722	spa_t *spa = vd->vdev_spa;
2723
2724	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
2725
2726	/* set the reopening flag unless we're taking the vdev offline */
2727	vd->vdev_reopening = !vd->vdev_offline;
2728	vdev_close(vd);
2729	(void) vdev_open(vd);
2730
2731	/*
2732	 * Call vdev_validate() here to make sure we have the same device.
2733	 * Otherwise, a device with an invalid label could be successfully
2734	 * opened in response to vdev_reopen().
2735	 */
2736	if (vd->vdev_aux) {
2737		(void) vdev_validate_aux(vd);
2738		if (vdev_readable(vd) && vdev_writeable(vd) &&
2739		    vd->vdev_aux == &spa->spa_l2cache) {
2740			/*
2741			 * In case the vdev is present we should evict all ARC
2742			 * buffers and pointers to log blocks and reclaim their
2743			 * space before restoring its contents to L2ARC.
2744			 */
2745			if (l2arc_vdev_present(vd)) {
2746				l2arc_rebuild_vdev(vd, B_TRUE);
2747			} else {
2748				l2arc_add_vdev(spa, vd);
2749			}
2750			spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD);
2751			spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM);
2752		}
2753	} else {
2754		(void) vdev_validate(vd);
2755	}
2756
2757	/*
2758	 * Recheck if resilver is still needed and cancel any
2759	 * scheduled resilver if resilver is unneeded.
2760	 */
2761	if (!vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL) &&
2762	    spa->spa_async_tasks & SPA_ASYNC_RESILVER) {
2763		mutex_enter(&spa->spa_async_lock);
2764		spa->spa_async_tasks &= ~SPA_ASYNC_RESILVER;
2765		mutex_exit(&spa->spa_async_lock);
2766	}
2767
2768	/*
2769	 * Reassess parent vdev's health.
2770	 */
2771	vdev_propagate_state(vd);
2772}
2773
2774int
2775vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
2776{
2777	int error;
2778
2779	/*
2780	 * Normally, partial opens (e.g. of a mirror) are allowed.
2781	 * For a create, however, we want to fail the request if
2782	 * there are any components we can't open.
2783	 */
2784	error = vdev_open(vd);
2785
2786	if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
2787		vdev_close(vd);
2788		return (error ? error : SET_ERROR(ENXIO));
2789	}
2790
2791	/*
2792	 * Recursively load DTLs and initialize all labels.
2793	 */
2794	if ((error = vdev_dtl_load(vd)) != 0 ||
2795	    (error = vdev_label_init(vd, txg, isreplacing ?
2796	    VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) {
2797		vdev_close(vd);
2798		return (error);
2799	}
2800
2801	return (0);
2802}
2803
2804void
2805vdev_metaslab_set_size(vdev_t *vd)
2806{
2807	uint64_t asize = vd->vdev_asize;
2808	uint64_t ms_count = asize >> zfs_vdev_default_ms_shift;
2809	uint64_t ms_shift;
2810
2811	/*
2812	 * There are two dimensions to the metaslab sizing calculation:
2813	 * the size of the metaslab and the count of metaslabs per vdev.
2814	 *
2815	 * The default values used below are a good balance between memory
2816	 * usage (larger metaslab size means more memory needed for loaded
2817	 * metaslabs; more metaslabs means more memory needed for the
2818	 * metaslab_t structs), metaslab load time (larger metaslabs take
2819	 * longer to load), and metaslab sync time (more metaslabs means
2820	 * more time spent syncing all of them).
2821	 *
2822	 * In general, we aim for zfs_vdev_default_ms_count (200) metaslabs.
2823	 * The range of the dimensions are as follows:
2824	 *
2825	 *	2^29 <= ms_size  <= 2^34
2826	 *	  16 <= ms_count <= 131,072
2827	 *
2828	 * On the lower end of vdev sizes, we aim for metaslabs sizes of
2829	 * at least 512MB (2^29) to minimize fragmentation effects when
2830	 * testing with smaller devices.  However, the count constraint
2831	 * of at least 16 metaslabs will override this minimum size goal.
2832	 *
2833	 * On the upper end of vdev sizes, we aim for a maximum metaslab
2834	 * size of 16GB.  However, we will cap the total count to 2^17
2835	 * metaslabs to keep our memory footprint in check and let the
2836	 * metaslab size grow from there if that limit is hit.
2837	 *
2838	 * The net effect of applying above constrains is summarized below.
2839	 *
2840	 *   vdev size       metaslab count
2841	 *  --------------|-----------------
2842	 *      < 8GB        ~16
2843	 *  8GB   - 100GB   one per 512MB
2844	 *  100GB - 3TB     ~200
2845	 *  3TB   - 2PB     one per 16GB
2846	 *      > 2PB       ~131,072
2847	 *  --------------------------------
2848	 *
2849	 *  Finally, note that all of the above calculate the initial
2850	 *  number of metaslabs. Expanding a top-level vdev will result
2851	 *  in additional metaslabs being allocated making it possible
2852	 *  to exceed the zfs_vdev_ms_count_limit.
2853	 */
2854
2855	if (ms_count < zfs_vdev_min_ms_count)
2856		ms_shift = highbit64(asize / zfs_vdev_min_ms_count);
2857	else if (ms_count > zfs_vdev_default_ms_count)
2858		ms_shift = highbit64(asize / zfs_vdev_default_ms_count);
2859	else
2860		ms_shift = zfs_vdev_default_ms_shift;
2861
2862	if (ms_shift < SPA_MAXBLOCKSHIFT) {
2863		ms_shift = SPA_MAXBLOCKSHIFT;
2864	} else if (ms_shift > zfs_vdev_max_ms_shift) {
2865		ms_shift = zfs_vdev_max_ms_shift;
2866		/* cap the total count to constrain memory footprint */
2867		if ((asize >> ms_shift) > zfs_vdev_ms_count_limit)
2868			ms_shift = highbit64(asize / zfs_vdev_ms_count_limit);
2869	}
2870
2871	vd->vdev_ms_shift = ms_shift;
2872	ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT);
2873}
2874
2875void
2876vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
2877{
2878	ASSERT(vd == vd->vdev_top);
2879	/* indirect vdevs don't have metaslabs or dtls */
2880	ASSERT(vdev_is_concrete(vd) || flags == 0);
2881	ASSERT(ISP2(flags));
2882	ASSERT(spa_writeable(vd->vdev_spa));
2883
2884	if (flags & VDD_METASLAB)
2885		(void) txg_list_add(&vd->vdev_ms_list, arg, txg);
2886
2887	if (flags & VDD_DTL)
2888		(void) txg_list_add(&vd->vdev_dtl_list, arg, txg);
2889
2890	(void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
2891}
2892
2893void
2894vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg)
2895{
2896	for (int c = 0; c < vd->vdev_children; c++)
2897		vdev_dirty_leaves(vd->vdev_child[c], flags, txg);
2898
2899	if (vd->vdev_ops->vdev_op_leaf)
2900		vdev_dirty(vd->vdev_top, flags, vd, txg);
2901}
2902
2903/*
2904 * DTLs.
2905 *
2906 * A vdev's DTL (dirty time log) is the set of transaction groups for which
2907 * the vdev has less than perfect replication.  There are four kinds of DTL:
2908 *
2909 * DTL_MISSING: txgs for which the vdev has no valid copies of the data
2910 *
2911 * DTL_PARTIAL: txgs for which data is available, but not fully replicated
2912 *
2913 * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon
2914 *	scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of
2915 *	txgs that was scrubbed.
2916 *
2917 * DTL_OUTAGE: txgs which cannot currently be read, whether due to
2918 *	persistent errors or just some device being offline.
2919 *	Unlike the other three, the DTL_OUTAGE map is not generally
2920 *	maintained; it's only computed when needed, typically to
2921 *	determine whether a device can be detached.
2922 *
2923 * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device
2924 * either has the data or it doesn't.
2925 *
2926 * For interior vdevs such as mirror and RAID-Z the picture is more complex.
2927 * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because
2928 * if any child is less than fully replicated, then so is its parent.
2929 * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs,
2930 * comprising only those txgs which appear in 'maxfaults' or more children;
2931 * those are the txgs we don't have enough replication to read.  For example,
2932 * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2);
2933 * thus, its DTL_MISSING consists of the set of txgs that appear in more than
2934 * two child DTL_MISSING maps.
2935 *
2936 * It should be clear from the above that to compute the DTLs and outage maps
2937 * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps.
2938 * Therefore, that is all we keep on disk.  When loading the pool, or after
2939 * a configuration change, we generate all other DTLs from first principles.
2940 */
2941void
2942vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
2943{
2944	range_tree_t *rt = vd->vdev_dtl[t];
2945
2946	ASSERT(t < DTL_TYPES);
2947	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
2948	ASSERT(spa_writeable(vd->vdev_spa));
2949
2950	mutex_enter(&vd->vdev_dtl_lock);
2951	if (!range_tree_contains(rt, txg, size))
2952		range_tree_add(rt, txg, size);
2953	mutex_exit(&vd->vdev_dtl_lock);
2954}
2955
2956boolean_t
2957vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
2958{
2959	range_tree_t *rt = vd->vdev_dtl[t];
2960	boolean_t dirty = B_FALSE;
2961
2962	ASSERT(t < DTL_TYPES);
2963	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
2964
2965	/*
2966	 * While we are loading the pool, the DTLs have not been loaded yet.
2967	 * This isn't a problem but it can result in devices being tried
2968	 * which are known to not have the data.  In which case, the import
2969	 * is relying on the checksum to ensure that we get the right data.
2970	 * Note that while importing we are only reading the MOS, which is
2971	 * always checksummed.
2972	 */
2973	mutex_enter(&vd->vdev_dtl_lock);
2974	if (!range_tree_is_empty(rt))
2975		dirty = range_tree_contains(rt, txg, size);
2976	mutex_exit(&vd->vdev_dtl_lock);
2977
2978	return (dirty);
2979}
2980
2981boolean_t
2982vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
2983{
2984	range_tree_t *rt = vd->vdev_dtl[t];
2985	boolean_t empty;
2986
2987	mutex_enter(&vd->vdev_dtl_lock);
2988	empty = range_tree_is_empty(rt);
2989	mutex_exit(&vd->vdev_dtl_lock);
2990
2991	return (empty);
2992}
2993
2994/*
2995 * Check if the txg falls within the range which must be
2996 * resilvered.  DVAs outside this range can always be skipped.
2997 */
2998boolean_t
2999vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
3000    uint64_t phys_birth)
3001{
3002	(void) dva, (void) psize;
3003
3004	/* Set by sequential resilver. */
3005	if (phys_birth == TXG_UNKNOWN)
3006		return (B_TRUE);
3007
3008	return (vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1));
3009}
3010
3011/*
3012 * Returns B_TRUE if the vdev determines the DVA needs to be resilvered.
3013 */
3014boolean_t
3015vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
3016    uint64_t phys_birth)
3017{
3018	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
3019
3020	if (vd->vdev_ops->vdev_op_need_resilver == NULL ||
3021	    vd->vdev_ops->vdev_op_leaf)
3022		return (B_TRUE);
3023
3024	return (vd->vdev_ops->vdev_op_need_resilver(vd, dva, psize,
3025	    phys_birth));
3026}
3027
3028/*
3029 * Returns the lowest txg in the DTL range.
3030 */
3031static uint64_t
3032vdev_dtl_min(vdev_t *vd)
3033{
3034	ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
3035	ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
3036	ASSERT0(vd->vdev_children);
3037
3038	return (range_tree_min(vd->vdev_dtl[DTL_MISSING]) - 1);
3039}
3040
3041/*
3042 * Returns the highest txg in the DTL.
3043 */
3044static uint64_t
3045vdev_dtl_max(vdev_t *vd)
3046{
3047	ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
3048	ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
3049	ASSERT0(vd->vdev_children);
3050
3051	return (range_tree_max(vd->vdev_dtl[DTL_MISSING]));
3052}
3053
3054/*
3055 * Determine if a resilvering vdev should remove any DTL entries from
3056 * its range. If the vdev was resilvering for the entire duration of the
3057 * scan then it should excise that range from its DTLs. Otherwise, this
3058 * vdev is considered partially resilvered and should leave its DTL
3059 * entries intact. The comment in vdev_dtl_reassess() describes how we
3060 * excise the DTLs.
3061 */
3062static boolean_t
3063vdev_dtl_should_excise(vdev_t *vd, boolean_t rebuild_done)
3064{
3065	ASSERT0(vd->vdev_children);
3066
3067	if (vd->vdev_state < VDEV_STATE_DEGRADED)
3068		return (B_FALSE);
3069
3070	if (vd->vdev_resilver_deferred)
3071		return (B_FALSE);
3072
3073	if (range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]))
3074		return (B_TRUE);
3075
3076	if (rebuild_done) {
3077		vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config;
3078		vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
3079
3080		/* Rebuild not initiated by attach */
3081		if (vd->vdev_rebuild_txg == 0)
3082			return (B_TRUE);
3083
3084		/*
3085		 * When a rebuild completes without error then all missing data
3086		 * up to the rebuild max txg has been reconstructed and the DTL
3087		 * is eligible for excision.
3088		 */
3089		if (vrp->vrp_rebuild_state == VDEV_REBUILD_COMPLETE &&
3090		    vdev_dtl_max(vd) <= vrp->vrp_max_txg) {
3091			ASSERT3U(vrp->vrp_min_txg, <=, vdev_dtl_min(vd));
3092			ASSERT3U(vrp->vrp_min_txg, <, vd->vdev_rebuild_txg);
3093			ASSERT3U(vd->vdev_rebuild_txg, <=, vrp->vrp_max_txg);
3094			return (B_TRUE);
3095		}
3096	} else {
3097		dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan;
3098		dsl_scan_phys_t *scnp __maybe_unused = &scn->scn_phys;
3099
3100		/* Resilver not initiated by attach */
3101		if (vd->vdev_resilver_txg == 0)
3102			return (B_TRUE);
3103
3104		/*
3105		 * When a resilver is initiated the scan will assign the
3106		 * scn_max_txg value to the highest txg value that exists
3107		 * in all DTLs. If this device's max DTL is not part of this
3108		 * scan (i.e. it is not in the range (scn_min_txg, scn_max_txg]
3109		 * then it is not eligible for excision.
3110		 */
3111		if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) {
3112			ASSERT3U(scnp->scn_min_txg, <=, vdev_dtl_min(vd));
3113			ASSERT3U(scnp->scn_min_txg, <, vd->vdev_resilver_txg);
3114			ASSERT3U(vd->vdev_resilver_txg, <=, scnp->scn_max_txg);
3115			return (B_TRUE);
3116		}
3117	}
3118
3119	return (B_FALSE);
3120}
3121
3122/*
3123 * Reassess DTLs after a config change or scrub completion. If txg == 0 no
3124 * write operations will be issued to the pool.
3125 */
3126void
3127vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
3128    boolean_t scrub_done, boolean_t rebuild_done)
3129{
3130	spa_t *spa = vd->vdev_spa;
3131	avl_tree_t reftree;
3132	int minref;
3133
3134	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
3135
3136	for (int c = 0; c < vd->vdev_children; c++)
3137		vdev_dtl_reassess(vd->vdev_child[c], txg,
3138		    scrub_txg, scrub_done, rebuild_done);
3139
3140	if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux)
3141		return;
3142
3143	if (vd->vdev_ops->vdev_op_leaf) {
3144		dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
3145		vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config;
3146		boolean_t check_excise = B_FALSE;
3147		boolean_t wasempty = B_TRUE;
3148
3149		mutex_enter(&vd->vdev_dtl_lock);
3150
3151		/*
3152		 * If requested, pretend the scan or rebuild completed cleanly.
3153		 */
3154		if (zfs_scan_ignore_errors) {
3155			if (scn != NULL)
3156				scn->scn_phys.scn_errors = 0;
3157			if (vr != NULL)
3158				vr->vr_rebuild_phys.vrp_errors = 0;
3159		}
3160
3161		if (scrub_txg != 0 &&
3162		    !range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) {
3163			wasempty = B_FALSE;
3164			zfs_dbgmsg("guid:%llu txg:%llu scrub:%llu started:%d "
3165			    "dtl:%llu/%llu errors:%llu",
3166			    (u_longlong_t)vd->vdev_guid, (u_longlong_t)txg,
3167			    (u_longlong_t)scrub_txg, spa->spa_scrub_started,
3168			    (u_longlong_t)vdev_dtl_min(vd),
3169			    (u_longlong_t)vdev_dtl_max(vd),
3170			    (u_longlong_t)(scn ? scn->scn_phys.scn_errors : 0));
3171		}
3172
3173		/*
3174		 * If we've completed a scrub/resilver or a rebuild cleanly
3175		 * then determine if this vdev should remove any DTLs. We
3176		 * only want to excise regions on vdevs that were available
3177		 * during the entire duration of this scan.
3178		 */
3179		if (rebuild_done &&
3180		    vr != NULL && vr->vr_rebuild_phys.vrp_errors == 0) {
3181			check_excise = B_TRUE;
3182		} else {
3183			if (spa->spa_scrub_started ||
3184			    (scn != NULL && scn->scn_phys.scn_errors == 0)) {
3185				check_excise = B_TRUE;
3186			}
3187		}
3188
3189		if (scrub_txg && check_excise &&
3190		    vdev_dtl_should_excise(vd, rebuild_done)) {
3191			/*
3192			 * We completed a scrub, resilver or rebuild up to
3193			 * scrub_txg.  If we did it without rebooting, then
3194			 * the scrub dtl will be valid, so excise the old
3195			 * region and fold in the scrub dtl.  Otherwise,
3196			 * leave the dtl as-is if there was an error.
3197			 *
3198			 * There's little trick here: to excise the beginning
3199			 * of the DTL_MISSING map, we put it into a reference
3200			 * tree and then add a segment with refcnt -1 that
3201			 * covers the range [0, scrub_txg).  This means
3202			 * that each txg in that range has refcnt -1 or 0.
3203			 * We then add DTL_SCRUB with a refcnt of 2, so that
3204			 * entries in the range [0, scrub_txg) will have a
3205			 * positive refcnt -- either 1 or 2.  We then convert
3206			 * the reference tree into the new DTL_MISSING map.
3207			 */
3208			space_reftree_create(&reftree);
3209			space_reftree_add_map(&reftree,
3210			    vd->vdev_dtl[DTL_MISSING], 1);
3211			space_reftree_add_seg(&reftree, 0, scrub_txg, -1);
3212			space_reftree_add_map(&reftree,
3213			    vd->vdev_dtl[DTL_SCRUB], 2);
3214			space_reftree_generate_map(&reftree,
3215			    vd->vdev_dtl[DTL_MISSING], 1);
3216			space_reftree_destroy(&reftree);
3217
3218			if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) {
3219				zfs_dbgmsg("update DTL_MISSING:%llu/%llu",
3220				    (u_longlong_t)vdev_dtl_min(vd),
3221				    (u_longlong_t)vdev_dtl_max(vd));
3222			} else if (!wasempty) {
3223				zfs_dbgmsg("DTL_MISSING is now empty");
3224			}
3225		}
3226		range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
3227		range_tree_walk(vd->vdev_dtl[DTL_MISSING],
3228		    range_tree_add, vd->vdev_dtl[DTL_PARTIAL]);
3229		if (scrub_done)
3230			range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
3231		range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
3232		if (!vdev_readable(vd))
3233			range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
3234		else
3235			range_tree_walk(vd->vdev_dtl[DTL_MISSING],
3236			    range_tree_add, vd->vdev_dtl[DTL_OUTAGE]);
3237
3238		/*
3239		 * If the vdev was resilvering or rebuilding and no longer
3240		 * has any DTLs then reset the appropriate flag and dirty
3241		 * the top level so that we persist the change.
3242		 */
3243		if (txg != 0 &&
3244		    range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
3245		    range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) {
3246			if (vd->vdev_rebuild_txg != 0) {
3247				vd->vdev_rebuild_txg = 0;
3248				vdev_config_dirty(vd->vdev_top);
3249			} else if (vd->vdev_resilver_txg != 0) {
3250				vd->vdev_resilver_txg = 0;
3251				vdev_config_dirty(vd->vdev_top);
3252			}
3253		}
3254
3255		mutex_exit(&vd->vdev_dtl_lock);
3256
3257		if (txg != 0)
3258			vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
3259	} else {
3260		mutex_enter(&vd->vdev_dtl_lock);
3261		for (int t = 0; t < DTL_TYPES; t++) {
3262			/* account for child's outage in parent's missing map */
3263			int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
3264			if (t == DTL_SCRUB) {
3265				/* leaf vdevs only */
3266				continue;
3267			}
3268			if (t == DTL_PARTIAL) {
3269				/* i.e. non-zero */
3270				minref = 1;
3271			} else if (vdev_get_nparity(vd) != 0) {
3272				/* RAIDZ, DRAID */
3273				minref = vdev_get_nparity(vd) + 1;
3274			} else {
3275				/* any kind of mirror */
3276				minref = vd->vdev_children;
3277			}
3278			space_reftree_create(&reftree);
3279			for (int c = 0; c < vd->vdev_children; c++) {
3280				vdev_t *cvd = vd->vdev_child[c];
3281				mutex_enter(&cvd->vdev_dtl_lock);
3282				space_reftree_add_map(&reftree,
3283				    cvd->vdev_dtl[s], 1);
3284				mutex_exit(&cvd->vdev_dtl_lock);
3285			}
3286			space_reftree_generate_map(&reftree,
3287			    vd->vdev_dtl[t], minref);
3288			space_reftree_destroy(&reftree);
3289		}
3290		mutex_exit(&vd->vdev_dtl_lock);
3291	}
3292
3293	if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) {
3294		raidz_dtl_reassessed(vd);
3295	}
3296}
3297
3298/*
3299 * Iterate over all the vdevs except spare, and post kobj events
3300 */
3301void
3302vdev_post_kobj_evt(vdev_t *vd)
3303{
3304	if (vd->vdev_ops->vdev_op_kobj_evt_post &&
3305	    vd->vdev_kobj_flag == B_FALSE) {
3306		vd->vdev_kobj_flag = B_TRUE;
3307		vd->vdev_ops->vdev_op_kobj_evt_post(vd);
3308	}
3309
3310	for (int c = 0; c < vd->vdev_children; c++)
3311		vdev_post_kobj_evt(vd->vdev_child[c]);
3312}
3313
3314/*
3315 * Iterate over all the vdevs except spare, and clear kobj events
3316 */
3317void
3318vdev_clear_kobj_evt(vdev_t *vd)
3319{
3320	vd->vdev_kobj_flag = B_FALSE;
3321
3322	for (int c = 0; c < vd->vdev_children; c++)
3323		vdev_clear_kobj_evt(vd->vdev_child[c]);
3324}
3325
3326int
3327vdev_dtl_load(vdev_t *vd)
3328{
3329	spa_t *spa = vd->vdev_spa;
3330	objset_t *mos = spa->spa_meta_objset;
3331	range_tree_t *rt;
3332	int error = 0;
3333
3334	if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) {
3335		ASSERT(vdev_is_concrete(vd));
3336
3337		/*
3338		 * If the dtl cannot be sync'd there is no need to open it.
3339		 */
3340		if (spa->spa_mode == SPA_MODE_READ && !spa->spa_read_spacemaps)
3341			return (0);
3342
3343		error = space_map_open(&vd->vdev_dtl_sm, mos,
3344		    vd->vdev_dtl_object, 0, -1ULL, 0);
3345		if (error)
3346			return (error);
3347		ASSERT(vd->vdev_dtl_sm != NULL);
3348
3349		rt = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
3350		error = space_map_load(vd->vdev_dtl_sm, rt, SM_ALLOC);
3351		if (error == 0) {
3352			mutex_enter(&vd->vdev_dtl_lock);
3353			range_tree_walk(rt, range_tree_add,
3354			    vd->vdev_dtl[DTL_MISSING]);
3355			mutex_exit(&vd->vdev_dtl_lock);
3356		}
3357
3358		range_tree_vacate(rt, NULL, NULL);
3359		range_tree_destroy(rt);
3360
3361		return (error);
3362	}
3363
3364	for (int c = 0; c < vd->vdev_children; c++) {
3365		error = vdev_dtl_load(vd->vdev_child[c]);
3366		if (error != 0)
3367			break;
3368	}
3369
3370	return (error);
3371}
3372
3373static void
3374vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx)
3375{
3376	spa_t *spa = vd->vdev_spa;
3377	objset_t *mos = spa->spa_meta_objset;
3378	vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias;
3379	const char *string;
3380
3381	ASSERT(alloc_bias != VDEV_BIAS_NONE);
3382
3383	string =
3384	    (alloc_bias == VDEV_BIAS_LOG) ? VDEV_ALLOC_BIAS_LOG :
3385	    (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL :
3386	    (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : NULL;
3387
3388	ASSERT(string != NULL);
3389	VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS,
3390	    1, strlen(string) + 1, string, tx));
3391
3392	if (alloc_bias == VDEV_BIAS_SPECIAL || alloc_bias == VDEV_BIAS_DEDUP) {
3393		spa_activate_allocation_classes(spa, tx);
3394	}
3395}
3396
3397void
3398vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx)
3399{
3400	spa_t *spa = vd->vdev_spa;
3401
3402	VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx));
3403	VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps,
3404	    zapobj, tx));
3405}
3406
3407uint64_t
3408vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx)
3409{
3410	spa_t *spa = vd->vdev_spa;
3411	uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA,
3412	    DMU_OT_NONE, 0, tx);
3413
3414	ASSERT(zap != 0);
3415	VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps,
3416	    zap, tx));
3417
3418	return (zap);
3419}
3420
3421void
3422vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx)
3423{
3424	if (vd->vdev_ops != &vdev_hole_ops &&
3425	    vd->vdev_ops != &vdev_missing_ops &&
3426	    vd->vdev_ops != &vdev_root_ops &&
3427	    !vd->vdev_top->vdev_removing) {
3428		if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) {
3429			vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx);
3430		}
3431		if (vd == vd->vdev_top && vd->vdev_top_zap == 0) {
3432			vd->vdev_top_zap = vdev_create_link_zap(vd, tx);
3433			if (vd->vdev_alloc_bias != VDEV_BIAS_NONE)
3434				vdev_zap_allocation_data(vd, tx);
3435		}
3436	}
3437	if (vd->vdev_ops == &vdev_root_ops && vd->vdev_root_zap == 0 &&
3438	    spa_feature_is_enabled(vd->vdev_spa, SPA_FEATURE_AVZ_V2)) {
3439		if (!spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2))
3440			spa_feature_incr(vd->vdev_spa, SPA_FEATURE_AVZ_V2, tx);
3441		vd->vdev_root_zap = vdev_create_link_zap(vd, tx);
3442	}
3443
3444	for (uint64_t i = 0; i < vd->vdev_children; i++) {
3445		vdev_construct_zaps(vd->vdev_child[i], tx);
3446	}
3447}
3448
3449static void
3450vdev_dtl_sync(vdev_t *vd, uint64_t txg)
3451{
3452	spa_t *spa = vd->vdev_spa;
3453	range_tree_t *rt = vd->vdev_dtl[DTL_MISSING];
3454	objset_t *mos = spa->spa_meta_objset;
3455	range_tree_t *rtsync;
3456	dmu_tx_t *tx;
3457	uint64_t object = space_map_object(vd->vdev_dtl_sm);
3458
3459	ASSERT(vdev_is_concrete(vd));
3460	ASSERT(vd->vdev_ops->vdev_op_leaf);
3461
3462	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
3463
3464	if (vd->vdev_detached || vd->vdev_top->vdev_removing) {
3465		mutex_enter(&vd->vdev_dtl_lock);
3466		space_map_free(vd->vdev_dtl_sm, tx);
3467		space_map_close(vd->vdev_dtl_sm);
3468		vd->vdev_dtl_sm = NULL;
3469		mutex_exit(&vd->vdev_dtl_lock);
3470
3471		/*
3472		 * We only destroy the leaf ZAP for detached leaves or for
3473		 * removed log devices. Removed data devices handle leaf ZAP
3474		 * cleanup later, once cancellation is no longer possible.
3475		 */
3476		if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached ||
3477		    vd->vdev_top->vdev_islog)) {
3478			vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx);
3479			vd->vdev_leaf_zap = 0;
3480		}
3481
3482		dmu_tx_commit(tx);
3483		return;
3484	}
3485
3486	if (vd->vdev_dtl_sm == NULL) {
3487		uint64_t new_object;
3488
3489		new_object = space_map_alloc(mos, zfs_vdev_dtl_sm_blksz, tx);
3490		VERIFY3U(new_object, !=, 0);
3491
3492		VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object,
3493		    0, -1ULL, 0));
3494		ASSERT(vd->vdev_dtl_sm != NULL);
3495	}
3496
3497	rtsync = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
3498
3499	mutex_enter(&vd->vdev_dtl_lock);
3500	range_tree_walk(rt, range_tree_add, rtsync);
3501	mutex_exit(&vd->vdev_dtl_lock);
3502
3503	space_map_truncate(vd->vdev_dtl_sm, zfs_vdev_dtl_sm_blksz, tx);
3504	space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx);
3505	range_tree_vacate(rtsync, NULL, NULL);
3506
3507	range_tree_destroy(rtsync);
3508
3509	/*
3510	 * If the object for the space map has changed then dirty
3511	 * the top level so that we update the config.
3512	 */
3513	if (object != space_map_object(vd->vdev_dtl_sm)) {
3514		vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, "
3515		    "new object %llu", (u_longlong_t)txg, spa_name(spa),
3516		    (u_longlong_t)object,
3517		    (u_longlong_t)space_map_object(vd->vdev_dtl_sm));
3518		vdev_config_dirty(vd->vdev_top);
3519	}
3520
3521	dmu_tx_commit(tx);
3522}
3523
3524/*
3525 * Determine whether the specified vdev can be offlined/detached/removed
3526 * without losing data.
3527 */
3528boolean_t
3529vdev_dtl_required(vdev_t *vd)
3530{
3531	spa_t *spa = vd->vdev_spa;
3532	vdev_t *tvd = vd->vdev_top;
3533	uint8_t cant_read = vd->vdev_cant_read;
3534	boolean_t required;
3535
3536	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
3537
3538	if (vd == spa->spa_root_vdev || vd == tvd)
3539		return (B_TRUE);
3540
3541	/*
3542	 * Temporarily mark the device as unreadable, and then determine
3543	 * whether this results in any DTL outages in the top-level vdev.
3544	 * If not, we can safely offline/detach/remove the device.
3545	 */
3546	vd->vdev_cant_read = B_TRUE;
3547	vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE);
3548	required = !vdev_dtl_empty(tvd, DTL_OUTAGE);
3549	vd->vdev_cant_read = cant_read;
3550	vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE);
3551
3552	if (!required && zio_injection_enabled) {
3553		required = !!zio_handle_device_injection(vd, NULL,
3554		    SET_ERROR(ECHILD));
3555	}
3556
3557	return (required);
3558}
3559
3560/*
3561 * Determine if resilver is needed, and if so the txg range.
3562 */
3563boolean_t
3564vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
3565{
3566	boolean_t needed = B_FALSE;
3567	uint64_t thismin = UINT64_MAX;
3568	uint64_t thismax = 0;
3569
3570	if (vd->vdev_children == 0) {
3571		mutex_enter(&vd->vdev_dtl_lock);
3572		if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
3573		    vdev_writeable(vd)) {
3574
3575			thismin = vdev_dtl_min(vd);
3576			thismax = vdev_dtl_max(vd);
3577			needed = B_TRUE;
3578		}
3579		mutex_exit(&vd->vdev_dtl_lock);
3580	} else {
3581		for (int c = 0; c < vd->vdev_children; c++) {
3582			vdev_t *cvd = vd->vdev_child[c];
3583			uint64_t cmin, cmax;
3584
3585			if (vdev_resilver_needed(cvd, &cmin, &cmax)) {
3586				thismin = MIN(thismin, cmin);
3587				thismax = MAX(thismax, cmax);
3588				needed = B_TRUE;
3589			}
3590		}
3591	}
3592
3593	if (needed && minp) {
3594		*minp = thismin;
3595		*maxp = thismax;
3596	}
3597	return (needed);
3598}
3599
3600/*
3601 * Gets the checkpoint space map object from the vdev's ZAP.  On success sm_obj
3602 * will contain either the checkpoint spacemap object or zero if none exists.
3603 * All other errors are returned to the caller.
3604 */
3605int
3606vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj)
3607{
3608	ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
3609
3610	if (vd->vdev_top_zap == 0) {
3611		*sm_obj = 0;
3612		return (0);
3613	}
3614
3615	int error = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap,
3616	    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, sm_obj);
3617	if (error == ENOENT) {
3618		*sm_obj = 0;
3619		error = 0;
3620	}
3621
3622	return (error);
3623}
3624
3625int
3626vdev_load(vdev_t *vd)
3627{
3628	int children = vd->vdev_children;
3629	int error = 0;
3630	taskq_t *tq = NULL;
3631
3632	/*
3633	 * It's only worthwhile to use the taskq for the root vdev, because the
3634	 * slow part is metaslab_init, and that only happens for top-level
3635	 * vdevs.
3636	 */
3637	if (vd->vdev_ops == &vdev_root_ops && vd->vdev_children > 0) {
3638		tq = taskq_create("vdev_load", children, minclsyspri,
3639		    children, children, TASKQ_PREPOPULATE);
3640	}
3641
3642	/*
3643	 * Recursively load all children.
3644	 */
3645	for (int c = 0; c < vd->vdev_children; c++) {
3646		vdev_t *cvd = vd->vdev_child[c];
3647
3648		if (tq == NULL || vdev_uses_zvols(cvd)) {
3649			cvd->vdev_load_error = vdev_load(cvd);
3650		} else {
3651			VERIFY(taskq_dispatch(tq, vdev_load_child,
3652			    cvd, TQ_SLEEP) != TASKQID_INVALID);
3653		}
3654	}
3655
3656	if (tq != NULL) {
3657		taskq_wait(tq);
3658		taskq_destroy(tq);
3659	}
3660
3661	for (int c = 0; c < vd->vdev_children; c++) {
3662		int error = vd->vdev_child[c]->vdev_load_error;
3663
3664		if (error != 0)
3665			return (error);
3666	}
3667
3668	vdev_set_deflate_ratio(vd);
3669
3670	if (vd->vdev_ops == &vdev_raidz_ops) {
3671		error = vdev_raidz_load(vd);
3672		if (error != 0)
3673			return (error);
3674	}
3675
3676	/*
3677	 * On spa_load path, grab the allocation bias from our zap
3678	 */
3679	if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
3680		spa_t *spa = vd->vdev_spa;
3681		char bias_str[64];
3682
3683		error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
3684		    VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str),
3685		    bias_str);
3686		if (error == 0) {
3687			ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE);
3688			vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str);
3689		} else if (error != ENOENT) {
3690			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
3691			    VDEV_AUX_CORRUPT_DATA);
3692			vdev_dbgmsg(vd, "vdev_load: zap_lookup(top_zap=%llu) "
3693			    "failed [error=%d]",
3694			    (u_longlong_t)vd->vdev_top_zap, error);
3695			return (error);
3696		}
3697	}
3698
3699	if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
3700		spa_t *spa = vd->vdev_spa;
3701		uint64_t failfast;
3702
3703		error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
3704		    vdev_prop_to_name(VDEV_PROP_FAILFAST), sizeof (failfast),
3705		    1, &failfast);
3706		if (error == 0) {
3707			vd->vdev_failfast = failfast & 1;
3708		} else if (error == ENOENT) {
3709			vd->vdev_failfast = vdev_prop_default_numeric(
3710			    VDEV_PROP_FAILFAST);
3711		} else {
3712			vdev_dbgmsg(vd,
3713			    "vdev_load: zap_lookup(top_zap=%llu) "
3714			    "failed [error=%d]",
3715			    (u_longlong_t)vd->vdev_top_zap, error);
3716		}
3717	}
3718
3719	/*
3720	 * Load any rebuild state from the top-level vdev zap.
3721	 */
3722	if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
3723		error = vdev_rebuild_load(vd);
3724		if (error && error != ENOTSUP) {
3725			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
3726			    VDEV_AUX_CORRUPT_DATA);
3727			vdev_dbgmsg(vd, "vdev_load: vdev_rebuild_load "
3728			    "failed [error=%d]", error);
3729			return (error);
3730		}
3731	}
3732
3733	if (vd->vdev_top_zap != 0 || vd->vdev_leaf_zap != 0) {
3734		uint64_t zapobj;
3735
3736		if (vd->vdev_top_zap != 0)
3737			zapobj = vd->vdev_top_zap;
3738		else
3739			zapobj = vd->vdev_leaf_zap;
3740
3741		error = vdev_prop_get_int(vd, VDEV_PROP_CHECKSUM_N,
3742		    &vd->vdev_checksum_n);
3743		if (error && error != ENOENT)
3744			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
3745			    "failed [error=%d]", (u_longlong_t)zapobj, error);
3746
3747		error = vdev_prop_get_int(vd, VDEV_PROP_CHECKSUM_T,
3748		    &vd->vdev_checksum_t);
3749		if (error && error != ENOENT)
3750			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
3751			    "failed [error=%d]", (u_longlong_t)zapobj, error);
3752
3753		error = vdev_prop_get_int(vd, VDEV_PROP_IO_N,
3754		    &vd->vdev_io_n);
3755		if (error && error != ENOENT)
3756			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
3757			    "failed [error=%d]", (u_longlong_t)zapobj, error);
3758
3759		error = vdev_prop_get_int(vd, VDEV_PROP_IO_T,
3760		    &vd->vdev_io_t);
3761		if (error && error != ENOENT)
3762			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
3763			    "failed [error=%d]", (u_longlong_t)zapobj, error);
3764
3765		error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_N,
3766		    &vd->vdev_slow_io_n);
3767		if (error && error != ENOENT)
3768			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
3769			    "failed [error=%d]", (u_longlong_t)zapobj, error);
3770
3771		error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_T,
3772		    &vd->vdev_slow_io_t);
3773		if (error && error != ENOENT)
3774			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
3775			    "failed [error=%d]", (u_longlong_t)zapobj, error);
3776	}
3777
3778	/*
3779	 * If this is a top-level vdev, initialize its metaslabs.
3780	 */
3781	if (vd == vd->vdev_top && vdev_is_concrete(vd)) {
3782		vdev_metaslab_group_create(vd);
3783
3784		if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) {
3785			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
3786			    VDEV_AUX_CORRUPT_DATA);
3787			vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, "
3788			    "asize=%llu", (u_longlong_t)vd->vdev_ashift,
3789			    (u_longlong_t)vd->vdev_asize);
3790			return (SET_ERROR(ENXIO));
3791		}
3792
3793		error = vdev_metaslab_init(vd, 0);
3794		if (error != 0) {
3795			vdev_dbgmsg(vd, "vdev_load: metaslab_init failed "
3796			    "[error=%d]", error);
3797			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
3798			    VDEV_AUX_CORRUPT_DATA);
3799			return (error);
3800		}
3801
3802		uint64_t checkpoint_sm_obj;
3803		error = vdev_checkpoint_sm_object(vd, &checkpoint_sm_obj);
3804		if (error == 0 && checkpoint_sm_obj != 0) {
3805			objset_t *mos = spa_meta_objset(vd->vdev_spa);
3806			ASSERT(vd->vdev_asize != 0);
3807			ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL);
3808
3809			error = space_map_open(&vd->vdev_checkpoint_sm,
3810			    mos, checkpoint_sm_obj, 0, vd->vdev_asize,
3811			    vd->vdev_ashift);
3812			if (error != 0) {
3813				vdev_dbgmsg(vd, "vdev_load: space_map_open "
3814				    "failed for checkpoint spacemap (obj %llu) "
3815				    "[error=%d]",
3816				    (u_longlong_t)checkpoint_sm_obj, error);
3817				return (error);
3818			}
3819			ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
3820
3821			/*
3822			 * Since the checkpoint_sm contains free entries
3823			 * exclusively we can use space_map_allocated() to
3824			 * indicate the cumulative checkpointed space that
3825			 * has been freed.
3826			 */
3827			vd->vdev_stat.vs_checkpoint_space =
3828			    -space_map_allocated(vd->vdev_checkpoint_sm);
3829			vd->vdev_spa->spa_checkpoint_info.sci_dspace +=
3830			    vd->vdev_stat.vs_checkpoint_space;
3831		} else if (error != 0) {
3832			vdev_dbgmsg(vd, "vdev_load: failed to retrieve "
3833			    "checkpoint space map object from vdev ZAP "
3834			    "[error=%d]", error);
3835			return (error);
3836		}
3837	}
3838
3839	/*
3840	 * If this is a leaf vdev, load its DTL.
3841	 */
3842	if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) {
3843		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
3844		    VDEV_AUX_CORRUPT_DATA);
3845		vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed "
3846		    "[error=%d]", error);
3847		return (error);
3848	}
3849
3850	uint64_t obsolete_sm_object;
3851	error = vdev_obsolete_sm_object(vd, &obsolete_sm_object);
3852	if (error == 0 && obsolete_sm_object != 0) {
3853		objset_t *mos = vd->vdev_spa->spa_meta_objset;
3854		ASSERT(vd->vdev_asize != 0);
3855		ASSERT3P(vd->vdev_obsolete_sm, ==, NULL);
3856
3857		if ((error = space_map_open(&vd->vdev_obsolete_sm, mos,
3858		    obsolete_sm_object, 0, vd->vdev_asize, 0))) {
3859			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
3860			    VDEV_AUX_CORRUPT_DATA);
3861			vdev_dbgmsg(vd, "vdev_load: space_map_open failed for "
3862			    "obsolete spacemap (obj %llu) [error=%d]",
3863			    (u_longlong_t)obsolete_sm_object, error);
3864			return (error);
3865		}
3866	} else if (error != 0) {
3867		vdev_dbgmsg(vd, "vdev_load: failed to retrieve obsolete "
3868		    "space map object from vdev ZAP [error=%d]", error);
3869		return (error);
3870	}
3871
3872	return (0);
3873}
3874
3875/*
3876 * The special vdev case is used for hot spares and l2cache devices.  Its
3877 * sole purpose it to set the vdev state for the associated vdev.  To do this,
3878 * we make sure that we can open the underlying device, then try to read the
3879 * label, and make sure that the label is sane and that it hasn't been
3880 * repurposed to another pool.
3881 */
3882int
3883vdev_validate_aux(vdev_t *vd)
3884{
3885	nvlist_t *label;
3886	uint64_t guid, version;
3887	uint64_t state;
3888
3889	if (!vdev_readable(vd))
3890		return (0);
3891
3892	if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) {
3893		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
3894		    VDEV_AUX_CORRUPT_DATA);
3895		return (-1);
3896	}
3897
3898	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 ||
3899	    !SPA_VERSION_IS_SUPPORTED(version) ||
3900	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 ||
3901	    guid != vd->vdev_guid ||
3902	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) {
3903		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
3904		    VDEV_AUX_CORRUPT_DATA);
3905		nvlist_free(label);
3906		return (-1);
3907	}
3908
3909	/*
3910	 * We don't actually check the pool state here.  If it's in fact in
3911	 * use by another pool, we update this fact on the fly when requested.
3912	 */
3913	nvlist_free(label);
3914	return (0);
3915}
3916
3917static void
3918vdev_destroy_ms_flush_data(vdev_t *vd, dmu_tx_t *tx)
3919{
3920	objset_t *mos = spa_meta_objset(vd->vdev_spa);
3921
3922	if (vd->vdev_top_zap == 0)
3923		return;
3924
3925	uint64_t object = 0;
3926	int err = zap_lookup(mos, vd->vdev_top_zap,
3927	    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &object);
3928	if (err == ENOENT)
3929		return;
3930	VERIFY0(err);
3931
3932	VERIFY0(dmu_object_free(mos, object, tx));
3933	VERIFY0(zap_remove(mos, vd->vdev_top_zap,
3934	    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, tx));
3935}
3936
3937/*
3938 * Free the objects used to store this vdev's spacemaps, and the array
3939 * that points to them.
3940 */
3941void
3942vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx)
3943{
3944	if (vd->vdev_ms_array == 0)
3945		return;
3946
3947	objset_t *mos = vd->vdev_spa->spa_meta_objset;
3948	uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift;
3949	size_t array_bytes = array_count * sizeof (uint64_t);
3950	uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP);
3951	VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0,
3952	    array_bytes, smobj_array, 0));
3953
3954	for (uint64_t i = 0; i < array_count; i++) {
3955		uint64_t smobj = smobj_array[i];
3956		if (smobj == 0)
3957			continue;
3958
3959		space_map_free_obj(mos, smobj, tx);
3960	}
3961
3962	kmem_free(smobj_array, array_bytes);
3963	VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx));
3964	vdev_destroy_ms_flush_data(vd, tx);
3965	vd->vdev_ms_array = 0;
3966}
3967
3968static void
3969vdev_remove_empty_log(vdev_t *vd, uint64_t txg)
3970{
3971	spa_t *spa = vd->vdev_spa;
3972
3973	ASSERT(vd->vdev_islog);
3974	ASSERT(vd == vd->vdev_top);
3975	ASSERT3U(txg, ==, spa_syncing_txg(spa));
3976
3977	dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
3978
3979	vdev_destroy_spacemaps(vd, tx);
3980	if (vd->vdev_top_zap != 0) {
3981		vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx);
3982		vd->vdev_top_zap = 0;
3983	}
3984
3985	dmu_tx_commit(tx);
3986}
3987
3988void
3989vdev_sync_done(vdev_t *vd, uint64_t txg)
3990{
3991	metaslab_t *msp;
3992	boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
3993
3994	ASSERT(vdev_is_concrete(vd));
3995
3996	while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
3997	    != NULL)
3998		metaslab_sync_done(msp, txg);
3999
4000	if (reassess) {
4001		metaslab_sync_reassess(vd->vdev_mg);
4002		if (vd->vdev_log_mg != NULL)
4003			metaslab_sync_reassess(vd->vdev_log_mg);
4004	}
4005}
4006
4007void
4008vdev_sync(vdev_t *vd, uint64_t txg)
4009{
4010	spa_t *spa = vd->vdev_spa;
4011	vdev_t *lvd;
4012	metaslab_t *msp;
4013
4014	ASSERT3U(txg, ==, spa->spa_syncing_txg);
4015	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
4016	if (range_tree_space(vd->vdev_obsolete_segments) > 0) {
4017		ASSERT(vd->vdev_removing ||
4018		    vd->vdev_ops == &vdev_indirect_ops);
4019
4020		vdev_indirect_sync_obsolete(vd, tx);
4021
4022		/*
4023		 * If the vdev is indirect, it can't have dirty
4024		 * metaslabs or DTLs.
4025		 */
4026		if (vd->vdev_ops == &vdev_indirect_ops) {
4027			ASSERT(txg_list_empty(&vd->vdev_ms_list, txg));
4028			ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg));
4029			dmu_tx_commit(tx);
4030			return;
4031		}
4032	}
4033
4034	ASSERT(vdev_is_concrete(vd));
4035
4036	if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 &&
4037	    !vd->vdev_removing) {
4038		ASSERT(vd == vd->vdev_top);
4039		ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
4040		vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
4041		    DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
4042		ASSERT(vd->vdev_ms_array != 0);
4043		vdev_config_dirty(vd);
4044	}
4045
4046	while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
4047		metaslab_sync(msp, txg);
4048		(void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
4049	}
4050
4051	while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
4052		vdev_dtl_sync(lvd, txg);
4053
4054	/*
4055	 * If this is an empty log device being removed, destroy the
4056	 * metadata associated with it.
4057	 */
4058	if (vd->vdev_islog && vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
4059		vdev_remove_empty_log(vd, txg);
4060
4061	(void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
4062	dmu_tx_commit(tx);
4063}
4064
4065/*
4066 * Return the amount of space that should be (or was) allocated for the given
4067 * psize (compressed block size) in the given TXG. Note that for expanded
4068 * RAIDZ vdevs, the size allocated for older BP's may be larger. See
4069 * vdev_raidz_asize().
4070 */
4071uint64_t
4072vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize, uint64_t txg)
4073{
4074	return (vd->vdev_ops->vdev_op_asize(vd, psize, txg));
4075}
4076
4077uint64_t
4078vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
4079{
4080	return (vdev_psize_to_asize_txg(vd, psize, 0));
4081}
4082
4083/*
4084 * Mark the given vdev faulted.  A faulted vdev behaves as if the device could
4085 * not be opened, and no I/O is attempted.
4086 */
4087int
4088vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
4089{
4090	vdev_t *vd, *tvd;
4091
4092	spa_vdev_state_enter(spa, SCL_NONE);
4093
4094	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
4095		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
4096
4097	if (!vd->vdev_ops->vdev_op_leaf)
4098		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
4099
4100	tvd = vd->vdev_top;
4101
4102	/*
4103	 * If user did a 'zpool offline -f' then make the fault persist across
4104	 * reboots.
4105	 */
4106	if (aux == VDEV_AUX_EXTERNAL_PERSIST) {
4107		/*
4108		 * There are two kinds of forced faults: temporary and
4109		 * persistent.  Temporary faults go away at pool import, while
4110		 * persistent faults stay set.  Both types of faults can be
4111		 * cleared with a zpool clear.
4112		 *
4113		 * We tell if a vdev is persistently faulted by looking at the
4114		 * ZPOOL_CONFIG_AUX_STATE nvpair.  If it's set to "external" at
4115		 * import then it's a persistent fault.  Otherwise, it's
4116		 * temporary.  We get ZPOOL_CONFIG_AUX_STATE set to "external"
4117		 * by setting vd.vdev_stat.vs_aux to VDEV_AUX_EXTERNAL.  This
4118		 * tells vdev_config_generate() (which gets run later) to set
4119		 * ZPOOL_CONFIG_AUX_STATE to "external" in the nvlist.
4120		 */
4121		vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL;
4122		vd->vdev_tmpoffline = B_FALSE;
4123		aux = VDEV_AUX_EXTERNAL;
4124	} else {
4125		vd->vdev_tmpoffline = B_TRUE;
4126	}
4127
4128	/*
4129	 * We don't directly use the aux state here, but if we do a
4130	 * vdev_reopen(), we need this value to be present to remember why we
4131	 * were faulted.
4132	 */
4133	vd->vdev_label_aux = aux;
4134
4135	/*
4136	 * Faulted state takes precedence over degraded.
4137	 */
4138	vd->vdev_delayed_close = B_FALSE;
4139	vd->vdev_faulted = 1ULL;
4140	vd->vdev_degraded = 0ULL;
4141	vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux);
4142
4143	/*
4144	 * If this device has the only valid copy of the data, then
4145	 * back off and simply mark the vdev as degraded instead.
4146	 */
4147	if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) {
4148		vd->vdev_degraded = 1ULL;
4149		vd->vdev_faulted = 0ULL;
4150
4151		/*
4152		 * If we reopen the device and it's not dead, only then do we
4153		 * mark it degraded.
4154		 */
4155		vdev_reopen(tvd);
4156
4157		if (vdev_readable(vd))
4158			vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux);
4159	}
4160
4161	return (spa_vdev_state_exit(spa, vd, 0));
4162}
4163
4164/*
4165 * Mark the given vdev degraded.  A degraded vdev is purely an indication to the
4166 * user that something is wrong.  The vdev continues to operate as normal as far
4167 * as I/O is concerned.
4168 */
4169int
4170vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux)
4171{
4172	vdev_t *vd;
4173
4174	spa_vdev_state_enter(spa, SCL_NONE);
4175
4176	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
4177		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
4178
4179	if (!vd->vdev_ops->vdev_op_leaf)
4180		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
4181
4182	/*
4183	 * If the vdev is already faulted, then don't do anything.
4184	 */
4185	if (vd->vdev_faulted || vd->vdev_degraded)
4186		return (spa_vdev_state_exit(spa, NULL, 0));
4187
4188	vd->vdev_degraded = 1ULL;
4189	if (!vdev_is_dead(vd))
4190		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
4191		    aux);
4192
4193	return (spa_vdev_state_exit(spa, vd, 0));
4194}
4195
4196int
4197vdev_remove_wanted(spa_t *spa, uint64_t guid)
4198{
4199	vdev_t *vd;
4200
4201	spa_vdev_state_enter(spa, SCL_NONE);
4202
4203	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
4204		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
4205
4206	/*
4207	 * If the vdev is already removed, or expanding which can trigger
4208	 * repartition add/remove events, then don't do anything.
4209	 */
4210	if (vd->vdev_removed || vd->vdev_expanding)
4211		return (spa_vdev_state_exit(spa, NULL, 0));
4212
4213	/*
4214	 * Confirm the vdev has been removed, otherwise don't do anything.
4215	 */
4216	if (vd->vdev_ops->vdev_op_leaf && !zio_wait(vdev_probe(vd, NULL)))
4217		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EEXIST)));
4218
4219	vd->vdev_remove_wanted = B_TRUE;
4220	spa_async_request(spa, SPA_ASYNC_REMOVE);
4221
4222	return (spa_vdev_state_exit(spa, vd, 0));
4223}
4224
4225
4226/*
4227 * Online the given vdev.
4228 *
4229 * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things.  First, any attached
4230 * spare device should be detached when the device finishes resilvering.
4231 * Second, the online should be treated like a 'test' online case, so no FMA
4232 * events are generated if the device fails to open.
4233 */
4234int
4235vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
4236{
4237	vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev;
4238	boolean_t wasoffline;
4239	vdev_state_t oldstate;
4240
4241	spa_vdev_state_enter(spa, SCL_NONE);
4242
4243	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
4244		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
4245
4246	wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline);
4247	oldstate = vd->vdev_state;
4248
4249	tvd = vd->vdev_top;
4250	vd->vdev_offline = B_FALSE;
4251	vd->vdev_tmpoffline = B_FALSE;
4252	vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE);
4253	vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT);
4254
4255	/* XXX - L2ARC 1.0 does not support expansion */
4256	if (!vd->vdev_aux) {
4257		for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
4258			pvd->vdev_expanding = !!((flags & ZFS_ONLINE_EXPAND) ||
4259			    spa->spa_autoexpand);
4260		vd->vdev_expansion_time = gethrestime_sec();
4261	}
4262
4263	vdev_reopen(tvd);
4264	vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE;
4265
4266	if (!vd->vdev_aux) {
4267		for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
4268			pvd->vdev_expanding = B_FALSE;
4269	}
4270
4271	if (newstate)
4272		*newstate = vd->vdev_state;
4273	if ((flags & ZFS_ONLINE_UNSPARE) &&
4274	    !vdev_is_dead(vd) && vd->vdev_parent &&
4275	    vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
4276	    vd->vdev_parent->vdev_child[0] == vd)
4277		vd->vdev_unspare = B_TRUE;
4278
4279	if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) {
4280
4281		/* XXX - L2ARC 1.0 does not support expansion */
4282		if (vd->vdev_aux)
4283			return (spa_vdev_state_exit(spa, vd, ENOTSUP));
4284		spa->spa_ccw_fail_time = 0;
4285		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
4286	}
4287
4288	/* Restart initializing if necessary */
4289	mutex_enter(&vd->vdev_initialize_lock);
4290	if (vdev_writeable(vd) &&
4291	    vd->vdev_initialize_thread == NULL &&
4292	    vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) {
4293		(void) vdev_initialize(vd);
4294	}
4295	mutex_exit(&vd->vdev_initialize_lock);
4296
4297	/*
4298	 * Restart trimming if necessary. We do not restart trimming for cache
4299	 * devices here. This is triggered by l2arc_rebuild_vdev()
4300	 * asynchronously for the whole device or in l2arc_evict() as it evicts
4301	 * space for upcoming writes.
4302	 */
4303	mutex_enter(&vd->vdev_trim_lock);
4304	if (vdev_writeable(vd) && !vd->vdev_isl2cache &&
4305	    vd->vdev_trim_thread == NULL &&
4306	    vd->vdev_trim_state == VDEV_TRIM_ACTIVE) {
4307		(void) vdev_trim(vd, vd->vdev_trim_rate, vd->vdev_trim_partial,
4308		    vd->vdev_trim_secure);
4309	}
4310	mutex_exit(&vd->vdev_trim_lock);
4311
4312	if (wasoffline ||
4313	    (oldstate < VDEV_STATE_DEGRADED &&
4314	    vd->vdev_state >= VDEV_STATE_DEGRADED)) {
4315		spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE);
4316
4317		/*
4318		 * Asynchronously detach spare vdev if resilver or
4319		 * rebuild is not required
4320		 */
4321		if (vd->vdev_unspare &&
4322		    !dsl_scan_resilvering(spa->spa_dsl_pool) &&
4323		    !dsl_scan_resilver_scheduled(spa->spa_dsl_pool) &&
4324		    !vdev_rebuild_active(tvd))
4325			spa_async_request(spa, SPA_ASYNC_DETACH_SPARE);
4326	}
4327	return (spa_vdev_state_exit(spa, vd, 0));
4328}
4329
4330static int
4331vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
4332{
4333	vdev_t *vd, *tvd;
4334	int error = 0;
4335	uint64_t generation;
4336	metaslab_group_t *mg;
4337
4338top:
4339	spa_vdev_state_enter(spa, SCL_ALLOC);
4340
4341	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
4342		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
4343
4344	if (!vd->vdev_ops->vdev_op_leaf)
4345		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
4346
4347	if (vd->vdev_ops == &vdev_draid_spare_ops)
4348		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
4349
4350	tvd = vd->vdev_top;
4351	mg = tvd->vdev_mg;
4352	generation = spa->spa_config_generation + 1;
4353
4354	/*
4355	 * If the device isn't already offline, try to offline it.
4356	 */
4357	if (!vd->vdev_offline) {
4358		/*
4359		 * If this device has the only valid copy of some data,
4360		 * don't allow it to be offlined. Log devices are always
4361		 * expendable.
4362		 */
4363		if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
4364		    vdev_dtl_required(vd))
4365			return (spa_vdev_state_exit(spa, NULL,
4366			    SET_ERROR(EBUSY)));
4367
4368		/*
4369		 * If the top-level is a slog and it has had allocations
4370		 * then proceed.  We check that the vdev's metaslab group
4371		 * is not NULL since it's possible that we may have just
4372		 * added this vdev but not yet initialized its metaslabs.
4373		 */
4374		if (tvd->vdev_islog && mg != NULL) {
4375			/*
4376			 * Prevent any future allocations.
4377			 */
4378			ASSERT3P(tvd->vdev_log_mg, ==, NULL);
4379			metaslab_group_passivate(mg);
4380			(void) spa_vdev_state_exit(spa, vd, 0);
4381
4382			error = spa_reset_logs(spa);
4383
4384			/*
4385			 * If the log device was successfully reset but has
4386			 * checkpointed data, do not offline it.
4387			 */
4388			if (error == 0 &&
4389			    tvd->vdev_checkpoint_sm != NULL) {
4390				ASSERT3U(space_map_allocated(
4391				    tvd->vdev_checkpoint_sm), !=, 0);
4392				error = ZFS_ERR_CHECKPOINT_EXISTS;
4393			}
4394
4395			spa_vdev_state_enter(spa, SCL_ALLOC);
4396
4397			/*
4398			 * Check to see if the config has changed.
4399			 */
4400			if (error || generation != spa->spa_config_generation) {
4401				metaslab_group_activate(mg);
4402				if (error)
4403					return (spa_vdev_state_exit(spa,
4404					    vd, error));
4405				(void) spa_vdev_state_exit(spa, vd, 0);
4406				goto top;
4407			}
4408			ASSERT0(tvd->vdev_stat.vs_alloc);
4409		}
4410
4411		/*
4412		 * Offline this device and reopen its top-level vdev.
4413		 * If the top-level vdev is a log device then just offline
4414		 * it. Otherwise, if this action results in the top-level
4415		 * vdev becoming unusable, undo it and fail the request.
4416		 */
4417		vd->vdev_offline = B_TRUE;
4418		vdev_reopen(tvd);
4419
4420		if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
4421		    vdev_is_dead(tvd)) {
4422			vd->vdev_offline = B_FALSE;
4423			vdev_reopen(tvd);
4424			return (spa_vdev_state_exit(spa, NULL,
4425			    SET_ERROR(EBUSY)));
4426		}
4427
4428		/*
4429		 * Add the device back into the metaslab rotor so that
4430		 * once we online the device it's open for business.
4431		 */
4432		if (tvd->vdev_islog && mg != NULL)
4433			metaslab_group_activate(mg);
4434	}
4435
4436	vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);
4437
4438	return (spa_vdev_state_exit(spa, vd, 0));
4439}
4440
4441int
4442vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
4443{
4444	int error;
4445
4446	mutex_enter(&spa->spa_vdev_top_lock);
4447	error = vdev_offline_locked(spa, guid, flags);
4448	mutex_exit(&spa->spa_vdev_top_lock);
4449
4450	return (error);
4451}
4452
4453/*
4454 * Clear the error counts associated with this vdev.  Unlike vdev_online() and
4455 * vdev_offline(), we assume the spa config is locked.  We also clear all
4456 * children.  If 'vd' is NULL, then the user wants to clear all vdevs.
4457 */
4458void
4459vdev_clear(spa_t *spa, vdev_t *vd)
4460{
4461	vdev_t *rvd = spa->spa_root_vdev;
4462
4463	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
4464
4465	if (vd == NULL)
4466		vd = rvd;
4467
4468	vd->vdev_stat.vs_read_errors = 0;
4469	vd->vdev_stat.vs_write_errors = 0;
4470	vd->vdev_stat.vs_checksum_errors = 0;
4471	vd->vdev_stat.vs_slow_ios = 0;
4472
4473	for (int c = 0; c < vd->vdev_children; c++)
4474		vdev_clear(spa, vd->vdev_child[c]);
4475
4476	/*
4477	 * It makes no sense to "clear" an indirect  or removed vdev.
4478	 */
4479	if (!vdev_is_concrete(vd) || vd->vdev_removed)
4480		return;
4481
4482	/*
4483	 * If we're in the FAULTED state or have experienced failed I/O, then
4484	 * clear the persistent state and attempt to reopen the device.  We
4485	 * also mark the vdev config dirty, so that the new faulted state is
4486	 * written out to disk.
4487	 */
4488	if (vd->vdev_faulted || vd->vdev_degraded ||
4489	    !vdev_readable(vd) || !vdev_writeable(vd)) {
4490		/*
4491		 * When reopening in response to a clear event, it may be due to
4492		 * a fmadm repair request.  In this case, if the device is
4493		 * still broken, we want to still post the ereport again.
4494		 */
4495		vd->vdev_forcefault = B_TRUE;
4496
4497		vd->vdev_faulted = vd->vdev_degraded = 0ULL;
4498		vd->vdev_cant_read = B_FALSE;
4499		vd->vdev_cant_write = B_FALSE;
4500		vd->vdev_stat.vs_aux = 0;
4501
4502		vdev_reopen(vd == rvd ? rvd : vd->vdev_top);
4503
4504		vd->vdev_forcefault = B_FALSE;
4505
4506		if (vd != rvd && vdev_writeable(vd->vdev_top))
4507			vdev_state_dirty(vd->vdev_top);
4508
4509		/* If a resilver isn't required, check if vdevs can be culled */
4510		if (vd->vdev_aux == NULL && !vdev_is_dead(vd) &&
4511		    !dsl_scan_resilvering(spa->spa_dsl_pool) &&
4512		    !dsl_scan_resilver_scheduled(spa->spa_dsl_pool))
4513			spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
4514
4515		spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR);
4516	}
4517
4518	/*
4519	 * When clearing a FMA-diagnosed fault, we always want to
4520	 * unspare the device, as we assume that the original spare was
4521	 * done in response to the FMA fault.
4522	 */
4523	if (!vdev_is_dead(vd) && vd->vdev_parent != NULL &&
4524	    vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
4525	    vd->vdev_parent->vdev_child[0] == vd)
4526		vd->vdev_unspare = B_TRUE;
4527
4528	/* Clear recent error events cache (i.e. duplicate events tracking) */
4529	zfs_ereport_clear(spa, vd);
4530}
4531
4532boolean_t
4533vdev_is_dead(vdev_t *vd)
4534{
4535	/*
4536	 * Holes and missing devices are always considered "dead".
4537	 * This simplifies the code since we don't have to check for
4538	 * these types of devices in the various code paths.
4539	 * Instead we rely on the fact that we skip over dead devices
4540	 * before issuing I/O to them.
4541	 */
4542	return (vd->vdev_state < VDEV_STATE_DEGRADED ||
4543	    vd->vdev_ops == &vdev_hole_ops ||
4544	    vd->vdev_ops == &vdev_missing_ops);
4545}
4546
4547boolean_t
4548vdev_readable(vdev_t *vd)
4549{
4550	return (!vdev_is_dead(vd) && !vd->vdev_cant_read);
4551}
4552
4553boolean_t
4554vdev_writeable(vdev_t *vd)
4555{
4556	return (!vdev_is_dead(vd) && !vd->vdev_cant_write &&
4557	    vdev_is_concrete(vd));
4558}
4559
4560boolean_t
4561vdev_allocatable(vdev_t *vd)
4562{
4563	uint64_t state = vd->vdev_state;
4564
4565	/*
4566	 * We currently allow allocations from vdevs which may be in the
4567	 * process of reopening (i.e. VDEV_STATE_CLOSED). If the device
4568	 * fails to reopen then we'll catch it later when we're holding
4569	 * the proper locks.  Note that we have to get the vdev state
4570	 * in a local variable because although it changes atomically,
4571	 * we're asking two separate questions about it.
4572	 */
4573	return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
4574	    !vd->vdev_cant_write && vdev_is_concrete(vd) &&
4575	    vd->vdev_mg->mg_initialized);
4576}
4577
4578boolean_t
4579vdev_accessible(vdev_t *vd, zio_t *zio)
4580{
4581	ASSERT(zio->io_vd == vd);
4582
4583	if (vdev_is_dead(vd) || vd->vdev_remove_wanted)
4584		return (B_FALSE);
4585
4586	if (zio->io_type == ZIO_TYPE_READ)
4587		return (!vd->vdev_cant_read);
4588
4589	if (zio->io_type == ZIO_TYPE_WRITE)
4590		return (!vd->vdev_cant_write);
4591
4592	return (B_TRUE);
4593}
4594
4595static void
4596vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs)
4597{
4598	/*
4599	 * Exclude the dRAID spare when aggregating to avoid double counting
4600	 * the ops and bytes.  These IOs are counted by the physical leaves.
4601	 */
4602	if (cvd->vdev_ops == &vdev_draid_spare_ops)
4603		return;
4604
4605	for (int t = 0; t < VS_ZIO_TYPES; t++) {
4606		vs->vs_ops[t] += cvs->vs_ops[t];
4607		vs->vs_bytes[t] += cvs->vs_bytes[t];
4608	}
4609
4610	cvs->vs_scan_removing = cvd->vdev_removing;
4611}
4612
4613/*
4614 * Get extended stats
4615 */
4616static void
4617vdev_get_child_stat_ex(vdev_t *cvd, vdev_stat_ex_t *vsx, vdev_stat_ex_t *cvsx)
4618{
4619	(void) cvd;
4620
4621	int t, b;
4622	for (t = 0; t < ZIO_TYPES; t++) {
4623		for (b = 0; b < ARRAY_SIZE(vsx->vsx_disk_histo[0]); b++)
4624			vsx->vsx_disk_histo[t][b] += cvsx->vsx_disk_histo[t][b];
4625
4626		for (b = 0; b < ARRAY_SIZE(vsx->vsx_total_histo[0]); b++) {
4627			vsx->vsx_total_histo[t][b] +=
4628			    cvsx->vsx_total_histo[t][b];
4629		}
4630	}
4631
4632	for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) {
4633		for (b = 0; b < ARRAY_SIZE(vsx->vsx_queue_histo[0]); b++) {
4634			vsx->vsx_queue_histo[t][b] +=
4635			    cvsx->vsx_queue_histo[t][b];
4636		}
4637		vsx->vsx_active_queue[t] += cvsx->vsx_active_queue[t];
4638		vsx->vsx_pend_queue[t] += cvsx->vsx_pend_queue[t];
4639
4640		for (b = 0; b < ARRAY_SIZE(vsx->vsx_ind_histo[0]); b++)
4641			vsx->vsx_ind_histo[t][b] += cvsx->vsx_ind_histo[t][b];
4642
4643		for (b = 0; b < ARRAY_SIZE(vsx->vsx_agg_histo[0]); b++)
4644			vsx->vsx_agg_histo[t][b] += cvsx->vsx_agg_histo[t][b];
4645	}
4646
4647}
4648
4649boolean_t
4650vdev_is_spacemap_addressable(vdev_t *vd)
4651{
4652	if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2))
4653		return (B_TRUE);
4654
4655	/*
4656	 * If double-word space map entries are not enabled we assume
4657	 * 47 bits of the space map entry are dedicated to the entry's
4658	 * offset (see SM_OFFSET_BITS in space_map.h). We then use that
4659	 * to calculate the maximum address that can be described by a
4660	 * space map entry for the given device.
4661	 */
4662	uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS;
4663
4664	if (shift >= 63) /* detect potential overflow */
4665		return (B_TRUE);
4666
4667	return (vd->vdev_asize < (1ULL << shift));
4668}
4669
4670/*
4671 * Get statistics for the given vdev.
4672 */
4673static void
4674vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
4675{
4676	int t;
4677	/*
4678	 * If we're getting stats on the root vdev, aggregate the I/O counts
4679	 * over all top-level vdevs (i.e. the direct children of the root).
4680	 */
4681	if (!vd->vdev_ops->vdev_op_leaf) {
4682		if (vs) {
4683			memset(vs->vs_ops, 0, sizeof (vs->vs_ops));
4684			memset(vs->vs_bytes, 0, sizeof (vs->vs_bytes));
4685		}
4686		if (vsx)
4687			memset(vsx, 0, sizeof (*vsx));
4688
4689		for (int c = 0; c < vd->vdev_children; c++) {
4690			vdev_t *cvd = vd->vdev_child[c];
4691			vdev_stat_t *cvs = &cvd->vdev_stat;
4692			vdev_stat_ex_t *cvsx = &cvd->vdev_stat_ex;
4693
4694			vdev_get_stats_ex_impl(cvd, cvs, cvsx);
4695			if (vs)
4696				vdev_get_child_stat(cvd, vs, cvs);
4697			if (vsx)
4698				vdev_get_child_stat_ex(cvd, vsx, cvsx);
4699		}
4700	} else {
4701		/*
4702		 * We're a leaf.  Just copy our ZIO active queue stats in.  The
4703		 * other leaf stats are updated in vdev_stat_update().
4704		 */
4705		if (!vsx)
4706			return;
4707
4708		memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex));
4709
4710		for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) {
4711			vsx->vsx_active_queue[t] = vd->vdev_queue.vq_cactive[t];
4712			vsx->vsx_pend_queue[t] = vdev_queue_class_length(vd, t);
4713		}
4714	}
4715}
4716
4717void
4718vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
4719{
4720	vdev_t *tvd = vd->vdev_top;
4721	mutex_enter(&vd->vdev_stat_lock);
4722	if (vs) {
4723		memcpy(vs, &vd->vdev_stat, sizeof (*vs));
4724		vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
4725		vs->vs_state = vd->vdev_state;
4726		vs->vs_rsize = vdev_get_min_asize(vd);
4727
4728		if (vd->vdev_ops->vdev_op_leaf) {
4729			vs->vs_pspace = vd->vdev_psize;
4730			vs->vs_rsize += VDEV_LABEL_START_SIZE +
4731			    VDEV_LABEL_END_SIZE;
4732			/*
4733			 * Report initializing progress. Since we don't
4734			 * have the initializing locks held, this is only
4735			 * an estimate (although a fairly accurate one).
4736			 */
4737			vs->vs_initialize_bytes_done =
4738			    vd->vdev_initialize_bytes_done;
4739			vs->vs_initialize_bytes_est =
4740			    vd->vdev_initialize_bytes_est;
4741			vs->vs_initialize_state = vd->vdev_initialize_state;
4742			vs->vs_initialize_action_time =
4743			    vd->vdev_initialize_action_time;
4744
4745			/*
4746			 * Report manual TRIM progress. Since we don't have
4747			 * the manual TRIM locks held, this is only an
4748			 * estimate (although fairly accurate one).
4749			 */
4750			vs->vs_trim_notsup = !vd->vdev_has_trim;
4751			vs->vs_trim_bytes_done = vd->vdev_trim_bytes_done;
4752			vs->vs_trim_bytes_est = vd->vdev_trim_bytes_est;
4753			vs->vs_trim_state = vd->vdev_trim_state;
4754			vs->vs_trim_action_time = vd->vdev_trim_action_time;
4755
4756			/* Set when there is a deferred resilver. */
4757			vs->vs_resilver_deferred = vd->vdev_resilver_deferred;
4758		}
4759
4760		/*
4761		 * Report expandable space on top-level, non-auxiliary devices
4762		 * only. The expandable space is reported in terms of metaslab
4763		 * sized units since that determines how much space the pool
4764		 * can expand.
4765		 */
4766		if (vd->vdev_aux == NULL && tvd != NULL) {
4767			vs->vs_esize = P2ALIGN(
4768			    vd->vdev_max_asize - vd->vdev_asize,
4769			    1ULL << tvd->vdev_ms_shift);
4770		}
4771
4772		vs->vs_configured_ashift = vd->vdev_top != NULL
4773		    ? vd->vdev_top->vdev_ashift : vd->vdev_ashift;
4774		vs->vs_logical_ashift = vd->vdev_logical_ashift;
4775		if (vd->vdev_physical_ashift <= ASHIFT_MAX)
4776			vs->vs_physical_ashift = vd->vdev_physical_ashift;
4777		else
4778			vs->vs_physical_ashift = 0;
4779
4780		/*
4781		 * Report fragmentation and rebuild progress for top-level,
4782		 * non-auxiliary, concrete devices.
4783		 */
4784		if (vd->vdev_aux == NULL && vd == vd->vdev_top &&
4785		    vdev_is_concrete(vd)) {
4786			/*
4787			 * The vdev fragmentation rating doesn't take into
4788			 * account the embedded slog metaslab (vdev_log_mg).
4789			 * Since it's only one metaslab, it would have a tiny
4790			 * impact on the overall fragmentation.
4791			 */
4792			vs->vs_fragmentation = (vd->vdev_mg != NULL) ?
4793			    vd->vdev_mg->mg_fragmentation : 0;
4794		}
4795		vs->vs_noalloc = MAX(vd->vdev_noalloc,
4796		    tvd ? tvd->vdev_noalloc : 0);
4797	}
4798
4799	vdev_get_stats_ex_impl(vd, vs, vsx);
4800	mutex_exit(&vd->vdev_stat_lock);
4801}
4802
4803void
4804vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
4805{
4806	return (vdev_get_stats_ex(vd, vs, NULL));
4807}
4808
4809void
4810vdev_clear_stats(vdev_t *vd)
4811{
4812	mutex_enter(&vd->vdev_stat_lock);
4813	vd->vdev_stat.vs_space = 0;
4814	vd->vdev_stat.vs_dspace = 0;
4815	vd->vdev_stat.vs_alloc = 0;
4816	mutex_exit(&vd->vdev_stat_lock);
4817}
4818
4819void
4820vdev_scan_stat_init(vdev_t *vd)
4821{
4822	vdev_stat_t *vs = &vd->vdev_stat;
4823
4824	for (int c = 0; c < vd->vdev_children; c++)
4825		vdev_scan_stat_init(vd->vdev_child[c]);
4826
4827	mutex_enter(&vd->vdev_stat_lock);
4828	vs->vs_scan_processed = 0;
4829	mutex_exit(&vd->vdev_stat_lock);
4830}
4831
4832void
4833vdev_stat_update(zio_t *zio, uint64_t psize)
4834{
4835	spa_t *spa = zio->io_spa;
4836	vdev_t *rvd = spa->spa_root_vdev;
4837	vdev_t *vd = zio->io_vd ? zio->io_vd : rvd;
4838	vdev_t *pvd;
4839	uint64_t txg = zio->io_txg;
4840/* Suppress ASAN false positive */
4841#ifdef __SANITIZE_ADDRESS__
4842	vdev_stat_t *vs = vd ? &vd->vdev_stat : NULL;
4843	vdev_stat_ex_t *vsx = vd ? &vd->vdev_stat_ex : NULL;
4844#else
4845	vdev_stat_t *vs = &vd->vdev_stat;
4846	vdev_stat_ex_t *vsx = &vd->vdev_stat_ex;
4847#endif
4848	zio_type_t type = zio->io_type;
4849	int flags = zio->io_flags;
4850
4851	/*
4852	 * If this i/o is a gang leader, it didn't do any actual work.
4853	 */
4854	if (zio->io_gang_tree)
4855		return;
4856
4857	if (zio->io_error == 0) {
4858		/*
4859		 * If this is a root i/o, don't count it -- we've already
4860		 * counted the top-level vdevs, and vdev_get_stats() will
4861		 * aggregate them when asked.  This reduces contention on
4862		 * the root vdev_stat_lock and implicitly handles blocks
4863		 * that compress away to holes, for which there is no i/o.
4864		 * (Holes never create vdev children, so all the counters
4865		 * remain zero, which is what we want.)
4866		 *
4867		 * Note: this only applies to successful i/o (io_error == 0)
4868		 * because unlike i/o counts, errors are not additive.
4869		 * When reading a ditto block, for example, failure of
4870		 * one top-level vdev does not imply a root-level error.
4871		 */
4872		if (vd == rvd)
4873			return;
4874
4875		ASSERT(vd == zio->io_vd);
4876
4877		if (flags & ZIO_FLAG_IO_BYPASS)
4878			return;
4879
4880		mutex_enter(&vd->vdev_stat_lock);
4881
4882		if (flags & ZIO_FLAG_IO_REPAIR) {
4883			/*
4884			 * Repair is the result of a resilver issued by the
4885			 * scan thread (spa_sync).
4886			 */
4887			if (flags & ZIO_FLAG_SCAN_THREAD) {
4888				dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
4889				dsl_scan_phys_t *scn_phys = &scn->scn_phys;
4890				uint64_t *processed = &scn_phys->scn_processed;
4891
4892				if (vd->vdev_ops->vdev_op_leaf)
4893					atomic_add_64(processed, psize);
4894				vs->vs_scan_processed += psize;
4895			}
4896
4897			/*
4898			 * Repair is the result of a rebuild issued by the
4899			 * rebuild thread (vdev_rebuild_thread).  To avoid
4900			 * double counting repaired bytes the virtual dRAID
4901			 * spare vdev is excluded from the processed bytes.
4902			 */
4903			if (zio->io_priority == ZIO_PRIORITY_REBUILD) {
4904				vdev_t *tvd = vd->vdev_top;
4905				vdev_rebuild_t *vr = &tvd->vdev_rebuild_config;
4906				vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
4907				uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt;
4908
4909				if (vd->vdev_ops->vdev_op_leaf &&
4910				    vd->vdev_ops != &vdev_draid_spare_ops) {
4911					atomic_add_64(rebuilt, psize);
4912				}
4913				vs->vs_rebuild_processed += psize;
4914			}
4915
4916			if (flags & ZIO_FLAG_SELF_HEAL)
4917				vs->vs_self_healed += psize;
4918		}
4919
4920		/*
4921		 * The bytes/ops/histograms are recorded at the leaf level and
4922		 * aggregated into the higher level vdevs in vdev_get_stats().
4923		 */
4924		if (vd->vdev_ops->vdev_op_leaf &&
4925		    (zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) {
4926			zio_type_t vs_type = type;
4927			zio_priority_t priority = zio->io_priority;
4928
4929			/*
4930			 * TRIM ops and bytes are reported to user space as
4931			 * ZIO_TYPE_FLUSH.  This is done to preserve the
4932			 * vdev_stat_t structure layout for user space.
4933			 */
4934			if (type == ZIO_TYPE_TRIM)
4935				vs_type = ZIO_TYPE_FLUSH;
4936
4937			/*
4938			 * Solely for the purposes of 'zpool iostat -lqrw'
4939			 * reporting use the priority to categorize the IO.
4940			 * Only the following are reported to user space:
4941			 *
4942			 *   ZIO_PRIORITY_SYNC_READ,
4943			 *   ZIO_PRIORITY_SYNC_WRITE,
4944			 *   ZIO_PRIORITY_ASYNC_READ,
4945			 *   ZIO_PRIORITY_ASYNC_WRITE,
4946			 *   ZIO_PRIORITY_SCRUB,
4947			 *   ZIO_PRIORITY_TRIM,
4948			 *   ZIO_PRIORITY_REBUILD.
4949			 */
4950			if (priority == ZIO_PRIORITY_INITIALIZING) {
4951				ASSERT3U(type, ==, ZIO_TYPE_WRITE);
4952				priority = ZIO_PRIORITY_ASYNC_WRITE;
4953			} else if (priority == ZIO_PRIORITY_REMOVAL) {
4954				priority = ((type == ZIO_TYPE_WRITE) ?
4955				    ZIO_PRIORITY_ASYNC_WRITE :
4956				    ZIO_PRIORITY_ASYNC_READ);
4957			}
4958
4959			vs->vs_ops[vs_type]++;
4960			vs->vs_bytes[vs_type] += psize;
4961
4962			if (flags & ZIO_FLAG_DELEGATED) {
4963				vsx->vsx_agg_histo[priority]
4964				    [RQ_HISTO(zio->io_size)]++;
4965			} else {
4966				vsx->vsx_ind_histo[priority]
4967				    [RQ_HISTO(zio->io_size)]++;
4968			}
4969
4970			if (zio->io_delta && zio->io_delay) {
4971				vsx->vsx_queue_histo[priority]
4972				    [L_HISTO(zio->io_delta - zio->io_delay)]++;
4973				vsx->vsx_disk_histo[type]
4974				    [L_HISTO(zio->io_delay)]++;
4975				vsx->vsx_total_histo[type]
4976				    [L_HISTO(zio->io_delta)]++;
4977			}
4978		}
4979
4980		mutex_exit(&vd->vdev_stat_lock);
4981		return;
4982	}
4983
4984	if (flags & ZIO_FLAG_SPECULATIVE)
4985		return;
4986
4987	/*
4988	 * If this is an I/O error that is going to be retried, then ignore the
4989	 * error.  Otherwise, the user may interpret B_FAILFAST I/O errors as
4990	 * hard errors, when in reality they can happen for any number of
4991	 * innocuous reasons (bus resets, MPxIO link failure, etc).
4992	 */
4993	if (zio->io_error == EIO &&
4994	    !(zio->io_flags & ZIO_FLAG_IO_RETRY))
4995		return;
4996
4997	/*
4998	 * Intent logs writes won't propagate their error to the root
4999	 * I/O so don't mark these types of failures as pool-level
5000	 * errors.
5001	 */
5002	if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
5003		return;
5004
5005	if (type == ZIO_TYPE_WRITE && txg != 0 &&
5006	    (!(flags & ZIO_FLAG_IO_REPAIR) ||
5007	    (flags & ZIO_FLAG_SCAN_THREAD) ||
5008	    spa->spa_claiming)) {
5009		/*
5010		 * This is either a normal write (not a repair), or it's
5011		 * a repair induced by the scrub thread, or it's a repair
5012		 * made by zil_claim() during spa_load() in the first txg.
5013		 * In the normal case, we commit the DTL change in the same
5014		 * txg as the block was born.  In the scrub-induced repair
5015		 * case, we know that scrubs run in first-pass syncing context,
5016		 * so we commit the DTL change in spa_syncing_txg(spa).
5017		 * In the zil_claim() case, we commit in spa_first_txg(spa).
5018		 *
5019		 * We currently do not make DTL entries for failed spontaneous
5020		 * self-healing writes triggered by normal (non-scrubbing)
5021		 * reads, because we have no transactional context in which to
5022		 * do so -- and it's not clear that it'd be desirable anyway.
5023		 */
5024		if (vd->vdev_ops->vdev_op_leaf) {
5025			uint64_t commit_txg = txg;
5026			if (flags & ZIO_FLAG_SCAN_THREAD) {
5027				ASSERT(flags & ZIO_FLAG_IO_REPAIR);
5028				ASSERT(spa_sync_pass(spa) == 1);
5029				vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
5030				commit_txg = spa_syncing_txg(spa);
5031			} else if (spa->spa_claiming) {
5032				ASSERT(flags & ZIO_FLAG_IO_REPAIR);
5033				commit_txg = spa_first_txg(spa);
5034			}
5035			ASSERT(commit_txg >= spa_syncing_txg(spa));
5036			if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
5037				return;
5038			for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
5039				vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1);
5040			vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg);
5041		}
5042		if (vd != rvd)
5043			vdev_dtl_dirty(vd, DTL_MISSING, txg, 1);
5044	}
5045}
5046
5047int64_t
5048vdev_deflated_space(vdev_t *vd, int64_t space)
5049{
5050	ASSERT((space & (SPA_MINBLOCKSIZE-1)) == 0);
5051	ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
5052
5053	return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio);
5054}
5055
5056/*
5057 * Update the in-core space usage stats for this vdev, its metaslab class,
5058 * and the root vdev.
5059 */
5060void
5061vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
5062    int64_t space_delta)
5063{
5064	(void) defer_delta;
5065	int64_t dspace_delta;
5066	spa_t *spa = vd->vdev_spa;
5067	vdev_t *rvd = spa->spa_root_vdev;
5068
5069	ASSERT(vd == vd->vdev_top);
5070
5071	/*
5072	 * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion
5073	 * factor.  We must calculate this here and not at the root vdev
5074	 * because the root vdev's psize-to-asize is simply the max of its
5075	 * children's, thus not accurate enough for us.
5076	 */
5077	dspace_delta = vdev_deflated_space(vd, space_delta);
5078
5079	mutex_enter(&vd->vdev_stat_lock);
5080	/* ensure we won't underflow */
5081	if (alloc_delta < 0) {
5082		ASSERT3U(vd->vdev_stat.vs_alloc, >=, -alloc_delta);
5083	}
5084
5085	vd->vdev_stat.vs_alloc += alloc_delta;
5086	vd->vdev_stat.vs_space += space_delta;
5087	vd->vdev_stat.vs_dspace += dspace_delta;
5088	mutex_exit(&vd->vdev_stat_lock);
5089
5090	/* every class but log contributes to root space stats */
5091	if (vd->vdev_mg != NULL && !vd->vdev_islog) {
5092		ASSERT(!vd->vdev_isl2cache);
5093		mutex_enter(&rvd->vdev_stat_lock);
5094		rvd->vdev_stat.vs_alloc += alloc_delta;
5095		rvd->vdev_stat.vs_space += space_delta;
5096		rvd->vdev_stat.vs_dspace += dspace_delta;
5097		mutex_exit(&rvd->vdev_stat_lock);
5098	}
5099	/* Note: metaslab_class_space_update moved to metaslab_space_update */
5100}
5101
5102/*
5103 * Mark a top-level vdev's config as dirty, placing it on the dirty list
5104 * so that it will be written out next time the vdev configuration is synced.
5105 * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
5106 */
5107void
5108vdev_config_dirty(vdev_t *vd)
5109{
5110	spa_t *spa = vd->vdev_spa;
5111	vdev_t *rvd = spa->spa_root_vdev;
5112	int c;
5113
5114	ASSERT(spa_writeable(spa));
5115
5116	/*
5117	 * If this is an aux vdev (as with l2cache and spare devices), then we
5118	 * update the vdev config manually and set the sync flag.
5119	 */
5120	if (vd->vdev_aux != NULL) {
5121		spa_aux_vdev_t *sav = vd->vdev_aux;
5122		nvlist_t **aux;
5123		uint_t naux;
5124
5125		for (c = 0; c < sav->sav_count; c++) {
5126			if (sav->sav_vdevs[c] == vd)
5127				break;
5128		}
5129
5130		if (c == sav->sav_count) {
5131			/*
5132			 * We're being removed.  There's nothing more to do.
5133			 */
5134			ASSERT(sav->sav_sync == B_TRUE);
5135			return;
5136		}
5137
5138		sav->sav_sync = B_TRUE;
5139
5140		if (nvlist_lookup_nvlist_array(sav->sav_config,
5141		    ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) {
5142			VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
5143			    ZPOOL_CONFIG_SPARES, &aux, &naux) == 0);
5144		}
5145
5146		ASSERT(c < naux);
5147
5148		/*
5149		 * Setting the nvlist in the middle if the array is a little
5150		 * sketchy, but it will work.
5151		 */
5152		nvlist_free(aux[c]);
5153		aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0);
5154
5155		return;
5156	}
5157
5158	/*
5159	 * The dirty list is protected by the SCL_CONFIG lock.  The caller
5160	 * must either hold SCL_CONFIG as writer, or must be the sync thread
5161	 * (which holds SCL_CONFIG as reader).  There's only one sync thread,
5162	 * so this is sufficient to ensure mutual exclusion.
5163	 */
5164	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
5165	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
5166	    spa_config_held(spa, SCL_CONFIG, RW_READER)));
5167
5168	if (vd == rvd) {
5169		for (c = 0; c < rvd->vdev_children; c++)
5170			vdev_config_dirty(rvd->vdev_child[c]);
5171	} else {
5172		ASSERT(vd == vd->vdev_top);
5173
5174		if (!list_link_active(&vd->vdev_config_dirty_node) &&
5175		    vdev_is_concrete(vd)) {
5176			list_insert_head(&spa->spa_config_dirty_list, vd);
5177		}
5178	}
5179}
5180
5181void
5182vdev_config_clean(vdev_t *vd)
5183{
5184	spa_t *spa = vd->vdev_spa;
5185
5186	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
5187	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
5188	    spa_config_held(spa, SCL_CONFIG, RW_READER)));
5189
5190	ASSERT(list_link_active(&vd->vdev_config_dirty_node));
5191	list_remove(&spa->spa_config_dirty_list, vd);
5192}
5193
5194/*
5195 * Mark a top-level vdev's state as dirty, so that the next pass of
5196 * spa_sync() can convert this into vdev_config_dirty().  We distinguish
5197 * the state changes from larger config changes because they require
5198 * much less locking, and are often needed for administrative actions.
5199 */
5200void
5201vdev_state_dirty(vdev_t *vd)
5202{
5203	spa_t *spa = vd->vdev_spa;
5204
5205	ASSERT(spa_writeable(spa));
5206	ASSERT(vd == vd->vdev_top);
5207
5208	/*
5209	 * The state list is protected by the SCL_STATE lock.  The caller
5210	 * must either hold SCL_STATE as writer, or must be the sync thread
5211	 * (which holds SCL_STATE as reader).  There's only one sync thread,
5212	 * so this is sufficient to ensure mutual exclusion.
5213	 */
5214	ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
5215	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
5216	    spa_config_held(spa, SCL_STATE, RW_READER)));
5217
5218	if (!list_link_active(&vd->vdev_state_dirty_node) &&
5219	    vdev_is_concrete(vd))
5220		list_insert_head(&spa->spa_state_dirty_list, vd);
5221}
5222
5223void
5224vdev_state_clean(vdev_t *vd)
5225{
5226	spa_t *spa = vd->vdev_spa;
5227
5228	ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
5229	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
5230	    spa_config_held(spa, SCL_STATE, RW_READER)));
5231
5232	ASSERT(list_link_active(&vd->vdev_state_dirty_node));
5233	list_remove(&spa->spa_state_dirty_list, vd);
5234}
5235
5236/*
5237 * Propagate vdev state up from children to parent.
5238 */
5239void
5240vdev_propagate_state(vdev_t *vd)
5241{
5242	spa_t *spa = vd->vdev_spa;
5243	vdev_t *rvd = spa->spa_root_vdev;
5244	int degraded = 0, faulted = 0;
5245	int corrupted = 0;
5246	vdev_t *child;
5247
5248	if (vd->vdev_children > 0) {
5249		for (int c = 0; c < vd->vdev_children; c++) {
5250			child = vd->vdev_child[c];
5251
5252			/*
5253			 * Don't factor holes or indirect vdevs into the
5254			 * decision.
5255			 */
5256			if (!vdev_is_concrete(child))
5257				continue;
5258
5259			if (!vdev_readable(child) ||
5260			    (!vdev_writeable(child) && spa_writeable(spa))) {
5261				/*
5262				 * Root special: if there is a top-level log
5263				 * device, treat the root vdev as if it were
5264				 * degraded.
5265				 */
5266				if (child->vdev_islog && vd == rvd)
5267					degraded++;
5268				else
5269					faulted++;
5270			} else if (child->vdev_state <= VDEV_STATE_DEGRADED) {
5271				degraded++;
5272			}
5273
5274			if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
5275				corrupted++;
5276		}
5277
5278		vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);
5279
5280		/*
5281		 * Root special: if there is a top-level vdev that cannot be
5282		 * opened due to corrupted metadata, then propagate the root
5283		 * vdev's aux state as 'corrupt' rather than 'insufficient
5284		 * replicas'.
5285		 */
5286		if (corrupted && vd == rvd &&
5287		    rvd->vdev_state == VDEV_STATE_CANT_OPEN)
5288			vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
5289			    VDEV_AUX_CORRUPT_DATA);
5290	}
5291
5292	if (vd->vdev_parent)
5293		vdev_propagate_state(vd->vdev_parent);
5294}
5295
5296/*
5297 * Set a vdev's state.  If this is during an open, we don't update the parent
5298 * state, because we're in the process of opening children depth-first.
5299 * Otherwise, we propagate the change to the parent.
5300 *
5301 * If this routine places a device in a faulted state, an appropriate ereport is
5302 * generated.
5303 */
5304void
5305vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
5306{
5307	uint64_t save_state;
5308	spa_t *spa = vd->vdev_spa;
5309
5310	if (state == vd->vdev_state) {
5311		/*
5312		 * Since vdev_offline() code path is already in an offline
5313		 * state we can miss a statechange event to OFFLINE. Check
5314		 * the previous state to catch this condition.
5315		 */
5316		if (vd->vdev_ops->vdev_op_leaf &&
5317		    (state == VDEV_STATE_OFFLINE) &&
5318		    (vd->vdev_prevstate >= VDEV_STATE_FAULTED)) {
5319			/* post an offline state change */
5320			zfs_post_state_change(spa, vd, vd->vdev_prevstate);
5321		}
5322		vd->vdev_stat.vs_aux = aux;
5323		return;
5324	}
5325
5326	save_state = vd->vdev_state;
5327
5328	vd->vdev_state = state;
5329	vd->vdev_stat.vs_aux = aux;
5330
5331	/*
5332	 * If we are setting the vdev state to anything but an open state, then
5333	 * always close the underlying device unless the device has requested
5334	 * a delayed close (i.e. we're about to remove or fault the device).
5335	 * Otherwise, we keep accessible but invalid devices open forever.
5336	 * We don't call vdev_close() itself, because that implies some extra
5337	 * checks (offline, etc) that we don't want here.  This is limited to
5338	 * leaf devices, because otherwise closing the device will affect other
5339	 * children.
5340	 */
5341	if (!vd->vdev_delayed_close && vdev_is_dead(vd) &&
5342	    vd->vdev_ops->vdev_op_leaf)
5343		vd->vdev_ops->vdev_op_close(vd);
5344
5345	if (vd->vdev_removed &&
5346	    state == VDEV_STATE_CANT_OPEN &&
5347	    (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) {
5348		/*
5349		 * If the previous state is set to VDEV_STATE_REMOVED, then this
5350		 * device was previously marked removed and someone attempted to
5351		 * reopen it.  If this failed due to a nonexistent device, then
5352		 * keep the device in the REMOVED state.  We also let this be if
5353		 * it is one of our special test online cases, which is only
5354		 * attempting to online the device and shouldn't generate an FMA
5355		 * fault.
5356		 */
5357		vd->vdev_state = VDEV_STATE_REMOVED;
5358		vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
5359	} else if (state == VDEV_STATE_REMOVED) {
5360		vd->vdev_removed = B_TRUE;
5361	} else if (state == VDEV_STATE_CANT_OPEN) {
5362		/*
5363		 * If we fail to open a vdev during an import or recovery, we
5364		 * mark it as "not available", which signifies that it was
5365		 * never there to begin with.  Failure to open such a device
5366		 * is not considered an error.
5367		 */
5368		if ((spa_load_state(spa) == SPA_LOAD_IMPORT ||
5369		    spa_load_state(spa) == SPA_LOAD_RECOVER) &&
5370		    vd->vdev_ops->vdev_op_leaf)
5371			vd->vdev_not_present = 1;
5372
5373		/*
5374		 * Post the appropriate ereport.  If the 'prevstate' field is
5375		 * set to something other than VDEV_STATE_UNKNOWN, it indicates
5376		 * that this is part of a vdev_reopen().  In this case, we don't
5377		 * want to post the ereport if the device was already in the
5378		 * CANT_OPEN state beforehand.
5379		 *
5380		 * If the 'checkremove' flag is set, then this is an attempt to
5381		 * online the device in response to an insertion event.  If we
5382		 * hit this case, then we have detected an insertion event for a
5383		 * faulted or offline device that wasn't in the removed state.
5384		 * In this scenario, we don't post an ereport because we are
5385		 * about to replace the device, or attempt an online with
5386		 * vdev_forcefault, which will generate the fault for us.
5387		 */
5388		if ((vd->vdev_prevstate != state || vd->vdev_forcefault) &&
5389		    !vd->vdev_not_present && !vd->vdev_checkremove &&
5390		    vd != spa->spa_root_vdev) {
5391			const char *class;
5392
5393			switch (aux) {
5394			case VDEV_AUX_OPEN_FAILED:
5395				class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED;
5396				break;
5397			case VDEV_AUX_CORRUPT_DATA:
5398				class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA;
5399				break;
5400			case VDEV_AUX_NO_REPLICAS:
5401				class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS;
5402				break;
5403			case VDEV_AUX_BAD_GUID_SUM:
5404				class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM;
5405				break;
5406			case VDEV_AUX_TOO_SMALL:
5407				class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL;
5408				break;
5409			case VDEV_AUX_BAD_LABEL:
5410				class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
5411				break;
5412			case VDEV_AUX_BAD_ASHIFT:
5413				class = FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT;
5414				break;
5415			default:
5416				class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
5417			}
5418
5419			(void) zfs_ereport_post(class, spa, vd, NULL, NULL,
5420			    save_state);
5421		}
5422
5423		/* Erase any notion of persistent removed state */
5424		vd->vdev_removed = B_FALSE;
5425	} else {
5426		vd->vdev_removed = B_FALSE;
5427	}
5428
5429	/*
5430	 * Notify ZED of any significant state-change on a leaf vdev.
5431	 *
5432	 */
5433	if (vd->vdev_ops->vdev_op_leaf) {
5434		/* preserve original state from a vdev_reopen() */
5435		if ((vd->vdev_prevstate != VDEV_STATE_UNKNOWN) &&
5436		    (vd->vdev_prevstate != vd->vdev_state) &&
5437		    (save_state <= VDEV_STATE_CLOSED))
5438			save_state = vd->vdev_prevstate;
5439
5440		/* filter out state change due to initial vdev_open */
5441		if (save_state > VDEV_STATE_CLOSED)
5442			zfs_post_state_change(spa, vd, save_state);
5443	}
5444
5445	if (!isopen && vd->vdev_parent)
5446		vdev_propagate_state(vd->vdev_parent);
5447}
5448
5449boolean_t
5450vdev_children_are_offline(vdev_t *vd)
5451{
5452	ASSERT(!vd->vdev_ops->vdev_op_leaf);
5453
5454	for (uint64_t i = 0; i < vd->vdev_children; i++) {
5455		if (vd->vdev_child[i]->vdev_state != VDEV_STATE_OFFLINE)
5456			return (B_FALSE);
5457	}
5458
5459	return (B_TRUE);
5460}
5461
5462/*
5463 * Check the vdev configuration to ensure that it's capable of supporting
5464 * a root pool. We do not support partial configuration.
5465 */
5466boolean_t
5467vdev_is_bootable(vdev_t *vd)
5468{
5469	if (!vd->vdev_ops->vdev_op_leaf) {
5470		const char *vdev_type = vd->vdev_ops->vdev_op_type;
5471
5472		if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0)
5473			return (B_FALSE);
5474	}
5475
5476	for (int c = 0; c < vd->vdev_children; c++) {
5477		if (!vdev_is_bootable(vd->vdev_child[c]))
5478			return (B_FALSE);
5479	}
5480	return (B_TRUE);
5481}
5482
5483boolean_t
5484vdev_is_concrete(vdev_t *vd)
5485{
5486	vdev_ops_t *ops = vd->vdev_ops;
5487	if (ops == &vdev_indirect_ops || ops == &vdev_hole_ops ||
5488	    ops == &vdev_missing_ops || ops == &vdev_root_ops) {
5489		return (B_FALSE);
5490	} else {
5491		return (B_TRUE);
5492	}
5493}
5494
5495/*
5496 * Determine if a log device has valid content.  If the vdev was
5497 * removed or faulted in the MOS config then we know that
5498 * the content on the log device has already been written to the pool.
5499 */
5500boolean_t
5501vdev_log_state_valid(vdev_t *vd)
5502{
5503	if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted &&
5504	    !vd->vdev_removed)
5505		return (B_TRUE);
5506
5507	for (int c = 0; c < vd->vdev_children; c++)
5508		if (vdev_log_state_valid(vd->vdev_child[c]))
5509			return (B_TRUE);
5510
5511	return (B_FALSE);
5512}
5513
5514/*
5515 * Expand a vdev if possible.
5516 */
5517void
5518vdev_expand(vdev_t *vd, uint64_t txg)
5519{
5520	ASSERT(vd->vdev_top == vd);
5521	ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
5522	ASSERT(vdev_is_concrete(vd));
5523
5524	vdev_set_deflate_ratio(vd);
5525
5526	if ((vd->vdev_spa->spa_raidz_expand == NULL ||
5527	    vd->vdev_spa->spa_raidz_expand->vre_vdev_id != vd->vdev_id) &&
5528	    (vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count &&
5529	    vdev_is_concrete(vd)) {
5530		vdev_metaslab_group_create(vd);
5531		VERIFY(vdev_metaslab_init(vd, txg) == 0);
5532		vdev_config_dirty(vd);
5533	}
5534}
5535
5536/*
5537 * Split a vdev.
5538 */
5539void
5540vdev_split(vdev_t *vd)
5541{
5542	vdev_t *cvd, *pvd = vd->vdev_parent;
5543
5544	VERIFY3U(pvd->vdev_children, >, 1);
5545
5546	vdev_remove_child(pvd, vd);
5547	vdev_compact_children(pvd);
5548
5549	ASSERT3P(pvd->vdev_child, !=, NULL);
5550
5551	cvd = pvd->vdev_child[0];
5552	if (pvd->vdev_children == 1) {
5553		vdev_remove_parent(cvd);
5554		cvd->vdev_splitting = B_TRUE;
5555	}
5556	vdev_propagate_state(cvd);
5557}
5558
5559void
5560vdev_deadman(vdev_t *vd, const char *tag)
5561{
5562	for (int c = 0; c < vd->vdev_children; c++) {
5563		vdev_t *cvd = vd->vdev_child[c];
5564
5565		vdev_deadman(cvd, tag);
5566	}
5567
5568	if (vd->vdev_ops->vdev_op_leaf) {
5569		vdev_queue_t *vq = &vd->vdev_queue;
5570
5571		mutex_enter(&vq->vq_lock);
5572		if (vq->vq_active > 0) {
5573			spa_t *spa = vd->vdev_spa;
5574			zio_t *fio;
5575			uint64_t delta;
5576
5577			zfs_dbgmsg("slow vdev: %s has %u active IOs",
5578			    vd->vdev_path, vq->vq_active);
5579
5580			/*
5581			 * Look at the head of all the pending queues,
5582			 * if any I/O has been outstanding for longer than
5583			 * the spa_deadman_synctime invoke the deadman logic.
5584			 */
5585			fio = list_head(&vq->vq_active_list);
5586			delta = gethrtime() - fio->io_timestamp;
5587			if (delta > spa_deadman_synctime(spa))
5588				zio_deadman(fio, tag);
5589		}
5590		mutex_exit(&vq->vq_lock);
5591	}
5592}
5593
5594void
5595vdev_defer_resilver(vdev_t *vd)
5596{
5597	ASSERT(vd->vdev_ops->vdev_op_leaf);
5598
5599	vd->vdev_resilver_deferred = B_TRUE;
5600	vd->vdev_spa->spa_resilver_deferred = B_TRUE;
5601}
5602
5603/*
5604 * Clears the resilver deferred flag on all leaf devs under vd. Returns
5605 * B_TRUE if we have devices that need to be resilvered and are available to
5606 * accept resilver I/Os.
5607 */
5608boolean_t
5609vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx)
5610{
5611	boolean_t resilver_needed = B_FALSE;
5612	spa_t *spa = vd->vdev_spa;
5613
5614	for (int c = 0; c < vd->vdev_children; c++) {
5615		vdev_t *cvd = vd->vdev_child[c];
5616		resilver_needed |= vdev_clear_resilver_deferred(cvd, tx);
5617	}
5618
5619	if (vd == spa->spa_root_vdev &&
5620	    spa_feature_is_active(spa, SPA_FEATURE_RESILVER_DEFER)) {
5621		spa_feature_decr(spa, SPA_FEATURE_RESILVER_DEFER, tx);
5622		vdev_config_dirty(vd);
5623		spa->spa_resilver_deferred = B_FALSE;
5624		return (resilver_needed);
5625	}
5626
5627	if (!vdev_is_concrete(vd) || vd->vdev_aux ||
5628	    !vd->vdev_ops->vdev_op_leaf)
5629		return (resilver_needed);
5630
5631	vd->vdev_resilver_deferred = B_FALSE;
5632
5633	return (!vdev_is_dead(vd) && !vd->vdev_offline &&
5634	    vdev_resilver_needed(vd, NULL, NULL));
5635}
5636
5637boolean_t
5638vdev_xlate_is_empty(range_seg64_t *rs)
5639{
5640	return (rs->rs_start == rs->rs_end);
5641}
5642
5643/*
5644 * Translate a logical range to the first contiguous physical range for the
5645 * specified vdev_t.  This function is initially called with a leaf vdev and
5646 * will walk each parent vdev until it reaches a top-level vdev. Once the
5647 * top-level is reached the physical range is initialized and the recursive
5648 * function begins to unwind. As it unwinds it calls the parent's vdev
5649 * specific translation function to do the real conversion.
5650 */
5651void
5652vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
5653    range_seg64_t *physical_rs, range_seg64_t *remain_rs)
5654{
5655	/*
5656	 * Walk up the vdev tree
5657	 */
5658	if (vd != vd->vdev_top) {
5659		vdev_xlate(vd->vdev_parent, logical_rs, physical_rs,
5660		    remain_rs);
5661	} else {
5662		/*
5663		 * We've reached the top-level vdev, initialize the physical
5664		 * range to the logical range and set an empty remaining
5665		 * range then start to unwind.
5666		 */
5667		physical_rs->rs_start = logical_rs->rs_start;
5668		physical_rs->rs_end = logical_rs->rs_end;
5669
5670		remain_rs->rs_start = logical_rs->rs_start;
5671		remain_rs->rs_end = logical_rs->rs_start;
5672
5673		return;
5674	}
5675
5676	vdev_t *pvd = vd->vdev_parent;
5677	ASSERT3P(pvd, !=, NULL);
5678	ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL);
5679
5680	/*
5681	 * As this recursive function unwinds, translate the logical
5682	 * range into its physical and any remaining components by calling
5683	 * the vdev specific translate function.
5684	 */
5685	range_seg64_t intermediate = { 0 };
5686	pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate, remain_rs);
5687
5688	physical_rs->rs_start = intermediate.rs_start;
5689	physical_rs->rs_end = intermediate.rs_end;
5690}
5691
5692void
5693vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs,
5694    vdev_xlate_func_t *func, void *arg)
5695{
5696	range_seg64_t iter_rs = *logical_rs;
5697	range_seg64_t physical_rs;
5698	range_seg64_t remain_rs;
5699
5700	while (!vdev_xlate_is_empty(&iter_rs)) {
5701
5702		vdev_xlate(vd, &iter_rs, &physical_rs, &remain_rs);
5703
5704		/*
5705		 * With raidz and dRAID, it's possible that the logical range
5706		 * does not live on this leaf vdev. Only when there is a non-
5707		 * zero physical size call the provided function.
5708		 */
5709		if (!vdev_xlate_is_empty(&physical_rs))
5710			func(arg, &physical_rs);
5711
5712		iter_rs = remain_rs;
5713	}
5714}
5715
5716static char *
5717vdev_name(vdev_t *vd, char *buf, int buflen)
5718{
5719	if (vd->vdev_path == NULL) {
5720		if (strcmp(vd->vdev_ops->vdev_op_type, "root") == 0) {
5721			strlcpy(buf, vd->vdev_spa->spa_name, buflen);
5722		} else if (!vd->vdev_ops->vdev_op_leaf) {
5723			snprintf(buf, buflen, "%s-%llu",
5724			    vd->vdev_ops->vdev_op_type,
5725			    (u_longlong_t)vd->vdev_id);
5726		}
5727	} else {
5728		strlcpy(buf, vd->vdev_path, buflen);
5729	}
5730	return (buf);
5731}
5732
5733/*
5734 * Look at the vdev tree and determine whether any devices are currently being
5735 * replaced.
5736 */
5737boolean_t
5738vdev_replace_in_progress(vdev_t *vdev)
5739{
5740	ASSERT(spa_config_held(vdev->vdev_spa, SCL_ALL, RW_READER) != 0);
5741
5742	if (vdev->vdev_ops == &vdev_replacing_ops)
5743		return (B_TRUE);
5744
5745	/*
5746	 * A 'spare' vdev indicates that we have a replace in progress, unless
5747	 * it has exactly two children, and the second, the hot spare, has
5748	 * finished being resilvered.
5749	 */
5750	if (vdev->vdev_ops == &vdev_spare_ops && (vdev->vdev_children > 2 ||
5751	    !vdev_dtl_empty(vdev->vdev_child[1], DTL_MISSING)))
5752		return (B_TRUE);
5753
5754	for (int i = 0; i < vdev->vdev_children; i++) {
5755		if (vdev_replace_in_progress(vdev->vdev_child[i]))
5756			return (B_TRUE);
5757	}
5758
5759	return (B_FALSE);
5760}
5761
5762/*
5763 * Add a (source=src, propname=propval) list to an nvlist.
5764 */
5765static void
5766vdev_prop_add_list(nvlist_t *nvl, const char *propname, const char *strval,
5767    uint64_t intval, zprop_source_t src)
5768{
5769	nvlist_t *propval;
5770
5771	propval = fnvlist_alloc();
5772	fnvlist_add_uint64(propval, ZPROP_SOURCE, src);
5773
5774	if (strval != NULL)
5775		fnvlist_add_string(propval, ZPROP_VALUE, strval);
5776	else
5777		fnvlist_add_uint64(propval, ZPROP_VALUE, intval);
5778
5779	fnvlist_add_nvlist(nvl, propname, propval);
5780	nvlist_free(propval);
5781}
5782
5783static void
5784vdev_props_set_sync(void *arg, dmu_tx_t *tx)
5785{
5786	vdev_t *vd;
5787	nvlist_t *nvp = arg;
5788	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
5789	objset_t *mos = spa->spa_meta_objset;
5790	nvpair_t *elem = NULL;
5791	uint64_t vdev_guid;
5792	uint64_t objid;
5793	nvlist_t *nvprops;
5794
5795	vdev_guid = fnvlist_lookup_uint64(nvp, ZPOOL_VDEV_PROPS_SET_VDEV);
5796	nvprops = fnvlist_lookup_nvlist(nvp, ZPOOL_VDEV_PROPS_SET_PROPS);
5797	vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE);
5798
5799	/* this vdev could get removed while waiting for this sync task */
5800	if (vd == NULL)
5801		return;
5802
5803	/*
5804	 * Set vdev property values in the vdev props mos object.
5805	 */
5806	if (vd->vdev_root_zap != 0) {
5807		objid = vd->vdev_root_zap;
5808	} else if (vd->vdev_top_zap != 0) {
5809		objid = vd->vdev_top_zap;
5810	} else if (vd->vdev_leaf_zap != 0) {
5811		objid = vd->vdev_leaf_zap;
5812	} else {
5813		panic("unexpected vdev type");
5814	}
5815
5816	mutex_enter(&spa->spa_props_lock);
5817
5818	while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) {
5819		uint64_t intval;
5820		const char *strval;
5821		vdev_prop_t prop;
5822		const char *propname = nvpair_name(elem);
5823		zprop_type_t proptype;
5824
5825		switch (prop = vdev_name_to_prop(propname)) {
5826		case VDEV_PROP_USERPROP:
5827			if (vdev_prop_user(propname)) {
5828				strval = fnvpair_value_string(elem);
5829				if (strlen(strval) == 0) {
5830					/* remove the property if value == "" */
5831					(void) zap_remove(mos, objid, propname,
5832					    tx);
5833				} else {
5834					VERIFY0(zap_update(mos, objid, propname,
5835					    1, strlen(strval) + 1, strval, tx));
5836				}
5837				spa_history_log_internal(spa, "vdev set", tx,
5838				    "vdev_guid=%llu: %s=%s",
5839				    (u_longlong_t)vdev_guid, nvpair_name(elem),
5840				    strval);
5841			}
5842			break;
5843		default:
5844			/* normalize the property name */
5845			propname = vdev_prop_to_name(prop);
5846			proptype = vdev_prop_get_type(prop);
5847
5848			if (nvpair_type(elem) == DATA_TYPE_STRING) {
5849				ASSERT(proptype == PROP_TYPE_STRING);
5850				strval = fnvpair_value_string(elem);
5851				VERIFY0(zap_update(mos, objid, propname,
5852				    1, strlen(strval) + 1, strval, tx));
5853				spa_history_log_internal(spa, "vdev set", tx,
5854				    "vdev_guid=%llu: %s=%s",
5855				    (u_longlong_t)vdev_guid, nvpair_name(elem),
5856				    strval);
5857			} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
5858				intval = fnvpair_value_uint64(elem);
5859
5860				if (proptype == PROP_TYPE_INDEX) {
5861					const char *unused;
5862					VERIFY0(vdev_prop_index_to_string(
5863					    prop, intval, &unused));
5864				}
5865				VERIFY0(zap_update(mos, objid, propname,
5866				    sizeof (uint64_t), 1, &intval, tx));
5867				spa_history_log_internal(spa, "vdev set", tx,
5868				    "vdev_guid=%llu: %s=%lld",
5869				    (u_longlong_t)vdev_guid,
5870				    nvpair_name(elem), (longlong_t)intval);
5871			} else {
5872				panic("invalid vdev property type %u",
5873				    nvpair_type(elem));
5874			}
5875		}
5876
5877	}
5878
5879	mutex_exit(&spa->spa_props_lock);
5880}
5881
5882int
5883vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
5884{
5885	spa_t *spa = vd->vdev_spa;
5886	nvpair_t *elem = NULL;
5887	uint64_t vdev_guid;
5888	nvlist_t *nvprops;
5889	int error = 0;
5890
5891	ASSERT(vd != NULL);
5892
5893	/* Check that vdev has a zap we can use */
5894	if (vd->vdev_root_zap == 0 &&
5895	    vd->vdev_top_zap == 0 &&
5896	    vd->vdev_leaf_zap == 0)
5897		return (SET_ERROR(EINVAL));
5898
5899	if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_SET_VDEV,
5900	    &vdev_guid) != 0)
5901		return (SET_ERROR(EINVAL));
5902
5903	if (nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_SET_PROPS,
5904	    &nvprops) != 0)
5905		return (SET_ERROR(EINVAL));
5906
5907	if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL)
5908		return (SET_ERROR(EINVAL));
5909
5910	while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) {
5911		const char *propname = nvpair_name(elem);
5912		vdev_prop_t prop = vdev_name_to_prop(propname);
5913		uint64_t intval = 0;
5914		const char *strval = NULL;
5915
5916		if (prop == VDEV_PROP_USERPROP && !vdev_prop_user(propname)) {
5917			error = EINVAL;
5918			goto end;
5919		}
5920
5921		if (vdev_prop_readonly(prop)) {
5922			error = EROFS;
5923			goto end;
5924		}
5925
5926		/* Special Processing */
5927		switch (prop) {
5928		case VDEV_PROP_PATH:
5929			if (vd->vdev_path == NULL) {
5930				error = EROFS;
5931				break;
5932			}
5933			if (nvpair_value_string(elem, &strval) != 0) {
5934				error = EINVAL;
5935				break;
5936			}
5937			/* New path must start with /dev/ */
5938			if (strncmp(strval, "/dev/", 5)) {
5939				error = EINVAL;
5940				break;
5941			}
5942			error = spa_vdev_setpath(spa, vdev_guid, strval);
5943			break;
5944		case VDEV_PROP_ALLOCATING:
5945			if (nvpair_value_uint64(elem, &intval) != 0) {
5946				error = EINVAL;
5947				break;
5948			}
5949			if (intval != vd->vdev_noalloc)
5950				break;
5951			if (intval == 0)
5952				error = spa_vdev_noalloc(spa, vdev_guid);
5953			else
5954				error = spa_vdev_alloc(spa, vdev_guid);
5955			break;
5956		case VDEV_PROP_FAILFAST:
5957			if (nvpair_value_uint64(elem, &intval) != 0) {
5958				error = EINVAL;
5959				break;
5960			}
5961			vd->vdev_failfast = intval & 1;
5962			break;
5963		case VDEV_PROP_CHECKSUM_N:
5964			if (nvpair_value_uint64(elem, &intval) != 0) {
5965				error = EINVAL;
5966				break;
5967			}
5968			vd->vdev_checksum_n = intval;
5969			break;
5970		case VDEV_PROP_CHECKSUM_T:
5971			if (nvpair_value_uint64(elem, &intval) != 0) {
5972				error = EINVAL;
5973				break;
5974			}
5975			vd->vdev_checksum_t = intval;
5976			break;
5977		case VDEV_PROP_IO_N:
5978			if (nvpair_value_uint64(elem, &intval) != 0) {
5979				error = EINVAL;
5980				break;
5981			}
5982			vd->vdev_io_n = intval;
5983			break;
5984		case VDEV_PROP_IO_T:
5985			if (nvpair_value_uint64(elem, &intval) != 0) {
5986				error = EINVAL;
5987				break;
5988			}
5989			vd->vdev_io_t = intval;
5990			break;
5991		case VDEV_PROP_SLOW_IO_N:
5992			if (nvpair_value_uint64(elem, &intval) != 0) {
5993				error = EINVAL;
5994				break;
5995			}
5996			vd->vdev_slow_io_n = intval;
5997			break;
5998		case VDEV_PROP_SLOW_IO_T:
5999			if (nvpair_value_uint64(elem, &intval) != 0) {
6000				error = EINVAL;
6001				break;
6002			}
6003			vd->vdev_slow_io_t = intval;
6004			break;
6005		default:
6006			/* Most processing is done in vdev_props_set_sync */
6007			break;
6008		}
6009end:
6010		if (error != 0) {
6011			intval = error;
6012			vdev_prop_add_list(outnvl, propname, strval, intval, 0);
6013			return (error);
6014		}
6015	}
6016
6017	return (dsl_sync_task(spa->spa_name, NULL, vdev_props_set_sync,
6018	    innvl, 6, ZFS_SPACE_CHECK_EXTRA_RESERVED));
6019}
6020
6021int
6022vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
6023{
6024	spa_t *spa = vd->vdev_spa;
6025	objset_t *mos = spa->spa_meta_objset;
6026	int err = 0;
6027	uint64_t objid;
6028	uint64_t vdev_guid;
6029	nvpair_t *elem = NULL;
6030	nvlist_t *nvprops = NULL;
6031	uint64_t intval = 0;
6032	char *strval = NULL;
6033	const char *propname = NULL;
6034	vdev_prop_t prop;
6035
6036	ASSERT(vd != NULL);
6037	ASSERT(mos != NULL);
6038
6039	if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_GET_VDEV,
6040	    &vdev_guid) != 0)
6041		return (SET_ERROR(EINVAL));
6042
6043	nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_GET_PROPS, &nvprops);
6044
6045	if (vd->vdev_root_zap != 0) {
6046		objid = vd->vdev_root_zap;
6047	} else if (vd->vdev_top_zap != 0) {
6048		objid = vd->vdev_top_zap;
6049	} else if (vd->vdev_leaf_zap != 0) {
6050		objid = vd->vdev_leaf_zap;
6051	} else {
6052		return (SET_ERROR(EINVAL));
6053	}
6054	ASSERT(objid != 0);
6055
6056	mutex_enter(&spa->spa_props_lock);
6057
6058	if (nvprops != NULL) {
6059		char namebuf[64] = { 0 };
6060
6061		while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) {
6062			intval = 0;
6063			strval = NULL;
6064			propname = nvpair_name(elem);
6065			prop = vdev_name_to_prop(propname);
6066			zprop_source_t src = ZPROP_SRC_DEFAULT;
6067			uint64_t integer_size, num_integers;
6068
6069			switch (prop) {
6070			/* Special Read-only Properties */
6071			case VDEV_PROP_NAME:
6072				strval = vdev_name(vd, namebuf,
6073				    sizeof (namebuf));
6074				if (strval == NULL)
6075					continue;
6076				vdev_prop_add_list(outnvl, propname, strval, 0,
6077				    ZPROP_SRC_NONE);
6078				continue;
6079			case VDEV_PROP_CAPACITY:
6080				/* percent used */
6081				intval = (vd->vdev_stat.vs_dspace == 0) ? 0 :
6082				    (vd->vdev_stat.vs_alloc * 100 /
6083				    vd->vdev_stat.vs_dspace);
6084				vdev_prop_add_list(outnvl, propname, NULL,
6085				    intval, ZPROP_SRC_NONE);
6086				continue;
6087			case VDEV_PROP_STATE:
6088				vdev_prop_add_list(outnvl, propname, NULL,
6089				    vd->vdev_state, ZPROP_SRC_NONE);
6090				continue;
6091			case VDEV_PROP_GUID:
6092				vdev_prop_add_list(outnvl, propname, NULL,
6093				    vd->vdev_guid, ZPROP_SRC_NONE);
6094				continue;
6095			case VDEV_PROP_ASIZE:
6096				vdev_prop_add_list(outnvl, propname, NULL,
6097				    vd->vdev_asize, ZPROP_SRC_NONE);
6098				continue;
6099			case VDEV_PROP_PSIZE:
6100				vdev_prop_add_list(outnvl, propname, NULL,
6101				    vd->vdev_psize, ZPROP_SRC_NONE);
6102				continue;
6103			case VDEV_PROP_ASHIFT:
6104				vdev_prop_add_list(outnvl, propname, NULL,
6105				    vd->vdev_ashift, ZPROP_SRC_NONE);
6106				continue;
6107			case VDEV_PROP_SIZE:
6108				vdev_prop_add_list(outnvl, propname, NULL,
6109				    vd->vdev_stat.vs_dspace, ZPROP_SRC_NONE);
6110				continue;
6111			case VDEV_PROP_FREE:
6112				vdev_prop_add_list(outnvl, propname, NULL,
6113				    vd->vdev_stat.vs_dspace -
6114				    vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE);
6115				continue;
6116			case VDEV_PROP_ALLOCATED:
6117				vdev_prop_add_list(outnvl, propname, NULL,
6118				    vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE);
6119				continue;
6120			case VDEV_PROP_EXPANDSZ:
6121				vdev_prop_add_list(outnvl, propname, NULL,
6122				    vd->vdev_stat.vs_esize, ZPROP_SRC_NONE);
6123				continue;
6124			case VDEV_PROP_FRAGMENTATION:
6125				vdev_prop_add_list(outnvl, propname, NULL,
6126				    vd->vdev_stat.vs_fragmentation,
6127				    ZPROP_SRC_NONE);
6128				continue;
6129			case VDEV_PROP_PARITY:
6130				vdev_prop_add_list(outnvl, propname, NULL,
6131				    vdev_get_nparity(vd), ZPROP_SRC_NONE);
6132				continue;
6133			case VDEV_PROP_PATH:
6134				if (vd->vdev_path == NULL)
6135					continue;
6136				vdev_prop_add_list(outnvl, propname,
6137				    vd->vdev_path, 0, ZPROP_SRC_NONE);
6138				continue;
6139			case VDEV_PROP_DEVID:
6140				if (vd->vdev_devid == NULL)
6141					continue;
6142				vdev_prop_add_list(outnvl, propname,
6143				    vd->vdev_devid, 0, ZPROP_SRC_NONE);
6144				continue;
6145			case VDEV_PROP_PHYS_PATH:
6146				if (vd->vdev_physpath == NULL)
6147					continue;
6148				vdev_prop_add_list(outnvl, propname,
6149				    vd->vdev_physpath, 0, ZPROP_SRC_NONE);
6150				continue;
6151			case VDEV_PROP_ENC_PATH:
6152				if (vd->vdev_enc_sysfs_path == NULL)
6153					continue;
6154				vdev_prop_add_list(outnvl, propname,
6155				    vd->vdev_enc_sysfs_path, 0, ZPROP_SRC_NONE);
6156				continue;
6157			case VDEV_PROP_FRU:
6158				if (vd->vdev_fru == NULL)
6159					continue;
6160				vdev_prop_add_list(outnvl, propname,
6161				    vd->vdev_fru, 0, ZPROP_SRC_NONE);
6162				continue;
6163			case VDEV_PROP_PARENT:
6164				if (vd->vdev_parent != NULL) {
6165					strval = vdev_name(vd->vdev_parent,
6166					    namebuf, sizeof (namebuf));
6167					vdev_prop_add_list(outnvl, propname,
6168					    strval, 0, ZPROP_SRC_NONE);
6169				}
6170				continue;
6171			case VDEV_PROP_CHILDREN:
6172				if (vd->vdev_children > 0)
6173					strval = kmem_zalloc(ZAP_MAXVALUELEN,
6174					    KM_SLEEP);
6175				for (uint64_t i = 0; i < vd->vdev_children;
6176				    i++) {
6177					const char *vname;
6178
6179					vname = vdev_name(vd->vdev_child[i],
6180					    namebuf, sizeof (namebuf));
6181					if (vname == NULL)
6182						vname = "(unknown)";
6183					if (strlen(strval) > 0)
6184						strlcat(strval, ",",
6185						    ZAP_MAXVALUELEN);
6186					strlcat(strval, vname, ZAP_MAXVALUELEN);
6187				}
6188				if (strval != NULL) {
6189					vdev_prop_add_list(outnvl, propname,
6190					    strval, 0, ZPROP_SRC_NONE);
6191					kmem_free(strval, ZAP_MAXVALUELEN);
6192				}
6193				continue;
6194			case VDEV_PROP_NUMCHILDREN:
6195				vdev_prop_add_list(outnvl, propname, NULL,
6196				    vd->vdev_children, ZPROP_SRC_NONE);
6197				continue;
6198			case VDEV_PROP_READ_ERRORS:
6199				vdev_prop_add_list(outnvl, propname, NULL,
6200				    vd->vdev_stat.vs_read_errors,
6201				    ZPROP_SRC_NONE);
6202				continue;
6203			case VDEV_PROP_WRITE_ERRORS:
6204				vdev_prop_add_list(outnvl, propname, NULL,
6205				    vd->vdev_stat.vs_write_errors,
6206				    ZPROP_SRC_NONE);
6207				continue;
6208			case VDEV_PROP_CHECKSUM_ERRORS:
6209				vdev_prop_add_list(outnvl, propname, NULL,
6210				    vd->vdev_stat.vs_checksum_errors,
6211				    ZPROP_SRC_NONE);
6212				continue;
6213			case VDEV_PROP_INITIALIZE_ERRORS:
6214				vdev_prop_add_list(outnvl, propname, NULL,
6215				    vd->vdev_stat.vs_initialize_errors,
6216				    ZPROP_SRC_NONE);
6217				continue;
6218			case VDEV_PROP_OPS_NULL:
6219				vdev_prop_add_list(outnvl, propname, NULL,
6220				    vd->vdev_stat.vs_ops[ZIO_TYPE_NULL],
6221				    ZPROP_SRC_NONE);
6222				continue;
6223			case VDEV_PROP_OPS_READ:
6224				vdev_prop_add_list(outnvl, propname, NULL,
6225				    vd->vdev_stat.vs_ops[ZIO_TYPE_READ],
6226				    ZPROP_SRC_NONE);
6227				continue;
6228			case VDEV_PROP_OPS_WRITE:
6229				vdev_prop_add_list(outnvl, propname, NULL,
6230				    vd->vdev_stat.vs_ops[ZIO_TYPE_WRITE],
6231				    ZPROP_SRC_NONE);
6232				continue;
6233			case VDEV_PROP_OPS_FREE:
6234				vdev_prop_add_list(outnvl, propname, NULL,
6235				    vd->vdev_stat.vs_ops[ZIO_TYPE_FREE],
6236				    ZPROP_SRC_NONE);
6237				continue;
6238			case VDEV_PROP_OPS_CLAIM:
6239				vdev_prop_add_list(outnvl, propname, NULL,
6240				    vd->vdev_stat.vs_ops[ZIO_TYPE_CLAIM],
6241				    ZPROP_SRC_NONE);
6242				continue;
6243			case VDEV_PROP_OPS_TRIM:
6244				/*
6245				 * TRIM ops and bytes are reported to user
6246				 * space as ZIO_TYPE_FLUSH.  This is done to
6247				 * preserve the vdev_stat_t structure layout
6248				 * for user space.
6249				 */
6250				vdev_prop_add_list(outnvl, propname, NULL,
6251				    vd->vdev_stat.vs_ops[ZIO_TYPE_FLUSH],
6252				    ZPROP_SRC_NONE);
6253				continue;
6254			case VDEV_PROP_BYTES_NULL:
6255				vdev_prop_add_list(outnvl, propname, NULL,
6256				    vd->vdev_stat.vs_bytes[ZIO_TYPE_NULL],
6257				    ZPROP_SRC_NONE);
6258				continue;
6259			case VDEV_PROP_BYTES_READ:
6260				vdev_prop_add_list(outnvl, propname, NULL,
6261				    vd->vdev_stat.vs_bytes[ZIO_TYPE_READ],
6262				    ZPROP_SRC_NONE);
6263				continue;
6264			case VDEV_PROP_BYTES_WRITE:
6265				vdev_prop_add_list(outnvl, propname, NULL,
6266				    vd->vdev_stat.vs_bytes[ZIO_TYPE_WRITE],
6267				    ZPROP_SRC_NONE);
6268				continue;
6269			case VDEV_PROP_BYTES_FREE:
6270				vdev_prop_add_list(outnvl, propname, NULL,
6271				    vd->vdev_stat.vs_bytes[ZIO_TYPE_FREE],
6272				    ZPROP_SRC_NONE);
6273				continue;
6274			case VDEV_PROP_BYTES_CLAIM:
6275				vdev_prop_add_list(outnvl, propname, NULL,
6276				    vd->vdev_stat.vs_bytes[ZIO_TYPE_CLAIM],
6277				    ZPROP_SRC_NONE);
6278				continue;
6279			case VDEV_PROP_BYTES_TRIM:
6280				/*
6281				 * TRIM ops and bytes are reported to user
6282				 * space as ZIO_TYPE_FLUSH.  This is done to
6283				 * preserve the vdev_stat_t structure layout
6284				 * for user space.
6285				 */
6286				vdev_prop_add_list(outnvl, propname, NULL,
6287				    vd->vdev_stat.vs_bytes[ZIO_TYPE_FLUSH],
6288				    ZPROP_SRC_NONE);
6289				continue;
6290			case VDEV_PROP_REMOVING:
6291				vdev_prop_add_list(outnvl, propname, NULL,
6292				    vd->vdev_removing, ZPROP_SRC_NONE);
6293				continue;
6294			case VDEV_PROP_RAIDZ_EXPANDING:
6295				/* Only expose this for raidz */
6296				if (vd->vdev_ops == &vdev_raidz_ops) {
6297					vdev_prop_add_list(outnvl, propname,
6298					    NULL, vd->vdev_rz_expanding,
6299					    ZPROP_SRC_NONE);
6300				}
6301				continue;
6302			/* Numeric Properites */
6303			case VDEV_PROP_ALLOCATING:
6304				/* Leaf vdevs cannot have this property */
6305				if (vd->vdev_mg == NULL &&
6306				    vd->vdev_top != NULL) {
6307					src = ZPROP_SRC_NONE;
6308					intval = ZPROP_BOOLEAN_NA;
6309				} else {
6310					err = vdev_prop_get_int(vd, prop,
6311					    &intval);
6312					if (err && err != ENOENT)
6313						break;
6314
6315					if (intval ==
6316					    vdev_prop_default_numeric(prop))
6317						src = ZPROP_SRC_DEFAULT;
6318					else
6319						src = ZPROP_SRC_LOCAL;
6320				}
6321
6322				vdev_prop_add_list(outnvl, propname, NULL,
6323				    intval, src);
6324				break;
6325			case VDEV_PROP_FAILFAST:
6326				src = ZPROP_SRC_LOCAL;
6327				strval = NULL;
6328
6329				err = zap_lookup(mos, objid, nvpair_name(elem),
6330				    sizeof (uint64_t), 1, &intval);
6331				if (err == ENOENT) {
6332					intval = vdev_prop_default_numeric(
6333					    prop);
6334					err = 0;
6335				} else if (err) {
6336					break;
6337				}
6338				if (intval == vdev_prop_default_numeric(prop))
6339					src = ZPROP_SRC_DEFAULT;
6340
6341				vdev_prop_add_list(outnvl, propname, strval,
6342				    intval, src);
6343				break;
6344			case VDEV_PROP_CHECKSUM_N:
6345			case VDEV_PROP_CHECKSUM_T:
6346			case VDEV_PROP_IO_N:
6347			case VDEV_PROP_IO_T:
6348			case VDEV_PROP_SLOW_IO_N:
6349			case VDEV_PROP_SLOW_IO_T:
6350				err = vdev_prop_get_int(vd, prop, &intval);
6351				if (err && err != ENOENT)
6352					break;
6353
6354				if (intval == vdev_prop_default_numeric(prop))
6355					src = ZPROP_SRC_DEFAULT;
6356				else
6357					src = ZPROP_SRC_LOCAL;
6358
6359				vdev_prop_add_list(outnvl, propname, NULL,
6360				    intval, src);
6361				break;
6362			/* Text Properties */
6363			case VDEV_PROP_COMMENT:
6364				/* Exists in the ZAP below */
6365				/* FALLTHRU */
6366			case VDEV_PROP_USERPROP:
6367				/* User Properites */
6368				src = ZPROP_SRC_LOCAL;
6369
6370				err = zap_length(mos, objid, nvpair_name(elem),
6371				    &integer_size, &num_integers);
6372				if (err)
6373					break;
6374
6375				switch (integer_size) {
6376				case 8:
6377					/* User properties cannot be integers */
6378					err = EINVAL;
6379					break;
6380				case 1:
6381					/* string property */
6382					strval = kmem_alloc(num_integers,
6383					    KM_SLEEP);
6384					err = zap_lookup(mos, objid,
6385					    nvpair_name(elem), 1,
6386					    num_integers, strval);
6387					if (err) {
6388						kmem_free(strval,
6389						    num_integers);
6390						break;
6391					}
6392					vdev_prop_add_list(outnvl, propname,
6393					    strval, 0, src);
6394					kmem_free(strval, num_integers);
6395					break;
6396				}
6397				break;
6398			default:
6399				err = ENOENT;
6400				break;
6401			}
6402			if (err)
6403				break;
6404		}
6405	} else {
6406		/*
6407		 * Get all properties from the MOS vdev property object.
6408		 */
6409		zap_cursor_t zc;
6410		zap_attribute_t za;
6411		for (zap_cursor_init(&zc, mos, objid);
6412		    (err = zap_cursor_retrieve(&zc, &za)) == 0;
6413		    zap_cursor_advance(&zc)) {
6414			intval = 0;
6415			strval = NULL;
6416			zprop_source_t src = ZPROP_SRC_DEFAULT;
6417			propname = za.za_name;
6418
6419			switch (za.za_integer_length) {
6420			case 8:
6421				/* We do not allow integer user properties */
6422				/* This is likely an internal value */
6423				break;
6424			case 1:
6425				/* string property */
6426				strval = kmem_alloc(za.za_num_integers,
6427				    KM_SLEEP);
6428				err = zap_lookup(mos, objid, za.za_name, 1,
6429				    za.za_num_integers, strval);
6430				if (err) {
6431					kmem_free(strval, za.za_num_integers);
6432					break;
6433				}
6434				vdev_prop_add_list(outnvl, propname, strval, 0,
6435				    src);
6436				kmem_free(strval, za.za_num_integers);
6437				break;
6438
6439			default:
6440				break;
6441			}
6442		}
6443		zap_cursor_fini(&zc);
6444	}
6445
6446	mutex_exit(&spa->spa_props_lock);
6447	if (err && err != ENOENT) {
6448		return (err);
6449	}
6450
6451	return (0);
6452}
6453
6454EXPORT_SYMBOL(vdev_fault);
6455EXPORT_SYMBOL(vdev_degrade);
6456EXPORT_SYMBOL(vdev_online);
6457EXPORT_SYMBOL(vdev_offline);
6458EXPORT_SYMBOL(vdev_clear);
6459
6460ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_count, UINT, ZMOD_RW,
6461	"Target number of metaslabs per top-level vdev");
6462
6463ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_shift, UINT, ZMOD_RW,
6464	"Default lower limit for metaslab size");
6465
6466ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, max_ms_shift, UINT, ZMOD_RW,
6467	"Default upper limit for metaslab size");
6468
6469ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, min_ms_count, UINT, ZMOD_RW,
6470	"Minimum number of metaslabs per top-level vdev");
6471
6472ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, ms_count_limit, UINT, ZMOD_RW,
6473	"Practical upper limit of total metaslabs per top-level vdev");
6474
6475ZFS_MODULE_PARAM(zfs, zfs_, slow_io_events_per_second, UINT, ZMOD_RW,
6476	"Rate limit slow IO (delay) events to this many per second");
6477
6478/* BEGIN CSTYLED */
6479ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW,
6480	"Rate limit checksum events to this many checksum errors per second "
6481	"(do not set below ZED threshold).");
6482/* END CSTYLED */
6483
6484ZFS_MODULE_PARAM(zfs, zfs_, scan_ignore_errors, INT, ZMOD_RW,
6485	"Ignore errors during resilver/scrub");
6486
6487ZFS_MODULE_PARAM(zfs_vdev, vdev_, validate_skip, INT, ZMOD_RW,
6488	"Bypass vdev_validate()");
6489
6490ZFS_MODULE_PARAM(zfs, zfs_, nocacheflush, INT, ZMOD_RW,
6491	"Disable cache flushes");
6492
6493ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, UINT, ZMOD_RW,
6494	"Minimum number of metaslabs required to dedicate one for log blocks");
6495
6496/* BEGIN CSTYLED */
6497ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift,
6498	param_set_min_auto_ashift, param_get_uint, ZMOD_RW,
6499	"Minimum ashift used when creating new top-level vdevs");
6500
6501ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, max_auto_ashift,
6502	param_set_max_auto_ashift, param_get_uint, ZMOD_RW,
6503	"Maximum ashift used when optimizing for logical -> physical sector "
6504	"size on new top-level vdevs");
6505/* END CSTYLED */
6506