metaslab.c revision 307268
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
25 * Copyright (c) 2014 Integros [integros.com]
26 */
27
28#include <sys/zfs_context.h>
29#include <sys/dmu.h>
30#include <sys/dmu_tx.h>
31#include <sys/space_map.h>
32#include <sys/metaslab_impl.h>
33#include <sys/vdev_impl.h>
34#include <sys/zio.h>
35#include <sys/spa_impl.h>
36#include <sys/zfeature.h>
37
38SYSCTL_DECL(_vfs_zfs);
39SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab");
40
41/*
42 * Allow allocations to switch to gang blocks quickly. We do this to
43 * avoid having to load lots of space_maps in a given txg. There are,
44 * however, some cases where we want to avoid "fast" ganging and instead
45 * we want to do an exhaustive search of all metaslabs on this device.
46 * Currently we don't allow any gang, slog, or dump device related allocations
47 * to "fast" gang.
48 */
49#define	CAN_FASTGANG(flags) \
50	(!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
51	METASLAB_GANG_AVOID)))
52
53#define	METASLAB_WEIGHT_PRIMARY		(1ULL << 63)
54#define	METASLAB_WEIGHT_SECONDARY	(1ULL << 62)
55#define	METASLAB_ACTIVE_MASK		\
56	(METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
57
58uint64_t metaslab_aliquot = 512ULL << 10;
59uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
60TUNABLE_QUAD("vfs.zfs.metaslab.gang_bang", &metaslab_gang_bang);
61SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, gang_bang, CTLFLAG_RWTUN,
62    &metaslab_gang_bang, 0,
63    "Force gang block allocation for blocks larger than or equal to this value");
64
65/*
66 * The in-core space map representation is more compact than its on-disk form.
67 * The zfs_condense_pct determines how much more compact the in-core
68 * space_map representation must be before we compact it on-disk.
69 * Values should be greater than or equal to 100.
70 */
71int zfs_condense_pct = 200;
72TUNABLE_INT("vfs.zfs.condense_pct", &zfs_condense_pct);
73SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN,
74    &zfs_condense_pct, 0,
75    "Condense on-disk spacemap when it is more than this many percents"
76    " of in-memory counterpart");
77
78/*
79 * Condensing a metaslab is not guaranteed to actually reduce the amount of
80 * space used on disk. In particular, a space map uses data in increments of
81 * MAX(1 << ashift, space_map_blksize), so a metaslab might use the
82 * same number of blocks after condensing. Since the goal of condensing is to
83 * reduce the number of IOPs required to read the space map, we only want to
84 * condense when we can be sure we will reduce the number of blocks used by the
85 * space map. Unfortunately, we cannot precisely compute whether or not this is
86 * the case in metaslab_should_condense since we are holding ms_lock. Instead,
87 * we apply the following heuristic: do not condense a spacemap unless the
88 * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
89 * blocks.
90 */
91int zfs_metaslab_condense_block_threshold = 4;
92
93/*
94 * The zfs_mg_noalloc_threshold defines which metaslab groups should
95 * be eligible for allocation. The value is defined as a percentage of
96 * free space. Metaslab groups that have more free space than
97 * zfs_mg_noalloc_threshold are always eligible for allocations. Once
98 * a metaslab group's free space is less than or equal to the
99 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
100 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
101 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
102 * groups are allowed to accept allocations. Gang blocks are always
103 * eligible to allocate on any metaslab group. The default value of 0 means
104 * no metaslab group will be excluded based on this criterion.
105 */
106int zfs_mg_noalloc_threshold = 0;
107TUNABLE_INT("vfs.zfs.mg_noalloc_threshold", &zfs_mg_noalloc_threshold);
108SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_noalloc_threshold, CTLFLAG_RWTUN,
109    &zfs_mg_noalloc_threshold, 0,
110    "Percentage of metaslab group size that should be free"
111    " to make it eligible for allocation");
112
113/*
114 * Metaslab groups are considered eligible for allocations if their
115 * fragmenation metric (measured as a percentage) is less than or equal to
116 * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold
117 * then it will be skipped unless all metaslab groups within the metaslab
118 * class have also crossed this threshold.
119 */
120int zfs_mg_fragmentation_threshold = 85;
121TUNABLE_INT("vfs.zfs.mg_fragmentation_threshold", &zfs_mg_fragmentation_threshold);
122SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_fragmentation_threshold, CTLFLAG_RWTUN,
123    &zfs_mg_fragmentation_threshold, 0,
124    "Percentage of metaslab group size that should be considered "
125    "eligible for allocations unless all metaslab groups within the metaslab class "
126    "have also crossed this threshold");
127
128/*
129 * Allow metaslabs to keep their active state as long as their fragmentation
130 * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
131 * active metaslab that exceeds this threshold will no longer keep its active
132 * status allowing better metaslabs to be selected.
133 */
134int zfs_metaslab_fragmentation_threshold = 70;
135TUNABLE_INT("vfs.zfs.metaslab.fragmentation_threshold",
136    &zfs_metaslab_fragmentation_threshold);
137SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_threshold, CTLFLAG_RWTUN,
138    &zfs_metaslab_fragmentation_threshold, 0,
139    "Maximum percentage of metaslab fragmentation level to keep their active state");
140
141/*
142 * When set will load all metaslabs when pool is first opened.
143 */
144int metaslab_debug_load = 0;
145TUNABLE_INT("vfs.zfs.metaslab.debug_load", &metaslab_debug_load);
146SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_load, CTLFLAG_RWTUN,
147    &metaslab_debug_load, 0,
148    "Load all metaslabs when pool is first opened");
149
150/*
151 * When set will prevent metaslabs from being unloaded.
152 */
153int metaslab_debug_unload = 0;
154TUNABLE_INT("vfs.zfs.metaslab.debug_unload", &metaslab_debug_unload);
155SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_unload, CTLFLAG_RWTUN,
156    &metaslab_debug_unload, 0,
157    "Prevent metaslabs from being unloaded");
158
159/*
160 * Minimum size which forces the dynamic allocator to change
161 * it's allocation strategy.  Once the space map cannot satisfy
162 * an allocation of this size then it switches to using more
163 * aggressive strategy (i.e search by size rather than offset).
164 */
165uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
166TUNABLE_QUAD("vfs.zfs.metaslab.df_alloc_threshold",
167    &metaslab_df_alloc_threshold);
168SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN,
169    &metaslab_df_alloc_threshold, 0,
170    "Minimum size which forces the dynamic allocator to change it's allocation strategy");
171
172/*
173 * The minimum free space, in percent, which must be available
174 * in a space map to continue allocations in a first-fit fashion.
175 * Once the space_map's free space drops below this level we dynamically
176 * switch to using best-fit allocations.
177 */
178int metaslab_df_free_pct = 4;
179TUNABLE_INT("vfs.zfs.metaslab.df_free_pct", &metaslab_df_free_pct);
180SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN,
181    &metaslab_df_free_pct, 0,
182    "The minimum free space, in percent, which must be available in a space map to continue allocations in a first-fit fashion");
183
184/*
185 * A metaslab is considered "free" if it contains a contiguous
186 * segment which is greater than metaslab_min_alloc_size.
187 */
188uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
189TUNABLE_QUAD("vfs.zfs.metaslab.min_alloc_size",
190    &metaslab_min_alloc_size);
191SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, min_alloc_size, CTLFLAG_RWTUN,
192    &metaslab_min_alloc_size, 0,
193    "A metaslab is considered \"free\" if it contains a contiguous segment which is greater than vfs.zfs.metaslab.min_alloc_size");
194
195/*
196 * Percentage of all cpus that can be used by the metaslab taskq.
197 */
198int metaslab_load_pct = 50;
199TUNABLE_INT("vfs.zfs.metaslab.load_pct", &metaslab_load_pct);
200SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN,
201    &metaslab_load_pct, 0,
202    "Percentage of cpus that can be used by the metaslab taskq");
203
204/*
205 * Determines how many txgs a metaslab may remain loaded without having any
206 * allocations from it. As long as a metaslab continues to be used we will
207 * keep it loaded.
208 */
209int metaslab_unload_delay = TXG_SIZE * 2;
210TUNABLE_INT("vfs.zfs.metaslab.unload_delay", &metaslab_unload_delay);
211SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, unload_delay, CTLFLAG_RWTUN,
212    &metaslab_unload_delay, 0,
213    "Number of TXGs that an unused metaslab can be kept in memory");
214
215/*
216 * Max number of metaslabs per group to preload.
217 */
218int metaslab_preload_limit = SPA_DVAS_PER_BP;
219TUNABLE_INT("vfs.zfs.metaslab.preload_limit", &metaslab_preload_limit);
220SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN,
221    &metaslab_preload_limit, 0,
222    "Max number of metaslabs per group to preload");
223
224/*
225 * Enable/disable preloading of metaslab.
226 */
227boolean_t metaslab_preload_enabled = B_TRUE;
228TUNABLE_INT("vfs.zfs.metaslab.preload_enabled", &metaslab_preload_enabled);
229SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_enabled, CTLFLAG_RWTUN,
230    &metaslab_preload_enabled, 0,
231    "Max number of metaslabs per group to preload");
232
233/*
234 * Enable/disable fragmentation weighting on metaslabs.
235 */
236boolean_t metaslab_fragmentation_factor_enabled = B_TRUE;
237TUNABLE_INT("vfs.zfs.metaslab_fragmentation_factor_enabled",
238    &metaslab_fragmentation_factor_enabled);
239SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_factor_enabled, CTLFLAG_RWTUN,
240    &metaslab_fragmentation_factor_enabled, 0,
241    "Enable fragmentation weighting on metaslabs");
242
243/*
244 * Enable/disable lba weighting (i.e. outer tracks are given preference).
245 */
246boolean_t metaslab_lba_weighting_enabled = B_TRUE;
247TUNABLE_INT("vfs.zfs.metaslab.lba_weighting_enabled",
248    &metaslab_lba_weighting_enabled);
249SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, lba_weighting_enabled, CTLFLAG_RWTUN,
250    &metaslab_lba_weighting_enabled, 0,
251    "Enable LBA weighting (i.e. outer tracks are given preference)");
252
253/*
254 * Enable/disable metaslab group biasing.
255 */
256boolean_t metaslab_bias_enabled = B_TRUE;
257TUNABLE_INT("vfs.zfs.metaslab.bias_enabled",
258    &metaslab_bias_enabled);
259SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, bias_enabled, CTLFLAG_RWTUN,
260    &metaslab_bias_enabled, 0,
261    "Enable metaslab group biasing");
262
263static uint64_t metaslab_fragmentation(metaslab_t *);
264
265/*
266 * ==========================================================================
267 * Metaslab classes
268 * ==========================================================================
269 */
270metaslab_class_t *
271metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
272{
273	metaslab_class_t *mc;
274
275	mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
276
277	mc->mc_spa = spa;
278	mc->mc_rotor = NULL;
279	mc->mc_ops = ops;
280
281	return (mc);
282}
283
284void
285metaslab_class_destroy(metaslab_class_t *mc)
286{
287	ASSERT(mc->mc_rotor == NULL);
288	ASSERT(mc->mc_alloc == 0);
289	ASSERT(mc->mc_deferred == 0);
290	ASSERT(mc->mc_space == 0);
291	ASSERT(mc->mc_dspace == 0);
292
293	kmem_free(mc, sizeof (metaslab_class_t));
294}
295
296int
297metaslab_class_validate(metaslab_class_t *mc)
298{
299	metaslab_group_t *mg;
300	vdev_t *vd;
301
302	/*
303	 * Must hold one of the spa_config locks.
304	 */
305	ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
306	    spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
307
308	if ((mg = mc->mc_rotor) == NULL)
309		return (0);
310
311	do {
312		vd = mg->mg_vd;
313		ASSERT(vd->vdev_mg != NULL);
314		ASSERT3P(vd->vdev_top, ==, vd);
315		ASSERT3P(mg->mg_class, ==, mc);
316		ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
317	} while ((mg = mg->mg_next) != mc->mc_rotor);
318
319	return (0);
320}
321
322void
323metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
324    int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
325{
326	atomic_add_64(&mc->mc_alloc, alloc_delta);
327	atomic_add_64(&mc->mc_deferred, defer_delta);
328	atomic_add_64(&mc->mc_space, space_delta);
329	atomic_add_64(&mc->mc_dspace, dspace_delta);
330}
331
332void
333metaslab_class_minblocksize_update(metaslab_class_t *mc)
334{
335	metaslab_group_t *mg;
336	vdev_t *vd;
337	uint64_t minashift = UINT64_MAX;
338
339	if ((mg = mc->mc_rotor) == NULL) {
340		mc->mc_minblocksize = SPA_MINBLOCKSIZE;
341		return;
342	}
343
344	do {
345		vd = mg->mg_vd;
346		if (vd->vdev_ashift < minashift)
347			minashift = vd->vdev_ashift;
348	} while ((mg = mg->mg_next) != mc->mc_rotor);
349
350	mc->mc_minblocksize = 1ULL << minashift;
351}
352
353uint64_t
354metaslab_class_get_alloc(metaslab_class_t *mc)
355{
356	return (mc->mc_alloc);
357}
358
359uint64_t
360metaslab_class_get_deferred(metaslab_class_t *mc)
361{
362	return (mc->mc_deferred);
363}
364
365uint64_t
366metaslab_class_get_space(metaslab_class_t *mc)
367{
368	return (mc->mc_space);
369}
370
371uint64_t
372metaslab_class_get_dspace(metaslab_class_t *mc)
373{
374	return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
375}
376
377uint64_t
378metaslab_class_get_minblocksize(metaslab_class_t *mc)
379{
380	return (mc->mc_minblocksize);
381}
382
383void
384metaslab_class_histogram_verify(metaslab_class_t *mc)
385{
386	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
387	uint64_t *mc_hist;
388	int i;
389
390	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
391		return;
392
393	mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
394	    KM_SLEEP);
395
396	for (int c = 0; c < rvd->vdev_children; c++) {
397		vdev_t *tvd = rvd->vdev_child[c];
398		metaslab_group_t *mg = tvd->vdev_mg;
399
400		/*
401		 * Skip any holes, uninitialized top-levels, or
402		 * vdevs that are not in this metalab class.
403		 */
404		if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
405		    mg->mg_class != mc) {
406			continue;
407		}
408
409		for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
410			mc_hist[i] += mg->mg_histogram[i];
411	}
412
413	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
414		VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
415
416	kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
417}
418
419/*
420 * Calculate the metaslab class's fragmentation metric. The metric
421 * is weighted based on the space contribution of each metaslab group.
422 * The return value will be a number between 0 and 100 (inclusive), or
423 * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
424 * zfs_frag_table for more information about the metric.
425 */
426uint64_t
427metaslab_class_fragmentation(metaslab_class_t *mc)
428{
429	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
430	uint64_t fragmentation = 0;
431
432	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
433
434	for (int c = 0; c < rvd->vdev_children; c++) {
435		vdev_t *tvd = rvd->vdev_child[c];
436		metaslab_group_t *mg = tvd->vdev_mg;
437
438		/*
439		 * Skip any holes, uninitialized top-levels, or
440		 * vdevs that are not in this metalab class.
441		 */
442		if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
443		    mg->mg_class != mc) {
444			continue;
445		}
446
447		/*
448		 * If a metaslab group does not contain a fragmentation
449		 * metric then just bail out.
450		 */
451		if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
452			spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
453			return (ZFS_FRAG_INVALID);
454		}
455
456		/*
457		 * Determine how much this metaslab_group is contributing
458		 * to the overall pool fragmentation metric.
459		 */
460		fragmentation += mg->mg_fragmentation *
461		    metaslab_group_get_space(mg);
462	}
463	fragmentation /= metaslab_class_get_space(mc);
464
465	ASSERT3U(fragmentation, <=, 100);
466	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
467	return (fragmentation);
468}
469
470/*
471 * Calculate the amount of expandable space that is available in
472 * this metaslab class. If a device is expanded then its expandable
473 * space will be the amount of allocatable space that is currently not
474 * part of this metaslab class.
475 */
476uint64_t
477metaslab_class_expandable_space(metaslab_class_t *mc)
478{
479	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
480	uint64_t space = 0;
481
482	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
483	for (int c = 0; c < rvd->vdev_children; c++) {
484		vdev_t *tvd = rvd->vdev_child[c];
485		metaslab_group_t *mg = tvd->vdev_mg;
486
487		if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
488		    mg->mg_class != mc) {
489			continue;
490		}
491
492		/*
493		 * Calculate if we have enough space to add additional
494		 * metaslabs. We report the expandable space in terms
495		 * of the metaslab size since that's the unit of expansion.
496		 */
497		space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize,
498		    1ULL << tvd->vdev_ms_shift);
499	}
500	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
501	return (space);
502}
503
504/*
505 * ==========================================================================
506 * Metaslab groups
507 * ==========================================================================
508 */
509static int
510metaslab_compare(const void *x1, const void *x2)
511{
512	const metaslab_t *m1 = x1;
513	const metaslab_t *m2 = x2;
514
515	if (m1->ms_weight < m2->ms_weight)
516		return (1);
517	if (m1->ms_weight > m2->ms_weight)
518		return (-1);
519
520	/*
521	 * If the weights are identical, use the offset to force uniqueness.
522	 */
523	if (m1->ms_start < m2->ms_start)
524		return (-1);
525	if (m1->ms_start > m2->ms_start)
526		return (1);
527
528	ASSERT3P(m1, ==, m2);
529
530	return (0);
531}
532
533/*
534 * Update the allocatable flag and the metaslab group's capacity.
535 * The allocatable flag is set to true if the capacity is below
536 * the zfs_mg_noalloc_threshold. If a metaslab group transitions
537 * from allocatable to non-allocatable or vice versa then the metaslab
538 * group's class is updated to reflect the transition.
539 */
540static void
541metaslab_group_alloc_update(metaslab_group_t *mg)
542{
543	vdev_t *vd = mg->mg_vd;
544	metaslab_class_t *mc = mg->mg_class;
545	vdev_stat_t *vs = &vd->vdev_stat;
546	boolean_t was_allocatable;
547
548	ASSERT(vd == vd->vdev_top);
549
550	mutex_enter(&mg->mg_lock);
551	was_allocatable = mg->mg_allocatable;
552
553	mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
554	    (vs->vs_space + 1);
555
556	/*
557	 * A metaslab group is considered allocatable if it has plenty
558	 * of free space or is not heavily fragmented. We only take
559	 * fragmentation into account if the metaslab group has a valid
560	 * fragmentation metric (i.e. a value between 0 and 100).
561	 */
562	mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
563	    (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
564	    mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
565
566	/*
567	 * The mc_alloc_groups maintains a count of the number of
568	 * groups in this metaslab class that are still above the
569	 * zfs_mg_noalloc_threshold. This is used by the allocating
570	 * threads to determine if they should avoid allocations to
571	 * a given group. The allocator will avoid allocations to a group
572	 * if that group has reached or is below the zfs_mg_noalloc_threshold
573	 * and there are still other groups that are above the threshold.
574	 * When a group transitions from allocatable to non-allocatable or
575	 * vice versa we update the metaslab class to reflect that change.
576	 * When the mc_alloc_groups value drops to 0 that means that all
577	 * groups have reached the zfs_mg_noalloc_threshold making all groups
578	 * eligible for allocations. This effectively means that all devices
579	 * are balanced again.
580	 */
581	if (was_allocatable && !mg->mg_allocatable)
582		mc->mc_alloc_groups--;
583	else if (!was_allocatable && mg->mg_allocatable)
584		mc->mc_alloc_groups++;
585
586	mutex_exit(&mg->mg_lock);
587}
588
589metaslab_group_t *
590metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
591{
592	metaslab_group_t *mg;
593
594	mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
595	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
596	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
597	    sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
598	mg->mg_vd = vd;
599	mg->mg_class = mc;
600	mg->mg_activation_count = 0;
601
602	mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
603	    minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
604
605	return (mg);
606}
607
608void
609metaslab_group_destroy(metaslab_group_t *mg)
610{
611	ASSERT(mg->mg_prev == NULL);
612	ASSERT(mg->mg_next == NULL);
613	/*
614	 * We may have gone below zero with the activation count
615	 * either because we never activated in the first place or
616	 * because we're done, and possibly removing the vdev.
617	 */
618	ASSERT(mg->mg_activation_count <= 0);
619
620	taskq_destroy(mg->mg_taskq);
621	avl_destroy(&mg->mg_metaslab_tree);
622	mutex_destroy(&mg->mg_lock);
623	kmem_free(mg, sizeof (metaslab_group_t));
624}
625
626void
627metaslab_group_activate(metaslab_group_t *mg)
628{
629	metaslab_class_t *mc = mg->mg_class;
630	metaslab_group_t *mgprev, *mgnext;
631
632	ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
633
634	ASSERT(mc->mc_rotor != mg);
635	ASSERT(mg->mg_prev == NULL);
636	ASSERT(mg->mg_next == NULL);
637	ASSERT(mg->mg_activation_count <= 0);
638
639	if (++mg->mg_activation_count <= 0)
640		return;
641
642	mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
643	metaslab_group_alloc_update(mg);
644
645	if ((mgprev = mc->mc_rotor) == NULL) {
646		mg->mg_prev = mg;
647		mg->mg_next = mg;
648	} else {
649		mgnext = mgprev->mg_next;
650		mg->mg_prev = mgprev;
651		mg->mg_next = mgnext;
652		mgprev->mg_next = mg;
653		mgnext->mg_prev = mg;
654	}
655	mc->mc_rotor = mg;
656	metaslab_class_minblocksize_update(mc);
657}
658
659void
660metaslab_group_passivate(metaslab_group_t *mg)
661{
662	metaslab_class_t *mc = mg->mg_class;
663	metaslab_group_t *mgprev, *mgnext;
664
665	ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
666
667	if (--mg->mg_activation_count != 0) {
668		ASSERT(mc->mc_rotor != mg);
669		ASSERT(mg->mg_prev == NULL);
670		ASSERT(mg->mg_next == NULL);
671		ASSERT(mg->mg_activation_count < 0);
672		return;
673	}
674
675	taskq_wait(mg->mg_taskq);
676	metaslab_group_alloc_update(mg);
677
678	mgprev = mg->mg_prev;
679	mgnext = mg->mg_next;
680
681	if (mg == mgnext) {
682		mc->mc_rotor = NULL;
683	} else {
684		mc->mc_rotor = mgnext;
685		mgprev->mg_next = mgnext;
686		mgnext->mg_prev = mgprev;
687	}
688
689	mg->mg_prev = NULL;
690	mg->mg_next = NULL;
691	metaslab_class_minblocksize_update(mc);
692}
693
694uint64_t
695metaslab_group_get_space(metaslab_group_t *mg)
696{
697	return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
698}
699
700void
701metaslab_group_histogram_verify(metaslab_group_t *mg)
702{
703	uint64_t *mg_hist;
704	vdev_t *vd = mg->mg_vd;
705	uint64_t ashift = vd->vdev_ashift;
706	int i;
707
708	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
709		return;
710
711	mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
712	    KM_SLEEP);
713
714	ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
715	    SPACE_MAP_HISTOGRAM_SIZE + ashift);
716
717	for (int m = 0; m < vd->vdev_ms_count; m++) {
718		metaslab_t *msp = vd->vdev_ms[m];
719
720		if (msp->ms_sm == NULL)
721			continue;
722
723		for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
724			mg_hist[i + ashift] +=
725			    msp->ms_sm->sm_phys->smp_histogram[i];
726	}
727
728	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
729		VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
730
731	kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
732}
733
734static void
735metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
736{
737	metaslab_class_t *mc = mg->mg_class;
738	uint64_t ashift = mg->mg_vd->vdev_ashift;
739
740	ASSERT(MUTEX_HELD(&msp->ms_lock));
741	if (msp->ms_sm == NULL)
742		return;
743
744	mutex_enter(&mg->mg_lock);
745	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
746		mg->mg_histogram[i + ashift] +=
747		    msp->ms_sm->sm_phys->smp_histogram[i];
748		mc->mc_histogram[i + ashift] +=
749		    msp->ms_sm->sm_phys->smp_histogram[i];
750	}
751	mutex_exit(&mg->mg_lock);
752}
753
754void
755metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
756{
757	metaslab_class_t *mc = mg->mg_class;
758	uint64_t ashift = mg->mg_vd->vdev_ashift;
759
760	ASSERT(MUTEX_HELD(&msp->ms_lock));
761	if (msp->ms_sm == NULL)
762		return;
763
764	mutex_enter(&mg->mg_lock);
765	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
766		ASSERT3U(mg->mg_histogram[i + ashift], >=,
767		    msp->ms_sm->sm_phys->smp_histogram[i]);
768		ASSERT3U(mc->mc_histogram[i + ashift], >=,
769		    msp->ms_sm->sm_phys->smp_histogram[i]);
770
771		mg->mg_histogram[i + ashift] -=
772		    msp->ms_sm->sm_phys->smp_histogram[i];
773		mc->mc_histogram[i + ashift] -=
774		    msp->ms_sm->sm_phys->smp_histogram[i];
775	}
776	mutex_exit(&mg->mg_lock);
777}
778
779static void
780metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
781{
782	ASSERT(msp->ms_group == NULL);
783	mutex_enter(&mg->mg_lock);
784	msp->ms_group = mg;
785	msp->ms_weight = 0;
786	avl_add(&mg->mg_metaslab_tree, msp);
787	mutex_exit(&mg->mg_lock);
788
789	mutex_enter(&msp->ms_lock);
790	metaslab_group_histogram_add(mg, msp);
791	mutex_exit(&msp->ms_lock);
792}
793
794static void
795metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
796{
797	mutex_enter(&msp->ms_lock);
798	metaslab_group_histogram_remove(mg, msp);
799	mutex_exit(&msp->ms_lock);
800
801	mutex_enter(&mg->mg_lock);
802	ASSERT(msp->ms_group == mg);
803	avl_remove(&mg->mg_metaslab_tree, msp);
804	msp->ms_group = NULL;
805	mutex_exit(&mg->mg_lock);
806}
807
808static void
809metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
810{
811	/*
812	 * Although in principle the weight can be any value, in
813	 * practice we do not use values in the range [1, 511].
814	 */
815	ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
816	ASSERT(MUTEX_HELD(&msp->ms_lock));
817
818	mutex_enter(&mg->mg_lock);
819	ASSERT(msp->ms_group == mg);
820	avl_remove(&mg->mg_metaslab_tree, msp);
821	msp->ms_weight = weight;
822	avl_add(&mg->mg_metaslab_tree, msp);
823	mutex_exit(&mg->mg_lock);
824}
825
826/*
827 * Calculate the fragmentation for a given metaslab group. We can use
828 * a simple average here since all metaslabs within the group must have
829 * the same size. The return value will be a value between 0 and 100
830 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
831 * group have a fragmentation metric.
832 */
833uint64_t
834metaslab_group_fragmentation(metaslab_group_t *mg)
835{
836	vdev_t *vd = mg->mg_vd;
837	uint64_t fragmentation = 0;
838	uint64_t valid_ms = 0;
839
840	for (int m = 0; m < vd->vdev_ms_count; m++) {
841		metaslab_t *msp = vd->vdev_ms[m];
842
843		if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
844			continue;
845
846		valid_ms++;
847		fragmentation += msp->ms_fragmentation;
848	}
849
850	if (valid_ms <= vd->vdev_ms_count / 2)
851		return (ZFS_FRAG_INVALID);
852
853	fragmentation /= valid_ms;
854	ASSERT3U(fragmentation, <=, 100);
855	return (fragmentation);
856}
857
858/*
859 * Determine if a given metaslab group should skip allocations. A metaslab
860 * group should avoid allocations if its free capacity is less than the
861 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
862 * zfs_mg_fragmentation_threshold and there is at least one metaslab group
863 * that can still handle allocations.
864 */
865static boolean_t
866metaslab_group_allocatable(metaslab_group_t *mg)
867{
868	vdev_t *vd = mg->mg_vd;
869	spa_t *spa = vd->vdev_spa;
870	metaslab_class_t *mc = mg->mg_class;
871
872	/*
873	 * We use two key metrics to determine if a metaslab group is
874	 * considered allocatable -- free space and fragmentation. If
875	 * the free space is greater than the free space threshold and
876	 * the fragmentation is less than the fragmentation threshold then
877	 * consider the group allocatable. There are two case when we will
878	 * not consider these key metrics. The first is if the group is
879	 * associated with a slog device and the second is if all groups
880	 * in this metaslab class have already been consider ineligible
881	 * for allocations.
882	 */
883	return ((mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
884	    (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
885	    mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)) ||
886	    mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0);
887}
888
889/*
890 * ==========================================================================
891 * Range tree callbacks
892 * ==========================================================================
893 */
894
895/*
896 * Comparison function for the private size-ordered tree. Tree is sorted
897 * by size, larger sizes at the end of the tree.
898 */
899static int
900metaslab_rangesize_compare(const void *x1, const void *x2)
901{
902	const range_seg_t *r1 = x1;
903	const range_seg_t *r2 = x2;
904	uint64_t rs_size1 = r1->rs_end - r1->rs_start;
905	uint64_t rs_size2 = r2->rs_end - r2->rs_start;
906
907	if (rs_size1 < rs_size2)
908		return (-1);
909	if (rs_size1 > rs_size2)
910		return (1);
911
912	if (r1->rs_start < r2->rs_start)
913		return (-1);
914
915	if (r1->rs_start > r2->rs_start)
916		return (1);
917
918	return (0);
919}
920
921/*
922 * Create any block allocator specific components. The current allocators
923 * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
924 */
925static void
926metaslab_rt_create(range_tree_t *rt, void *arg)
927{
928	metaslab_t *msp = arg;
929
930	ASSERT3P(rt->rt_arg, ==, msp);
931	ASSERT(msp->ms_tree == NULL);
932
933	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
934	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
935}
936
937/*
938 * Destroy the block allocator specific components.
939 */
940static void
941metaslab_rt_destroy(range_tree_t *rt, void *arg)
942{
943	metaslab_t *msp = arg;
944
945	ASSERT3P(rt->rt_arg, ==, msp);
946	ASSERT3P(msp->ms_tree, ==, rt);
947	ASSERT0(avl_numnodes(&msp->ms_size_tree));
948
949	avl_destroy(&msp->ms_size_tree);
950}
951
952static void
953metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
954{
955	metaslab_t *msp = arg;
956
957	ASSERT3P(rt->rt_arg, ==, msp);
958	ASSERT3P(msp->ms_tree, ==, rt);
959	VERIFY(!msp->ms_condensing);
960	avl_add(&msp->ms_size_tree, rs);
961}
962
963static void
964metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
965{
966	metaslab_t *msp = arg;
967
968	ASSERT3P(rt->rt_arg, ==, msp);
969	ASSERT3P(msp->ms_tree, ==, rt);
970	VERIFY(!msp->ms_condensing);
971	avl_remove(&msp->ms_size_tree, rs);
972}
973
974static void
975metaslab_rt_vacate(range_tree_t *rt, void *arg)
976{
977	metaslab_t *msp = arg;
978
979	ASSERT3P(rt->rt_arg, ==, msp);
980	ASSERT3P(msp->ms_tree, ==, rt);
981
982	/*
983	 * Normally one would walk the tree freeing nodes along the way.
984	 * Since the nodes are shared with the range trees we can avoid
985	 * walking all nodes and just reinitialize the avl tree. The nodes
986	 * will be freed by the range tree, so we don't want to free them here.
987	 */
988	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
989	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
990}
991
992static range_tree_ops_t metaslab_rt_ops = {
993	metaslab_rt_create,
994	metaslab_rt_destroy,
995	metaslab_rt_add,
996	metaslab_rt_remove,
997	metaslab_rt_vacate
998};
999
1000/*
1001 * ==========================================================================
1002 * Metaslab block operations
1003 * ==========================================================================
1004 */
1005
1006/*
1007 * Return the maximum contiguous segment within the metaslab.
1008 */
1009uint64_t
1010metaslab_block_maxsize(metaslab_t *msp)
1011{
1012	avl_tree_t *t = &msp->ms_size_tree;
1013	range_seg_t *rs;
1014
1015	if (t == NULL || (rs = avl_last(t)) == NULL)
1016		return (0ULL);
1017
1018	return (rs->rs_end - rs->rs_start);
1019}
1020
1021uint64_t
1022metaslab_block_alloc(metaslab_t *msp, uint64_t size)
1023{
1024	uint64_t start;
1025	range_tree_t *rt = msp->ms_tree;
1026
1027	VERIFY(!msp->ms_condensing);
1028
1029	start = msp->ms_ops->msop_alloc(msp, size);
1030	if (start != -1ULL) {
1031		vdev_t *vd = msp->ms_group->mg_vd;
1032
1033		VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
1034		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
1035		VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
1036		range_tree_remove(rt, start, size);
1037	}
1038	return (start);
1039}
1040
1041/*
1042 * ==========================================================================
1043 * Common allocator routines
1044 * ==========================================================================
1045 */
1046
1047/*
1048 * This is a helper function that can be used by the allocator to find
1049 * a suitable block to allocate. This will search the specified AVL
1050 * tree looking for a block that matches the specified criteria.
1051 */
1052static uint64_t
1053metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
1054    uint64_t align)
1055{
1056	range_seg_t *rs, rsearch;
1057	avl_index_t where;
1058
1059	rsearch.rs_start = *cursor;
1060	rsearch.rs_end = *cursor + size;
1061
1062	rs = avl_find(t, &rsearch, &where);
1063	if (rs == NULL)
1064		rs = avl_nearest(t, where, AVL_AFTER);
1065
1066	while (rs != NULL) {
1067		uint64_t offset = P2ROUNDUP(rs->rs_start, align);
1068
1069		if (offset + size <= rs->rs_end) {
1070			*cursor = offset + size;
1071			return (offset);
1072		}
1073		rs = AVL_NEXT(t, rs);
1074	}
1075
1076	/*
1077	 * If we know we've searched the whole map (*cursor == 0), give up.
1078	 * Otherwise, reset the cursor to the beginning and try again.
1079	 */
1080	if (*cursor == 0)
1081		return (-1ULL);
1082
1083	*cursor = 0;
1084	return (metaslab_block_picker(t, cursor, size, align));
1085}
1086
1087/*
1088 * ==========================================================================
1089 * The first-fit block allocator
1090 * ==========================================================================
1091 */
1092static uint64_t
1093metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
1094{
1095	/*
1096	 * Find the largest power of 2 block size that evenly divides the
1097	 * requested size. This is used to try to allocate blocks with similar
1098	 * alignment from the same area of the metaslab (i.e. same cursor
1099	 * bucket) but it does not guarantee that other allocations sizes
1100	 * may exist in the same region.
1101	 */
1102	uint64_t align = size & -size;
1103	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1104	avl_tree_t *t = &msp->ms_tree->rt_root;
1105
1106	return (metaslab_block_picker(t, cursor, size, align));
1107}
1108
1109static metaslab_ops_t metaslab_ff_ops = {
1110	metaslab_ff_alloc
1111};
1112
1113/*
1114 * ==========================================================================
1115 * Dynamic block allocator -
1116 * Uses the first fit allocation scheme until space get low and then
1117 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
1118 * and metaslab_df_free_pct to determine when to switch the allocation scheme.
1119 * ==========================================================================
1120 */
1121static uint64_t
1122metaslab_df_alloc(metaslab_t *msp, uint64_t size)
1123{
1124	/*
1125	 * Find the largest power of 2 block size that evenly divides the
1126	 * requested size. This is used to try to allocate blocks with similar
1127	 * alignment from the same area of the metaslab (i.e. same cursor
1128	 * bucket) but it does not guarantee that other allocations sizes
1129	 * may exist in the same region.
1130	 */
1131	uint64_t align = size & -size;
1132	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1133	range_tree_t *rt = msp->ms_tree;
1134	avl_tree_t *t = &rt->rt_root;
1135	uint64_t max_size = metaslab_block_maxsize(msp);
1136	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
1137
1138	ASSERT(MUTEX_HELD(&msp->ms_lock));
1139	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
1140
1141	if (max_size < size)
1142		return (-1ULL);
1143
1144	/*
1145	 * If we're running low on space switch to using the size
1146	 * sorted AVL tree (best-fit).
1147	 */
1148	if (max_size < metaslab_df_alloc_threshold ||
1149	    free_pct < metaslab_df_free_pct) {
1150		t = &msp->ms_size_tree;
1151		*cursor = 0;
1152	}
1153
1154	return (metaslab_block_picker(t, cursor, size, 1ULL));
1155}
1156
1157static metaslab_ops_t metaslab_df_ops = {
1158	metaslab_df_alloc
1159};
1160
1161/*
1162 * ==========================================================================
1163 * Cursor fit block allocator -
1164 * Select the largest region in the metaslab, set the cursor to the beginning
1165 * of the range and the cursor_end to the end of the range. As allocations
1166 * are made advance the cursor. Continue allocating from the cursor until
1167 * the range is exhausted and then find a new range.
1168 * ==========================================================================
1169 */
1170static uint64_t
1171metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
1172{
1173	range_tree_t *rt = msp->ms_tree;
1174	avl_tree_t *t = &msp->ms_size_tree;
1175	uint64_t *cursor = &msp->ms_lbas[0];
1176	uint64_t *cursor_end = &msp->ms_lbas[1];
1177	uint64_t offset = 0;
1178
1179	ASSERT(MUTEX_HELD(&msp->ms_lock));
1180	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
1181
1182	ASSERT3U(*cursor_end, >=, *cursor);
1183
1184	if ((*cursor + size) > *cursor_end) {
1185		range_seg_t *rs;
1186
1187		rs = avl_last(&msp->ms_size_tree);
1188		if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
1189			return (-1ULL);
1190
1191		*cursor = rs->rs_start;
1192		*cursor_end = rs->rs_end;
1193	}
1194
1195	offset = *cursor;
1196	*cursor += size;
1197
1198	return (offset);
1199}
1200
1201static metaslab_ops_t metaslab_cf_ops = {
1202	metaslab_cf_alloc
1203};
1204
1205/*
1206 * ==========================================================================
1207 * New dynamic fit allocator -
1208 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
1209 * contiguous blocks. If no region is found then just use the largest segment
1210 * that remains.
1211 * ==========================================================================
1212 */
1213
1214/*
1215 * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
1216 * to request from the allocator.
1217 */
1218uint64_t metaslab_ndf_clump_shift = 4;
1219
1220static uint64_t
1221metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
1222{
1223	avl_tree_t *t = &msp->ms_tree->rt_root;
1224	avl_index_t where;
1225	range_seg_t *rs, rsearch;
1226	uint64_t hbit = highbit64(size);
1227	uint64_t *cursor = &msp->ms_lbas[hbit - 1];
1228	uint64_t max_size = metaslab_block_maxsize(msp);
1229
1230	ASSERT(MUTEX_HELD(&msp->ms_lock));
1231	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
1232
1233	if (max_size < size)
1234		return (-1ULL);
1235
1236	rsearch.rs_start = *cursor;
1237	rsearch.rs_end = *cursor + size;
1238
1239	rs = avl_find(t, &rsearch, &where);
1240	if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
1241		t = &msp->ms_size_tree;
1242
1243		rsearch.rs_start = 0;
1244		rsearch.rs_end = MIN(max_size,
1245		    1ULL << (hbit + metaslab_ndf_clump_shift));
1246		rs = avl_find(t, &rsearch, &where);
1247		if (rs == NULL)
1248			rs = avl_nearest(t, where, AVL_AFTER);
1249		ASSERT(rs != NULL);
1250	}
1251
1252	if ((rs->rs_end - rs->rs_start) >= size) {
1253		*cursor = rs->rs_start + size;
1254		return (rs->rs_start);
1255	}
1256	return (-1ULL);
1257}
1258
1259static metaslab_ops_t metaslab_ndf_ops = {
1260	metaslab_ndf_alloc
1261};
1262
1263metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
1264
1265/*
1266 * ==========================================================================
1267 * Metaslabs
1268 * ==========================================================================
1269 */
1270
1271/*
1272 * Wait for any in-progress metaslab loads to complete.
1273 */
1274void
1275metaslab_load_wait(metaslab_t *msp)
1276{
1277	ASSERT(MUTEX_HELD(&msp->ms_lock));
1278
1279	while (msp->ms_loading) {
1280		ASSERT(!msp->ms_loaded);
1281		cv_wait(&msp->ms_load_cv, &msp->ms_lock);
1282	}
1283}
1284
1285int
1286metaslab_load(metaslab_t *msp)
1287{
1288	int error = 0;
1289
1290	ASSERT(MUTEX_HELD(&msp->ms_lock));
1291	ASSERT(!msp->ms_loaded);
1292	ASSERT(!msp->ms_loading);
1293
1294	msp->ms_loading = B_TRUE;
1295
1296	/*
1297	 * If the space map has not been allocated yet, then treat
1298	 * all the space in the metaslab as free and add it to the
1299	 * ms_tree.
1300	 */
1301	if (msp->ms_sm != NULL)
1302		error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
1303	else
1304		range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
1305
1306	msp->ms_loaded = (error == 0);
1307	msp->ms_loading = B_FALSE;
1308
1309	if (msp->ms_loaded) {
1310		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1311			range_tree_walk(msp->ms_defertree[t],
1312			    range_tree_remove, msp->ms_tree);
1313		}
1314	}
1315	cv_broadcast(&msp->ms_load_cv);
1316	return (error);
1317}
1318
1319void
1320metaslab_unload(metaslab_t *msp)
1321{
1322	ASSERT(MUTEX_HELD(&msp->ms_lock));
1323	range_tree_vacate(msp->ms_tree, NULL, NULL);
1324	msp->ms_loaded = B_FALSE;
1325	msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
1326}
1327
1328int
1329metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
1330    metaslab_t **msp)
1331{
1332	vdev_t *vd = mg->mg_vd;
1333	objset_t *mos = vd->vdev_spa->spa_meta_objset;
1334	metaslab_t *ms;
1335	int error;
1336
1337	ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
1338	mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
1339	cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
1340	ms->ms_id = id;
1341	ms->ms_start = id << vd->vdev_ms_shift;
1342	ms->ms_size = 1ULL << vd->vdev_ms_shift;
1343
1344	/*
1345	 * We only open space map objects that already exist. All others
1346	 * will be opened when we finally allocate an object for it.
1347	 */
1348	if (object != 0) {
1349		error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
1350		    ms->ms_size, vd->vdev_ashift, &ms->ms_lock);
1351
1352		if (error != 0) {
1353			kmem_free(ms, sizeof (metaslab_t));
1354			return (error);
1355		}
1356
1357		ASSERT(ms->ms_sm != NULL);
1358	}
1359
1360	/*
1361	 * We create the main range tree here, but we don't create the
1362	 * alloctree and freetree until metaslab_sync_done().  This serves
1363	 * two purposes: it allows metaslab_sync_done() to detect the
1364	 * addition of new space; and for debugging, it ensures that we'd
1365	 * data fault on any attempt to use this metaslab before it's ready.
1366	 */
1367	ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms, &ms->ms_lock);
1368	metaslab_group_add(mg, ms);
1369
1370	ms->ms_fragmentation = metaslab_fragmentation(ms);
1371	ms->ms_ops = mg->mg_class->mc_ops;
1372
1373	/*
1374	 * If we're opening an existing pool (txg == 0) or creating
1375	 * a new one (txg == TXG_INITIAL), all space is available now.
1376	 * If we're adding space to an existing pool, the new space
1377	 * does not become available until after this txg has synced.
1378	 */
1379	if (txg <= TXG_INITIAL)
1380		metaslab_sync_done(ms, 0);
1381
1382	/*
1383	 * If metaslab_debug_load is set and we're initializing a metaslab
1384	 * that has an allocated space_map object then load the its space
1385	 * map so that can verify frees.
1386	 */
1387	if (metaslab_debug_load && ms->ms_sm != NULL) {
1388		mutex_enter(&ms->ms_lock);
1389		VERIFY0(metaslab_load(ms));
1390		mutex_exit(&ms->ms_lock);
1391	}
1392
1393	if (txg != 0) {
1394		vdev_dirty(vd, 0, NULL, txg);
1395		vdev_dirty(vd, VDD_METASLAB, ms, txg);
1396	}
1397
1398	*msp = ms;
1399
1400	return (0);
1401}
1402
1403void
1404metaslab_fini(metaslab_t *msp)
1405{
1406	metaslab_group_t *mg = msp->ms_group;
1407
1408	metaslab_group_remove(mg, msp);
1409
1410	mutex_enter(&msp->ms_lock);
1411
1412	VERIFY(msp->ms_group == NULL);
1413	vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
1414	    0, -msp->ms_size);
1415	space_map_close(msp->ms_sm);
1416
1417	metaslab_unload(msp);
1418	range_tree_destroy(msp->ms_tree);
1419
1420	for (int t = 0; t < TXG_SIZE; t++) {
1421		range_tree_destroy(msp->ms_alloctree[t]);
1422		range_tree_destroy(msp->ms_freetree[t]);
1423	}
1424
1425	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1426		range_tree_destroy(msp->ms_defertree[t]);
1427	}
1428
1429	ASSERT0(msp->ms_deferspace);
1430
1431	mutex_exit(&msp->ms_lock);
1432	cv_destroy(&msp->ms_load_cv);
1433	mutex_destroy(&msp->ms_lock);
1434
1435	kmem_free(msp, sizeof (metaslab_t));
1436}
1437
1438#define	FRAGMENTATION_TABLE_SIZE	17
1439
1440/*
1441 * This table defines a segment size based fragmentation metric that will
1442 * allow each metaslab to derive its own fragmentation value. This is done
1443 * by calculating the space in each bucket of the spacemap histogram and
1444 * multiplying that by the fragmetation metric in this table. Doing
1445 * this for all buckets and dividing it by the total amount of free
1446 * space in this metaslab (i.e. the total free space in all buckets) gives
1447 * us the fragmentation metric. This means that a high fragmentation metric
1448 * equates to most of the free space being comprised of small segments.
1449 * Conversely, if the metric is low, then most of the free space is in
1450 * large segments. A 10% change in fragmentation equates to approximately
1451 * double the number of segments.
1452 *
1453 * This table defines 0% fragmented space using 16MB segments. Testing has
1454 * shown that segments that are greater than or equal to 16MB do not suffer
1455 * from drastic performance problems. Using this value, we derive the rest
1456 * of the table. Since the fragmentation value is never stored on disk, it
1457 * is possible to change these calculations in the future.
1458 */
1459int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
1460	100,	/* 512B	*/
1461	100,	/* 1K	*/
1462	98,	/* 2K	*/
1463	95,	/* 4K	*/
1464	90,	/* 8K	*/
1465	80,	/* 16K	*/
1466	70,	/* 32K	*/
1467	60,	/* 64K	*/
1468	50,	/* 128K	*/
1469	40,	/* 256K	*/
1470	30,	/* 512K	*/
1471	20,	/* 1M	*/
1472	15,	/* 2M	*/
1473	10,	/* 4M	*/
1474	5,	/* 8M	*/
1475	0	/* 16M	*/
1476};
1477
1478/*
1479 * Calclate the metaslab's fragmentation metric. A return value
1480 * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does
1481 * not support this metric. Otherwise, the return value should be in the
1482 * range [0, 100].
1483 */
1484static uint64_t
1485metaslab_fragmentation(metaslab_t *msp)
1486{
1487	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1488	uint64_t fragmentation = 0;
1489	uint64_t total = 0;
1490	boolean_t feature_enabled = spa_feature_is_enabled(spa,
1491	    SPA_FEATURE_SPACEMAP_HISTOGRAM);
1492
1493	if (!feature_enabled)
1494		return (ZFS_FRAG_INVALID);
1495
1496	/*
1497	 * A null space map means that the entire metaslab is free
1498	 * and thus is not fragmented.
1499	 */
1500	if (msp->ms_sm == NULL)
1501		return (0);
1502
1503	/*
1504	 * If this metaslab's space_map has not been upgraded, flag it
1505	 * so that we upgrade next time we encounter it.
1506	 */
1507	if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
1508		uint64_t txg = spa_syncing_txg(spa);
1509		vdev_t *vd = msp->ms_group->mg_vd;
1510
1511		if (spa_writeable(spa)) {
1512			msp->ms_condense_wanted = B_TRUE;
1513			vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
1514			spa_dbgmsg(spa, "txg %llu, requesting force condense: "
1515			    "msp %p, vd %p", txg, msp, vd);
1516		}
1517		return (ZFS_FRAG_INVALID);
1518	}
1519
1520	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
1521		uint64_t space = 0;
1522		uint8_t shift = msp->ms_sm->sm_shift;
1523		int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
1524		    FRAGMENTATION_TABLE_SIZE - 1);
1525
1526		if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
1527			continue;
1528
1529		space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
1530		total += space;
1531
1532		ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
1533		fragmentation += space * zfs_frag_table[idx];
1534	}
1535
1536	if (total > 0)
1537		fragmentation /= total;
1538	ASSERT3U(fragmentation, <=, 100);
1539	return (fragmentation);
1540}
1541
1542/*
1543 * Compute a weight -- a selection preference value -- for the given metaslab.
1544 * This is based on the amount of free space, the level of fragmentation,
1545 * the LBA range, and whether the metaslab is loaded.
1546 */
1547static uint64_t
1548metaslab_weight(metaslab_t *msp)
1549{
1550	metaslab_group_t *mg = msp->ms_group;
1551	vdev_t *vd = mg->mg_vd;
1552	uint64_t weight, space;
1553
1554	ASSERT(MUTEX_HELD(&msp->ms_lock));
1555
1556	/*
1557	 * This vdev is in the process of being removed so there is nothing
1558	 * for us to do here.
1559	 */
1560	if (vd->vdev_removing) {
1561		ASSERT0(space_map_allocated(msp->ms_sm));
1562		ASSERT0(vd->vdev_ms_shift);
1563		return (0);
1564	}
1565
1566	/*
1567	 * The baseline weight is the metaslab's free space.
1568	 */
1569	space = msp->ms_size - space_map_allocated(msp->ms_sm);
1570
1571	msp->ms_fragmentation = metaslab_fragmentation(msp);
1572	if (metaslab_fragmentation_factor_enabled &&
1573	    msp->ms_fragmentation != ZFS_FRAG_INVALID) {
1574		/*
1575		 * Use the fragmentation information to inversely scale
1576		 * down the baseline weight. We need to ensure that we
1577		 * don't exclude this metaslab completely when it's 100%
1578		 * fragmented. To avoid this we reduce the fragmented value
1579		 * by 1.
1580		 */
1581		space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
1582
1583		/*
1584		 * If space < SPA_MINBLOCKSIZE, then we will not allocate from
1585		 * this metaslab again. The fragmentation metric may have
1586		 * decreased the space to something smaller than
1587		 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
1588		 * so that we can consume any remaining space.
1589		 */
1590		if (space > 0 && space < SPA_MINBLOCKSIZE)
1591			space = SPA_MINBLOCKSIZE;
1592	}
1593	weight = space;
1594
1595	/*
1596	 * Modern disks have uniform bit density and constant angular velocity.
1597	 * Therefore, the outer recording zones are faster (higher bandwidth)
1598	 * than the inner zones by the ratio of outer to inner track diameter,
1599	 * which is typically around 2:1.  We account for this by assigning
1600	 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
1601	 * In effect, this means that we'll select the metaslab with the most
1602	 * free bandwidth rather than simply the one with the most free space.
1603	 */
1604	if (metaslab_lba_weighting_enabled) {
1605		weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
1606		ASSERT(weight >= space && weight <= 2 * space);
1607	}
1608
1609	/*
1610	 * If this metaslab is one we're actively using, adjust its
1611	 * weight to make it preferable to any inactive metaslab so
1612	 * we'll polish it off. If the fragmentation on this metaslab
1613	 * has exceed our threshold, then don't mark it active.
1614	 */
1615	if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
1616	    msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
1617		weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
1618	}
1619
1620	return (weight);
1621}
1622
1623static int
1624metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
1625{
1626	ASSERT(MUTEX_HELD(&msp->ms_lock));
1627
1628	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
1629		metaslab_load_wait(msp);
1630		if (!msp->ms_loaded) {
1631			int error = metaslab_load(msp);
1632			if (error) {
1633				metaslab_group_sort(msp->ms_group, msp, 0);
1634				return (error);
1635			}
1636		}
1637
1638		metaslab_group_sort(msp->ms_group, msp,
1639		    msp->ms_weight | activation_weight);
1640	}
1641	ASSERT(msp->ms_loaded);
1642	ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
1643
1644	return (0);
1645}
1646
1647static void
1648metaslab_passivate(metaslab_t *msp, uint64_t size)
1649{
1650	/*
1651	 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
1652	 * this metaslab again.  In that case, it had better be empty,
1653	 * or we would be leaving space on the table.
1654	 */
1655	ASSERT(size >= SPA_MINBLOCKSIZE || range_tree_space(msp->ms_tree) == 0);
1656	metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
1657	ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
1658}
1659
1660static void
1661metaslab_preload(void *arg)
1662{
1663	metaslab_t *msp = arg;
1664	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1665
1666	ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
1667
1668	mutex_enter(&msp->ms_lock);
1669	metaslab_load_wait(msp);
1670	if (!msp->ms_loaded)
1671		(void) metaslab_load(msp);
1672
1673	/*
1674	 * Set the ms_access_txg value so that we don't unload it right away.
1675	 */
1676	msp->ms_access_txg = spa_syncing_txg(spa) + metaslab_unload_delay + 1;
1677	mutex_exit(&msp->ms_lock);
1678}
1679
1680static void
1681metaslab_group_preload(metaslab_group_t *mg)
1682{
1683	spa_t *spa = mg->mg_vd->vdev_spa;
1684	metaslab_t *msp;
1685	avl_tree_t *t = &mg->mg_metaslab_tree;
1686	int m = 0;
1687
1688	if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
1689		taskq_wait(mg->mg_taskq);
1690		return;
1691	}
1692
1693	mutex_enter(&mg->mg_lock);
1694	/*
1695	 * Load the next potential metaslabs
1696	 */
1697	msp = avl_first(t);
1698	while (msp != NULL) {
1699		metaslab_t *msp_next = AVL_NEXT(t, msp);
1700
1701		/*
1702		 * We preload only the maximum number of metaslabs specified
1703		 * by metaslab_preload_limit. If a metaslab is being forced
1704		 * to condense then we preload it too. This will ensure
1705		 * that force condensing happens in the next txg.
1706		 */
1707		if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
1708			msp = msp_next;
1709			continue;
1710		}
1711
1712		/*
1713		 * We must drop the metaslab group lock here to preserve
1714		 * lock ordering with the ms_lock (when grabbing both
1715		 * the mg_lock and the ms_lock, the ms_lock must be taken
1716		 * first).  As a result, it is possible that the ordering
1717		 * of the metaslabs within the avl tree may change before
1718		 * we reacquire the lock. The metaslab cannot be removed from
1719		 * the tree while we're in syncing context so it is safe to
1720		 * drop the mg_lock here. If the metaslabs are reordered
1721		 * nothing will break -- we just may end up loading a
1722		 * less than optimal one.
1723		 */
1724		mutex_exit(&mg->mg_lock);
1725		VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
1726		    msp, TQ_SLEEP) != 0);
1727		mutex_enter(&mg->mg_lock);
1728		msp = msp_next;
1729	}
1730	mutex_exit(&mg->mg_lock);
1731}
1732
1733/*
1734 * Determine if the space map's on-disk footprint is past our tolerance
1735 * for inefficiency. We would like to use the following criteria to make
1736 * our decision:
1737 *
1738 * 1. The size of the space map object should not dramatically increase as a
1739 * result of writing out the free space range tree.
1740 *
1741 * 2. The minimal on-disk space map representation is zfs_condense_pct/100
1742 * times the size than the free space range tree representation
1743 * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB).
1744 *
1745 * 3. The on-disk size of the space map should actually decrease.
1746 *
1747 * Checking the first condition is tricky since we don't want to walk
1748 * the entire AVL tree calculating the estimated on-disk size. Instead we
1749 * use the size-ordered range tree in the metaslab and calculate the
1750 * size required to write out the largest segment in our free tree. If the
1751 * size required to represent that segment on disk is larger than the space
1752 * map object then we avoid condensing this map.
1753 *
1754 * To determine the second criterion we use a best-case estimate and assume
1755 * each segment can be represented on-disk as a single 64-bit entry. We refer
1756 * to this best-case estimate as the space map's minimal form.
1757 *
1758 * Unfortunately, we cannot compute the on-disk size of the space map in this
1759 * context because we cannot accurately compute the effects of compression, etc.
1760 * Instead, we apply the heuristic described in the block comment for
1761 * zfs_metaslab_condense_block_threshold - we only condense if the space used
1762 * is greater than a threshold number of blocks.
1763 */
1764static boolean_t
1765metaslab_should_condense(metaslab_t *msp)
1766{
1767	space_map_t *sm = msp->ms_sm;
1768	range_seg_t *rs;
1769	uint64_t size, entries, segsz, object_size, optimal_size, record_size;
1770	dmu_object_info_t doi;
1771	uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift;
1772
1773	ASSERT(MUTEX_HELD(&msp->ms_lock));
1774	ASSERT(msp->ms_loaded);
1775
1776	/*
1777	 * Use the ms_size_tree range tree, which is ordered by size, to
1778	 * obtain the largest segment in the free tree. We always condense
1779	 * metaslabs that are empty and metaslabs for which a condense
1780	 * request has been made.
1781	 */
1782	rs = avl_last(&msp->ms_size_tree);
1783	if (rs == NULL || msp->ms_condense_wanted)
1784		return (B_TRUE);
1785
1786	/*
1787	 * Calculate the number of 64-bit entries this segment would
1788	 * require when written to disk. If this single segment would be
1789	 * larger on-disk than the entire current on-disk structure, then
1790	 * clearly condensing will increase the on-disk structure size.
1791	 */
1792	size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
1793	entries = size / (MIN(size, SM_RUN_MAX));
1794	segsz = entries * sizeof (uint64_t);
1795
1796	optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root);
1797	object_size = space_map_length(msp->ms_sm);
1798
1799	dmu_object_info_from_db(sm->sm_dbuf, &doi);
1800	record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
1801
1802	return (segsz <= object_size &&
1803	    object_size >= (optimal_size * zfs_condense_pct / 100) &&
1804	    object_size > zfs_metaslab_condense_block_threshold * record_size);
1805}
1806
1807/*
1808 * Condense the on-disk space map representation to its minimized form.
1809 * The minimized form consists of a small number of allocations followed by
1810 * the entries of the free range tree.
1811 */
1812static void
1813metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
1814{
1815	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1816	range_tree_t *freetree = msp->ms_freetree[txg & TXG_MASK];
1817	range_tree_t *condense_tree;
1818	space_map_t *sm = msp->ms_sm;
1819
1820	ASSERT(MUTEX_HELD(&msp->ms_lock));
1821	ASSERT3U(spa_sync_pass(spa), ==, 1);
1822	ASSERT(msp->ms_loaded);
1823
1824
1825	spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, vdev id %llu, "
1826	    "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
1827	    msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
1828	    msp->ms_group->mg_vd->vdev_spa->spa_name,
1829	    space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root),
1830	    msp->ms_condense_wanted ? "TRUE" : "FALSE");
1831
1832	msp->ms_condense_wanted = B_FALSE;
1833
1834	/*
1835	 * Create an range tree that is 100% allocated. We remove segments
1836	 * that have been freed in this txg, any deferred frees that exist,
1837	 * and any allocation in the future. Removing segments should be
1838	 * a relatively inexpensive operation since we expect these trees to
1839	 * have a small number of nodes.
1840	 */
1841	condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock);
1842	range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
1843
1844	/*
1845	 * Remove what's been freed in this txg from the condense_tree.
1846	 * Since we're in sync_pass 1, we know that all the frees from
1847	 * this txg are in the freetree.
1848	 */
1849	range_tree_walk(freetree, range_tree_remove, condense_tree);
1850
1851	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1852		range_tree_walk(msp->ms_defertree[t],
1853		    range_tree_remove, condense_tree);
1854	}
1855
1856	for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
1857		range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK],
1858		    range_tree_remove, condense_tree);
1859	}
1860
1861	/*
1862	 * We're about to drop the metaslab's lock thus allowing
1863	 * other consumers to change it's content. Set the
1864	 * metaslab's ms_condensing flag to ensure that
1865	 * allocations on this metaslab do not occur while we're
1866	 * in the middle of committing it to disk. This is only critical
1867	 * for the ms_tree as all other range trees use per txg
1868	 * views of their content.
1869	 */
1870	msp->ms_condensing = B_TRUE;
1871
1872	mutex_exit(&msp->ms_lock);
1873	space_map_truncate(sm, tx);
1874	mutex_enter(&msp->ms_lock);
1875
1876	/*
1877	 * While we would ideally like to create a space_map representation
1878	 * that consists only of allocation records, doing so can be
1879	 * prohibitively expensive because the in-core free tree can be
1880	 * large, and therefore computationally expensive to subtract
1881	 * from the condense_tree. Instead we sync out two trees, a cheap
1882	 * allocation only tree followed by the in-core free tree. While not
1883	 * optimal, this is typically close to optimal, and much cheaper to
1884	 * compute.
1885	 */
1886	space_map_write(sm, condense_tree, SM_ALLOC, tx);
1887	range_tree_vacate(condense_tree, NULL, NULL);
1888	range_tree_destroy(condense_tree);
1889
1890	space_map_write(sm, msp->ms_tree, SM_FREE, tx);
1891	msp->ms_condensing = B_FALSE;
1892}
1893
1894/*
1895 * Write a metaslab to disk in the context of the specified transaction group.
1896 */
1897void
1898metaslab_sync(metaslab_t *msp, uint64_t txg)
1899{
1900	metaslab_group_t *mg = msp->ms_group;
1901	vdev_t *vd = mg->mg_vd;
1902	spa_t *spa = vd->vdev_spa;
1903	objset_t *mos = spa_meta_objset(spa);
1904	range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK];
1905	range_tree_t **freetree = &msp->ms_freetree[txg & TXG_MASK];
1906	range_tree_t **freed_tree =
1907	    &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
1908	dmu_tx_t *tx;
1909	uint64_t object = space_map_object(msp->ms_sm);
1910
1911	ASSERT(!vd->vdev_ishole);
1912
1913	/*
1914	 * This metaslab has just been added so there's no work to do now.
1915	 */
1916	if (*freetree == NULL) {
1917		ASSERT3P(alloctree, ==, NULL);
1918		return;
1919	}
1920
1921	ASSERT3P(alloctree, !=, NULL);
1922	ASSERT3P(*freetree, !=, NULL);
1923	ASSERT3P(*freed_tree, !=, NULL);
1924
1925	/*
1926	 * Normally, we don't want to process a metaslab if there
1927	 * are no allocations or frees to perform. However, if the metaslab
1928	 * is being forced to condense we need to let it through.
1929	 */
1930	if (range_tree_space(alloctree) == 0 &&
1931	    range_tree_space(*freetree) == 0 &&
1932	    !msp->ms_condense_wanted)
1933		return;
1934
1935	/*
1936	 * The only state that can actually be changing concurrently with
1937	 * metaslab_sync() is the metaslab's ms_tree.  No other thread can
1938	 * be modifying this txg's alloctree, freetree, freed_tree, or
1939	 * space_map_phys_t. Therefore, we only hold ms_lock to satify
1940	 * space_map ASSERTs. We drop it whenever we call into the DMU,
1941	 * because the DMU can call down to us (e.g. via zio_free()) at
1942	 * any time.
1943	 */
1944
1945	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
1946
1947	if (msp->ms_sm == NULL) {
1948		uint64_t new_object;
1949
1950		new_object = space_map_alloc(mos, tx);
1951		VERIFY3U(new_object, !=, 0);
1952
1953		VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
1954		    msp->ms_start, msp->ms_size, vd->vdev_ashift,
1955		    &msp->ms_lock));
1956		ASSERT(msp->ms_sm != NULL);
1957	}
1958
1959	mutex_enter(&msp->ms_lock);
1960
1961	/*
1962	 * Note: metaslab_condense() clears the space_map's histogram.
1963	 * Therefore we must verify and remove this histogram before
1964	 * condensing.
1965	 */
1966	metaslab_group_histogram_verify(mg);
1967	metaslab_class_histogram_verify(mg->mg_class);
1968	metaslab_group_histogram_remove(mg, msp);
1969
1970	if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
1971	    metaslab_should_condense(msp)) {
1972		metaslab_condense(msp, txg, tx);
1973	} else {
1974		space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
1975		space_map_write(msp->ms_sm, *freetree, SM_FREE, tx);
1976	}
1977
1978	if (msp->ms_loaded) {
1979		/*
1980		 * When the space map is loaded, we have an accruate
1981		 * histogram in the range tree. This gives us an opportunity
1982		 * to bring the space map's histogram up-to-date so we clear
1983		 * it first before updating it.
1984		 */
1985		space_map_histogram_clear(msp->ms_sm);
1986		space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
1987	} else {
1988		/*
1989		 * Since the space map is not loaded we simply update the
1990		 * exisiting histogram with what was freed in this txg. This
1991		 * means that the on-disk histogram may not have an accurate
1992		 * view of the free space but it's close enough to allow
1993		 * us to make allocation decisions.
1994		 */
1995		space_map_histogram_add(msp->ms_sm, *freetree, tx);
1996	}
1997	metaslab_group_histogram_add(mg, msp);
1998	metaslab_group_histogram_verify(mg);
1999	metaslab_class_histogram_verify(mg->mg_class);
2000
2001	/*
2002	 * For sync pass 1, we avoid traversing this txg's free range tree
2003	 * and instead will just swap the pointers for freetree and
2004	 * freed_tree. We can safely do this since the freed_tree is
2005	 * guaranteed to be empty on the initial pass.
2006	 */
2007	if (spa_sync_pass(spa) == 1) {
2008		range_tree_swap(freetree, freed_tree);
2009	} else {
2010		range_tree_vacate(*freetree, range_tree_add, *freed_tree);
2011	}
2012	range_tree_vacate(alloctree, NULL, NULL);
2013
2014	ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
2015	ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
2016
2017	mutex_exit(&msp->ms_lock);
2018
2019	if (object != space_map_object(msp->ms_sm)) {
2020		object = space_map_object(msp->ms_sm);
2021		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
2022		    msp->ms_id, sizeof (uint64_t), &object, tx);
2023	}
2024	dmu_tx_commit(tx);
2025}
2026
2027/*
2028 * Called after a transaction group has completely synced to mark
2029 * all of the metaslab's free space as usable.
2030 */
2031void
2032metaslab_sync_done(metaslab_t *msp, uint64_t txg)
2033{
2034	metaslab_group_t *mg = msp->ms_group;
2035	vdev_t *vd = mg->mg_vd;
2036	range_tree_t **freed_tree;
2037	range_tree_t **defer_tree;
2038	int64_t alloc_delta, defer_delta;
2039
2040	ASSERT(!vd->vdev_ishole);
2041
2042	mutex_enter(&msp->ms_lock);
2043
2044	/*
2045	 * If this metaslab is just becoming available, initialize its
2046	 * alloctrees, freetrees, and defertree and add its capacity to
2047	 * the vdev.
2048	 */
2049	if (msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK] == NULL) {
2050		for (int t = 0; t < TXG_SIZE; t++) {
2051			ASSERT(msp->ms_alloctree[t] == NULL);
2052			ASSERT(msp->ms_freetree[t] == NULL);
2053
2054			msp->ms_alloctree[t] = range_tree_create(NULL, msp,
2055			    &msp->ms_lock);
2056			msp->ms_freetree[t] = range_tree_create(NULL, msp,
2057			    &msp->ms_lock);
2058		}
2059
2060		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2061			ASSERT(msp->ms_defertree[t] == NULL);
2062
2063			msp->ms_defertree[t] = range_tree_create(NULL, msp,
2064			    &msp->ms_lock);
2065		}
2066
2067		vdev_space_update(vd, 0, 0, msp->ms_size);
2068	}
2069
2070	freed_tree = &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
2071	defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE];
2072
2073	alloc_delta = space_map_alloc_delta(msp->ms_sm);
2074	defer_delta = range_tree_space(*freed_tree) -
2075	    range_tree_space(*defer_tree);
2076
2077	vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
2078
2079	ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
2080	ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
2081
2082	/*
2083	 * If there's a metaslab_load() in progress, wait for it to complete
2084	 * so that we have a consistent view of the in-core space map.
2085	 */
2086	metaslab_load_wait(msp);
2087
2088	/*
2089	 * Move the frees from the defer_tree back to the free
2090	 * range tree (if it's loaded). Swap the freed_tree and the
2091	 * defer_tree -- this is safe to do because we've just emptied out
2092	 * the defer_tree.
2093	 */
2094	range_tree_vacate(*defer_tree,
2095	    msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
2096	range_tree_swap(freed_tree, defer_tree);
2097
2098	space_map_update(msp->ms_sm);
2099
2100	msp->ms_deferspace += defer_delta;
2101	ASSERT3S(msp->ms_deferspace, >=, 0);
2102	ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
2103	if (msp->ms_deferspace != 0) {
2104		/*
2105		 * Keep syncing this metaslab until all deferred frees
2106		 * are back in circulation.
2107		 */
2108		vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
2109	}
2110
2111	if (msp->ms_loaded && msp->ms_access_txg < txg) {
2112		for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
2113			VERIFY0(range_tree_space(
2114			    msp->ms_alloctree[(txg + t) & TXG_MASK]));
2115		}
2116
2117		if (!metaslab_debug_unload)
2118			metaslab_unload(msp);
2119	}
2120
2121	metaslab_group_sort(mg, msp, metaslab_weight(msp));
2122	mutex_exit(&msp->ms_lock);
2123}
2124
2125void
2126metaslab_sync_reassess(metaslab_group_t *mg)
2127{
2128	metaslab_group_alloc_update(mg);
2129	mg->mg_fragmentation = metaslab_group_fragmentation(mg);
2130
2131	/*
2132	 * Preload the next potential metaslabs
2133	 */
2134	metaslab_group_preload(mg);
2135}
2136
2137static uint64_t
2138metaslab_distance(metaslab_t *msp, dva_t *dva)
2139{
2140	uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
2141	uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
2142	uint64_t start = msp->ms_id;
2143
2144	if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
2145		return (1ULL << 63);
2146
2147	if (offset < start)
2148		return ((start - offset) << ms_shift);
2149	if (offset > start)
2150		return ((offset - start) << ms_shift);
2151	return (0);
2152}
2153
2154static uint64_t
2155metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
2156    uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
2157{
2158	spa_t *spa = mg->mg_vd->vdev_spa;
2159	metaslab_t *msp = NULL;
2160	uint64_t offset = -1ULL;
2161	avl_tree_t *t = &mg->mg_metaslab_tree;
2162	uint64_t activation_weight;
2163	uint64_t target_distance;
2164	int i;
2165
2166	activation_weight = METASLAB_WEIGHT_PRIMARY;
2167	for (i = 0; i < d; i++) {
2168		if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
2169			activation_weight = METASLAB_WEIGHT_SECONDARY;
2170			break;
2171		}
2172	}
2173
2174	for (;;) {
2175		boolean_t was_active;
2176
2177		mutex_enter(&mg->mg_lock);
2178		for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
2179			if (msp->ms_weight < asize) {
2180				spa_dbgmsg(spa, "%s: failed to meet weight "
2181				    "requirement: vdev %llu, txg %llu, mg %p, "
2182				    "msp %p, psize %llu, asize %llu, "
2183				    "weight %llu", spa_name(spa),
2184				    mg->mg_vd->vdev_id, txg,
2185				    mg, msp, psize, asize, msp->ms_weight);
2186				mutex_exit(&mg->mg_lock);
2187				return (-1ULL);
2188			}
2189
2190			/*
2191			 * If the selected metaslab is condensing, skip it.
2192			 */
2193			if (msp->ms_condensing)
2194				continue;
2195
2196			was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
2197			if (activation_weight == METASLAB_WEIGHT_PRIMARY)
2198				break;
2199
2200			target_distance = min_distance +
2201			    (space_map_allocated(msp->ms_sm) != 0 ? 0 :
2202			    min_distance >> 1);
2203
2204			for (i = 0; i < d; i++)
2205				if (metaslab_distance(msp, &dva[i]) <
2206				    target_distance)
2207					break;
2208			if (i == d)
2209				break;
2210		}
2211		mutex_exit(&mg->mg_lock);
2212		if (msp == NULL)
2213			return (-1ULL);
2214
2215		mutex_enter(&msp->ms_lock);
2216
2217		/*
2218		 * Ensure that the metaslab we have selected is still
2219		 * capable of handling our request. It's possible that
2220		 * another thread may have changed the weight while we
2221		 * were blocked on the metaslab lock.
2222		 */
2223		if (msp->ms_weight < asize || (was_active &&
2224		    !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
2225		    activation_weight == METASLAB_WEIGHT_PRIMARY)) {
2226			mutex_exit(&msp->ms_lock);
2227			continue;
2228		}
2229
2230		if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
2231		    activation_weight == METASLAB_WEIGHT_PRIMARY) {
2232			metaslab_passivate(msp,
2233			    msp->ms_weight & ~METASLAB_ACTIVE_MASK);
2234			mutex_exit(&msp->ms_lock);
2235			continue;
2236		}
2237
2238		if (metaslab_activate(msp, activation_weight) != 0) {
2239			mutex_exit(&msp->ms_lock);
2240			continue;
2241		}
2242
2243		/*
2244		 * If this metaslab is currently condensing then pick again as
2245		 * we can't manipulate this metaslab until it's committed
2246		 * to disk.
2247		 */
2248		if (msp->ms_condensing) {
2249			mutex_exit(&msp->ms_lock);
2250			continue;
2251		}
2252
2253		if ((offset = metaslab_block_alloc(msp, asize)) != -1ULL)
2254			break;
2255
2256		metaslab_passivate(msp, metaslab_block_maxsize(msp));
2257		mutex_exit(&msp->ms_lock);
2258	}
2259
2260	if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
2261		vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
2262
2263	range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, asize);
2264	msp->ms_access_txg = txg + metaslab_unload_delay;
2265
2266	mutex_exit(&msp->ms_lock);
2267
2268	return (offset);
2269}
2270
2271/*
2272 * Allocate a block for the specified i/o.
2273 */
2274static int
2275metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
2276    dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags)
2277{
2278	metaslab_group_t *mg, *rotor;
2279	vdev_t *vd;
2280	int dshift = 3;
2281	int all_zero;
2282	int zio_lock = B_FALSE;
2283	boolean_t allocatable;
2284	uint64_t offset = -1ULL;
2285	uint64_t asize;
2286	uint64_t distance;
2287
2288	ASSERT(!DVA_IS_VALID(&dva[d]));
2289
2290	/*
2291	 * For testing, make some blocks above a certain size be gang blocks.
2292	 */
2293	if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0)
2294		return (SET_ERROR(ENOSPC));
2295
2296	/*
2297	 * Start at the rotor and loop through all mgs until we find something.
2298	 * Note that there's no locking on mc_rotor or mc_aliquot because
2299	 * nothing actually breaks if we miss a few updates -- we just won't
2300	 * allocate quite as evenly.  It all balances out over time.
2301	 *
2302	 * If we are doing ditto or log blocks, try to spread them across
2303	 * consecutive vdevs.  If we're forced to reuse a vdev before we've
2304	 * allocated all of our ditto blocks, then try and spread them out on
2305	 * that vdev as much as possible.  If it turns out to not be possible,
2306	 * gradually lower our standards until anything becomes acceptable.
2307	 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
2308	 * gives us hope of containing our fault domains to something we're
2309	 * able to reason about.  Otherwise, any two top-level vdev failures
2310	 * will guarantee the loss of data.  With consecutive allocation,
2311	 * only two adjacent top-level vdev failures will result in data loss.
2312	 *
2313	 * If we are doing gang blocks (hintdva is non-NULL), try to keep
2314	 * ourselves on the same vdev as our gang block header.  That
2315	 * way, we can hope for locality in vdev_cache, plus it makes our
2316	 * fault domains something tractable.
2317	 */
2318	if (hintdva) {
2319		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
2320
2321		/*
2322		 * It's possible the vdev we're using as the hint no
2323		 * longer exists (i.e. removed). Consult the rotor when
2324		 * all else fails.
2325		 */
2326		if (vd != NULL) {
2327			mg = vd->vdev_mg;
2328
2329			if (flags & METASLAB_HINTBP_AVOID &&
2330			    mg->mg_next != NULL)
2331				mg = mg->mg_next;
2332		} else {
2333			mg = mc->mc_rotor;
2334		}
2335	} else if (d != 0) {
2336		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
2337		mg = vd->vdev_mg->mg_next;
2338	} else {
2339		mg = mc->mc_rotor;
2340	}
2341
2342	/*
2343	 * If the hint put us into the wrong metaslab class, or into a
2344	 * metaslab group that has been passivated, just follow the rotor.
2345	 */
2346	if (mg->mg_class != mc || mg->mg_activation_count <= 0)
2347		mg = mc->mc_rotor;
2348
2349	rotor = mg;
2350top:
2351	all_zero = B_TRUE;
2352	do {
2353		ASSERT(mg->mg_activation_count == 1);
2354
2355		vd = mg->mg_vd;
2356
2357		/*
2358		 * Don't allocate from faulted devices.
2359		 */
2360		if (zio_lock) {
2361			spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
2362			allocatable = vdev_allocatable(vd);
2363			spa_config_exit(spa, SCL_ZIO, FTAG);
2364		} else {
2365			allocatable = vdev_allocatable(vd);
2366		}
2367
2368		/*
2369		 * Determine if the selected metaslab group is eligible
2370		 * for allocations. If we're ganging or have requested
2371		 * an allocation for the smallest gang block size
2372		 * then we don't want to avoid allocating to the this
2373		 * metaslab group. If we're in this condition we should
2374		 * try to allocate from any device possible so that we
2375		 * don't inadvertently return ENOSPC and suspend the pool
2376		 * even though space is still available.
2377		 */
2378		if (allocatable && CAN_FASTGANG(flags) &&
2379		    psize > SPA_GANGBLOCKSIZE)
2380			allocatable = metaslab_group_allocatable(mg);
2381
2382		if (!allocatable)
2383			goto next;
2384
2385		/*
2386		 * Avoid writing single-copy data to a failing vdev
2387		 * unless the user instructs us that it is okay.
2388		 */
2389		if ((vd->vdev_stat.vs_write_errors > 0 ||
2390		    vd->vdev_state < VDEV_STATE_HEALTHY) &&
2391		    d == 0 && dshift == 3 && vd->vdev_children == 0) {
2392			all_zero = B_FALSE;
2393			goto next;
2394		}
2395
2396		ASSERT(mg->mg_class == mc);
2397
2398		distance = vd->vdev_asize >> dshift;
2399		if (distance <= (1ULL << vd->vdev_ms_shift))
2400			distance = 0;
2401		else
2402			all_zero = B_FALSE;
2403
2404		asize = vdev_psize_to_asize(vd, psize);
2405		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
2406
2407		offset = metaslab_group_alloc(mg, psize, asize, txg, distance,
2408		    dva, d);
2409		if (offset != -1ULL) {
2410			/*
2411			 * If we've just selected this metaslab group,
2412			 * figure out whether the corresponding vdev is
2413			 * over- or under-used relative to the pool,
2414			 * and set an allocation bias to even it out.
2415			 */
2416			if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
2417				vdev_stat_t *vs = &vd->vdev_stat;
2418				int64_t vu, cu;
2419
2420				vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
2421				cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
2422
2423				/*
2424				 * Calculate how much more or less we should
2425				 * try to allocate from this device during
2426				 * this iteration around the rotor.
2427				 * For example, if a device is 80% full
2428				 * and the pool is 20% full then we should
2429				 * reduce allocations by 60% on this device.
2430				 *
2431				 * mg_bias = (20 - 80) * 512K / 100 = -307K
2432				 *
2433				 * This reduces allocations by 307K for this
2434				 * iteration.
2435				 */
2436				mg->mg_bias = ((cu - vu) *
2437				    (int64_t)mg->mg_aliquot) / 100;
2438			} else if (!metaslab_bias_enabled) {
2439				mg->mg_bias = 0;
2440			}
2441
2442			if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
2443			    mg->mg_aliquot + mg->mg_bias) {
2444				mc->mc_rotor = mg->mg_next;
2445				mc->mc_aliquot = 0;
2446			}
2447
2448			DVA_SET_VDEV(&dva[d], vd->vdev_id);
2449			DVA_SET_OFFSET(&dva[d], offset);
2450			DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
2451			DVA_SET_ASIZE(&dva[d], asize);
2452
2453			return (0);
2454		}
2455next:
2456		mc->mc_rotor = mg->mg_next;
2457		mc->mc_aliquot = 0;
2458	} while ((mg = mg->mg_next) != rotor);
2459
2460	if (!all_zero) {
2461		dshift++;
2462		ASSERT(dshift < 64);
2463		goto top;
2464	}
2465
2466	if (!allocatable && !zio_lock) {
2467		dshift = 3;
2468		zio_lock = B_TRUE;
2469		goto top;
2470	}
2471
2472	bzero(&dva[d], sizeof (dva_t));
2473
2474	return (SET_ERROR(ENOSPC));
2475}
2476
2477/*
2478 * Free the block represented by DVA in the context of the specified
2479 * transaction group.
2480 */
2481static void
2482metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
2483{
2484	uint64_t vdev = DVA_GET_VDEV(dva);
2485	uint64_t offset = DVA_GET_OFFSET(dva);
2486	uint64_t size = DVA_GET_ASIZE(dva);
2487	vdev_t *vd;
2488	metaslab_t *msp;
2489
2490	ASSERT(DVA_IS_VALID(dva));
2491
2492	if (txg > spa_freeze_txg(spa))
2493		return;
2494
2495	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
2496	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
2497		cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
2498		    (u_longlong_t)vdev, (u_longlong_t)offset);
2499		ASSERT(0);
2500		return;
2501	}
2502
2503	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
2504
2505	if (DVA_GET_GANG(dva))
2506		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
2507
2508	mutex_enter(&msp->ms_lock);
2509
2510	if (now) {
2511		range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
2512		    offset, size);
2513
2514		VERIFY(!msp->ms_condensing);
2515		VERIFY3U(offset, >=, msp->ms_start);
2516		VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
2517		VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
2518		    msp->ms_size);
2519		VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
2520		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
2521		range_tree_add(msp->ms_tree, offset, size);
2522	} else {
2523		if (range_tree_space(msp->ms_freetree[txg & TXG_MASK]) == 0)
2524			vdev_dirty(vd, VDD_METASLAB, msp, txg);
2525		range_tree_add(msp->ms_freetree[txg & TXG_MASK],
2526		    offset, size);
2527	}
2528
2529	mutex_exit(&msp->ms_lock);
2530}
2531
2532/*
2533 * Intent log support: upon opening the pool after a crash, notify the SPA
2534 * of blocks that the intent log has allocated for immediate write, but
2535 * which are still considered free by the SPA because the last transaction
2536 * group didn't commit yet.
2537 */
2538static int
2539metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
2540{
2541	uint64_t vdev = DVA_GET_VDEV(dva);
2542	uint64_t offset = DVA_GET_OFFSET(dva);
2543	uint64_t size = DVA_GET_ASIZE(dva);
2544	vdev_t *vd;
2545	metaslab_t *msp;
2546	int error = 0;
2547
2548	ASSERT(DVA_IS_VALID(dva));
2549
2550	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
2551	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
2552		return (SET_ERROR(ENXIO));
2553
2554	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
2555
2556	if (DVA_GET_GANG(dva))
2557		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
2558
2559	mutex_enter(&msp->ms_lock);
2560
2561	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
2562		error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
2563
2564	if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
2565		error = SET_ERROR(ENOENT);
2566
2567	if (error || txg == 0) {	/* txg == 0 indicates dry run */
2568		mutex_exit(&msp->ms_lock);
2569		return (error);
2570	}
2571
2572	VERIFY(!msp->ms_condensing);
2573	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
2574	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
2575	VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
2576	range_tree_remove(msp->ms_tree, offset, size);
2577
2578	if (spa_writeable(spa)) {	/* don't dirty if we're zdb(1M) */
2579		if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
2580			vdev_dirty(vd, VDD_METASLAB, msp, txg);
2581		range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
2582	}
2583
2584	mutex_exit(&msp->ms_lock);
2585
2586	return (0);
2587}
2588
2589int
2590metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
2591    int ndvas, uint64_t txg, blkptr_t *hintbp, int flags)
2592{
2593	dva_t *dva = bp->blk_dva;
2594	dva_t *hintdva = hintbp->blk_dva;
2595	int error = 0;
2596
2597	ASSERT(bp->blk_birth == 0);
2598	ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
2599
2600	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
2601
2602	if (mc->mc_rotor == NULL) {	/* no vdevs in this class */
2603		spa_config_exit(spa, SCL_ALLOC, FTAG);
2604		return (SET_ERROR(ENOSPC));
2605	}
2606
2607	ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
2608	ASSERT(BP_GET_NDVAS(bp) == 0);
2609	ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
2610
2611	for (int d = 0; d < ndvas; d++) {
2612		error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
2613		    txg, flags);
2614		if (error != 0) {
2615			for (d--; d >= 0; d--) {
2616				metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
2617				bzero(&dva[d], sizeof (dva_t));
2618			}
2619			spa_config_exit(spa, SCL_ALLOC, FTAG);
2620			return (error);
2621		}
2622	}
2623	ASSERT(error == 0);
2624	ASSERT(BP_GET_NDVAS(bp) == ndvas);
2625
2626	spa_config_exit(spa, SCL_ALLOC, FTAG);
2627
2628	BP_SET_BIRTH(bp, txg, txg);
2629
2630	return (0);
2631}
2632
2633void
2634metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
2635{
2636	const dva_t *dva = bp->blk_dva;
2637	int ndvas = BP_GET_NDVAS(bp);
2638
2639	ASSERT(!BP_IS_HOLE(bp));
2640	ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
2641
2642	spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
2643
2644	for (int d = 0; d < ndvas; d++)
2645		metaslab_free_dva(spa, &dva[d], txg, now);
2646
2647	spa_config_exit(spa, SCL_FREE, FTAG);
2648}
2649
2650int
2651metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
2652{
2653	const dva_t *dva = bp->blk_dva;
2654	int ndvas = BP_GET_NDVAS(bp);
2655	int error = 0;
2656
2657	ASSERT(!BP_IS_HOLE(bp));
2658
2659	if (txg != 0) {
2660		/*
2661		 * First do a dry run to make sure all DVAs are claimable,
2662		 * so we don't have to unwind from partial failures below.
2663		 */
2664		if ((error = metaslab_claim(spa, bp, 0)) != 0)
2665			return (error);
2666	}
2667
2668	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
2669
2670	for (int d = 0; d < ndvas; d++)
2671		if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
2672			break;
2673
2674	spa_config_exit(spa, SCL_ALLOC, FTAG);
2675
2676	ASSERT(error == 0 || txg == 0);
2677
2678	return (error);
2679}
2680
2681void
2682metaslab_check_free(spa_t *spa, const blkptr_t *bp)
2683{
2684	if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
2685		return;
2686
2687	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
2688	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
2689		uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
2690		vdev_t *vd = vdev_lookup_top(spa, vdev);
2691		uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
2692		uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
2693		metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
2694
2695		if (msp->ms_loaded)
2696			range_tree_verify(msp->ms_tree, offset, size);
2697
2698		for (int j = 0; j < TXG_SIZE; j++)
2699			range_tree_verify(msp->ms_freetree[j], offset, size);
2700		for (int j = 0; j < TXG_DEFER_SIZE; j++)
2701			range_tree_verify(msp->ms_defertree[j], offset, size);
2702	}
2703	spa_config_exit(spa, SCL_VDEV, FTAG);
2704}
2705