metaslab.c revision 331395
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
25 * Copyright (c) 2014 Integros [integros.com]
26 */
27
28#include <sys/zfs_context.h>
29#include <sys/dmu.h>
30#include <sys/dmu_tx.h>
31#include <sys/space_map.h>
32#include <sys/metaslab_impl.h>
33#include <sys/vdev_impl.h>
34#include <sys/zio.h>
35#include <sys/spa_impl.h>
36#include <sys/zfeature.h>
37
38SYSCTL_DECL(_vfs_zfs);
39SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab");
40
41#define	GANG_ALLOCATION(flags) \
42	((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
43
44uint64_t metaslab_aliquot = 512ULL << 10;
45uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
46SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, gang_bang, CTLFLAG_RWTUN,
47    &metaslab_gang_bang, 0,
48    "Force gang block allocation for blocks larger than or equal to this value");
49
50/*
51 * The in-core space map representation is more compact than its on-disk form.
52 * The zfs_condense_pct determines how much more compact the in-core
53 * space map representation must be before we compact it on-disk.
54 * Values should be greater than or equal to 100.
55 */
56int zfs_condense_pct = 200;
57SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN,
58    &zfs_condense_pct, 0,
59    "Condense on-disk spacemap when it is more than this many percents"
60    " of in-memory counterpart");
61
62/*
63 * Condensing a metaslab is not guaranteed to actually reduce the amount of
64 * space used on disk. In particular, a space map uses data in increments of
65 * MAX(1 << ashift, space_map_blksize), so a metaslab might use the
66 * same number of blocks after condensing. Since the goal of condensing is to
67 * reduce the number of IOPs required to read the space map, we only want to
68 * condense when we can be sure we will reduce the number of blocks used by the
69 * space map. Unfortunately, we cannot precisely compute whether or not this is
70 * the case in metaslab_should_condense since we are holding ms_lock. Instead,
71 * we apply the following heuristic: do not condense a spacemap unless the
72 * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
73 * blocks.
74 */
75int zfs_metaslab_condense_block_threshold = 4;
76
77/*
78 * The zfs_mg_noalloc_threshold defines which metaslab groups should
79 * be eligible for allocation. The value is defined as a percentage of
80 * free space. Metaslab groups that have more free space than
81 * zfs_mg_noalloc_threshold are always eligible for allocations. Once
82 * a metaslab group's free space is less than or equal to the
83 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
84 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
85 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
86 * groups are allowed to accept allocations. Gang blocks are always
87 * eligible to allocate on any metaslab group. The default value of 0 means
88 * no metaslab group will be excluded based on this criterion.
89 */
90int zfs_mg_noalloc_threshold = 0;
91SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_noalloc_threshold, CTLFLAG_RWTUN,
92    &zfs_mg_noalloc_threshold, 0,
93    "Percentage of metaslab group size that should be free"
94    " to make it eligible for allocation");
95
96/*
97 * Metaslab groups are considered eligible for allocations if their
98 * fragmenation metric (measured as a percentage) is less than or equal to
99 * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold
100 * then it will be skipped unless all metaslab groups within the metaslab
101 * class have also crossed this threshold.
102 */
103int zfs_mg_fragmentation_threshold = 85;
104SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_fragmentation_threshold, CTLFLAG_RWTUN,
105    &zfs_mg_fragmentation_threshold, 0,
106    "Percentage of metaslab group size that should be considered "
107    "eligible for allocations unless all metaslab groups within the metaslab class "
108    "have also crossed this threshold");
109
110/*
111 * Allow metaslabs to keep their active state as long as their fragmentation
112 * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
113 * active metaslab that exceeds this threshold will no longer keep its active
114 * status allowing better metaslabs to be selected.
115 */
116int zfs_metaslab_fragmentation_threshold = 70;
117SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_threshold, CTLFLAG_RWTUN,
118    &zfs_metaslab_fragmentation_threshold, 0,
119    "Maximum percentage of metaslab fragmentation level to keep their active state");
120
121/*
122 * When set will load all metaslabs when pool is first opened.
123 */
124int metaslab_debug_load = 0;
125SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_load, CTLFLAG_RWTUN,
126    &metaslab_debug_load, 0,
127    "Load all metaslabs when pool is first opened");
128
129/*
130 * When set will prevent metaslabs from being unloaded.
131 */
132int metaslab_debug_unload = 0;
133SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_unload, CTLFLAG_RWTUN,
134    &metaslab_debug_unload, 0,
135    "Prevent metaslabs from being unloaded");
136
137/*
138 * Minimum size which forces the dynamic allocator to change
139 * it's allocation strategy.  Once the space map cannot satisfy
140 * an allocation of this size then it switches to using more
141 * aggressive strategy (i.e search by size rather than offset).
142 */
143uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
144SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN,
145    &metaslab_df_alloc_threshold, 0,
146    "Minimum size which forces the dynamic allocator to change it's allocation strategy");
147
148/*
149 * The minimum free space, in percent, which must be available
150 * in a space map to continue allocations in a first-fit fashion.
151 * Once the space map's free space drops below this level we dynamically
152 * switch to using best-fit allocations.
153 */
154int metaslab_df_free_pct = 4;
155SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN,
156    &metaslab_df_free_pct, 0,
157    "The minimum free space, in percent, which must be available in a "
158    "space map to continue allocations in a first-fit fashion");
159
160/*
161 * A metaslab is considered "free" if it contains a contiguous
162 * segment which is greater than metaslab_min_alloc_size.
163 */
164uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
165SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, min_alloc_size, CTLFLAG_RWTUN,
166    &metaslab_min_alloc_size, 0,
167    "A metaslab is considered \"free\" if it contains a contiguous "
168    "segment which is greater than vfs.zfs.metaslab.min_alloc_size");
169
170/*
171 * Percentage of all cpus that can be used by the metaslab taskq.
172 */
173int metaslab_load_pct = 50;
174SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN,
175    &metaslab_load_pct, 0,
176    "Percentage of cpus that can be used by the metaslab taskq");
177
178/*
179 * Determines how many txgs a metaslab may remain loaded without having any
180 * allocations from it. As long as a metaslab continues to be used we will
181 * keep it loaded.
182 */
183int metaslab_unload_delay = TXG_SIZE * 2;
184SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, unload_delay, CTLFLAG_RWTUN,
185    &metaslab_unload_delay, 0,
186    "Number of TXGs that an unused metaslab can be kept in memory");
187
188/*
189 * Max number of metaslabs per group to preload.
190 */
191int metaslab_preload_limit = SPA_DVAS_PER_BP;
192SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN,
193    &metaslab_preload_limit, 0,
194    "Max number of metaslabs per group to preload");
195
196/*
197 * Enable/disable preloading of metaslab.
198 */
199boolean_t metaslab_preload_enabled = B_TRUE;
200SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_enabled, CTLFLAG_RWTUN,
201    &metaslab_preload_enabled, 0,
202    "Max number of metaslabs per group to preload");
203
204/*
205 * Enable/disable fragmentation weighting on metaslabs.
206 */
207boolean_t metaslab_fragmentation_factor_enabled = B_TRUE;
208SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_factor_enabled, CTLFLAG_RWTUN,
209    &metaslab_fragmentation_factor_enabled, 0,
210    "Enable fragmentation weighting on metaslabs");
211
212/*
213 * Enable/disable lba weighting (i.e. outer tracks are given preference).
214 */
215boolean_t metaslab_lba_weighting_enabled = B_TRUE;
216SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, lba_weighting_enabled, CTLFLAG_RWTUN,
217    &metaslab_lba_weighting_enabled, 0,
218    "Enable LBA weighting (i.e. outer tracks are given preference)");
219
220/*
221 * Enable/disable metaslab group biasing.
222 */
223boolean_t metaslab_bias_enabled = B_TRUE;
224SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, bias_enabled, CTLFLAG_RWTUN,
225    &metaslab_bias_enabled, 0,
226    "Enable metaslab group biasing");
227
228/*
229 * Enable/disable segment-based metaslab selection.
230 */
231boolean_t zfs_metaslab_segment_weight_enabled = B_TRUE;
232
233/*
234 * When using segment-based metaslab selection, we will continue
235 * allocating from the active metaslab until we have exhausted
236 * zfs_metaslab_switch_threshold of its buckets.
237 */
238int zfs_metaslab_switch_threshold = 2;
239
240/*
241 * Internal switch to enable/disable the metaslab allocation tracing
242 * facility.
243 */
244boolean_t metaslab_trace_enabled = B_TRUE;
245
246/*
247 * Maximum entries that the metaslab allocation tracing facility will keep
248 * in a given list when running in non-debug mode. We limit the number
249 * of entries in non-debug mode to prevent us from using up too much memory.
250 * The limit should be sufficiently large that we don't expect any allocation
251 * to every exceed this value. In debug mode, the system will panic if this
252 * limit is ever reached allowing for further investigation.
253 */
254uint64_t metaslab_trace_max_entries = 5000;
255
256static uint64_t metaslab_weight(metaslab_t *);
257static void metaslab_set_fragmentation(metaslab_t *);
258
259kmem_cache_t *metaslab_alloc_trace_cache;
260
261/*
262 * ==========================================================================
263 * Metaslab classes
264 * ==========================================================================
265 */
266metaslab_class_t *
267metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
268{
269	metaslab_class_t *mc;
270
271	mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
272
273	mc->mc_spa = spa;
274	mc->mc_rotor = NULL;
275	mc->mc_ops = ops;
276	mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
277	refcount_create_tracked(&mc->mc_alloc_slots);
278
279	return (mc);
280}
281
282void
283metaslab_class_destroy(metaslab_class_t *mc)
284{
285	ASSERT(mc->mc_rotor == NULL);
286	ASSERT(mc->mc_alloc == 0);
287	ASSERT(mc->mc_deferred == 0);
288	ASSERT(mc->mc_space == 0);
289	ASSERT(mc->mc_dspace == 0);
290
291	refcount_destroy(&mc->mc_alloc_slots);
292	mutex_destroy(&mc->mc_lock);
293	kmem_free(mc, sizeof (metaslab_class_t));
294}
295
296int
297metaslab_class_validate(metaslab_class_t *mc)
298{
299	metaslab_group_t *mg;
300	vdev_t *vd;
301
302	/*
303	 * Must hold one of the spa_config locks.
304	 */
305	ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
306	    spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
307
308	if ((mg = mc->mc_rotor) == NULL)
309		return (0);
310
311	do {
312		vd = mg->mg_vd;
313		ASSERT(vd->vdev_mg != NULL);
314		ASSERT3P(vd->vdev_top, ==, vd);
315		ASSERT3P(mg->mg_class, ==, mc);
316		ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
317	} while ((mg = mg->mg_next) != mc->mc_rotor);
318
319	return (0);
320}
321
322void
323metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
324    int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
325{
326	atomic_add_64(&mc->mc_alloc, alloc_delta);
327	atomic_add_64(&mc->mc_deferred, defer_delta);
328	atomic_add_64(&mc->mc_space, space_delta);
329	atomic_add_64(&mc->mc_dspace, dspace_delta);
330}
331
332void
333metaslab_class_minblocksize_update(metaslab_class_t *mc)
334{
335	metaslab_group_t *mg;
336	vdev_t *vd;
337	uint64_t minashift = UINT64_MAX;
338
339	if ((mg = mc->mc_rotor) == NULL) {
340		mc->mc_minblocksize = SPA_MINBLOCKSIZE;
341		return;
342	}
343
344	do {
345		vd = mg->mg_vd;
346		if (vd->vdev_ashift < minashift)
347			minashift = vd->vdev_ashift;
348	} while ((mg = mg->mg_next) != mc->mc_rotor);
349
350	mc->mc_minblocksize = 1ULL << minashift;
351}
352
353uint64_t
354metaslab_class_get_alloc(metaslab_class_t *mc)
355{
356	return (mc->mc_alloc);
357}
358
359uint64_t
360metaslab_class_get_deferred(metaslab_class_t *mc)
361{
362	return (mc->mc_deferred);
363}
364
365uint64_t
366metaslab_class_get_space(metaslab_class_t *mc)
367{
368	return (mc->mc_space);
369}
370
371uint64_t
372metaslab_class_get_dspace(metaslab_class_t *mc)
373{
374	return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
375}
376
377uint64_t
378metaslab_class_get_minblocksize(metaslab_class_t *mc)
379{
380	return (mc->mc_minblocksize);
381}
382
383void
384metaslab_class_histogram_verify(metaslab_class_t *mc)
385{
386	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
387	uint64_t *mc_hist;
388	int i;
389
390	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
391		return;
392
393	mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
394	    KM_SLEEP);
395
396	for (int c = 0; c < rvd->vdev_children; c++) {
397		vdev_t *tvd = rvd->vdev_child[c];
398		metaslab_group_t *mg = tvd->vdev_mg;
399
400		/*
401		 * Skip any holes, uninitialized top-levels, or
402		 * vdevs that are not in this metalab class.
403		 */
404		if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
405		    mg->mg_class != mc) {
406			continue;
407		}
408
409		for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
410			mc_hist[i] += mg->mg_histogram[i];
411	}
412
413	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
414		VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
415
416	kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
417}
418
419/*
420 * Calculate the metaslab class's fragmentation metric. The metric
421 * is weighted based on the space contribution of each metaslab group.
422 * The return value will be a number between 0 and 100 (inclusive), or
423 * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
424 * zfs_frag_table for more information about the metric.
425 */
426uint64_t
427metaslab_class_fragmentation(metaslab_class_t *mc)
428{
429	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
430	uint64_t fragmentation = 0;
431
432	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
433
434	for (int c = 0; c < rvd->vdev_children; c++) {
435		vdev_t *tvd = rvd->vdev_child[c];
436		metaslab_group_t *mg = tvd->vdev_mg;
437
438		/*
439		 * Skip any holes, uninitialized top-levels, or
440		 * vdevs that are not in this metalab class.
441		 */
442		if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
443		    mg->mg_class != mc) {
444			continue;
445		}
446
447		/*
448		 * If a metaslab group does not contain a fragmentation
449		 * metric then just bail out.
450		 */
451		if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
452			spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
453			return (ZFS_FRAG_INVALID);
454		}
455
456		/*
457		 * Determine how much this metaslab_group is contributing
458		 * to the overall pool fragmentation metric.
459		 */
460		fragmentation += mg->mg_fragmentation *
461		    metaslab_group_get_space(mg);
462	}
463	fragmentation /= metaslab_class_get_space(mc);
464
465	ASSERT3U(fragmentation, <=, 100);
466	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
467	return (fragmentation);
468}
469
470/*
471 * Calculate the amount of expandable space that is available in
472 * this metaslab class. If a device is expanded then its expandable
473 * space will be the amount of allocatable space that is currently not
474 * part of this metaslab class.
475 */
476uint64_t
477metaslab_class_expandable_space(metaslab_class_t *mc)
478{
479	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
480	uint64_t space = 0;
481
482	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
483	for (int c = 0; c < rvd->vdev_children; c++) {
484		uint64_t tspace;
485		vdev_t *tvd = rvd->vdev_child[c];
486		metaslab_group_t *mg = tvd->vdev_mg;
487
488		if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
489		    mg->mg_class != mc) {
490			continue;
491		}
492
493		/*
494		 * Calculate if we have enough space to add additional
495		 * metaslabs. We report the expandable space in terms
496		 * of the metaslab size since that's the unit of expansion.
497		 * Adjust by efi system partition size.
498		 */
499		tspace = tvd->vdev_max_asize - tvd->vdev_asize;
500		if (tspace > mc->mc_spa->spa_bootsize) {
501			tspace -= mc->mc_spa->spa_bootsize;
502		}
503		space += P2ALIGN(tspace, 1ULL << tvd->vdev_ms_shift);
504	}
505	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
506	return (space);
507}
508
509static int
510metaslab_compare(const void *x1, const void *x2)
511{
512	const metaslab_t *m1 = x1;
513	const metaslab_t *m2 = x2;
514
515	if (m1->ms_weight < m2->ms_weight)
516		return (1);
517	if (m1->ms_weight > m2->ms_weight)
518		return (-1);
519
520	/*
521	 * If the weights are identical, use the offset to force uniqueness.
522	 */
523	if (m1->ms_start < m2->ms_start)
524		return (-1);
525	if (m1->ms_start > m2->ms_start)
526		return (1);
527
528	ASSERT3P(m1, ==, m2);
529
530	return (0);
531}
532
533/*
534 * Verify that the space accounting on disk matches the in-core range_trees.
535 */
536void
537metaslab_verify_space(metaslab_t *msp, uint64_t txg)
538{
539	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
540	uint64_t allocated = 0;
541	uint64_t sm_free_space, msp_free_space;
542
543	ASSERT(MUTEX_HELD(&msp->ms_lock));
544
545	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
546		return;
547
548	/*
549	 * We can only verify the metaslab space when we're called
550	 * from syncing context with a loaded metaslab that has an allocated
551	 * space map. Calling this in non-syncing context does not
552	 * provide a consistent view of the metaslab since we're performing
553	 * allocations in the future.
554	 */
555	if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
556	    !msp->ms_loaded)
557		return;
558
559	sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) -
560	    space_map_alloc_delta(msp->ms_sm);
561
562	/*
563	 * Account for future allocations since we would have already
564	 * deducted that space from the ms_freetree.
565	 */
566	for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
567		allocated +=
568		    range_tree_space(msp->ms_alloctree[(txg + t) & TXG_MASK]);
569	}
570
571	msp_free_space = range_tree_space(msp->ms_tree) + allocated +
572	    msp->ms_deferspace + range_tree_space(msp->ms_freedtree);
573
574	VERIFY3U(sm_free_space, ==, msp_free_space);
575}
576
577/*
578 * ==========================================================================
579 * Metaslab groups
580 * ==========================================================================
581 */
582/*
583 * Update the allocatable flag and the metaslab group's capacity.
584 * The allocatable flag is set to true if the capacity is below
585 * the zfs_mg_noalloc_threshold or has a fragmentation value that is
586 * greater than zfs_mg_fragmentation_threshold. If a metaslab group
587 * transitions from allocatable to non-allocatable or vice versa then the
588 * metaslab group's class is updated to reflect the transition.
589 */
590static void
591metaslab_group_alloc_update(metaslab_group_t *mg)
592{
593	vdev_t *vd = mg->mg_vd;
594	metaslab_class_t *mc = mg->mg_class;
595	vdev_stat_t *vs = &vd->vdev_stat;
596	boolean_t was_allocatable;
597	boolean_t was_initialized;
598
599	ASSERT(vd == vd->vdev_top);
600
601	mutex_enter(&mg->mg_lock);
602	was_allocatable = mg->mg_allocatable;
603	was_initialized = mg->mg_initialized;
604
605	mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
606	    (vs->vs_space + 1);
607
608	mutex_enter(&mc->mc_lock);
609
610	/*
611	 * If the metaslab group was just added then it won't
612	 * have any space until we finish syncing out this txg.
613	 * At that point we will consider it initialized and available
614	 * for allocations.  We also don't consider non-activated
615	 * metaslab groups (e.g. vdevs that are in the middle of being removed)
616	 * to be initialized, because they can't be used for allocation.
617	 */
618	mg->mg_initialized = metaslab_group_initialized(mg);
619	if (!was_initialized && mg->mg_initialized) {
620		mc->mc_groups++;
621	} else if (was_initialized && !mg->mg_initialized) {
622		ASSERT3U(mc->mc_groups, >, 0);
623		mc->mc_groups--;
624	}
625	if (mg->mg_initialized)
626		mg->mg_no_free_space = B_FALSE;
627
628	/*
629	 * A metaslab group is considered allocatable if it has plenty
630	 * of free space or is not heavily fragmented. We only take
631	 * fragmentation into account if the metaslab group has a valid
632	 * fragmentation metric (i.e. a value between 0 and 100).
633	 */
634	mg->mg_allocatable = (mg->mg_activation_count > 0 &&
635	    mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
636	    (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
637	    mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
638
639	/*
640	 * The mc_alloc_groups maintains a count of the number of
641	 * groups in this metaslab class that are still above the
642	 * zfs_mg_noalloc_threshold. This is used by the allocating
643	 * threads to determine if they should avoid allocations to
644	 * a given group. The allocator will avoid allocations to a group
645	 * if that group has reached or is below the zfs_mg_noalloc_threshold
646	 * and there are still other groups that are above the threshold.
647	 * When a group transitions from allocatable to non-allocatable or
648	 * vice versa we update the metaslab class to reflect that change.
649	 * When the mc_alloc_groups value drops to 0 that means that all
650	 * groups have reached the zfs_mg_noalloc_threshold making all groups
651	 * eligible for allocations. This effectively means that all devices
652	 * are balanced again.
653	 */
654	if (was_allocatable && !mg->mg_allocatable)
655		mc->mc_alloc_groups--;
656	else if (!was_allocatable && mg->mg_allocatable)
657		mc->mc_alloc_groups++;
658	mutex_exit(&mc->mc_lock);
659
660	mutex_exit(&mg->mg_lock);
661}
662
663metaslab_group_t *
664metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
665{
666	metaslab_group_t *mg;
667
668	mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
669	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
670	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
671	    sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
672	mg->mg_vd = vd;
673	mg->mg_class = mc;
674	mg->mg_activation_count = 0;
675	mg->mg_initialized = B_FALSE;
676	mg->mg_no_free_space = B_TRUE;
677	refcount_create_tracked(&mg->mg_alloc_queue_depth);
678
679	mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
680	    minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
681
682	return (mg);
683}
684
685void
686metaslab_group_destroy(metaslab_group_t *mg)
687{
688	ASSERT(mg->mg_prev == NULL);
689	ASSERT(mg->mg_next == NULL);
690	/*
691	 * We may have gone below zero with the activation count
692	 * either because we never activated in the first place or
693	 * because we're done, and possibly removing the vdev.
694	 */
695	ASSERT(mg->mg_activation_count <= 0);
696
697	taskq_destroy(mg->mg_taskq);
698	avl_destroy(&mg->mg_metaslab_tree);
699	mutex_destroy(&mg->mg_lock);
700	refcount_destroy(&mg->mg_alloc_queue_depth);
701	kmem_free(mg, sizeof (metaslab_group_t));
702}
703
704void
705metaslab_group_activate(metaslab_group_t *mg)
706{
707	metaslab_class_t *mc = mg->mg_class;
708	metaslab_group_t *mgprev, *mgnext;
709
710	ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
711
712	ASSERT(mc->mc_rotor != mg);
713	ASSERT(mg->mg_prev == NULL);
714	ASSERT(mg->mg_next == NULL);
715	ASSERT(mg->mg_activation_count <= 0);
716
717	if (++mg->mg_activation_count <= 0)
718		return;
719
720	mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
721	metaslab_group_alloc_update(mg);
722
723	if ((mgprev = mc->mc_rotor) == NULL) {
724		mg->mg_prev = mg;
725		mg->mg_next = mg;
726	} else {
727		mgnext = mgprev->mg_next;
728		mg->mg_prev = mgprev;
729		mg->mg_next = mgnext;
730		mgprev->mg_next = mg;
731		mgnext->mg_prev = mg;
732	}
733	mc->mc_rotor = mg;
734	metaslab_class_minblocksize_update(mc);
735}
736
737void
738metaslab_group_passivate(metaslab_group_t *mg)
739{
740	metaslab_class_t *mc = mg->mg_class;
741	metaslab_group_t *mgprev, *mgnext;
742
743	ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
744
745	if (--mg->mg_activation_count != 0) {
746		ASSERT(mc->mc_rotor != mg);
747		ASSERT(mg->mg_prev == NULL);
748		ASSERT(mg->mg_next == NULL);
749		ASSERT(mg->mg_activation_count < 0);
750		return;
751	}
752
753	taskq_wait(mg->mg_taskq);
754	metaslab_group_alloc_update(mg);
755
756	mgprev = mg->mg_prev;
757	mgnext = mg->mg_next;
758
759	if (mg == mgnext) {
760		mc->mc_rotor = NULL;
761	} else {
762		mc->mc_rotor = mgnext;
763		mgprev->mg_next = mgnext;
764		mgnext->mg_prev = mgprev;
765	}
766
767	mg->mg_prev = NULL;
768	mg->mg_next = NULL;
769	metaslab_class_minblocksize_update(mc);
770}
771
772boolean_t
773metaslab_group_initialized(metaslab_group_t *mg)
774{
775	vdev_t *vd = mg->mg_vd;
776	vdev_stat_t *vs = &vd->vdev_stat;
777
778	return (vs->vs_space != 0 && mg->mg_activation_count > 0);
779}
780
781uint64_t
782metaslab_group_get_space(metaslab_group_t *mg)
783{
784	return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
785}
786
787void
788metaslab_group_histogram_verify(metaslab_group_t *mg)
789{
790	uint64_t *mg_hist;
791	vdev_t *vd = mg->mg_vd;
792	uint64_t ashift = vd->vdev_ashift;
793	int i;
794
795	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
796		return;
797
798	mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
799	    KM_SLEEP);
800
801	ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
802	    SPACE_MAP_HISTOGRAM_SIZE + ashift);
803
804	for (int m = 0; m < vd->vdev_ms_count; m++) {
805		metaslab_t *msp = vd->vdev_ms[m];
806
807		if (msp->ms_sm == NULL)
808			continue;
809
810		for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
811			mg_hist[i + ashift] +=
812			    msp->ms_sm->sm_phys->smp_histogram[i];
813	}
814
815	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
816		VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
817
818	kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
819}
820
821static void
822metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
823{
824	metaslab_class_t *mc = mg->mg_class;
825	uint64_t ashift = mg->mg_vd->vdev_ashift;
826
827	ASSERT(MUTEX_HELD(&msp->ms_lock));
828	if (msp->ms_sm == NULL)
829		return;
830
831	mutex_enter(&mg->mg_lock);
832	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
833		mg->mg_histogram[i + ashift] +=
834		    msp->ms_sm->sm_phys->smp_histogram[i];
835		mc->mc_histogram[i + ashift] +=
836		    msp->ms_sm->sm_phys->smp_histogram[i];
837	}
838	mutex_exit(&mg->mg_lock);
839}
840
841void
842metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
843{
844	metaslab_class_t *mc = mg->mg_class;
845	uint64_t ashift = mg->mg_vd->vdev_ashift;
846
847	ASSERT(MUTEX_HELD(&msp->ms_lock));
848	if (msp->ms_sm == NULL)
849		return;
850
851	mutex_enter(&mg->mg_lock);
852	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
853		ASSERT3U(mg->mg_histogram[i + ashift], >=,
854		    msp->ms_sm->sm_phys->smp_histogram[i]);
855		ASSERT3U(mc->mc_histogram[i + ashift], >=,
856		    msp->ms_sm->sm_phys->smp_histogram[i]);
857
858		mg->mg_histogram[i + ashift] -=
859		    msp->ms_sm->sm_phys->smp_histogram[i];
860		mc->mc_histogram[i + ashift] -=
861		    msp->ms_sm->sm_phys->smp_histogram[i];
862	}
863	mutex_exit(&mg->mg_lock);
864}
865
866static void
867metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
868{
869	ASSERT(msp->ms_group == NULL);
870	mutex_enter(&mg->mg_lock);
871	msp->ms_group = mg;
872	msp->ms_weight = 0;
873	avl_add(&mg->mg_metaslab_tree, msp);
874	mutex_exit(&mg->mg_lock);
875
876	mutex_enter(&msp->ms_lock);
877	metaslab_group_histogram_add(mg, msp);
878	mutex_exit(&msp->ms_lock);
879}
880
881static void
882metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
883{
884	mutex_enter(&msp->ms_lock);
885	metaslab_group_histogram_remove(mg, msp);
886	mutex_exit(&msp->ms_lock);
887
888	mutex_enter(&mg->mg_lock);
889	ASSERT(msp->ms_group == mg);
890	avl_remove(&mg->mg_metaslab_tree, msp);
891	msp->ms_group = NULL;
892	mutex_exit(&mg->mg_lock);
893}
894
895static void
896metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
897{
898	/*
899	 * Although in principle the weight can be any value, in
900	 * practice we do not use values in the range [1, 511].
901	 */
902	ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
903	ASSERT(MUTEX_HELD(&msp->ms_lock));
904
905	mutex_enter(&mg->mg_lock);
906	ASSERT(msp->ms_group == mg);
907	avl_remove(&mg->mg_metaslab_tree, msp);
908	msp->ms_weight = weight;
909	avl_add(&mg->mg_metaslab_tree, msp);
910	mutex_exit(&mg->mg_lock);
911}
912
913/*
914 * Calculate the fragmentation for a given metaslab group. We can use
915 * a simple average here since all metaslabs within the group must have
916 * the same size. The return value will be a value between 0 and 100
917 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
918 * group have a fragmentation metric.
919 */
920uint64_t
921metaslab_group_fragmentation(metaslab_group_t *mg)
922{
923	vdev_t *vd = mg->mg_vd;
924	uint64_t fragmentation = 0;
925	uint64_t valid_ms = 0;
926
927	for (int m = 0; m < vd->vdev_ms_count; m++) {
928		metaslab_t *msp = vd->vdev_ms[m];
929
930		if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
931			continue;
932
933		valid_ms++;
934		fragmentation += msp->ms_fragmentation;
935	}
936
937	if (valid_ms <= vd->vdev_ms_count / 2)
938		return (ZFS_FRAG_INVALID);
939
940	fragmentation /= valid_ms;
941	ASSERT3U(fragmentation, <=, 100);
942	return (fragmentation);
943}
944
945/*
946 * Determine if a given metaslab group should skip allocations. A metaslab
947 * group should avoid allocations if its free capacity is less than the
948 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
949 * zfs_mg_fragmentation_threshold and there is at least one metaslab group
950 * that can still handle allocations. If the allocation throttle is enabled
951 * then we skip allocations to devices that have reached their maximum
952 * allocation queue depth unless the selected metaslab group is the only
953 * eligible group remaining.
954 */
955static boolean_t
956metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
957    uint64_t psize)
958{
959	spa_t *spa = mg->mg_vd->vdev_spa;
960	metaslab_class_t *mc = mg->mg_class;
961
962	/*
963	 * We can only consider skipping this metaslab group if it's
964	 * in the normal metaslab class and there are other metaslab
965	 * groups to select from. Otherwise, we always consider it eligible
966	 * for allocations.
967	 */
968	if (mc != spa_normal_class(spa) || mc->mc_groups <= 1)
969		return (B_TRUE);
970
971	/*
972	 * If the metaslab group's mg_allocatable flag is set (see comments
973	 * in metaslab_group_alloc_update() for more information) and
974	 * the allocation throttle is disabled then allow allocations to this
975	 * device. However, if the allocation throttle is enabled then
976	 * check if we have reached our allocation limit (mg_alloc_queue_depth)
977	 * to determine if we should allow allocations to this metaslab group.
978	 * If all metaslab groups are no longer considered allocatable
979	 * (mc_alloc_groups == 0) or we're trying to allocate the smallest
980	 * gang block size then we allow allocations on this metaslab group
981	 * regardless of the mg_allocatable or throttle settings.
982	 */
983	if (mg->mg_allocatable) {
984		metaslab_group_t *mgp;
985		int64_t qdepth;
986		uint64_t qmax = mg->mg_max_alloc_queue_depth;
987
988		if (!mc->mc_alloc_throttle_enabled)
989			return (B_TRUE);
990
991		/*
992		 * If this metaslab group does not have any free space, then
993		 * there is no point in looking further.
994		 */
995		if (mg->mg_no_free_space)
996			return (B_FALSE);
997
998		qdepth = refcount_count(&mg->mg_alloc_queue_depth);
999
1000		/*
1001		 * If this metaslab group is below its qmax or it's
1002		 * the only allocatable metasable group, then attempt
1003		 * to allocate from it.
1004		 */
1005		if (qdepth < qmax || mc->mc_alloc_groups == 1)
1006			return (B_TRUE);
1007		ASSERT3U(mc->mc_alloc_groups, >, 1);
1008
1009		/*
1010		 * Since this metaslab group is at or over its qmax, we
1011		 * need to determine if there are metaslab groups after this
1012		 * one that might be able to handle this allocation. This is
1013		 * racy since we can't hold the locks for all metaslab
1014		 * groups at the same time when we make this check.
1015		 */
1016		for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
1017			qmax = mgp->mg_max_alloc_queue_depth;
1018
1019			qdepth = refcount_count(&mgp->mg_alloc_queue_depth);
1020
1021			/*
1022			 * If there is another metaslab group that
1023			 * might be able to handle the allocation, then
1024			 * we return false so that we skip this group.
1025			 */
1026			if (qdepth < qmax && !mgp->mg_no_free_space)
1027				return (B_FALSE);
1028		}
1029
1030		/*
1031		 * We didn't find another group to handle the allocation
1032		 * so we can't skip this metaslab group even though
1033		 * we are at or over our qmax.
1034		 */
1035		return (B_TRUE);
1036
1037	} else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
1038		return (B_TRUE);
1039	}
1040	return (B_FALSE);
1041}
1042
1043/*
1044 * ==========================================================================
1045 * Range tree callbacks
1046 * ==========================================================================
1047 */
1048
1049/*
1050 * Comparison function for the private size-ordered tree. Tree is sorted
1051 * by size, larger sizes at the end of the tree.
1052 */
1053static int
1054metaslab_rangesize_compare(const void *x1, const void *x2)
1055{
1056	const range_seg_t *r1 = x1;
1057	const range_seg_t *r2 = x2;
1058	uint64_t rs_size1 = r1->rs_end - r1->rs_start;
1059	uint64_t rs_size2 = r2->rs_end - r2->rs_start;
1060
1061	if (rs_size1 < rs_size2)
1062		return (-1);
1063	if (rs_size1 > rs_size2)
1064		return (1);
1065
1066	if (r1->rs_start < r2->rs_start)
1067		return (-1);
1068
1069	if (r1->rs_start > r2->rs_start)
1070		return (1);
1071
1072	return (0);
1073}
1074
1075/*
1076 * Create any block allocator specific components. The current allocators
1077 * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
1078 */
1079static void
1080metaslab_rt_create(range_tree_t *rt, void *arg)
1081{
1082	metaslab_t *msp = arg;
1083
1084	ASSERT3P(rt->rt_arg, ==, msp);
1085	ASSERT(msp->ms_tree == NULL);
1086
1087	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
1088	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
1089}
1090
1091/*
1092 * Destroy the block allocator specific components.
1093 */
1094static void
1095metaslab_rt_destroy(range_tree_t *rt, void *arg)
1096{
1097	metaslab_t *msp = arg;
1098
1099	ASSERT3P(rt->rt_arg, ==, msp);
1100	ASSERT3P(msp->ms_tree, ==, rt);
1101	ASSERT0(avl_numnodes(&msp->ms_size_tree));
1102
1103	avl_destroy(&msp->ms_size_tree);
1104}
1105
1106static void
1107metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
1108{
1109	metaslab_t *msp = arg;
1110
1111	ASSERT3P(rt->rt_arg, ==, msp);
1112	ASSERT3P(msp->ms_tree, ==, rt);
1113	VERIFY(!msp->ms_condensing);
1114	avl_add(&msp->ms_size_tree, rs);
1115}
1116
1117static void
1118metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
1119{
1120	metaslab_t *msp = arg;
1121
1122	ASSERT3P(rt->rt_arg, ==, msp);
1123	ASSERT3P(msp->ms_tree, ==, rt);
1124	VERIFY(!msp->ms_condensing);
1125	avl_remove(&msp->ms_size_tree, rs);
1126}
1127
1128static void
1129metaslab_rt_vacate(range_tree_t *rt, void *arg)
1130{
1131	metaslab_t *msp = arg;
1132
1133	ASSERT3P(rt->rt_arg, ==, msp);
1134	ASSERT3P(msp->ms_tree, ==, rt);
1135
1136	/*
1137	 * Normally one would walk the tree freeing nodes along the way.
1138	 * Since the nodes are shared with the range trees we can avoid
1139	 * walking all nodes and just reinitialize the avl tree. The nodes
1140	 * will be freed by the range tree, so we don't want to free them here.
1141	 */
1142	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
1143	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
1144}
1145
1146static range_tree_ops_t metaslab_rt_ops = {
1147	metaslab_rt_create,
1148	metaslab_rt_destroy,
1149	metaslab_rt_add,
1150	metaslab_rt_remove,
1151	metaslab_rt_vacate
1152};
1153
1154/*
1155 * ==========================================================================
1156 * Common allocator routines
1157 * ==========================================================================
1158 */
1159
1160/*
1161 * Return the maximum contiguous segment within the metaslab.
1162 */
1163uint64_t
1164metaslab_block_maxsize(metaslab_t *msp)
1165{
1166	avl_tree_t *t = &msp->ms_size_tree;
1167	range_seg_t *rs;
1168
1169	if (t == NULL || (rs = avl_last(t)) == NULL)
1170		return (0ULL);
1171
1172	return (rs->rs_end - rs->rs_start);
1173}
1174
1175static range_seg_t *
1176metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
1177{
1178	range_seg_t *rs, rsearch;
1179	avl_index_t where;
1180
1181	rsearch.rs_start = start;
1182	rsearch.rs_end = start + size;
1183
1184	rs = avl_find(t, &rsearch, &where);
1185	if (rs == NULL) {
1186		rs = avl_nearest(t, where, AVL_AFTER);
1187	}
1188
1189	return (rs);
1190}
1191
1192/*
1193 * This is a helper function that can be used by the allocator to find
1194 * a suitable block to allocate. This will search the specified AVL
1195 * tree looking for a block that matches the specified criteria.
1196 */
1197static uint64_t
1198metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
1199    uint64_t align)
1200{
1201	range_seg_t *rs = metaslab_block_find(t, *cursor, size);
1202
1203	while (rs != NULL) {
1204		uint64_t offset = P2ROUNDUP(rs->rs_start, align);
1205
1206		if (offset + size <= rs->rs_end) {
1207			*cursor = offset + size;
1208			return (offset);
1209		}
1210		rs = AVL_NEXT(t, rs);
1211	}
1212
1213	/*
1214	 * If we know we've searched the whole map (*cursor == 0), give up.
1215	 * Otherwise, reset the cursor to the beginning and try again.
1216	 */
1217	if (*cursor == 0)
1218		return (-1ULL);
1219
1220	*cursor = 0;
1221	return (metaslab_block_picker(t, cursor, size, align));
1222}
1223
1224/*
1225 * ==========================================================================
1226 * The first-fit block allocator
1227 * ==========================================================================
1228 */
1229static uint64_t
1230metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
1231{
1232	/*
1233	 * Find the largest power of 2 block size that evenly divides the
1234	 * requested size. This is used to try to allocate blocks with similar
1235	 * alignment from the same area of the metaslab (i.e. same cursor
1236	 * bucket) but it does not guarantee that other allocations sizes
1237	 * may exist in the same region.
1238	 */
1239	uint64_t align = size & -size;
1240	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1241	avl_tree_t *t = &msp->ms_tree->rt_root;
1242
1243	return (metaslab_block_picker(t, cursor, size, align));
1244}
1245
1246static metaslab_ops_t metaslab_ff_ops = {
1247	metaslab_ff_alloc
1248};
1249
1250/*
1251 * ==========================================================================
1252 * Dynamic block allocator -
1253 * Uses the first fit allocation scheme until space get low and then
1254 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
1255 * and metaslab_df_free_pct to determine when to switch the allocation scheme.
1256 * ==========================================================================
1257 */
1258static uint64_t
1259metaslab_df_alloc(metaslab_t *msp, uint64_t size)
1260{
1261	/*
1262	 * Find the largest power of 2 block size that evenly divides the
1263	 * requested size. This is used to try to allocate blocks with similar
1264	 * alignment from the same area of the metaslab (i.e. same cursor
1265	 * bucket) but it does not guarantee that other allocations sizes
1266	 * may exist in the same region.
1267	 */
1268	uint64_t align = size & -size;
1269	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1270	range_tree_t *rt = msp->ms_tree;
1271	avl_tree_t *t = &rt->rt_root;
1272	uint64_t max_size = metaslab_block_maxsize(msp);
1273	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
1274
1275	ASSERT(MUTEX_HELD(&msp->ms_lock));
1276	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
1277
1278	if (max_size < size)
1279		return (-1ULL);
1280
1281	/*
1282	 * If we're running low on space switch to using the size
1283	 * sorted AVL tree (best-fit).
1284	 */
1285	if (max_size < metaslab_df_alloc_threshold ||
1286	    free_pct < metaslab_df_free_pct) {
1287		t = &msp->ms_size_tree;
1288		*cursor = 0;
1289	}
1290
1291	return (metaslab_block_picker(t, cursor, size, 1ULL));
1292}
1293
1294static metaslab_ops_t metaslab_df_ops = {
1295	metaslab_df_alloc
1296};
1297
1298/*
1299 * ==========================================================================
1300 * Cursor fit block allocator -
1301 * Select the largest region in the metaslab, set the cursor to the beginning
1302 * of the range and the cursor_end to the end of the range. As allocations
1303 * are made advance the cursor. Continue allocating from the cursor until
1304 * the range is exhausted and then find a new range.
1305 * ==========================================================================
1306 */
1307static uint64_t
1308metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
1309{
1310	range_tree_t *rt = msp->ms_tree;
1311	avl_tree_t *t = &msp->ms_size_tree;
1312	uint64_t *cursor = &msp->ms_lbas[0];
1313	uint64_t *cursor_end = &msp->ms_lbas[1];
1314	uint64_t offset = 0;
1315
1316	ASSERT(MUTEX_HELD(&msp->ms_lock));
1317	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
1318
1319	ASSERT3U(*cursor_end, >=, *cursor);
1320
1321	if ((*cursor + size) > *cursor_end) {
1322		range_seg_t *rs;
1323
1324		rs = avl_last(&msp->ms_size_tree);
1325		if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
1326			return (-1ULL);
1327
1328		*cursor = rs->rs_start;
1329		*cursor_end = rs->rs_end;
1330	}
1331
1332	offset = *cursor;
1333	*cursor += size;
1334
1335	return (offset);
1336}
1337
1338static metaslab_ops_t metaslab_cf_ops = {
1339	metaslab_cf_alloc
1340};
1341
1342/*
1343 * ==========================================================================
1344 * New dynamic fit allocator -
1345 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
1346 * contiguous blocks. If no region is found then just use the largest segment
1347 * that remains.
1348 * ==========================================================================
1349 */
1350
1351/*
1352 * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
1353 * to request from the allocator.
1354 */
1355uint64_t metaslab_ndf_clump_shift = 4;
1356
1357static uint64_t
1358metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
1359{
1360	avl_tree_t *t = &msp->ms_tree->rt_root;
1361	avl_index_t where;
1362	range_seg_t *rs, rsearch;
1363	uint64_t hbit = highbit64(size);
1364	uint64_t *cursor = &msp->ms_lbas[hbit - 1];
1365	uint64_t max_size = metaslab_block_maxsize(msp);
1366
1367	ASSERT(MUTEX_HELD(&msp->ms_lock));
1368	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
1369
1370	if (max_size < size)
1371		return (-1ULL);
1372
1373	rsearch.rs_start = *cursor;
1374	rsearch.rs_end = *cursor + size;
1375
1376	rs = avl_find(t, &rsearch, &where);
1377	if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
1378		t = &msp->ms_size_tree;
1379
1380		rsearch.rs_start = 0;
1381		rsearch.rs_end = MIN(max_size,
1382		    1ULL << (hbit + metaslab_ndf_clump_shift));
1383		rs = avl_find(t, &rsearch, &where);
1384		if (rs == NULL)
1385			rs = avl_nearest(t, where, AVL_AFTER);
1386		ASSERT(rs != NULL);
1387	}
1388
1389	if ((rs->rs_end - rs->rs_start) >= size) {
1390		*cursor = rs->rs_start + size;
1391		return (rs->rs_start);
1392	}
1393	return (-1ULL);
1394}
1395
1396static metaslab_ops_t metaslab_ndf_ops = {
1397	metaslab_ndf_alloc
1398};
1399
1400metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
1401
1402/*
1403 * ==========================================================================
1404 * Metaslabs
1405 * ==========================================================================
1406 */
1407
1408/*
1409 * Wait for any in-progress metaslab loads to complete.
1410 */
1411void
1412metaslab_load_wait(metaslab_t *msp)
1413{
1414	ASSERT(MUTEX_HELD(&msp->ms_lock));
1415
1416	while (msp->ms_loading) {
1417		ASSERT(!msp->ms_loaded);
1418		cv_wait(&msp->ms_load_cv, &msp->ms_lock);
1419	}
1420}
1421
1422int
1423metaslab_load(metaslab_t *msp)
1424{
1425	int error = 0;
1426	boolean_t success = B_FALSE;
1427
1428	ASSERT(MUTEX_HELD(&msp->ms_lock));
1429	ASSERT(!msp->ms_loaded);
1430	ASSERT(!msp->ms_loading);
1431
1432	msp->ms_loading = B_TRUE;
1433
1434	/*
1435	 * If the space map has not been allocated yet, then treat
1436	 * all the space in the metaslab as free and add it to the
1437	 * ms_tree.
1438	 */
1439	if (msp->ms_sm != NULL)
1440		error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
1441	else
1442		range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
1443
1444	success = (error == 0);
1445	msp->ms_loading = B_FALSE;
1446
1447	if (success) {
1448		ASSERT3P(msp->ms_group, !=, NULL);
1449		msp->ms_loaded = B_TRUE;
1450
1451		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1452			range_tree_walk(msp->ms_defertree[t],
1453			    range_tree_remove, msp->ms_tree);
1454		}
1455		msp->ms_max_size = metaslab_block_maxsize(msp);
1456	}
1457	cv_broadcast(&msp->ms_load_cv);
1458	return (error);
1459}
1460
1461void
1462metaslab_unload(metaslab_t *msp)
1463{
1464	ASSERT(MUTEX_HELD(&msp->ms_lock));
1465	range_tree_vacate(msp->ms_tree, NULL, NULL);
1466	msp->ms_loaded = B_FALSE;
1467	msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
1468	msp->ms_max_size = 0;
1469}
1470
1471int
1472metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
1473    metaslab_t **msp)
1474{
1475	vdev_t *vd = mg->mg_vd;
1476	objset_t *mos = vd->vdev_spa->spa_meta_objset;
1477	metaslab_t *ms;
1478	int error;
1479
1480	ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
1481	mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
1482	cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
1483	ms->ms_id = id;
1484	ms->ms_start = id << vd->vdev_ms_shift;
1485	ms->ms_size = 1ULL << vd->vdev_ms_shift;
1486
1487	/*
1488	 * We only open space map objects that already exist. All others
1489	 * will be opened when we finally allocate an object for it.
1490	 */
1491	if (object != 0) {
1492		error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
1493		    ms->ms_size, vd->vdev_ashift, &ms->ms_lock);
1494
1495		if (error != 0) {
1496			kmem_free(ms, sizeof (metaslab_t));
1497			return (error);
1498		}
1499
1500		ASSERT(ms->ms_sm != NULL);
1501	}
1502
1503	/*
1504	 * We create the main range tree here, but we don't create the
1505	 * other range trees until metaslab_sync_done().  This serves
1506	 * two purposes: it allows metaslab_sync_done() to detect the
1507	 * addition of new space; and for debugging, it ensures that we'd
1508	 * data fault on any attempt to use this metaslab before it's ready.
1509	 */
1510	ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms, &ms->ms_lock);
1511	metaslab_group_add(mg, ms);
1512
1513	metaslab_set_fragmentation(ms);
1514
1515	/*
1516	 * If we're opening an existing pool (txg == 0) or creating
1517	 * a new one (txg == TXG_INITIAL), all space is available now.
1518	 * If we're adding space to an existing pool, the new space
1519	 * does not become available until after this txg has synced.
1520	 * The metaslab's weight will also be initialized when we sync
1521	 * out this txg. This ensures that we don't attempt to allocate
1522	 * from it before we have initialized it completely.
1523	 */
1524	if (txg <= TXG_INITIAL)
1525		metaslab_sync_done(ms, 0);
1526
1527	/*
1528	 * If metaslab_debug_load is set and we're initializing a metaslab
1529	 * that has an allocated space map object then load the its space
1530	 * map so that can verify frees.
1531	 */
1532	if (metaslab_debug_load && ms->ms_sm != NULL) {
1533		mutex_enter(&ms->ms_lock);
1534		VERIFY0(metaslab_load(ms));
1535		mutex_exit(&ms->ms_lock);
1536	}
1537
1538	if (txg != 0) {
1539		vdev_dirty(vd, 0, NULL, txg);
1540		vdev_dirty(vd, VDD_METASLAB, ms, txg);
1541	}
1542
1543	*msp = ms;
1544
1545	return (0);
1546}
1547
1548void
1549metaslab_fini(metaslab_t *msp)
1550{
1551	metaslab_group_t *mg = msp->ms_group;
1552
1553	metaslab_group_remove(mg, msp);
1554
1555	mutex_enter(&msp->ms_lock);
1556	VERIFY(msp->ms_group == NULL);
1557	vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
1558	    0, -msp->ms_size);
1559	space_map_close(msp->ms_sm);
1560
1561	metaslab_unload(msp);
1562	range_tree_destroy(msp->ms_tree);
1563	range_tree_destroy(msp->ms_freeingtree);
1564	range_tree_destroy(msp->ms_freedtree);
1565
1566	for (int t = 0; t < TXG_SIZE; t++) {
1567		range_tree_destroy(msp->ms_alloctree[t]);
1568	}
1569
1570	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1571		range_tree_destroy(msp->ms_defertree[t]);
1572	}
1573
1574	ASSERT0(msp->ms_deferspace);
1575
1576	mutex_exit(&msp->ms_lock);
1577	cv_destroy(&msp->ms_load_cv);
1578	mutex_destroy(&msp->ms_lock);
1579
1580	kmem_free(msp, sizeof (metaslab_t));
1581}
1582
1583#define	FRAGMENTATION_TABLE_SIZE	17
1584
1585/*
1586 * This table defines a segment size based fragmentation metric that will
1587 * allow each metaslab to derive its own fragmentation value. This is done
1588 * by calculating the space in each bucket of the spacemap histogram and
1589 * multiplying that by the fragmetation metric in this table. Doing
1590 * this for all buckets and dividing it by the total amount of free
1591 * space in this metaslab (i.e. the total free space in all buckets) gives
1592 * us the fragmentation metric. This means that a high fragmentation metric
1593 * equates to most of the free space being comprised of small segments.
1594 * Conversely, if the metric is low, then most of the free space is in
1595 * large segments. A 10% change in fragmentation equates to approximately
1596 * double the number of segments.
1597 *
1598 * This table defines 0% fragmented space using 16MB segments. Testing has
1599 * shown that segments that are greater than or equal to 16MB do not suffer
1600 * from drastic performance problems. Using this value, we derive the rest
1601 * of the table. Since the fragmentation value is never stored on disk, it
1602 * is possible to change these calculations in the future.
1603 */
1604int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
1605	100,	/* 512B	*/
1606	100,	/* 1K	*/
1607	98,	/* 2K	*/
1608	95,	/* 4K	*/
1609	90,	/* 8K	*/
1610	80,	/* 16K	*/
1611	70,	/* 32K	*/
1612	60,	/* 64K	*/
1613	50,	/* 128K	*/
1614	40,	/* 256K	*/
1615	30,	/* 512K	*/
1616	20,	/* 1M	*/
1617	15,	/* 2M	*/
1618	10,	/* 4M	*/
1619	5,	/* 8M	*/
1620	0	/* 16M	*/
1621};
1622
1623/*
1624 * Calclate the metaslab's fragmentation metric. A return value
1625 * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does
1626 * not support this metric. Otherwise, the return value should be in the
1627 * range [0, 100].
1628 */
1629static void
1630metaslab_set_fragmentation(metaslab_t *msp)
1631{
1632	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1633	uint64_t fragmentation = 0;
1634	uint64_t total = 0;
1635	boolean_t feature_enabled = spa_feature_is_enabled(spa,
1636	    SPA_FEATURE_SPACEMAP_HISTOGRAM);
1637
1638	if (!feature_enabled) {
1639		msp->ms_fragmentation = ZFS_FRAG_INVALID;
1640		return;
1641	}
1642
1643	/*
1644	 * A null space map means that the entire metaslab is free
1645	 * and thus is not fragmented.
1646	 */
1647	if (msp->ms_sm == NULL) {
1648		msp->ms_fragmentation = 0;
1649		return;
1650	}
1651
1652	/*
1653	 * If this metaslab's space map has not been upgraded, flag it
1654	 * so that we upgrade next time we encounter it.
1655	 */
1656	if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
1657		uint64_t txg = spa_syncing_txg(spa);
1658		vdev_t *vd = msp->ms_group->mg_vd;
1659
1660		/*
1661		 * If we've reached the final dirty txg, then we must
1662		 * be shutting down the pool. We don't want to dirty
1663		 * any data past this point so skip setting the condense
1664		 * flag. We can retry this action the next time the pool
1665		 * is imported.
1666		 */
1667		if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) {
1668			msp->ms_condense_wanted = B_TRUE;
1669			vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
1670			spa_dbgmsg(spa, "txg %llu, requesting force condense: "
1671			    "ms_id %llu, vdev_id %llu", txg, msp->ms_id,
1672			    vd->vdev_id);
1673		}
1674		msp->ms_fragmentation = ZFS_FRAG_INVALID;
1675		return;
1676	}
1677
1678	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
1679		uint64_t space = 0;
1680		uint8_t shift = msp->ms_sm->sm_shift;
1681
1682		int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
1683		    FRAGMENTATION_TABLE_SIZE - 1);
1684
1685		if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
1686			continue;
1687
1688		space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
1689		total += space;
1690
1691		ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
1692		fragmentation += space * zfs_frag_table[idx];
1693	}
1694
1695	if (total > 0)
1696		fragmentation /= total;
1697	ASSERT3U(fragmentation, <=, 100);
1698
1699	msp->ms_fragmentation = fragmentation;
1700}
1701
1702/*
1703 * Compute a weight -- a selection preference value -- for the given metaslab.
1704 * This is based on the amount of free space, the level of fragmentation,
1705 * the LBA range, and whether the metaslab is loaded.
1706 */
1707static uint64_t
1708metaslab_space_weight(metaslab_t *msp)
1709{
1710	metaslab_group_t *mg = msp->ms_group;
1711	vdev_t *vd = mg->mg_vd;
1712	uint64_t weight, space;
1713
1714	ASSERT(MUTEX_HELD(&msp->ms_lock));
1715	ASSERT(!vd->vdev_removing);
1716
1717	/*
1718	 * The baseline weight is the metaslab's free space.
1719	 */
1720	space = msp->ms_size - space_map_allocated(msp->ms_sm);
1721
1722	if (metaslab_fragmentation_factor_enabled &&
1723	    msp->ms_fragmentation != ZFS_FRAG_INVALID) {
1724		/*
1725		 * Use the fragmentation information to inversely scale
1726		 * down the baseline weight. We need to ensure that we
1727		 * don't exclude this metaslab completely when it's 100%
1728		 * fragmented. To avoid this we reduce the fragmented value
1729		 * by 1.
1730		 */
1731		space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
1732
1733		/*
1734		 * If space < SPA_MINBLOCKSIZE, then we will not allocate from
1735		 * this metaslab again. The fragmentation metric may have
1736		 * decreased the space to something smaller than
1737		 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
1738		 * so that we can consume any remaining space.
1739		 */
1740		if (space > 0 && space < SPA_MINBLOCKSIZE)
1741			space = SPA_MINBLOCKSIZE;
1742	}
1743	weight = space;
1744
1745	/*
1746	 * Modern disks have uniform bit density and constant angular velocity.
1747	 * Therefore, the outer recording zones are faster (higher bandwidth)
1748	 * than the inner zones by the ratio of outer to inner track diameter,
1749	 * which is typically around 2:1.  We account for this by assigning
1750	 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
1751	 * In effect, this means that we'll select the metaslab with the most
1752	 * free bandwidth rather than simply the one with the most free space.
1753	 */
1754	if (metaslab_lba_weighting_enabled) {
1755		weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
1756		ASSERT(weight >= space && weight <= 2 * space);
1757	}
1758
1759	/*
1760	 * If this metaslab is one we're actively using, adjust its
1761	 * weight to make it preferable to any inactive metaslab so
1762	 * we'll polish it off. If the fragmentation on this metaslab
1763	 * has exceed our threshold, then don't mark it active.
1764	 */
1765	if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
1766	    msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
1767		weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
1768	}
1769
1770	WEIGHT_SET_SPACEBASED(weight);
1771	return (weight);
1772}
1773
1774/*
1775 * Return the weight of the specified metaslab, according to the segment-based
1776 * weighting algorithm. The metaslab must be loaded. This function can
1777 * be called within a sync pass since it relies only on the metaslab's
1778 * range tree which is always accurate when the metaslab is loaded.
1779 */
1780static uint64_t
1781metaslab_weight_from_range_tree(metaslab_t *msp)
1782{
1783	uint64_t weight = 0;
1784	uint32_t segments = 0;
1785
1786	ASSERT(msp->ms_loaded);
1787
1788	for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT;
1789	    i--) {
1790		uint8_t shift = msp->ms_group->mg_vd->vdev_ashift;
1791		int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
1792
1793		segments <<= 1;
1794		segments += msp->ms_tree->rt_histogram[i];
1795
1796		/*
1797		 * The range tree provides more precision than the space map
1798		 * and must be downgraded so that all values fit within the
1799		 * space map's histogram. This allows us to compare loaded
1800		 * vs. unloaded metaslabs to determine which metaslab is
1801		 * considered "best".
1802		 */
1803		if (i > max_idx)
1804			continue;
1805
1806		if (segments != 0) {
1807			WEIGHT_SET_COUNT(weight, segments);
1808			WEIGHT_SET_INDEX(weight, i);
1809			WEIGHT_SET_ACTIVE(weight, 0);
1810			break;
1811		}
1812	}
1813	return (weight);
1814}
1815
1816/*
1817 * Calculate the weight based on the on-disk histogram. This should only
1818 * be called after a sync pass has completely finished since the on-disk
1819 * information is updated in metaslab_sync().
1820 */
1821static uint64_t
1822metaslab_weight_from_spacemap(metaslab_t *msp)
1823{
1824	uint64_t weight = 0;
1825
1826	for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
1827		if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) {
1828			WEIGHT_SET_COUNT(weight,
1829			    msp->ms_sm->sm_phys->smp_histogram[i]);
1830			WEIGHT_SET_INDEX(weight, i +
1831			    msp->ms_sm->sm_shift);
1832			WEIGHT_SET_ACTIVE(weight, 0);
1833			break;
1834		}
1835	}
1836	return (weight);
1837}
1838
1839/*
1840 * Compute a segment-based weight for the specified metaslab. The weight
1841 * is determined by highest bucket in the histogram. The information
1842 * for the highest bucket is encoded into the weight value.
1843 */
1844static uint64_t
1845metaslab_segment_weight(metaslab_t *msp)
1846{
1847	metaslab_group_t *mg = msp->ms_group;
1848	uint64_t weight = 0;
1849	uint8_t shift = mg->mg_vd->vdev_ashift;
1850
1851	ASSERT(MUTEX_HELD(&msp->ms_lock));
1852
1853	/*
1854	 * The metaslab is completely free.
1855	 */
1856	if (space_map_allocated(msp->ms_sm) == 0) {
1857		int idx = highbit64(msp->ms_size) - 1;
1858		int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
1859
1860		if (idx < max_idx) {
1861			WEIGHT_SET_COUNT(weight, 1ULL);
1862			WEIGHT_SET_INDEX(weight, idx);
1863		} else {
1864			WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
1865			WEIGHT_SET_INDEX(weight, max_idx);
1866		}
1867		WEIGHT_SET_ACTIVE(weight, 0);
1868		ASSERT(!WEIGHT_IS_SPACEBASED(weight));
1869
1870		return (weight);
1871	}
1872
1873	ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
1874
1875	/*
1876	 * If the metaslab is fully allocated then just make the weight 0.
1877	 */
1878	if (space_map_allocated(msp->ms_sm) == msp->ms_size)
1879		return (0);
1880	/*
1881	 * If the metaslab is already loaded, then use the range tree to
1882	 * determine the weight. Otherwise, we rely on the space map information
1883	 * to generate the weight.
1884	 */
1885	if (msp->ms_loaded) {
1886		weight = metaslab_weight_from_range_tree(msp);
1887	} else {
1888		weight = metaslab_weight_from_spacemap(msp);
1889	}
1890
1891	/*
1892	 * If the metaslab was active the last time we calculated its weight
1893	 * then keep it active. We want to consume the entire region that
1894	 * is associated with this weight.
1895	 */
1896	if (msp->ms_activation_weight != 0 && weight != 0)
1897		WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
1898	return (weight);
1899}
1900
1901/*
1902 * Determine if we should attempt to allocate from this metaslab. If the
1903 * metaslab has a maximum size then we can quickly determine if the desired
1904 * allocation size can be satisfied. Otherwise, if we're using segment-based
1905 * weighting then we can determine the maximum allocation that this metaslab
1906 * can accommodate based on the index encoded in the weight. If we're using
1907 * space-based weights then rely on the entire weight (excluding the weight
1908 * type bit).
1909 */
1910boolean_t
1911metaslab_should_allocate(metaslab_t *msp, uint64_t asize)
1912{
1913	boolean_t should_allocate;
1914
1915	if (msp->ms_max_size != 0)
1916		return (msp->ms_max_size >= asize);
1917
1918	if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
1919		/*
1920		 * The metaslab segment weight indicates segments in the
1921		 * range [2^i, 2^(i+1)), where i is the index in the weight.
1922		 * Since the asize might be in the middle of the range, we
1923		 * should attempt the allocation if asize < 2^(i+1).
1924		 */
1925		should_allocate = (asize <
1926		    1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1));
1927	} else {
1928		should_allocate = (asize <=
1929		    (msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
1930	}
1931	return (should_allocate);
1932}
1933
1934static uint64_t
1935metaslab_weight(metaslab_t *msp)
1936{
1937	vdev_t *vd = msp->ms_group->mg_vd;
1938	spa_t *spa = vd->vdev_spa;
1939	uint64_t weight;
1940
1941	ASSERT(MUTEX_HELD(&msp->ms_lock));
1942
1943	/*
1944	 * This vdev is in the process of being removed so there is nothing
1945	 * for us to do here.
1946	 */
1947	if (vd->vdev_removing) {
1948		ASSERT0(space_map_allocated(msp->ms_sm));
1949		ASSERT0(vd->vdev_ms_shift);
1950		return (0);
1951	}
1952
1953	metaslab_set_fragmentation(msp);
1954
1955	/*
1956	 * Update the maximum size if the metaslab is loaded. This will
1957	 * ensure that we get an accurate maximum size if newly freed space
1958	 * has been added back into the free tree.
1959	 */
1960	if (msp->ms_loaded)
1961		msp->ms_max_size = metaslab_block_maxsize(msp);
1962
1963	/*
1964	 * Segment-based weighting requires space map histogram support.
1965	 */
1966	if (zfs_metaslab_segment_weight_enabled &&
1967	    spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
1968	    (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
1969	    sizeof (space_map_phys_t))) {
1970		weight = metaslab_segment_weight(msp);
1971	} else {
1972		weight = metaslab_space_weight(msp);
1973	}
1974	return (weight);
1975}
1976
1977static int
1978metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
1979{
1980	ASSERT(MUTEX_HELD(&msp->ms_lock));
1981
1982	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
1983		metaslab_load_wait(msp);
1984		if (!msp->ms_loaded) {
1985			int error = metaslab_load(msp);
1986			if (error) {
1987				metaslab_group_sort(msp->ms_group, msp, 0);
1988				return (error);
1989			}
1990		}
1991
1992		msp->ms_activation_weight = msp->ms_weight;
1993		metaslab_group_sort(msp->ms_group, msp,
1994		    msp->ms_weight | activation_weight);
1995	}
1996	ASSERT(msp->ms_loaded);
1997	ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
1998
1999	return (0);
2000}
2001
2002static void
2003metaslab_passivate(metaslab_t *msp, uint64_t weight)
2004{
2005	uint64_t size = weight & ~METASLAB_WEIGHT_TYPE;
2006
2007	/*
2008	 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
2009	 * this metaslab again.  In that case, it had better be empty,
2010	 * or we would be leaving space on the table.
2011	 */
2012	ASSERT(size >= SPA_MINBLOCKSIZE ||
2013	    range_tree_space(msp->ms_tree) == 0);
2014	ASSERT0(weight & METASLAB_ACTIVE_MASK);
2015
2016	msp->ms_activation_weight = 0;
2017	metaslab_group_sort(msp->ms_group, msp, weight);
2018	ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
2019}
2020
2021/*
2022 * Segment-based metaslabs are activated once and remain active until
2023 * we either fail an allocation attempt (similar to space-based metaslabs)
2024 * or have exhausted the free space in zfs_metaslab_switch_threshold
2025 * buckets since the metaslab was activated. This function checks to see
2026 * if we've exhaused the zfs_metaslab_switch_threshold buckets in the
2027 * metaslab and passivates it proactively. This will allow us to select a
2028 * metaslabs with larger contiguous region if any remaining within this
2029 * metaslab group. If we're in sync pass > 1, then we continue using this
2030 * metaslab so that we don't dirty more block and cause more sync passes.
2031 */
2032void
2033metaslab_segment_may_passivate(metaslab_t *msp)
2034{
2035	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2036
2037	if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1)
2038		return;
2039
2040	/*
2041	 * Since we are in the middle of a sync pass, the most accurate
2042	 * information that is accessible to us is the in-core range tree
2043	 * histogram; calculate the new weight based on that information.
2044	 */
2045	uint64_t weight = metaslab_weight_from_range_tree(msp);
2046	int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight);
2047	int current_idx = WEIGHT_GET_INDEX(weight);
2048
2049	if (current_idx <= activation_idx - zfs_metaslab_switch_threshold)
2050		metaslab_passivate(msp, weight);
2051}
2052
2053static void
2054metaslab_preload(void *arg)
2055{
2056	metaslab_t *msp = arg;
2057	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2058
2059	ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
2060
2061	mutex_enter(&msp->ms_lock);
2062	metaslab_load_wait(msp);
2063	if (!msp->ms_loaded)
2064		(void) metaslab_load(msp);
2065	msp->ms_selected_txg = spa_syncing_txg(spa);
2066	mutex_exit(&msp->ms_lock);
2067}
2068
2069static void
2070metaslab_group_preload(metaslab_group_t *mg)
2071{
2072	spa_t *spa = mg->mg_vd->vdev_spa;
2073	metaslab_t *msp;
2074	avl_tree_t *t = &mg->mg_metaslab_tree;
2075	int m = 0;
2076
2077	if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
2078		taskq_wait(mg->mg_taskq);
2079		return;
2080	}
2081
2082	mutex_enter(&mg->mg_lock);
2083	/*
2084	 * Load the next potential metaslabs
2085	 */
2086	for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
2087		/*
2088		 * We preload only the maximum number of metaslabs specified
2089		 * by metaslab_preload_limit. If a metaslab is being forced
2090		 * to condense then we preload it too. This will ensure
2091		 * that force condensing happens in the next txg.
2092		 */
2093		if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
2094			continue;
2095		}
2096
2097		VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
2098		    msp, TQ_SLEEP) != 0);
2099	}
2100	mutex_exit(&mg->mg_lock);
2101}
2102
2103/*
2104 * Determine if the space map's on-disk footprint is past our tolerance
2105 * for inefficiency. We would like to use the following criteria to make
2106 * our decision:
2107 *
2108 * 1. The size of the space map object should not dramatically increase as a
2109 * result of writing out the free space range tree.
2110 *
2111 * 2. The minimal on-disk space map representation is zfs_condense_pct/100
2112 * times the size than the free space range tree representation
2113 * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB).
2114 *
2115 * 3. The on-disk size of the space map should actually decrease.
2116 *
2117 * Checking the first condition is tricky since we don't want to walk
2118 * the entire AVL tree calculating the estimated on-disk size. Instead we
2119 * use the size-ordered range tree in the metaslab and calculate the
2120 * size required to write out the largest segment in our free tree. If the
2121 * size required to represent that segment on disk is larger than the space
2122 * map object then we avoid condensing this map.
2123 *
2124 * To determine the second criterion we use a best-case estimate and assume
2125 * each segment can be represented on-disk as a single 64-bit entry. We refer
2126 * to this best-case estimate as the space map's minimal form.
2127 *
2128 * Unfortunately, we cannot compute the on-disk size of the space map in this
2129 * context because we cannot accurately compute the effects of compression, etc.
2130 * Instead, we apply the heuristic described in the block comment for
2131 * zfs_metaslab_condense_block_threshold - we only condense if the space used
2132 * is greater than a threshold number of blocks.
2133 */
2134static boolean_t
2135metaslab_should_condense(metaslab_t *msp)
2136{
2137	space_map_t *sm = msp->ms_sm;
2138	range_seg_t *rs;
2139	uint64_t size, entries, segsz, object_size, optimal_size, record_size;
2140	dmu_object_info_t doi;
2141	uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift;
2142
2143	ASSERT(MUTEX_HELD(&msp->ms_lock));
2144	ASSERT(msp->ms_loaded);
2145
2146	/*
2147	 * Use the ms_size_tree range tree, which is ordered by size, to
2148	 * obtain the largest segment in the free tree. We always condense
2149	 * metaslabs that are empty and metaslabs for which a condense
2150	 * request has been made.
2151	 */
2152	rs = avl_last(&msp->ms_size_tree);
2153	if (rs == NULL || msp->ms_condense_wanted)
2154		return (B_TRUE);
2155
2156	/*
2157	 * Calculate the number of 64-bit entries this segment would
2158	 * require when written to disk. If this single segment would be
2159	 * larger on-disk than the entire current on-disk structure, then
2160	 * clearly condensing will increase the on-disk structure size.
2161	 */
2162	size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
2163	entries = size / (MIN(size, SM_RUN_MAX));
2164	segsz = entries * sizeof (uint64_t);
2165
2166	optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root);
2167	object_size = space_map_length(msp->ms_sm);
2168
2169	dmu_object_info_from_db(sm->sm_dbuf, &doi);
2170	record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
2171
2172	return (segsz <= object_size &&
2173	    object_size >= (optimal_size * zfs_condense_pct / 100) &&
2174	    object_size > zfs_metaslab_condense_block_threshold * record_size);
2175}
2176
2177/*
2178 * Condense the on-disk space map representation to its minimized form.
2179 * The minimized form consists of a small number of allocations followed by
2180 * the entries of the free range tree.
2181 */
2182static void
2183metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
2184{
2185	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2186	range_tree_t *condense_tree;
2187	space_map_t *sm = msp->ms_sm;
2188
2189	ASSERT(MUTEX_HELD(&msp->ms_lock));
2190	ASSERT3U(spa_sync_pass(spa), ==, 1);
2191	ASSERT(msp->ms_loaded);
2192
2193
2194	spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, vdev id %llu, "
2195	    "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
2196	    msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
2197	    msp->ms_group->mg_vd->vdev_spa->spa_name,
2198	    space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root),
2199	    msp->ms_condense_wanted ? "TRUE" : "FALSE");
2200
2201	msp->ms_condense_wanted = B_FALSE;
2202
2203	/*
2204	 * Create an range tree that is 100% allocated. We remove segments
2205	 * that have been freed in this txg, any deferred frees that exist,
2206	 * and any allocation in the future. Removing segments should be
2207	 * a relatively inexpensive operation since we expect these trees to
2208	 * have a small number of nodes.
2209	 */
2210	condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock);
2211	range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
2212
2213	/*
2214	 * Remove what's been freed in this txg from the condense_tree.
2215	 * Since we're in sync_pass 1, we know that all the frees from
2216	 * this txg are in the freeingtree.
2217	 */
2218	range_tree_walk(msp->ms_freeingtree, range_tree_remove, condense_tree);
2219
2220	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2221		range_tree_walk(msp->ms_defertree[t],
2222		    range_tree_remove, condense_tree);
2223	}
2224
2225	for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
2226		range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK],
2227		    range_tree_remove, condense_tree);
2228	}
2229
2230	/*
2231	 * We're about to drop the metaslab's lock thus allowing
2232	 * other consumers to change it's content. Set the
2233	 * metaslab's ms_condensing flag to ensure that
2234	 * allocations on this metaslab do not occur while we're
2235	 * in the middle of committing it to disk. This is only critical
2236	 * for the ms_tree as all other range trees use per txg
2237	 * views of their content.
2238	 */
2239	msp->ms_condensing = B_TRUE;
2240
2241	mutex_exit(&msp->ms_lock);
2242	space_map_truncate(sm, tx);
2243	mutex_enter(&msp->ms_lock);
2244
2245	/*
2246	 * While we would ideally like to create a space map representation
2247	 * that consists only of allocation records, doing so can be
2248	 * prohibitively expensive because the in-core free tree can be
2249	 * large, and therefore computationally expensive to subtract
2250	 * from the condense_tree. Instead we sync out two trees, a cheap
2251	 * allocation only tree followed by the in-core free tree. While not
2252	 * optimal, this is typically close to optimal, and much cheaper to
2253	 * compute.
2254	 */
2255	space_map_write(sm, condense_tree, SM_ALLOC, tx);
2256	range_tree_vacate(condense_tree, NULL, NULL);
2257	range_tree_destroy(condense_tree);
2258
2259	space_map_write(sm, msp->ms_tree, SM_FREE, tx);
2260	msp->ms_condensing = B_FALSE;
2261}
2262
2263/*
2264 * Write a metaslab to disk in the context of the specified transaction group.
2265 */
2266void
2267metaslab_sync(metaslab_t *msp, uint64_t txg)
2268{
2269	metaslab_group_t *mg = msp->ms_group;
2270	vdev_t *vd = mg->mg_vd;
2271	spa_t *spa = vd->vdev_spa;
2272	objset_t *mos = spa_meta_objset(spa);
2273	range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK];
2274	dmu_tx_t *tx;
2275	uint64_t object = space_map_object(msp->ms_sm);
2276
2277	ASSERT(!vd->vdev_ishole);
2278
2279	/*
2280	 * This metaslab has just been added so there's no work to do now.
2281	 */
2282	if (msp->ms_freeingtree == NULL) {
2283		ASSERT3P(alloctree, ==, NULL);
2284		return;
2285	}
2286
2287	ASSERT3P(alloctree, !=, NULL);
2288	ASSERT3P(msp->ms_freeingtree, !=, NULL);
2289	ASSERT3P(msp->ms_freedtree, !=, NULL);
2290
2291	/*
2292	 * Normally, we don't want to process a metaslab if there
2293	 * are no allocations or frees to perform. However, if the metaslab
2294	 * is being forced to condense and it's loaded, we need to let it
2295	 * through.
2296	 */
2297	if (range_tree_space(alloctree) == 0 &&
2298	    range_tree_space(msp->ms_freeingtree) == 0 &&
2299	    !(msp->ms_loaded && msp->ms_condense_wanted))
2300		return;
2301
2302
2303	VERIFY(txg <= spa_final_dirty_txg(spa));
2304
2305	/*
2306	 * The only state that can actually be changing concurrently with
2307	 * metaslab_sync() is the metaslab's ms_tree.  No other thread can
2308	 * be modifying this txg's alloctree, freeingtree, freedtree, or
2309	 * space_map_phys_t. Therefore, we only hold ms_lock to satify
2310	 * space map ASSERTs. We drop it whenever we call into the DMU,
2311	 * because the DMU can call down to us (e.g. via zio_free()) at
2312	 * any time.
2313	 */
2314
2315	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
2316
2317	if (msp->ms_sm == NULL) {
2318		uint64_t new_object;
2319
2320		new_object = space_map_alloc(mos, tx);
2321		VERIFY3U(new_object, !=, 0);
2322
2323		VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
2324		    msp->ms_start, msp->ms_size, vd->vdev_ashift,
2325		    &msp->ms_lock));
2326		ASSERT(msp->ms_sm != NULL);
2327	}
2328
2329	mutex_enter(&msp->ms_lock);
2330
2331	/*
2332	 * Note: metaslab_condense() clears the space map's histogram.
2333	 * Therefore we must verify and remove this histogram before
2334	 * condensing.
2335	 */
2336	metaslab_group_histogram_verify(mg);
2337	metaslab_class_histogram_verify(mg->mg_class);
2338	metaslab_group_histogram_remove(mg, msp);
2339
2340	if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
2341	    metaslab_should_condense(msp)) {
2342		metaslab_condense(msp, txg, tx);
2343	} else {
2344		space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
2345		space_map_write(msp->ms_sm, msp->ms_freeingtree, SM_FREE, tx);
2346	}
2347
2348	if (msp->ms_loaded) {
2349		/*
2350		 * When the space map is loaded, we have an accruate
2351		 * histogram in the range tree. This gives us an opportunity
2352		 * to bring the space map's histogram up-to-date so we clear
2353		 * it first before updating it.
2354		 */
2355		space_map_histogram_clear(msp->ms_sm);
2356		space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
2357
2358		/*
2359		 * Since we've cleared the histogram we need to add back
2360		 * any free space that has already been processed, plus
2361		 * any deferred space. This allows the on-disk histogram
2362		 * to accurately reflect all free space even if some space
2363		 * is not yet available for allocation (i.e. deferred).
2364		 */
2365		space_map_histogram_add(msp->ms_sm, msp->ms_freedtree, tx);
2366
2367		/*
2368		 * Add back any deferred free space that has not been
2369		 * added back into the in-core free tree yet. This will
2370		 * ensure that we don't end up with a space map histogram
2371		 * that is completely empty unless the metaslab is fully
2372		 * allocated.
2373		 */
2374		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2375			space_map_histogram_add(msp->ms_sm,
2376			    msp->ms_defertree[t], tx);
2377		}
2378	}
2379
2380	/*
2381	 * Always add the free space from this sync pass to the space
2382	 * map histogram. We want to make sure that the on-disk histogram
2383	 * accounts for all free space. If the space map is not loaded,
2384	 * then we will lose some accuracy but will correct it the next
2385	 * time we load the space map.
2386	 */
2387	space_map_histogram_add(msp->ms_sm, msp->ms_freeingtree, tx);
2388
2389	metaslab_group_histogram_add(mg, msp);
2390	metaslab_group_histogram_verify(mg);
2391	metaslab_class_histogram_verify(mg->mg_class);
2392
2393	/*
2394	 * For sync pass 1, we avoid traversing this txg's free range tree
2395	 * and instead will just swap the pointers for freeingtree and
2396	 * freedtree. We can safely do this since the freed_tree is
2397	 * guaranteed to be empty on the initial pass.
2398	 */
2399	if (spa_sync_pass(spa) == 1) {
2400		range_tree_swap(&msp->ms_freeingtree, &msp->ms_freedtree);
2401	} else {
2402		range_tree_vacate(msp->ms_freeingtree,
2403		    range_tree_add, msp->ms_freedtree);
2404	}
2405	range_tree_vacate(alloctree, NULL, NULL);
2406
2407	ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
2408	ASSERT0(range_tree_space(msp->ms_alloctree[TXG_CLEAN(txg) & TXG_MASK]));
2409	ASSERT0(range_tree_space(msp->ms_freeingtree));
2410
2411	mutex_exit(&msp->ms_lock);
2412
2413	if (object != space_map_object(msp->ms_sm)) {
2414		object = space_map_object(msp->ms_sm);
2415		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
2416		    msp->ms_id, sizeof (uint64_t), &object, tx);
2417	}
2418	dmu_tx_commit(tx);
2419}
2420
2421/*
2422 * Called after a transaction group has completely synced to mark
2423 * all of the metaslab's free space as usable.
2424 */
2425void
2426metaslab_sync_done(metaslab_t *msp, uint64_t txg)
2427{
2428	metaslab_group_t *mg = msp->ms_group;
2429	vdev_t *vd = mg->mg_vd;
2430	spa_t *spa = vd->vdev_spa;
2431	range_tree_t **defer_tree;
2432	int64_t alloc_delta, defer_delta;
2433	boolean_t defer_allowed = B_TRUE;
2434
2435	ASSERT(!vd->vdev_ishole);
2436
2437	mutex_enter(&msp->ms_lock);
2438
2439	/*
2440	 * If this metaslab is just becoming available, initialize its
2441	 * range trees and add its capacity to the vdev.
2442	 */
2443	if (msp->ms_freedtree == NULL) {
2444		for (int t = 0; t < TXG_SIZE; t++) {
2445			ASSERT(msp->ms_alloctree[t] == NULL);
2446
2447			msp->ms_alloctree[t] = range_tree_create(NULL, msp,
2448			    &msp->ms_lock);
2449		}
2450
2451		ASSERT3P(msp->ms_freeingtree, ==, NULL);
2452		msp->ms_freeingtree = range_tree_create(NULL, msp,
2453		    &msp->ms_lock);
2454
2455		ASSERT3P(msp->ms_freedtree, ==, NULL);
2456		msp->ms_freedtree = range_tree_create(NULL, msp,
2457		    &msp->ms_lock);
2458
2459		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2460			ASSERT(msp->ms_defertree[t] == NULL);
2461
2462			msp->ms_defertree[t] = range_tree_create(NULL, msp,
2463			    &msp->ms_lock);
2464		}
2465
2466		vdev_space_update(vd, 0, 0, msp->ms_size);
2467	}
2468
2469	defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE];
2470
2471	uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
2472	    metaslab_class_get_alloc(spa_normal_class(spa));
2473	if (free_space <= spa_get_slop_space(spa)) {
2474		defer_allowed = B_FALSE;
2475	}
2476
2477	defer_delta = 0;
2478	alloc_delta = space_map_alloc_delta(msp->ms_sm);
2479	if (defer_allowed) {
2480		defer_delta = range_tree_space(msp->ms_freedtree) -
2481		    range_tree_space(*defer_tree);
2482	} else {
2483		defer_delta -= range_tree_space(*defer_tree);
2484	}
2485
2486	vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
2487
2488	/*
2489	 * If there's a metaslab_load() in progress, wait for it to complete
2490	 * so that we have a consistent view of the in-core space map.
2491	 */
2492	metaslab_load_wait(msp);
2493
2494	/*
2495	 * Move the frees from the defer_tree back to the free
2496	 * range tree (if it's loaded). Swap the freed_tree and the
2497	 * defer_tree -- this is safe to do because we've just emptied out
2498	 * the defer_tree.
2499	 */
2500	range_tree_vacate(*defer_tree,
2501	    msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
2502	if (defer_allowed) {
2503		range_tree_swap(&msp->ms_freedtree, defer_tree);
2504	} else {
2505		range_tree_vacate(msp->ms_freedtree,
2506		    msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
2507	}
2508
2509	space_map_update(msp->ms_sm);
2510
2511	msp->ms_deferspace += defer_delta;
2512	ASSERT3S(msp->ms_deferspace, >=, 0);
2513	ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
2514	if (msp->ms_deferspace != 0) {
2515		/*
2516		 * Keep syncing this metaslab until all deferred frees
2517		 * are back in circulation.
2518		 */
2519		vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
2520	}
2521
2522	/*
2523	 * Calculate the new weights before unloading any metaslabs.
2524	 * This will give us the most accurate weighting.
2525	 */
2526	metaslab_group_sort(mg, msp, metaslab_weight(msp));
2527
2528	/*
2529	 * If the metaslab is loaded and we've not tried to load or allocate
2530	 * from it in 'metaslab_unload_delay' txgs, then unload it.
2531	 */
2532	if (msp->ms_loaded &&
2533	    msp->ms_selected_txg + metaslab_unload_delay < txg) {
2534		for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
2535			VERIFY0(range_tree_space(
2536			    msp->ms_alloctree[(txg + t) & TXG_MASK]));
2537		}
2538
2539		if (!metaslab_debug_unload)
2540			metaslab_unload(msp);
2541	}
2542
2543	mutex_exit(&msp->ms_lock);
2544}
2545
2546void
2547metaslab_sync_reassess(metaslab_group_t *mg)
2548{
2549	metaslab_group_alloc_update(mg);
2550	mg->mg_fragmentation = metaslab_group_fragmentation(mg);
2551
2552	/*
2553	 * Preload the next potential metaslabs
2554	 */
2555	metaslab_group_preload(mg);
2556}
2557
2558static uint64_t
2559metaslab_distance(metaslab_t *msp, dva_t *dva)
2560{
2561	uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
2562	uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
2563	uint64_t start = msp->ms_id;
2564
2565	if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
2566		return (1ULL << 63);
2567
2568	if (offset < start)
2569		return ((start - offset) << ms_shift);
2570	if (offset > start)
2571		return ((offset - start) << ms_shift);
2572	return (0);
2573}
2574
2575/*
2576 * ==========================================================================
2577 * Metaslab allocation tracing facility
2578 * ==========================================================================
2579 */
2580kstat_t *metaslab_trace_ksp;
2581kstat_named_t metaslab_trace_over_limit;
2582
2583void
2584metaslab_alloc_trace_init(void)
2585{
2586	ASSERT(metaslab_alloc_trace_cache == NULL);
2587	metaslab_alloc_trace_cache = kmem_cache_create(
2588	    "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
2589	    0, NULL, NULL, NULL, NULL, NULL, 0);
2590	metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats",
2591	    "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL);
2592	if (metaslab_trace_ksp != NULL) {
2593		metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit;
2594		kstat_named_init(&metaslab_trace_over_limit,
2595		    "metaslab_trace_over_limit", KSTAT_DATA_UINT64);
2596		kstat_install(metaslab_trace_ksp);
2597	}
2598}
2599
2600void
2601metaslab_alloc_trace_fini(void)
2602{
2603	if (metaslab_trace_ksp != NULL) {
2604		kstat_delete(metaslab_trace_ksp);
2605		metaslab_trace_ksp = NULL;
2606	}
2607	kmem_cache_destroy(metaslab_alloc_trace_cache);
2608	metaslab_alloc_trace_cache = NULL;
2609}
2610
2611/*
2612 * Add an allocation trace element to the allocation tracing list.
2613 */
2614static void
2615metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
2616    metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset)
2617{
2618	if (!metaslab_trace_enabled)
2619		return;
2620
2621	/*
2622	 * When the tracing list reaches its maximum we remove
2623	 * the second element in the list before adding a new one.
2624	 * By removing the second element we preserve the original
2625	 * entry as a clue to what allocations steps have already been
2626	 * performed.
2627	 */
2628	if (zal->zal_size == metaslab_trace_max_entries) {
2629		metaslab_alloc_trace_t *mat_next;
2630#ifdef DEBUG
2631		panic("too many entries in allocation list");
2632#endif
2633		atomic_inc_64(&metaslab_trace_over_limit.value.ui64);
2634		zal->zal_size--;
2635		mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
2636		list_remove(&zal->zal_list, mat_next);
2637		kmem_cache_free(metaslab_alloc_trace_cache, mat_next);
2638	}
2639
2640	metaslab_alloc_trace_t *mat =
2641	    kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP);
2642	list_link_init(&mat->mat_list_node);
2643	mat->mat_mg = mg;
2644	mat->mat_msp = msp;
2645	mat->mat_size = psize;
2646	mat->mat_dva_id = dva_id;
2647	mat->mat_offset = offset;
2648	mat->mat_weight = 0;
2649
2650	if (msp != NULL)
2651		mat->mat_weight = msp->ms_weight;
2652
2653	/*
2654	 * The list is part of the zio so locking is not required. Only
2655	 * a single thread will perform allocations for a given zio.
2656	 */
2657	list_insert_tail(&zal->zal_list, mat);
2658	zal->zal_size++;
2659
2660	ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
2661}
2662
2663void
2664metaslab_trace_init(zio_alloc_list_t *zal)
2665{
2666	list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t),
2667	    offsetof(metaslab_alloc_trace_t, mat_list_node));
2668	zal->zal_size = 0;
2669}
2670
2671void
2672metaslab_trace_fini(zio_alloc_list_t *zal)
2673{
2674	metaslab_alloc_trace_t *mat;
2675
2676	while ((mat = list_remove_head(&zal->zal_list)) != NULL)
2677		kmem_cache_free(metaslab_alloc_trace_cache, mat);
2678	list_destroy(&zal->zal_list);
2679	zal->zal_size = 0;
2680}
2681
2682/*
2683 * ==========================================================================
2684 * Metaslab block operations
2685 * ==========================================================================
2686 */
2687
2688static void
2689metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags)
2690{
2691	if (!(flags & METASLAB_ASYNC_ALLOC) ||
2692	    flags & METASLAB_DONT_THROTTLE)
2693		return;
2694
2695	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2696	if (!mg->mg_class->mc_alloc_throttle_enabled)
2697		return;
2698
2699	(void) refcount_add(&mg->mg_alloc_queue_depth, tag);
2700}
2701
2702void
2703metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags)
2704{
2705	if (!(flags & METASLAB_ASYNC_ALLOC) ||
2706	    flags & METASLAB_DONT_THROTTLE)
2707		return;
2708
2709	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2710	if (!mg->mg_class->mc_alloc_throttle_enabled)
2711		return;
2712
2713	(void) refcount_remove(&mg->mg_alloc_queue_depth, tag);
2714}
2715
2716void
2717metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag)
2718{
2719#ifdef ZFS_DEBUG
2720	const dva_t *dva = bp->blk_dva;
2721	int ndvas = BP_GET_NDVAS(bp);
2722
2723	for (int d = 0; d < ndvas; d++) {
2724		uint64_t vdev = DVA_GET_VDEV(&dva[d]);
2725		metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2726		VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag));
2727	}
2728#endif
2729}
2730
2731static uint64_t
2732metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
2733{
2734	uint64_t start;
2735	range_tree_t *rt = msp->ms_tree;
2736	metaslab_class_t *mc = msp->ms_group->mg_class;
2737
2738	VERIFY(!msp->ms_condensing);
2739
2740	start = mc->mc_ops->msop_alloc(msp, size);
2741	if (start != -1ULL) {
2742		metaslab_group_t *mg = msp->ms_group;
2743		vdev_t *vd = mg->mg_vd;
2744
2745		VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
2746		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
2747		VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
2748		range_tree_remove(rt, start, size);
2749
2750		if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
2751			vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
2752
2753		range_tree_add(msp->ms_alloctree[txg & TXG_MASK], start, size);
2754
2755		/* Track the last successful allocation */
2756		msp->ms_alloc_txg = txg;
2757		metaslab_verify_space(msp, txg);
2758	}
2759
2760	/*
2761	 * Now that we've attempted the allocation we need to update the
2762	 * metaslab's maximum block size since it may have changed.
2763	 */
2764	msp->ms_max_size = metaslab_block_maxsize(msp);
2765	return (start);
2766}
2767
2768static uint64_t
2769metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
2770    uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
2771{
2772	metaslab_t *msp = NULL;
2773	uint64_t offset = -1ULL;
2774	uint64_t activation_weight;
2775	uint64_t target_distance;
2776	int i;
2777
2778	activation_weight = METASLAB_WEIGHT_PRIMARY;
2779	for (i = 0; i < d; i++) {
2780		if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
2781			activation_weight = METASLAB_WEIGHT_SECONDARY;
2782			break;
2783		}
2784	}
2785
2786	metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
2787	search->ms_weight = UINT64_MAX;
2788	search->ms_start = 0;
2789	for (;;) {
2790		boolean_t was_active;
2791		avl_tree_t *t = &mg->mg_metaslab_tree;
2792		avl_index_t idx;
2793
2794		mutex_enter(&mg->mg_lock);
2795
2796		/*
2797		 * Find the metaslab with the highest weight that is less
2798		 * than what we've already tried.  In the common case, this
2799		 * means that we will examine each metaslab at most once.
2800		 * Note that concurrent callers could reorder metaslabs
2801		 * by activation/passivation once we have dropped the mg_lock.
2802		 * If a metaslab is activated by another thread, and we fail
2803		 * to allocate from the metaslab we have selected, we may
2804		 * not try the newly-activated metaslab, and instead activate
2805		 * another metaslab.  This is not optimal, but generally
2806		 * does not cause any problems (a possible exception being
2807		 * if every metaslab is completely full except for the
2808		 * the newly-activated metaslab which we fail to examine).
2809		 */
2810		msp = avl_find(t, search, &idx);
2811		if (msp == NULL)
2812			msp = avl_nearest(t, idx, AVL_AFTER);
2813		for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
2814
2815			if (!metaslab_should_allocate(msp, asize)) {
2816				metaslab_trace_add(zal, mg, msp, asize, d,
2817				    TRACE_TOO_SMALL);
2818				continue;
2819			}
2820
2821			/*
2822			 * If the selected metaslab is condensing, skip it.
2823			 */
2824			if (msp->ms_condensing)
2825				continue;
2826
2827			was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
2828			if (activation_weight == METASLAB_WEIGHT_PRIMARY)
2829				break;
2830
2831			target_distance = min_distance +
2832			    (space_map_allocated(msp->ms_sm) != 0 ? 0 :
2833			    min_distance >> 1);
2834
2835			for (i = 0; i < d; i++) {
2836				if (metaslab_distance(msp, &dva[i]) <
2837				    target_distance)
2838					break;
2839			}
2840			if (i == d)
2841				break;
2842		}
2843		mutex_exit(&mg->mg_lock);
2844		if (msp == NULL) {
2845			kmem_free(search, sizeof (*search));
2846			return (-1ULL);
2847		}
2848		search->ms_weight = msp->ms_weight;
2849		search->ms_start = msp->ms_start + 1;
2850
2851		mutex_enter(&msp->ms_lock);
2852
2853		/*
2854		 * Ensure that the metaslab we have selected is still
2855		 * capable of handling our request. It's possible that
2856		 * another thread may have changed the weight while we
2857		 * were blocked on the metaslab lock. We check the
2858		 * active status first to see if we need to reselect
2859		 * a new metaslab.
2860		 */
2861		if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
2862			mutex_exit(&msp->ms_lock);
2863			continue;
2864		}
2865
2866		if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
2867		    activation_weight == METASLAB_WEIGHT_PRIMARY) {
2868			metaslab_passivate(msp,
2869			    msp->ms_weight & ~METASLAB_ACTIVE_MASK);
2870			mutex_exit(&msp->ms_lock);
2871			continue;
2872		}
2873
2874		if (metaslab_activate(msp, activation_weight) != 0) {
2875			mutex_exit(&msp->ms_lock);
2876			continue;
2877		}
2878		msp->ms_selected_txg = txg;
2879
2880		/*
2881		 * Now that we have the lock, recheck to see if we should
2882		 * continue to use this metaslab for this allocation. The
2883		 * the metaslab is now loaded so metaslab_should_allocate() can
2884		 * accurately determine if the allocation attempt should
2885		 * proceed.
2886		 */
2887		if (!metaslab_should_allocate(msp, asize)) {
2888			/* Passivate this metaslab and select a new one. */
2889			metaslab_trace_add(zal, mg, msp, asize, d,
2890			    TRACE_TOO_SMALL);
2891			goto next;
2892		}
2893
2894		/*
2895		 * If this metaslab is currently condensing then pick again as
2896		 * we can't manipulate this metaslab until it's committed
2897		 * to disk.
2898		 */
2899		if (msp->ms_condensing) {
2900			metaslab_trace_add(zal, mg, msp, asize, d,
2901			    TRACE_CONDENSING);
2902			mutex_exit(&msp->ms_lock);
2903			continue;
2904		}
2905
2906		offset = metaslab_block_alloc(msp, asize, txg);
2907		metaslab_trace_add(zal, mg, msp, asize, d, offset);
2908
2909		if (offset != -1ULL) {
2910			/* Proactively passivate the metaslab, if needed */
2911			metaslab_segment_may_passivate(msp);
2912			break;
2913		}
2914next:
2915		ASSERT(msp->ms_loaded);
2916
2917		/*
2918		 * We were unable to allocate from this metaslab so determine
2919		 * a new weight for this metaslab. Now that we have loaded
2920		 * the metaslab we can provide a better hint to the metaslab
2921		 * selector.
2922		 *
2923		 * For space-based metaslabs, we use the maximum block size.
2924		 * This information is only available when the metaslab
2925		 * is loaded and is more accurate than the generic free
2926		 * space weight that was calculated by metaslab_weight().
2927		 * This information allows us to quickly compare the maximum
2928		 * available allocation in the metaslab to the allocation
2929		 * size being requested.
2930		 *
2931		 * For segment-based metaslabs, determine the new weight
2932		 * based on the highest bucket in the range tree. We
2933		 * explicitly use the loaded segment weight (i.e. the range
2934		 * tree histogram) since it contains the space that is
2935		 * currently available for allocation and is accurate
2936		 * even within a sync pass.
2937		 */
2938		if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
2939			uint64_t weight = metaslab_block_maxsize(msp);
2940			WEIGHT_SET_SPACEBASED(weight);
2941			metaslab_passivate(msp, weight);
2942		} else {
2943			metaslab_passivate(msp,
2944			    metaslab_weight_from_range_tree(msp));
2945		}
2946
2947		/*
2948		 * We have just failed an allocation attempt, check
2949		 * that metaslab_should_allocate() agrees. Otherwise,
2950		 * we may end up in an infinite loop retrying the same
2951		 * metaslab.
2952		 */
2953		ASSERT(!metaslab_should_allocate(msp, asize));
2954		mutex_exit(&msp->ms_lock);
2955	}
2956	mutex_exit(&msp->ms_lock);
2957	kmem_free(search, sizeof (*search));
2958	return (offset);
2959}
2960
2961static uint64_t
2962metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
2963    uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
2964{
2965	uint64_t offset;
2966	ASSERT(mg->mg_initialized);
2967
2968	offset = metaslab_group_alloc_normal(mg, zal, asize, txg,
2969	    min_distance, dva, d);
2970
2971	mutex_enter(&mg->mg_lock);
2972	if (offset == -1ULL) {
2973		mg->mg_failed_allocations++;
2974		metaslab_trace_add(zal, mg, NULL, asize, d,
2975		    TRACE_GROUP_FAILURE);
2976		if (asize == SPA_GANGBLOCKSIZE) {
2977			/*
2978			 * This metaslab group was unable to allocate
2979			 * the minimum gang block size so it must be out of
2980			 * space. We must notify the allocation throttle
2981			 * to start skipping allocation attempts to this
2982			 * metaslab group until more space becomes available.
2983			 * Note: this failure cannot be caused by the
2984			 * allocation throttle since the allocation throttle
2985			 * is only responsible for skipping devices and
2986			 * not failing block allocations.
2987			 */
2988			mg->mg_no_free_space = B_TRUE;
2989		}
2990	}
2991	mg->mg_allocations++;
2992	mutex_exit(&mg->mg_lock);
2993	return (offset);
2994}
2995
2996/*
2997 * If we have to write a ditto block (i.e. more than one DVA for a given BP)
2998 * on the same vdev as an existing DVA of this BP, then try to allocate it
2999 * at least (vdev_asize / (2 ^ ditto_same_vdev_distance_shift)) away from the
3000 * existing DVAs.
3001 */
3002int ditto_same_vdev_distance_shift = 3;
3003
3004/*
3005 * Allocate a block for the specified i/o.
3006 */
3007static int
3008metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
3009    dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
3010    zio_alloc_list_t *zal)
3011{
3012	metaslab_group_t *mg, *rotor;
3013	vdev_t *vd;
3014	boolean_t try_hard = B_FALSE;
3015
3016	ASSERT(!DVA_IS_VALID(&dva[d]));
3017
3018	/*
3019	 * For testing, make some blocks above a certain size be gang blocks.
3020	 */
3021	if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) {
3022		metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG);
3023		return (SET_ERROR(ENOSPC));
3024	}
3025
3026	/*
3027	 * Start at the rotor and loop through all mgs until we find something.
3028	 * Note that there's no locking on mc_rotor or mc_aliquot because
3029	 * nothing actually breaks if we miss a few updates -- we just won't
3030	 * allocate quite as evenly.  It all balances out over time.
3031	 *
3032	 * If we are doing ditto or log blocks, try to spread them across
3033	 * consecutive vdevs.  If we're forced to reuse a vdev before we've
3034	 * allocated all of our ditto blocks, then try and spread them out on
3035	 * that vdev as much as possible.  If it turns out to not be possible,
3036	 * gradually lower our standards until anything becomes acceptable.
3037	 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
3038	 * gives us hope of containing our fault domains to something we're
3039	 * able to reason about.  Otherwise, any two top-level vdev failures
3040	 * will guarantee the loss of data.  With consecutive allocation,
3041	 * only two adjacent top-level vdev failures will result in data loss.
3042	 *
3043	 * If we are doing gang blocks (hintdva is non-NULL), try to keep
3044	 * ourselves on the same vdev as our gang block header.  That
3045	 * way, we can hope for locality in vdev_cache, plus it makes our
3046	 * fault domains something tractable.
3047	 */
3048	if (hintdva) {
3049		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
3050
3051		/*
3052		 * It's possible the vdev we're using as the hint no
3053		 * longer exists (i.e. removed). Consult the rotor when
3054		 * all else fails.
3055		 */
3056		if (vd != NULL) {
3057			mg = vd->vdev_mg;
3058
3059			if (flags & METASLAB_HINTBP_AVOID &&
3060			    mg->mg_next != NULL)
3061				mg = mg->mg_next;
3062		} else {
3063			mg = mc->mc_rotor;
3064		}
3065	} else if (d != 0) {
3066		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
3067		mg = vd->vdev_mg->mg_next;
3068	} else {
3069		mg = mc->mc_rotor;
3070	}
3071
3072	/*
3073	 * If the hint put us into the wrong metaslab class, or into a
3074	 * metaslab group that has been passivated, just follow the rotor.
3075	 */
3076	if (mg->mg_class != mc || mg->mg_activation_count <= 0)
3077		mg = mc->mc_rotor;
3078
3079	rotor = mg;
3080top:
3081	do {
3082		boolean_t allocatable;
3083
3084		ASSERT(mg->mg_activation_count == 1);
3085		vd = mg->mg_vd;
3086
3087		/*
3088		 * Don't allocate from faulted devices.
3089		 */
3090		if (try_hard) {
3091			spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
3092			allocatable = vdev_allocatable(vd);
3093			spa_config_exit(spa, SCL_ZIO, FTAG);
3094		} else {
3095			allocatable = vdev_allocatable(vd);
3096		}
3097
3098		/*
3099		 * Determine if the selected metaslab group is eligible
3100		 * for allocations. If we're ganging then don't allow
3101		 * this metaslab group to skip allocations since that would
3102		 * inadvertently return ENOSPC and suspend the pool
3103		 * even though space is still available.
3104		 */
3105		if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
3106			allocatable = metaslab_group_allocatable(mg, rotor,
3107			    psize);
3108		}
3109
3110		if (!allocatable) {
3111			metaslab_trace_add(zal, mg, NULL, psize, d,
3112			    TRACE_NOT_ALLOCATABLE);
3113			goto next;
3114		}
3115
3116		ASSERT(mg->mg_initialized);
3117
3118		/*
3119		 * Avoid writing single-copy data to a failing,
3120		 * non-redundant vdev, unless we've already tried all
3121		 * other vdevs.
3122		 */
3123		if ((vd->vdev_stat.vs_write_errors > 0 ||
3124		    vd->vdev_state < VDEV_STATE_HEALTHY) &&
3125		    d == 0 && !try_hard && vd->vdev_children == 0) {
3126			metaslab_trace_add(zal, mg, NULL, psize, d,
3127			    TRACE_VDEV_ERROR);
3128			goto next;
3129		}
3130
3131		ASSERT(mg->mg_class == mc);
3132
3133		/*
3134		 * If we don't need to try hard, then require that the
3135		 * block be 1/8th of the device away from any other DVAs
3136		 * in this BP.  If we are trying hard, allow any offset
3137		 * to be used (distance=0).
3138		 */
3139		uint64_t distance = 0;
3140		if (!try_hard) {
3141			distance = vd->vdev_asize >>
3142			    ditto_same_vdev_distance_shift;
3143			if (distance <= (1ULL << vd->vdev_ms_shift))
3144				distance = 0;
3145		}
3146
3147		uint64_t asize = vdev_psize_to_asize(vd, psize);
3148		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
3149
3150		uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
3151		    distance, dva, d);
3152
3153		if (offset != -1ULL) {
3154			/*
3155			 * If we've just selected this metaslab group,
3156			 * figure out whether the corresponding vdev is
3157			 * over- or under-used relative to the pool,
3158			 * and set an allocation bias to even it out.
3159			 */
3160			if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
3161				vdev_stat_t *vs = &vd->vdev_stat;
3162				int64_t vu, cu;
3163
3164				vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
3165				cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
3166
3167				/*
3168				 * Calculate how much more or less we should
3169				 * try to allocate from this device during
3170				 * this iteration around the rotor.
3171				 * For example, if a device is 80% full
3172				 * and the pool is 20% full then we should
3173				 * reduce allocations by 60% on this device.
3174				 *
3175				 * mg_bias = (20 - 80) * 512K / 100 = -307K
3176				 *
3177				 * This reduces allocations by 307K for this
3178				 * iteration.
3179				 */
3180				mg->mg_bias = ((cu - vu) *
3181				    (int64_t)mg->mg_aliquot) / 100;
3182			} else if (!metaslab_bias_enabled) {
3183				mg->mg_bias = 0;
3184			}
3185
3186			if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
3187			    mg->mg_aliquot + mg->mg_bias) {
3188				mc->mc_rotor = mg->mg_next;
3189				mc->mc_aliquot = 0;
3190			}
3191
3192			DVA_SET_VDEV(&dva[d], vd->vdev_id);
3193			DVA_SET_OFFSET(&dva[d], offset);
3194			DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
3195			DVA_SET_ASIZE(&dva[d], asize);
3196
3197			return (0);
3198		}
3199next:
3200		mc->mc_rotor = mg->mg_next;
3201		mc->mc_aliquot = 0;
3202	} while ((mg = mg->mg_next) != rotor);
3203
3204	/*
3205	 * If we haven't tried hard, do so now.
3206	 */
3207	if (!try_hard) {
3208		try_hard = B_TRUE;
3209		goto top;
3210	}
3211
3212	bzero(&dva[d], sizeof (dva_t));
3213
3214	metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC);
3215	return (SET_ERROR(ENOSPC));
3216}
3217
3218/*
3219 * Free the block represented by DVA in the context of the specified
3220 * transaction group.
3221 */
3222static void
3223metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
3224{
3225	uint64_t vdev = DVA_GET_VDEV(dva);
3226	uint64_t offset = DVA_GET_OFFSET(dva);
3227	uint64_t size = DVA_GET_ASIZE(dva);
3228	vdev_t *vd;
3229	metaslab_t *msp;
3230
3231	ASSERT(DVA_IS_VALID(dva));
3232
3233	if (txg > spa_freeze_txg(spa))
3234		return;
3235
3236	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
3237	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
3238		cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
3239		    (u_longlong_t)vdev, (u_longlong_t)offset);
3240		ASSERT(0);
3241		return;
3242	}
3243
3244	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3245
3246	if (DVA_GET_GANG(dva))
3247		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
3248
3249	mutex_enter(&msp->ms_lock);
3250
3251	if (now) {
3252		range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
3253		    offset, size);
3254
3255		VERIFY(!msp->ms_condensing);
3256		VERIFY3U(offset, >=, msp->ms_start);
3257		VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
3258		VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
3259		    msp->ms_size);
3260		VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
3261		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
3262		range_tree_add(msp->ms_tree, offset, size);
3263		msp->ms_max_size = metaslab_block_maxsize(msp);
3264	} else {
3265		VERIFY3U(txg, ==, spa->spa_syncing_txg);
3266		if (range_tree_space(msp->ms_freeingtree) == 0)
3267			vdev_dirty(vd, VDD_METASLAB, msp, txg);
3268		range_tree_add(msp->ms_freeingtree, offset, size);
3269	}
3270
3271	mutex_exit(&msp->ms_lock);
3272}
3273
3274/*
3275 * Intent log support: upon opening the pool after a crash, notify the SPA
3276 * of blocks that the intent log has allocated for immediate write, but
3277 * which are still considered free by the SPA because the last transaction
3278 * group didn't commit yet.
3279 */
3280static int
3281metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
3282{
3283	uint64_t vdev = DVA_GET_VDEV(dva);
3284	uint64_t offset = DVA_GET_OFFSET(dva);
3285	uint64_t size = DVA_GET_ASIZE(dva);
3286	vdev_t *vd;
3287	metaslab_t *msp;
3288	int error = 0;
3289
3290	ASSERT(DVA_IS_VALID(dva));
3291
3292	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
3293	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
3294		return (SET_ERROR(ENXIO));
3295
3296	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3297
3298	if (DVA_GET_GANG(dva))
3299		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
3300
3301	mutex_enter(&msp->ms_lock);
3302
3303	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
3304		error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
3305
3306	if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
3307		error = SET_ERROR(ENOENT);
3308
3309	if (error || txg == 0) {	/* txg == 0 indicates dry run */
3310		mutex_exit(&msp->ms_lock);
3311		return (error);
3312	}
3313
3314	VERIFY(!msp->ms_condensing);
3315	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
3316	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
3317	VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
3318	range_tree_remove(msp->ms_tree, offset, size);
3319
3320	if (spa_writeable(spa)) {	/* don't dirty if we're zdb(1M) */
3321		if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
3322			vdev_dirty(vd, VDD_METASLAB, msp, txg);
3323		range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
3324	}
3325
3326	mutex_exit(&msp->ms_lock);
3327
3328	return (0);
3329}
3330
3331/*
3332 * Reserve some allocation slots. The reservation system must be called
3333 * before we call into the allocator. If there aren't any available slots
3334 * then the I/O will be throttled until an I/O completes and its slots are
3335 * freed up. The function returns true if it was successful in placing
3336 * the reservation.
3337 */
3338boolean_t
3339metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
3340    int flags)
3341{
3342	uint64_t available_slots = 0;
3343	boolean_t slot_reserved = B_FALSE;
3344
3345	ASSERT(mc->mc_alloc_throttle_enabled);
3346	mutex_enter(&mc->mc_lock);
3347
3348	uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots);
3349	if (reserved_slots < mc->mc_alloc_max_slots)
3350		available_slots = mc->mc_alloc_max_slots - reserved_slots;
3351
3352	if (slots <= available_slots || GANG_ALLOCATION(flags)) {
3353		/*
3354		 * We reserve the slots individually so that we can unreserve
3355		 * them individually when an I/O completes.
3356		 */
3357		for (int d = 0; d < slots; d++) {
3358			reserved_slots = refcount_add(&mc->mc_alloc_slots, zio);
3359		}
3360		zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
3361		slot_reserved = B_TRUE;
3362	}
3363
3364	mutex_exit(&mc->mc_lock);
3365	return (slot_reserved);
3366}
3367
3368void
3369metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio)
3370{
3371	ASSERT(mc->mc_alloc_throttle_enabled);
3372	mutex_enter(&mc->mc_lock);
3373	for (int d = 0; d < slots; d++) {
3374		(void) refcount_remove(&mc->mc_alloc_slots, zio);
3375	}
3376	mutex_exit(&mc->mc_lock);
3377}
3378
3379int
3380metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
3381    int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
3382    zio_alloc_list_t *zal, zio_t *zio)
3383{
3384	dva_t *dva = bp->blk_dva;
3385	dva_t *hintdva = hintbp->blk_dva;
3386	int error = 0;
3387
3388	ASSERT(bp->blk_birth == 0);
3389	ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
3390
3391	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
3392
3393	if (mc->mc_rotor == NULL) {	/* no vdevs in this class */
3394		spa_config_exit(spa, SCL_ALLOC, FTAG);
3395		return (SET_ERROR(ENOSPC));
3396	}
3397
3398	ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
3399	ASSERT(BP_GET_NDVAS(bp) == 0);
3400	ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
3401	ASSERT3P(zal, !=, NULL);
3402
3403	for (int d = 0; d < ndvas; d++) {
3404		error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
3405		    txg, flags, zal);
3406		if (error != 0) {
3407			for (d--; d >= 0; d--) {
3408				metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
3409				metaslab_group_alloc_decrement(spa,
3410				    DVA_GET_VDEV(&dva[d]), zio, flags);
3411				bzero(&dva[d], sizeof (dva_t));
3412			}
3413			spa_config_exit(spa, SCL_ALLOC, FTAG);
3414			return (error);
3415		} else {
3416			/*
3417			 * Update the metaslab group's queue depth
3418			 * based on the newly allocated dva.
3419			 */
3420			metaslab_group_alloc_increment(spa,
3421			    DVA_GET_VDEV(&dva[d]), zio, flags);
3422		}
3423
3424	}
3425	ASSERT(error == 0);
3426	ASSERT(BP_GET_NDVAS(bp) == ndvas);
3427
3428	spa_config_exit(spa, SCL_ALLOC, FTAG);
3429
3430	BP_SET_BIRTH(bp, txg, txg);
3431
3432	return (0);
3433}
3434
3435void
3436metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
3437{
3438	const dva_t *dva = bp->blk_dva;
3439	int ndvas = BP_GET_NDVAS(bp);
3440
3441	ASSERT(!BP_IS_HOLE(bp));
3442	ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
3443
3444	spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
3445
3446	for (int d = 0; d < ndvas; d++)
3447		metaslab_free_dva(spa, &dva[d], txg, now);
3448
3449	spa_config_exit(spa, SCL_FREE, FTAG);
3450}
3451
3452int
3453metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
3454{
3455	const dva_t *dva = bp->blk_dva;
3456	int ndvas = BP_GET_NDVAS(bp);
3457	int error = 0;
3458
3459	ASSERT(!BP_IS_HOLE(bp));
3460
3461	if (txg != 0) {
3462		/*
3463		 * First do a dry run to make sure all DVAs are claimable,
3464		 * so we don't have to unwind from partial failures below.
3465		 */
3466		if ((error = metaslab_claim(spa, bp, 0)) != 0)
3467			return (error);
3468	}
3469
3470	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
3471
3472	for (int d = 0; d < ndvas; d++)
3473		if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
3474			break;
3475
3476	spa_config_exit(spa, SCL_ALLOC, FTAG);
3477
3478	ASSERT(error == 0 || txg == 0);
3479
3480	return (error);
3481}
3482
3483void
3484metaslab_check_free(spa_t *spa, const blkptr_t *bp)
3485{
3486	if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
3487		return;
3488
3489	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
3490	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
3491		uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
3492		vdev_t *vd = vdev_lookup_top(spa, vdev);
3493		uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
3494		uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
3495		metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3496
3497		if (msp->ms_loaded)
3498			range_tree_verify(msp->ms_tree, offset, size);
3499
3500		range_tree_verify(msp->ms_freeingtree, offset, size);
3501		range_tree_verify(msp->ms_freedtree, offset, size);
3502		for (int j = 0; j < TXG_DEFER_SIZE; j++)
3503			range_tree_verify(msp->ms_defertree[j], offset, size);
3504	}
3505	spa_config_exit(spa, SCL_VDEV, FTAG);
3506}
3507