metaslab.c revision 307279
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
25 * Copyright (c) 2014 Integros [integros.com]
26 */
27
28#include <sys/zfs_context.h>
29#include <sys/dmu.h>
30#include <sys/dmu_tx.h>
31#include <sys/space_map.h>
32#include <sys/metaslab_impl.h>
33#include <sys/vdev_impl.h>
34#include <sys/zio.h>
35#include <sys/spa_impl.h>
36#include <sys/zfeature.h>
37
38SYSCTL_DECL(_vfs_zfs);
39SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab");
40
41#define	GANG_ALLOCATION(flags) \
42	((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
43
44#define	METASLAB_WEIGHT_PRIMARY		(1ULL << 63)
45#define	METASLAB_WEIGHT_SECONDARY	(1ULL << 62)
46#define	METASLAB_ACTIVE_MASK		\
47	(METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
48
49uint64_t metaslab_aliquot = 512ULL << 10;
50uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
51TUNABLE_QUAD("vfs.zfs.metaslab.gang_bang", &metaslab_gang_bang);
52SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, gang_bang, CTLFLAG_RWTUN,
53    &metaslab_gang_bang, 0,
54    "Force gang block allocation for blocks larger than or equal to this value");
55
56/*
57 * The in-core space map representation is more compact than its on-disk form.
58 * The zfs_condense_pct determines how much more compact the in-core
59 * space_map representation must be before we compact it on-disk.
60 * Values should be greater than or equal to 100.
61 */
62int zfs_condense_pct = 200;
63TUNABLE_INT("vfs.zfs.condense_pct", &zfs_condense_pct);
64SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN,
65    &zfs_condense_pct, 0,
66    "Condense on-disk spacemap when it is more than this many percents"
67    " of in-memory counterpart");
68
69/*
70 * Condensing a metaslab is not guaranteed to actually reduce the amount of
71 * space used on disk. In particular, a space map uses data in increments of
72 * MAX(1 << ashift, space_map_blksize), so a metaslab might use the
73 * same number of blocks after condensing. Since the goal of condensing is to
74 * reduce the number of IOPs required to read the space map, we only want to
75 * condense when we can be sure we will reduce the number of blocks used by the
76 * space map. Unfortunately, we cannot precisely compute whether or not this is
77 * the case in metaslab_should_condense since we are holding ms_lock. Instead,
78 * we apply the following heuristic: do not condense a spacemap unless the
79 * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
80 * blocks.
81 */
82int zfs_metaslab_condense_block_threshold = 4;
83
84/*
85 * The zfs_mg_noalloc_threshold defines which metaslab groups should
86 * be eligible for allocation. The value is defined as a percentage of
87 * free space. Metaslab groups that have more free space than
88 * zfs_mg_noalloc_threshold are always eligible for allocations. Once
89 * a metaslab group's free space is less than or equal to the
90 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
91 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
92 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
93 * groups are allowed to accept allocations. Gang blocks are always
94 * eligible to allocate on any metaslab group. The default value of 0 means
95 * no metaslab group will be excluded based on this criterion.
96 */
97int zfs_mg_noalloc_threshold = 0;
98TUNABLE_INT("vfs.zfs.mg_noalloc_threshold", &zfs_mg_noalloc_threshold);
99SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_noalloc_threshold, CTLFLAG_RWTUN,
100    &zfs_mg_noalloc_threshold, 0,
101    "Percentage of metaslab group size that should be free"
102    " to make it eligible for allocation");
103
104/*
105 * Metaslab groups are considered eligible for allocations if their
106 * fragmenation metric (measured as a percentage) is less than or equal to
107 * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold
108 * then it will be skipped unless all metaslab groups within the metaslab
109 * class have also crossed this threshold.
110 */
111int zfs_mg_fragmentation_threshold = 85;
112TUNABLE_INT("vfs.zfs.mg_fragmentation_threshold", &zfs_mg_fragmentation_threshold);
113SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_fragmentation_threshold, CTLFLAG_RWTUN,
114    &zfs_mg_fragmentation_threshold, 0,
115    "Percentage of metaslab group size that should be considered "
116    "eligible for allocations unless all metaslab groups within the metaslab class "
117    "have also crossed this threshold");
118
119/*
120 * Allow metaslabs to keep their active state as long as their fragmentation
121 * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
122 * active metaslab that exceeds this threshold will no longer keep its active
123 * status allowing better metaslabs to be selected.
124 */
125int zfs_metaslab_fragmentation_threshold = 70;
126TUNABLE_INT("vfs.zfs.metaslab.fragmentation_threshold",
127    &zfs_metaslab_fragmentation_threshold);
128SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_threshold, CTLFLAG_RWTUN,
129    &zfs_metaslab_fragmentation_threshold, 0,
130    "Maximum percentage of metaslab fragmentation level to keep their active state");
131
132/*
133 * When set will load all metaslabs when pool is first opened.
134 */
135int metaslab_debug_load = 0;
136TUNABLE_INT("vfs.zfs.metaslab.debug_load", &metaslab_debug_load);
137SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_load, CTLFLAG_RWTUN,
138    &metaslab_debug_load, 0,
139    "Load all metaslabs when pool is first opened");
140
141/*
142 * When set will prevent metaslabs from being unloaded.
143 */
144int metaslab_debug_unload = 0;
145TUNABLE_INT("vfs.zfs.metaslab.debug_unload", &metaslab_debug_unload);
146SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_unload, CTLFLAG_RWTUN,
147    &metaslab_debug_unload, 0,
148    "Prevent metaslabs from being unloaded");
149
150/*
151 * Minimum size which forces the dynamic allocator to change
152 * it's allocation strategy.  Once the space map cannot satisfy
153 * an allocation of this size then it switches to using more
154 * aggressive strategy (i.e search by size rather than offset).
155 */
156uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
157TUNABLE_QUAD("vfs.zfs.metaslab.df_alloc_threshold",
158    &metaslab_df_alloc_threshold);
159SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN,
160    &metaslab_df_alloc_threshold, 0,
161    "Minimum size which forces the dynamic allocator to change it's allocation strategy");
162
163/*
164 * The minimum free space, in percent, which must be available
165 * in a space map to continue allocations in a first-fit fashion.
166 * Once the space_map's free space drops below this level we dynamically
167 * switch to using best-fit allocations.
168 */
169int metaslab_df_free_pct = 4;
170TUNABLE_INT("vfs.zfs.metaslab.df_free_pct", &metaslab_df_free_pct);
171SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN,
172    &metaslab_df_free_pct, 0,
173    "The minimum free space, in percent, which must be available in a space map to continue allocations in a first-fit fashion");
174
175/*
176 * A metaslab is considered "free" if it contains a contiguous
177 * segment which is greater than metaslab_min_alloc_size.
178 */
179uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
180TUNABLE_QUAD("vfs.zfs.metaslab.min_alloc_size",
181    &metaslab_min_alloc_size);
182SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, min_alloc_size, CTLFLAG_RWTUN,
183    &metaslab_min_alloc_size, 0,
184    "A metaslab is considered \"free\" if it contains a contiguous segment which is greater than vfs.zfs.metaslab.min_alloc_size");
185
186/*
187 * Percentage of all cpus that can be used by the metaslab taskq.
188 */
189int metaslab_load_pct = 50;
190TUNABLE_INT("vfs.zfs.metaslab.load_pct", &metaslab_load_pct);
191SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN,
192    &metaslab_load_pct, 0,
193    "Percentage of cpus that can be used by the metaslab taskq");
194
195/*
196 * Determines how many txgs a metaslab may remain loaded without having any
197 * allocations from it. As long as a metaslab continues to be used we will
198 * keep it loaded.
199 */
200int metaslab_unload_delay = TXG_SIZE * 2;
201TUNABLE_INT("vfs.zfs.metaslab.unload_delay", &metaslab_unload_delay);
202SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, unload_delay, CTLFLAG_RWTUN,
203    &metaslab_unload_delay, 0,
204    "Number of TXGs that an unused metaslab can be kept in memory");
205
206/*
207 * Max number of metaslabs per group to preload.
208 */
209int metaslab_preload_limit = SPA_DVAS_PER_BP;
210TUNABLE_INT("vfs.zfs.metaslab.preload_limit", &metaslab_preload_limit);
211SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN,
212    &metaslab_preload_limit, 0,
213    "Max number of metaslabs per group to preload");
214
215/*
216 * Enable/disable preloading of metaslab.
217 */
218boolean_t metaslab_preload_enabled = B_TRUE;
219TUNABLE_INT("vfs.zfs.metaslab.preload_enabled", &metaslab_preload_enabled);
220SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_enabled, CTLFLAG_RWTUN,
221    &metaslab_preload_enabled, 0,
222    "Max number of metaslabs per group to preload");
223
224/*
225 * Enable/disable fragmentation weighting on metaslabs.
226 */
227boolean_t metaslab_fragmentation_factor_enabled = B_TRUE;
228TUNABLE_INT("vfs.zfs.metaslab_fragmentation_factor_enabled",
229    &metaslab_fragmentation_factor_enabled);
230SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_factor_enabled, CTLFLAG_RWTUN,
231    &metaslab_fragmentation_factor_enabled, 0,
232    "Enable fragmentation weighting on metaslabs");
233
234/*
235 * Enable/disable lba weighting (i.e. outer tracks are given preference).
236 */
237boolean_t metaslab_lba_weighting_enabled = B_TRUE;
238TUNABLE_INT("vfs.zfs.metaslab.lba_weighting_enabled",
239    &metaslab_lba_weighting_enabled);
240SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, lba_weighting_enabled, CTLFLAG_RWTUN,
241    &metaslab_lba_weighting_enabled, 0,
242    "Enable LBA weighting (i.e. outer tracks are given preference)");
243
244/*
245 * Enable/disable metaslab group biasing.
246 */
247boolean_t metaslab_bias_enabled = B_TRUE;
248TUNABLE_INT("vfs.zfs.metaslab.bias_enabled",
249    &metaslab_bias_enabled);
250SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, bias_enabled, CTLFLAG_RWTUN,
251    &metaslab_bias_enabled, 0,
252    "Enable metaslab group biasing");
253
254static uint64_t metaslab_fragmentation(metaslab_t *);
255
256/*
257 * ==========================================================================
258 * Metaslab classes
259 * ==========================================================================
260 */
261metaslab_class_t *
262metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
263{
264	metaslab_class_t *mc;
265
266	mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
267
268	mc->mc_spa = spa;
269	mc->mc_rotor = NULL;
270	mc->mc_ops = ops;
271	mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
272	refcount_create_tracked(&mc->mc_alloc_slots);
273
274	return (mc);
275}
276
277void
278metaslab_class_destroy(metaslab_class_t *mc)
279{
280	ASSERT(mc->mc_rotor == NULL);
281	ASSERT(mc->mc_alloc == 0);
282	ASSERT(mc->mc_deferred == 0);
283	ASSERT(mc->mc_space == 0);
284	ASSERT(mc->mc_dspace == 0);
285
286	refcount_destroy(&mc->mc_alloc_slots);
287	mutex_destroy(&mc->mc_lock);
288	kmem_free(mc, sizeof (metaslab_class_t));
289}
290
291int
292metaslab_class_validate(metaslab_class_t *mc)
293{
294	metaslab_group_t *mg;
295	vdev_t *vd;
296
297	/*
298	 * Must hold one of the spa_config locks.
299	 */
300	ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
301	    spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
302
303	if ((mg = mc->mc_rotor) == NULL)
304		return (0);
305
306	do {
307		vd = mg->mg_vd;
308		ASSERT(vd->vdev_mg != NULL);
309		ASSERT3P(vd->vdev_top, ==, vd);
310		ASSERT3P(mg->mg_class, ==, mc);
311		ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
312	} while ((mg = mg->mg_next) != mc->mc_rotor);
313
314	return (0);
315}
316
317void
318metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
319    int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
320{
321	atomic_add_64(&mc->mc_alloc, alloc_delta);
322	atomic_add_64(&mc->mc_deferred, defer_delta);
323	atomic_add_64(&mc->mc_space, space_delta);
324	atomic_add_64(&mc->mc_dspace, dspace_delta);
325}
326
327void
328metaslab_class_minblocksize_update(metaslab_class_t *mc)
329{
330	metaslab_group_t *mg;
331	vdev_t *vd;
332	uint64_t minashift = UINT64_MAX;
333
334	if ((mg = mc->mc_rotor) == NULL) {
335		mc->mc_minblocksize = SPA_MINBLOCKSIZE;
336		return;
337	}
338
339	do {
340		vd = mg->mg_vd;
341		if (vd->vdev_ashift < minashift)
342			minashift = vd->vdev_ashift;
343	} while ((mg = mg->mg_next) != mc->mc_rotor);
344
345	mc->mc_minblocksize = 1ULL << minashift;
346}
347
348uint64_t
349metaslab_class_get_alloc(metaslab_class_t *mc)
350{
351	return (mc->mc_alloc);
352}
353
354uint64_t
355metaslab_class_get_deferred(metaslab_class_t *mc)
356{
357	return (mc->mc_deferred);
358}
359
360uint64_t
361metaslab_class_get_space(metaslab_class_t *mc)
362{
363	return (mc->mc_space);
364}
365
366uint64_t
367metaslab_class_get_dspace(metaslab_class_t *mc)
368{
369	return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
370}
371
372uint64_t
373metaslab_class_get_minblocksize(metaslab_class_t *mc)
374{
375	return (mc->mc_minblocksize);
376}
377
378void
379metaslab_class_histogram_verify(metaslab_class_t *mc)
380{
381	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
382	uint64_t *mc_hist;
383	int i;
384
385	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
386		return;
387
388	mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
389	    KM_SLEEP);
390
391	for (int c = 0; c < rvd->vdev_children; c++) {
392		vdev_t *tvd = rvd->vdev_child[c];
393		metaslab_group_t *mg = tvd->vdev_mg;
394
395		/*
396		 * Skip any holes, uninitialized top-levels, or
397		 * vdevs that are not in this metalab class.
398		 */
399		if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
400		    mg->mg_class != mc) {
401			continue;
402		}
403
404		for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
405			mc_hist[i] += mg->mg_histogram[i];
406	}
407
408	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
409		VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
410
411	kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
412}
413
414/*
415 * Calculate the metaslab class's fragmentation metric. The metric
416 * is weighted based on the space contribution of each metaslab group.
417 * The return value will be a number between 0 and 100 (inclusive), or
418 * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
419 * zfs_frag_table for more information about the metric.
420 */
421uint64_t
422metaslab_class_fragmentation(metaslab_class_t *mc)
423{
424	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
425	uint64_t fragmentation = 0;
426
427	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
428
429	for (int c = 0; c < rvd->vdev_children; c++) {
430		vdev_t *tvd = rvd->vdev_child[c];
431		metaslab_group_t *mg = tvd->vdev_mg;
432
433		/*
434		 * Skip any holes, uninitialized top-levels, or
435		 * vdevs that are not in this metalab class.
436		 */
437		if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
438		    mg->mg_class != mc) {
439			continue;
440		}
441
442		/*
443		 * If a metaslab group does not contain a fragmentation
444		 * metric then just bail out.
445		 */
446		if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
447			spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
448			return (ZFS_FRAG_INVALID);
449		}
450
451		/*
452		 * Determine how much this metaslab_group is contributing
453		 * to the overall pool fragmentation metric.
454		 */
455		fragmentation += mg->mg_fragmentation *
456		    metaslab_group_get_space(mg);
457	}
458	fragmentation /= metaslab_class_get_space(mc);
459
460	ASSERT3U(fragmentation, <=, 100);
461	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
462	return (fragmentation);
463}
464
465/*
466 * Calculate the amount of expandable space that is available in
467 * this metaslab class. If a device is expanded then its expandable
468 * space will be the amount of allocatable space that is currently not
469 * part of this metaslab class.
470 */
471uint64_t
472metaslab_class_expandable_space(metaslab_class_t *mc)
473{
474	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
475	uint64_t space = 0;
476
477	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
478	for (int c = 0; c < rvd->vdev_children; c++) {
479		vdev_t *tvd = rvd->vdev_child[c];
480		metaslab_group_t *mg = tvd->vdev_mg;
481
482		if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
483		    mg->mg_class != mc) {
484			continue;
485		}
486
487		/*
488		 * Calculate if we have enough space to add additional
489		 * metaslabs. We report the expandable space in terms
490		 * of the metaslab size since that's the unit of expansion.
491		 */
492		space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize,
493		    1ULL << tvd->vdev_ms_shift);
494	}
495	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
496	return (space);
497}
498
499/*
500 * ==========================================================================
501 * Metaslab groups
502 * ==========================================================================
503 */
504static int
505metaslab_compare(const void *x1, const void *x2)
506{
507	const metaslab_t *m1 = x1;
508	const metaslab_t *m2 = x2;
509
510	if (m1->ms_weight < m2->ms_weight)
511		return (1);
512	if (m1->ms_weight > m2->ms_weight)
513		return (-1);
514
515	/*
516	 * If the weights are identical, use the offset to force uniqueness.
517	 */
518	if (m1->ms_start < m2->ms_start)
519		return (-1);
520	if (m1->ms_start > m2->ms_start)
521		return (1);
522
523	ASSERT3P(m1, ==, m2);
524
525	return (0);
526}
527
528/*
529 * Update the allocatable flag and the metaslab group's capacity.
530 * The allocatable flag is set to true if the capacity is below
531 * the zfs_mg_noalloc_threshold or has a fragmentation value that is
532 * greater than zfs_mg_fragmentation_threshold. If a metaslab group
533 * transitions from allocatable to non-allocatable or vice versa then the
534 * metaslab group's class is updated to reflect the transition.
535 */
536static void
537metaslab_group_alloc_update(metaslab_group_t *mg)
538{
539	vdev_t *vd = mg->mg_vd;
540	metaslab_class_t *mc = mg->mg_class;
541	vdev_stat_t *vs = &vd->vdev_stat;
542	boolean_t was_allocatable;
543	boolean_t was_initialized;
544
545	ASSERT(vd == vd->vdev_top);
546
547	mutex_enter(&mg->mg_lock);
548	was_allocatable = mg->mg_allocatable;
549	was_initialized = mg->mg_initialized;
550
551	mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
552	    (vs->vs_space + 1);
553
554	mutex_enter(&mc->mc_lock);
555
556	/*
557	 * If the metaslab group was just added then it won't
558	 * have any space until we finish syncing out this txg.
559	 * At that point we will consider it initialized and available
560	 * for allocations.  We also don't consider non-activated
561	 * metaslab groups (e.g. vdevs that are in the middle of being removed)
562	 * to be initialized, because they can't be used for allocation.
563	 */
564	mg->mg_initialized = metaslab_group_initialized(mg);
565	if (!was_initialized && mg->mg_initialized) {
566		mc->mc_groups++;
567	} else if (was_initialized && !mg->mg_initialized) {
568		ASSERT3U(mc->mc_groups, >, 0);
569		mc->mc_groups--;
570	}
571	if (mg->mg_initialized)
572		mg->mg_no_free_space = B_FALSE;
573
574	/*
575	 * A metaslab group is considered allocatable if it has plenty
576	 * of free space or is not heavily fragmented. We only take
577	 * fragmentation into account if the metaslab group has a valid
578	 * fragmentation metric (i.e. a value between 0 and 100).
579	 */
580	mg->mg_allocatable = (mg->mg_activation_count > 0 &&
581	    mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
582	    (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
583	    mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
584
585	/*
586	 * The mc_alloc_groups maintains a count of the number of
587	 * groups in this metaslab class that are still above the
588	 * zfs_mg_noalloc_threshold. This is used by the allocating
589	 * threads to determine if they should avoid allocations to
590	 * a given group. The allocator will avoid allocations to a group
591	 * if that group has reached or is below the zfs_mg_noalloc_threshold
592	 * and there are still other groups that are above the threshold.
593	 * When a group transitions from allocatable to non-allocatable or
594	 * vice versa we update the metaslab class to reflect that change.
595	 * When the mc_alloc_groups value drops to 0 that means that all
596	 * groups have reached the zfs_mg_noalloc_threshold making all groups
597	 * eligible for allocations. This effectively means that all devices
598	 * are balanced again.
599	 */
600	if (was_allocatable && !mg->mg_allocatable)
601		mc->mc_alloc_groups--;
602	else if (!was_allocatable && mg->mg_allocatable)
603		mc->mc_alloc_groups++;
604	mutex_exit(&mc->mc_lock);
605
606	mutex_exit(&mg->mg_lock);
607}
608
609metaslab_group_t *
610metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
611{
612	metaslab_group_t *mg;
613
614	mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
615	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
616	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
617	    sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
618	mg->mg_vd = vd;
619	mg->mg_class = mc;
620	mg->mg_activation_count = 0;
621	mg->mg_initialized = B_FALSE;
622	mg->mg_no_free_space = B_TRUE;
623	refcount_create_tracked(&mg->mg_alloc_queue_depth);
624
625	mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
626	    minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
627
628	return (mg);
629}
630
631void
632metaslab_group_destroy(metaslab_group_t *mg)
633{
634	ASSERT(mg->mg_prev == NULL);
635	ASSERT(mg->mg_next == NULL);
636	/*
637	 * We may have gone below zero with the activation count
638	 * either because we never activated in the first place or
639	 * because we're done, and possibly removing the vdev.
640	 */
641	ASSERT(mg->mg_activation_count <= 0);
642
643	taskq_destroy(mg->mg_taskq);
644	avl_destroy(&mg->mg_metaslab_tree);
645	mutex_destroy(&mg->mg_lock);
646	refcount_destroy(&mg->mg_alloc_queue_depth);
647	kmem_free(mg, sizeof (metaslab_group_t));
648}
649
650void
651metaslab_group_activate(metaslab_group_t *mg)
652{
653	metaslab_class_t *mc = mg->mg_class;
654	metaslab_group_t *mgprev, *mgnext;
655
656	ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
657
658	ASSERT(mc->mc_rotor != mg);
659	ASSERT(mg->mg_prev == NULL);
660	ASSERT(mg->mg_next == NULL);
661	ASSERT(mg->mg_activation_count <= 0);
662
663	if (++mg->mg_activation_count <= 0)
664		return;
665
666	mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
667	metaslab_group_alloc_update(mg);
668
669	if ((mgprev = mc->mc_rotor) == NULL) {
670		mg->mg_prev = mg;
671		mg->mg_next = mg;
672	} else {
673		mgnext = mgprev->mg_next;
674		mg->mg_prev = mgprev;
675		mg->mg_next = mgnext;
676		mgprev->mg_next = mg;
677		mgnext->mg_prev = mg;
678	}
679	mc->mc_rotor = mg;
680	metaslab_class_minblocksize_update(mc);
681}
682
683void
684metaslab_group_passivate(metaslab_group_t *mg)
685{
686	metaslab_class_t *mc = mg->mg_class;
687	metaslab_group_t *mgprev, *mgnext;
688
689	ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
690
691	if (--mg->mg_activation_count != 0) {
692		ASSERT(mc->mc_rotor != mg);
693		ASSERT(mg->mg_prev == NULL);
694		ASSERT(mg->mg_next == NULL);
695		ASSERT(mg->mg_activation_count < 0);
696		return;
697	}
698
699	taskq_wait(mg->mg_taskq);
700	metaslab_group_alloc_update(mg);
701
702	mgprev = mg->mg_prev;
703	mgnext = mg->mg_next;
704
705	if (mg == mgnext) {
706		mc->mc_rotor = NULL;
707	} else {
708		mc->mc_rotor = mgnext;
709		mgprev->mg_next = mgnext;
710		mgnext->mg_prev = mgprev;
711	}
712
713	mg->mg_prev = NULL;
714	mg->mg_next = NULL;
715	metaslab_class_minblocksize_update(mc);
716}
717
718boolean_t
719metaslab_group_initialized(metaslab_group_t *mg)
720{
721	vdev_t *vd = mg->mg_vd;
722	vdev_stat_t *vs = &vd->vdev_stat;
723
724	return (vs->vs_space != 0 && mg->mg_activation_count > 0);
725}
726
727uint64_t
728metaslab_group_get_space(metaslab_group_t *mg)
729{
730	return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
731}
732
733void
734metaslab_group_histogram_verify(metaslab_group_t *mg)
735{
736	uint64_t *mg_hist;
737	vdev_t *vd = mg->mg_vd;
738	uint64_t ashift = vd->vdev_ashift;
739	int i;
740
741	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
742		return;
743
744	mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
745	    KM_SLEEP);
746
747	ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
748	    SPACE_MAP_HISTOGRAM_SIZE + ashift);
749
750	for (int m = 0; m < vd->vdev_ms_count; m++) {
751		metaslab_t *msp = vd->vdev_ms[m];
752
753		if (msp->ms_sm == NULL)
754			continue;
755
756		for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
757			mg_hist[i + ashift] +=
758			    msp->ms_sm->sm_phys->smp_histogram[i];
759	}
760
761	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
762		VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
763
764	kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
765}
766
767static void
768metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
769{
770	metaslab_class_t *mc = mg->mg_class;
771	uint64_t ashift = mg->mg_vd->vdev_ashift;
772
773	ASSERT(MUTEX_HELD(&msp->ms_lock));
774	if (msp->ms_sm == NULL)
775		return;
776
777	mutex_enter(&mg->mg_lock);
778	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
779		mg->mg_histogram[i + ashift] +=
780		    msp->ms_sm->sm_phys->smp_histogram[i];
781		mc->mc_histogram[i + ashift] +=
782		    msp->ms_sm->sm_phys->smp_histogram[i];
783	}
784	mutex_exit(&mg->mg_lock);
785}
786
787void
788metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
789{
790	metaslab_class_t *mc = mg->mg_class;
791	uint64_t ashift = mg->mg_vd->vdev_ashift;
792
793	ASSERT(MUTEX_HELD(&msp->ms_lock));
794	if (msp->ms_sm == NULL)
795		return;
796
797	mutex_enter(&mg->mg_lock);
798	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
799		ASSERT3U(mg->mg_histogram[i + ashift], >=,
800		    msp->ms_sm->sm_phys->smp_histogram[i]);
801		ASSERT3U(mc->mc_histogram[i + ashift], >=,
802		    msp->ms_sm->sm_phys->smp_histogram[i]);
803
804		mg->mg_histogram[i + ashift] -=
805		    msp->ms_sm->sm_phys->smp_histogram[i];
806		mc->mc_histogram[i + ashift] -=
807		    msp->ms_sm->sm_phys->smp_histogram[i];
808	}
809	mutex_exit(&mg->mg_lock);
810}
811
812static void
813metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
814{
815	ASSERT(msp->ms_group == NULL);
816	mutex_enter(&mg->mg_lock);
817	msp->ms_group = mg;
818	msp->ms_weight = 0;
819	avl_add(&mg->mg_metaslab_tree, msp);
820	mutex_exit(&mg->mg_lock);
821
822	mutex_enter(&msp->ms_lock);
823	metaslab_group_histogram_add(mg, msp);
824	mutex_exit(&msp->ms_lock);
825}
826
827static void
828metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
829{
830	mutex_enter(&msp->ms_lock);
831	metaslab_group_histogram_remove(mg, msp);
832	mutex_exit(&msp->ms_lock);
833
834	mutex_enter(&mg->mg_lock);
835	ASSERT(msp->ms_group == mg);
836	avl_remove(&mg->mg_metaslab_tree, msp);
837	msp->ms_group = NULL;
838	mutex_exit(&mg->mg_lock);
839}
840
841static void
842metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
843{
844	/*
845	 * Although in principle the weight can be any value, in
846	 * practice we do not use values in the range [1, 511].
847	 */
848	ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
849	ASSERT(MUTEX_HELD(&msp->ms_lock));
850
851	mutex_enter(&mg->mg_lock);
852	ASSERT(msp->ms_group == mg);
853	avl_remove(&mg->mg_metaslab_tree, msp);
854	msp->ms_weight = weight;
855	avl_add(&mg->mg_metaslab_tree, msp);
856	mutex_exit(&mg->mg_lock);
857}
858
859/*
860 * Calculate the fragmentation for a given metaslab group. We can use
861 * a simple average here since all metaslabs within the group must have
862 * the same size. The return value will be a value between 0 and 100
863 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
864 * group have a fragmentation metric.
865 */
866uint64_t
867metaslab_group_fragmentation(metaslab_group_t *mg)
868{
869	vdev_t *vd = mg->mg_vd;
870	uint64_t fragmentation = 0;
871	uint64_t valid_ms = 0;
872
873	for (int m = 0; m < vd->vdev_ms_count; m++) {
874		metaslab_t *msp = vd->vdev_ms[m];
875
876		if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
877			continue;
878
879		valid_ms++;
880		fragmentation += msp->ms_fragmentation;
881	}
882
883	if (valid_ms <= vd->vdev_ms_count / 2)
884		return (ZFS_FRAG_INVALID);
885
886	fragmentation /= valid_ms;
887	ASSERT3U(fragmentation, <=, 100);
888	return (fragmentation);
889}
890
891/*
892 * Determine if a given metaslab group should skip allocations. A metaslab
893 * group should avoid allocations if its free capacity is less than the
894 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
895 * zfs_mg_fragmentation_threshold and there is at least one metaslab group
896 * that can still handle allocations. If the allocation throttle is enabled
897 * then we skip allocations to devices that have reached their maximum
898 * allocation queue depth unless the selected metaslab group is the only
899 * eligible group remaining.
900 */
901static boolean_t
902metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
903    uint64_t psize)
904{
905	spa_t *spa = mg->mg_vd->vdev_spa;
906	metaslab_class_t *mc = mg->mg_class;
907
908	/*
909	 * We can only consider skipping this metaslab group if it's
910	 * in the normal metaslab class and there are other metaslab
911	 * groups to select from. Otherwise, we always consider it eligible
912	 * for allocations.
913	 */
914	if (mc != spa_normal_class(spa) || mc->mc_groups <= 1)
915		return (B_TRUE);
916
917	/*
918	 * If the metaslab group's mg_allocatable flag is set (see comments
919	 * in metaslab_group_alloc_update() for more information) and
920	 * the allocation throttle is disabled then allow allocations to this
921	 * device. However, if the allocation throttle is enabled then
922	 * check if we have reached our allocation limit (mg_alloc_queue_depth)
923	 * to determine if we should allow allocations to this metaslab group.
924	 * If all metaslab groups are no longer considered allocatable
925	 * (mc_alloc_groups == 0) or we're trying to allocate the smallest
926	 * gang block size then we allow allocations on this metaslab group
927	 * regardless of the mg_allocatable or throttle settings.
928	 */
929	if (mg->mg_allocatable) {
930		metaslab_group_t *mgp;
931		int64_t qdepth;
932		uint64_t qmax = mg->mg_max_alloc_queue_depth;
933
934		if (!mc->mc_alloc_throttle_enabled)
935			return (B_TRUE);
936
937		/*
938		 * If this metaslab group does not have any free space, then
939		 * there is no point in looking further.
940		 */
941		if (mg->mg_no_free_space)
942			return (B_FALSE);
943
944		qdepth = refcount_count(&mg->mg_alloc_queue_depth);
945
946		/*
947		 * If this metaslab group is below its qmax or it's
948		 * the only allocatable metasable group, then attempt
949		 * to allocate from it.
950		 */
951		if (qdepth < qmax || mc->mc_alloc_groups == 1)
952			return (B_TRUE);
953		ASSERT3U(mc->mc_alloc_groups, >, 1);
954
955		/*
956		 * Since this metaslab group is at or over its qmax, we
957		 * need to determine if there are metaslab groups after this
958		 * one that might be able to handle this allocation. This is
959		 * racy since we can't hold the locks for all metaslab
960		 * groups at the same time when we make this check.
961		 */
962		for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
963			qmax = mgp->mg_max_alloc_queue_depth;
964
965			qdepth = refcount_count(&mgp->mg_alloc_queue_depth);
966
967			/*
968			 * If there is another metaslab group that
969			 * might be able to handle the allocation, then
970			 * we return false so that we skip this group.
971			 */
972			if (qdepth < qmax && !mgp->mg_no_free_space)
973				return (B_FALSE);
974		}
975
976		/*
977		 * We didn't find another group to handle the allocation
978		 * so we can't skip this metaslab group even though
979		 * we are at or over our qmax.
980		 */
981		return (B_TRUE);
982
983	} else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
984		return (B_TRUE);
985	}
986	return (B_FALSE);
987}
988
989/*
990 * ==========================================================================
991 * Range tree callbacks
992 * ==========================================================================
993 */
994
995/*
996 * Comparison function for the private size-ordered tree. Tree is sorted
997 * by size, larger sizes at the end of the tree.
998 */
999static int
1000metaslab_rangesize_compare(const void *x1, const void *x2)
1001{
1002	const range_seg_t *r1 = x1;
1003	const range_seg_t *r2 = x2;
1004	uint64_t rs_size1 = r1->rs_end - r1->rs_start;
1005	uint64_t rs_size2 = r2->rs_end - r2->rs_start;
1006
1007	if (rs_size1 < rs_size2)
1008		return (-1);
1009	if (rs_size1 > rs_size2)
1010		return (1);
1011
1012	if (r1->rs_start < r2->rs_start)
1013		return (-1);
1014
1015	if (r1->rs_start > r2->rs_start)
1016		return (1);
1017
1018	return (0);
1019}
1020
1021/*
1022 * Create any block allocator specific components. The current allocators
1023 * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
1024 */
1025static void
1026metaslab_rt_create(range_tree_t *rt, void *arg)
1027{
1028	metaslab_t *msp = arg;
1029
1030	ASSERT3P(rt->rt_arg, ==, msp);
1031	ASSERT(msp->ms_tree == NULL);
1032
1033	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
1034	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
1035}
1036
1037/*
1038 * Destroy the block allocator specific components.
1039 */
1040static void
1041metaslab_rt_destroy(range_tree_t *rt, void *arg)
1042{
1043	metaslab_t *msp = arg;
1044
1045	ASSERT3P(rt->rt_arg, ==, msp);
1046	ASSERT3P(msp->ms_tree, ==, rt);
1047	ASSERT0(avl_numnodes(&msp->ms_size_tree));
1048
1049	avl_destroy(&msp->ms_size_tree);
1050}
1051
1052static void
1053metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
1054{
1055	metaslab_t *msp = arg;
1056
1057	ASSERT3P(rt->rt_arg, ==, msp);
1058	ASSERT3P(msp->ms_tree, ==, rt);
1059	VERIFY(!msp->ms_condensing);
1060	avl_add(&msp->ms_size_tree, rs);
1061}
1062
1063static void
1064metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
1065{
1066	metaslab_t *msp = arg;
1067
1068	ASSERT3P(rt->rt_arg, ==, msp);
1069	ASSERT3P(msp->ms_tree, ==, rt);
1070	VERIFY(!msp->ms_condensing);
1071	avl_remove(&msp->ms_size_tree, rs);
1072}
1073
1074static void
1075metaslab_rt_vacate(range_tree_t *rt, void *arg)
1076{
1077	metaslab_t *msp = arg;
1078
1079	ASSERT3P(rt->rt_arg, ==, msp);
1080	ASSERT3P(msp->ms_tree, ==, rt);
1081
1082	/*
1083	 * Normally one would walk the tree freeing nodes along the way.
1084	 * Since the nodes are shared with the range trees we can avoid
1085	 * walking all nodes and just reinitialize the avl tree. The nodes
1086	 * will be freed by the range tree, so we don't want to free them here.
1087	 */
1088	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
1089	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
1090}
1091
1092static range_tree_ops_t metaslab_rt_ops = {
1093	metaslab_rt_create,
1094	metaslab_rt_destroy,
1095	metaslab_rt_add,
1096	metaslab_rt_remove,
1097	metaslab_rt_vacate
1098};
1099
1100/*
1101 * ==========================================================================
1102 * Metaslab block operations
1103 * ==========================================================================
1104 */
1105
1106/*
1107 * Return the maximum contiguous segment within the metaslab.
1108 */
1109uint64_t
1110metaslab_block_maxsize(metaslab_t *msp)
1111{
1112	avl_tree_t *t = &msp->ms_size_tree;
1113	range_seg_t *rs;
1114
1115	if (t == NULL || (rs = avl_last(t)) == NULL)
1116		return (0ULL);
1117
1118	return (rs->rs_end - rs->rs_start);
1119}
1120
1121uint64_t
1122metaslab_block_alloc(metaslab_t *msp, uint64_t size)
1123{
1124	uint64_t start;
1125	range_tree_t *rt = msp->ms_tree;
1126
1127	VERIFY(!msp->ms_condensing);
1128
1129	start = msp->ms_ops->msop_alloc(msp, size);
1130	if (start != -1ULL) {
1131		vdev_t *vd = msp->ms_group->mg_vd;
1132
1133		VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
1134		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
1135		VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
1136		range_tree_remove(rt, start, size);
1137	}
1138	return (start);
1139}
1140
1141/*
1142 * ==========================================================================
1143 * Common allocator routines
1144 * ==========================================================================
1145 */
1146
1147/*
1148 * This is a helper function that can be used by the allocator to find
1149 * a suitable block to allocate. This will search the specified AVL
1150 * tree looking for a block that matches the specified criteria.
1151 */
1152static uint64_t
1153metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
1154    uint64_t align)
1155{
1156	range_seg_t *rs, rsearch;
1157	avl_index_t where;
1158
1159	rsearch.rs_start = *cursor;
1160	rsearch.rs_end = *cursor + size;
1161
1162	rs = avl_find(t, &rsearch, &where);
1163	if (rs == NULL)
1164		rs = avl_nearest(t, where, AVL_AFTER);
1165
1166	while (rs != NULL) {
1167		uint64_t offset = P2ROUNDUP(rs->rs_start, align);
1168
1169		if (offset + size <= rs->rs_end) {
1170			*cursor = offset + size;
1171			return (offset);
1172		}
1173		rs = AVL_NEXT(t, rs);
1174	}
1175
1176	/*
1177	 * If we know we've searched the whole map (*cursor == 0), give up.
1178	 * Otherwise, reset the cursor to the beginning and try again.
1179	 */
1180	if (*cursor == 0)
1181		return (-1ULL);
1182
1183	*cursor = 0;
1184	return (metaslab_block_picker(t, cursor, size, align));
1185}
1186
1187/*
1188 * ==========================================================================
1189 * The first-fit block allocator
1190 * ==========================================================================
1191 */
1192static uint64_t
1193metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
1194{
1195	/*
1196	 * Find the largest power of 2 block size that evenly divides the
1197	 * requested size. This is used to try to allocate blocks with similar
1198	 * alignment from the same area of the metaslab (i.e. same cursor
1199	 * bucket) but it does not guarantee that other allocations sizes
1200	 * may exist in the same region.
1201	 */
1202	uint64_t align = size & -size;
1203	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1204	avl_tree_t *t = &msp->ms_tree->rt_root;
1205
1206	return (metaslab_block_picker(t, cursor, size, align));
1207}
1208
1209static metaslab_ops_t metaslab_ff_ops = {
1210	metaslab_ff_alloc
1211};
1212
1213/*
1214 * ==========================================================================
1215 * Dynamic block allocator -
1216 * Uses the first fit allocation scheme until space get low and then
1217 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
1218 * and metaslab_df_free_pct to determine when to switch the allocation scheme.
1219 * ==========================================================================
1220 */
1221static uint64_t
1222metaslab_df_alloc(metaslab_t *msp, uint64_t size)
1223{
1224	/*
1225	 * Find the largest power of 2 block size that evenly divides the
1226	 * requested size. This is used to try to allocate blocks with similar
1227	 * alignment from the same area of the metaslab (i.e. same cursor
1228	 * bucket) but it does not guarantee that other allocations sizes
1229	 * may exist in the same region.
1230	 */
1231	uint64_t align = size & -size;
1232	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1233	range_tree_t *rt = msp->ms_tree;
1234	avl_tree_t *t = &rt->rt_root;
1235	uint64_t max_size = metaslab_block_maxsize(msp);
1236	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
1237
1238	ASSERT(MUTEX_HELD(&msp->ms_lock));
1239	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
1240
1241	if (max_size < size)
1242		return (-1ULL);
1243
1244	/*
1245	 * If we're running low on space switch to using the size
1246	 * sorted AVL tree (best-fit).
1247	 */
1248	if (max_size < metaslab_df_alloc_threshold ||
1249	    free_pct < metaslab_df_free_pct) {
1250		t = &msp->ms_size_tree;
1251		*cursor = 0;
1252	}
1253
1254	return (metaslab_block_picker(t, cursor, size, 1ULL));
1255}
1256
1257static metaslab_ops_t metaslab_df_ops = {
1258	metaslab_df_alloc
1259};
1260
1261/*
1262 * ==========================================================================
1263 * Cursor fit block allocator -
1264 * Select the largest region in the metaslab, set the cursor to the beginning
1265 * of the range and the cursor_end to the end of the range. As allocations
1266 * are made advance the cursor. Continue allocating from the cursor until
1267 * the range is exhausted and then find a new range.
1268 * ==========================================================================
1269 */
1270static uint64_t
1271metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
1272{
1273	range_tree_t *rt = msp->ms_tree;
1274	avl_tree_t *t = &msp->ms_size_tree;
1275	uint64_t *cursor = &msp->ms_lbas[0];
1276	uint64_t *cursor_end = &msp->ms_lbas[1];
1277	uint64_t offset = 0;
1278
1279	ASSERT(MUTEX_HELD(&msp->ms_lock));
1280	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
1281
1282	ASSERT3U(*cursor_end, >=, *cursor);
1283
1284	if ((*cursor + size) > *cursor_end) {
1285		range_seg_t *rs;
1286
1287		rs = avl_last(&msp->ms_size_tree);
1288		if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
1289			return (-1ULL);
1290
1291		*cursor = rs->rs_start;
1292		*cursor_end = rs->rs_end;
1293	}
1294
1295	offset = *cursor;
1296	*cursor += size;
1297
1298	return (offset);
1299}
1300
1301static metaslab_ops_t metaslab_cf_ops = {
1302	metaslab_cf_alloc
1303};
1304
1305/*
1306 * ==========================================================================
1307 * New dynamic fit allocator -
1308 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
1309 * contiguous blocks. If no region is found then just use the largest segment
1310 * that remains.
1311 * ==========================================================================
1312 */
1313
1314/*
1315 * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
1316 * to request from the allocator.
1317 */
1318uint64_t metaslab_ndf_clump_shift = 4;
1319
1320static uint64_t
1321metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
1322{
1323	avl_tree_t *t = &msp->ms_tree->rt_root;
1324	avl_index_t where;
1325	range_seg_t *rs, rsearch;
1326	uint64_t hbit = highbit64(size);
1327	uint64_t *cursor = &msp->ms_lbas[hbit - 1];
1328	uint64_t max_size = metaslab_block_maxsize(msp);
1329
1330	ASSERT(MUTEX_HELD(&msp->ms_lock));
1331	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
1332
1333	if (max_size < size)
1334		return (-1ULL);
1335
1336	rsearch.rs_start = *cursor;
1337	rsearch.rs_end = *cursor + size;
1338
1339	rs = avl_find(t, &rsearch, &where);
1340	if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
1341		t = &msp->ms_size_tree;
1342
1343		rsearch.rs_start = 0;
1344		rsearch.rs_end = MIN(max_size,
1345		    1ULL << (hbit + metaslab_ndf_clump_shift));
1346		rs = avl_find(t, &rsearch, &where);
1347		if (rs == NULL)
1348			rs = avl_nearest(t, where, AVL_AFTER);
1349		ASSERT(rs != NULL);
1350	}
1351
1352	if ((rs->rs_end - rs->rs_start) >= size) {
1353		*cursor = rs->rs_start + size;
1354		return (rs->rs_start);
1355	}
1356	return (-1ULL);
1357}
1358
1359static metaslab_ops_t metaslab_ndf_ops = {
1360	metaslab_ndf_alloc
1361};
1362
1363metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
1364
1365/*
1366 * ==========================================================================
1367 * Metaslabs
1368 * ==========================================================================
1369 */
1370
1371/*
1372 * Wait for any in-progress metaslab loads to complete.
1373 */
1374void
1375metaslab_load_wait(metaslab_t *msp)
1376{
1377	ASSERT(MUTEX_HELD(&msp->ms_lock));
1378
1379	while (msp->ms_loading) {
1380		ASSERT(!msp->ms_loaded);
1381		cv_wait(&msp->ms_load_cv, &msp->ms_lock);
1382	}
1383}
1384
1385int
1386metaslab_load(metaslab_t *msp)
1387{
1388	int error = 0;
1389
1390	ASSERT(MUTEX_HELD(&msp->ms_lock));
1391	ASSERT(!msp->ms_loaded);
1392	ASSERT(!msp->ms_loading);
1393
1394	msp->ms_loading = B_TRUE;
1395
1396	/*
1397	 * If the space map has not been allocated yet, then treat
1398	 * all the space in the metaslab as free and add it to the
1399	 * ms_tree.
1400	 */
1401	if (msp->ms_sm != NULL)
1402		error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
1403	else
1404		range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
1405
1406	msp->ms_loaded = (error == 0);
1407	msp->ms_loading = B_FALSE;
1408
1409	if (msp->ms_loaded) {
1410		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1411			range_tree_walk(msp->ms_defertree[t],
1412			    range_tree_remove, msp->ms_tree);
1413		}
1414	}
1415	cv_broadcast(&msp->ms_load_cv);
1416	return (error);
1417}
1418
1419void
1420metaslab_unload(metaslab_t *msp)
1421{
1422	ASSERT(MUTEX_HELD(&msp->ms_lock));
1423	range_tree_vacate(msp->ms_tree, NULL, NULL);
1424	msp->ms_loaded = B_FALSE;
1425	msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
1426}
1427
1428int
1429metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
1430    metaslab_t **msp)
1431{
1432	vdev_t *vd = mg->mg_vd;
1433	objset_t *mos = vd->vdev_spa->spa_meta_objset;
1434	metaslab_t *ms;
1435	int error;
1436
1437	ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
1438	mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
1439	cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
1440	ms->ms_id = id;
1441	ms->ms_start = id << vd->vdev_ms_shift;
1442	ms->ms_size = 1ULL << vd->vdev_ms_shift;
1443
1444	/*
1445	 * We only open space map objects that already exist. All others
1446	 * will be opened when we finally allocate an object for it.
1447	 */
1448	if (object != 0) {
1449		error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
1450		    ms->ms_size, vd->vdev_ashift, &ms->ms_lock);
1451
1452		if (error != 0) {
1453			kmem_free(ms, sizeof (metaslab_t));
1454			return (error);
1455		}
1456
1457		ASSERT(ms->ms_sm != NULL);
1458	}
1459
1460	/*
1461	 * We create the main range tree here, but we don't create the
1462	 * alloctree and freetree until metaslab_sync_done().  This serves
1463	 * two purposes: it allows metaslab_sync_done() to detect the
1464	 * addition of new space; and for debugging, it ensures that we'd
1465	 * data fault on any attempt to use this metaslab before it's ready.
1466	 */
1467	ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms, &ms->ms_lock);
1468	metaslab_group_add(mg, ms);
1469
1470	ms->ms_fragmentation = metaslab_fragmentation(ms);
1471	ms->ms_ops = mg->mg_class->mc_ops;
1472
1473	/*
1474	 * If we're opening an existing pool (txg == 0) or creating
1475	 * a new one (txg == TXG_INITIAL), all space is available now.
1476	 * If we're adding space to an existing pool, the new space
1477	 * does not become available until after this txg has synced.
1478	 */
1479	if (txg <= TXG_INITIAL)
1480		metaslab_sync_done(ms, 0);
1481
1482	/*
1483	 * If metaslab_debug_load is set and we're initializing a metaslab
1484	 * that has an allocated space_map object then load the its space
1485	 * map so that can verify frees.
1486	 */
1487	if (metaslab_debug_load && ms->ms_sm != NULL) {
1488		mutex_enter(&ms->ms_lock);
1489		VERIFY0(metaslab_load(ms));
1490		mutex_exit(&ms->ms_lock);
1491	}
1492
1493	if (txg != 0) {
1494		vdev_dirty(vd, 0, NULL, txg);
1495		vdev_dirty(vd, VDD_METASLAB, ms, txg);
1496	}
1497
1498	*msp = ms;
1499
1500	return (0);
1501}
1502
1503void
1504metaslab_fini(metaslab_t *msp)
1505{
1506	metaslab_group_t *mg = msp->ms_group;
1507
1508	metaslab_group_remove(mg, msp);
1509
1510	mutex_enter(&msp->ms_lock);
1511
1512	VERIFY(msp->ms_group == NULL);
1513	vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
1514	    0, -msp->ms_size);
1515	space_map_close(msp->ms_sm);
1516
1517	metaslab_unload(msp);
1518	range_tree_destroy(msp->ms_tree);
1519
1520	for (int t = 0; t < TXG_SIZE; t++) {
1521		range_tree_destroy(msp->ms_alloctree[t]);
1522		range_tree_destroy(msp->ms_freetree[t]);
1523	}
1524
1525	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1526		range_tree_destroy(msp->ms_defertree[t]);
1527	}
1528
1529	ASSERT0(msp->ms_deferspace);
1530
1531	mutex_exit(&msp->ms_lock);
1532	cv_destroy(&msp->ms_load_cv);
1533	mutex_destroy(&msp->ms_lock);
1534
1535	kmem_free(msp, sizeof (metaslab_t));
1536}
1537
1538#define	FRAGMENTATION_TABLE_SIZE	17
1539
1540/*
1541 * This table defines a segment size based fragmentation metric that will
1542 * allow each metaslab to derive its own fragmentation value. This is done
1543 * by calculating the space in each bucket of the spacemap histogram and
1544 * multiplying that by the fragmetation metric in this table. Doing
1545 * this for all buckets and dividing it by the total amount of free
1546 * space in this metaslab (i.e. the total free space in all buckets) gives
1547 * us the fragmentation metric. This means that a high fragmentation metric
1548 * equates to most of the free space being comprised of small segments.
1549 * Conversely, if the metric is low, then most of the free space is in
1550 * large segments. A 10% change in fragmentation equates to approximately
1551 * double the number of segments.
1552 *
1553 * This table defines 0% fragmented space using 16MB segments. Testing has
1554 * shown that segments that are greater than or equal to 16MB do not suffer
1555 * from drastic performance problems. Using this value, we derive the rest
1556 * of the table. Since the fragmentation value is never stored on disk, it
1557 * is possible to change these calculations in the future.
1558 */
1559int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
1560	100,	/* 512B	*/
1561	100,	/* 1K	*/
1562	98,	/* 2K	*/
1563	95,	/* 4K	*/
1564	90,	/* 8K	*/
1565	80,	/* 16K	*/
1566	70,	/* 32K	*/
1567	60,	/* 64K	*/
1568	50,	/* 128K	*/
1569	40,	/* 256K	*/
1570	30,	/* 512K	*/
1571	20,	/* 1M	*/
1572	15,	/* 2M	*/
1573	10,	/* 4M	*/
1574	5,	/* 8M	*/
1575	0	/* 16M	*/
1576};
1577
1578/*
1579 * Calclate the metaslab's fragmentation metric. A return value
1580 * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does
1581 * not support this metric. Otherwise, the return value should be in the
1582 * range [0, 100].
1583 */
1584static uint64_t
1585metaslab_fragmentation(metaslab_t *msp)
1586{
1587	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1588	uint64_t fragmentation = 0;
1589	uint64_t total = 0;
1590	boolean_t feature_enabled = spa_feature_is_enabled(spa,
1591	    SPA_FEATURE_SPACEMAP_HISTOGRAM);
1592
1593	if (!feature_enabled)
1594		return (ZFS_FRAG_INVALID);
1595
1596	/*
1597	 * A null space map means that the entire metaslab is free
1598	 * and thus is not fragmented.
1599	 */
1600	if (msp->ms_sm == NULL)
1601		return (0);
1602
1603	/*
1604	 * If this metaslab's space_map has not been upgraded, flag it
1605	 * so that we upgrade next time we encounter it.
1606	 */
1607	if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
1608		uint64_t txg = spa_syncing_txg(spa);
1609		vdev_t *vd = msp->ms_group->mg_vd;
1610
1611		if (spa_writeable(spa)) {
1612			msp->ms_condense_wanted = B_TRUE;
1613			vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
1614			spa_dbgmsg(spa, "txg %llu, requesting force condense: "
1615			    "msp %p, vd %p", txg, msp, vd);
1616		}
1617		return (ZFS_FRAG_INVALID);
1618	}
1619
1620	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
1621		uint64_t space = 0;
1622		uint8_t shift = msp->ms_sm->sm_shift;
1623		int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
1624		    FRAGMENTATION_TABLE_SIZE - 1);
1625
1626		if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
1627			continue;
1628
1629		space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
1630		total += space;
1631
1632		ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
1633		fragmentation += space * zfs_frag_table[idx];
1634	}
1635
1636	if (total > 0)
1637		fragmentation /= total;
1638	ASSERT3U(fragmentation, <=, 100);
1639	return (fragmentation);
1640}
1641
1642/*
1643 * Compute a weight -- a selection preference value -- for the given metaslab.
1644 * This is based on the amount of free space, the level of fragmentation,
1645 * the LBA range, and whether the metaslab is loaded.
1646 */
1647static uint64_t
1648metaslab_weight(metaslab_t *msp)
1649{
1650	metaslab_group_t *mg = msp->ms_group;
1651	vdev_t *vd = mg->mg_vd;
1652	uint64_t weight, space;
1653
1654	ASSERT(MUTEX_HELD(&msp->ms_lock));
1655
1656	/*
1657	 * This vdev is in the process of being removed so there is nothing
1658	 * for us to do here.
1659	 */
1660	if (vd->vdev_removing) {
1661		ASSERT0(space_map_allocated(msp->ms_sm));
1662		ASSERT0(vd->vdev_ms_shift);
1663		return (0);
1664	}
1665
1666	/*
1667	 * The baseline weight is the metaslab's free space.
1668	 */
1669	space = msp->ms_size - space_map_allocated(msp->ms_sm);
1670
1671	msp->ms_fragmentation = metaslab_fragmentation(msp);
1672	if (metaslab_fragmentation_factor_enabled &&
1673	    msp->ms_fragmentation != ZFS_FRAG_INVALID) {
1674		/*
1675		 * Use the fragmentation information to inversely scale
1676		 * down the baseline weight. We need to ensure that we
1677		 * don't exclude this metaslab completely when it's 100%
1678		 * fragmented. To avoid this we reduce the fragmented value
1679		 * by 1.
1680		 */
1681		space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
1682
1683		/*
1684		 * If space < SPA_MINBLOCKSIZE, then we will not allocate from
1685		 * this metaslab again. The fragmentation metric may have
1686		 * decreased the space to something smaller than
1687		 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
1688		 * so that we can consume any remaining space.
1689		 */
1690		if (space > 0 && space < SPA_MINBLOCKSIZE)
1691			space = SPA_MINBLOCKSIZE;
1692	}
1693	weight = space;
1694
1695	/*
1696	 * Modern disks have uniform bit density and constant angular velocity.
1697	 * Therefore, the outer recording zones are faster (higher bandwidth)
1698	 * than the inner zones by the ratio of outer to inner track diameter,
1699	 * which is typically around 2:1.  We account for this by assigning
1700	 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
1701	 * In effect, this means that we'll select the metaslab with the most
1702	 * free bandwidth rather than simply the one with the most free space.
1703	 */
1704	if (metaslab_lba_weighting_enabled) {
1705		weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
1706		ASSERT(weight >= space && weight <= 2 * space);
1707	}
1708
1709	/*
1710	 * If this metaslab is one we're actively using, adjust its
1711	 * weight to make it preferable to any inactive metaslab so
1712	 * we'll polish it off. If the fragmentation on this metaslab
1713	 * has exceed our threshold, then don't mark it active.
1714	 */
1715	if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
1716	    msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
1717		weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
1718	}
1719
1720	return (weight);
1721}
1722
1723static int
1724metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
1725{
1726	ASSERT(MUTEX_HELD(&msp->ms_lock));
1727
1728	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
1729		metaslab_load_wait(msp);
1730		if (!msp->ms_loaded) {
1731			int error = metaslab_load(msp);
1732			if (error) {
1733				metaslab_group_sort(msp->ms_group, msp, 0);
1734				return (error);
1735			}
1736		}
1737
1738		metaslab_group_sort(msp->ms_group, msp,
1739		    msp->ms_weight | activation_weight);
1740	}
1741	ASSERT(msp->ms_loaded);
1742	ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
1743
1744	return (0);
1745}
1746
1747static void
1748metaslab_passivate(metaslab_t *msp, uint64_t size)
1749{
1750	/*
1751	 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
1752	 * this metaslab again.  In that case, it had better be empty,
1753	 * or we would be leaving space on the table.
1754	 */
1755	ASSERT(size >= SPA_MINBLOCKSIZE || range_tree_space(msp->ms_tree) == 0);
1756	metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
1757	ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
1758}
1759
1760static void
1761metaslab_preload(void *arg)
1762{
1763	metaslab_t *msp = arg;
1764	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1765
1766	ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
1767
1768	mutex_enter(&msp->ms_lock);
1769	metaslab_load_wait(msp);
1770	if (!msp->ms_loaded)
1771		(void) metaslab_load(msp);
1772
1773	/*
1774	 * Set the ms_access_txg value so that we don't unload it right away.
1775	 */
1776	msp->ms_access_txg = spa_syncing_txg(spa) + metaslab_unload_delay + 1;
1777	mutex_exit(&msp->ms_lock);
1778}
1779
1780static void
1781metaslab_group_preload(metaslab_group_t *mg)
1782{
1783	spa_t *spa = mg->mg_vd->vdev_spa;
1784	metaslab_t *msp;
1785	avl_tree_t *t = &mg->mg_metaslab_tree;
1786	int m = 0;
1787
1788	if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
1789		taskq_wait(mg->mg_taskq);
1790		return;
1791	}
1792
1793	mutex_enter(&mg->mg_lock);
1794	/*
1795	 * Load the next potential metaslabs
1796	 */
1797	msp = avl_first(t);
1798	while (msp != NULL) {
1799		metaslab_t *msp_next = AVL_NEXT(t, msp);
1800
1801		/*
1802		 * We preload only the maximum number of metaslabs specified
1803		 * by metaslab_preload_limit. If a metaslab is being forced
1804		 * to condense then we preload it too. This will ensure
1805		 * that force condensing happens in the next txg.
1806		 */
1807		if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
1808			msp = msp_next;
1809			continue;
1810		}
1811
1812		/*
1813		 * We must drop the metaslab group lock here to preserve
1814		 * lock ordering with the ms_lock (when grabbing both
1815		 * the mg_lock and the ms_lock, the ms_lock must be taken
1816		 * first).  As a result, it is possible that the ordering
1817		 * of the metaslabs within the avl tree may change before
1818		 * we reacquire the lock. The metaslab cannot be removed from
1819		 * the tree while we're in syncing context so it is safe to
1820		 * drop the mg_lock here. If the metaslabs are reordered
1821		 * nothing will break -- we just may end up loading a
1822		 * less than optimal one.
1823		 */
1824		mutex_exit(&mg->mg_lock);
1825		VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
1826		    msp, TQ_SLEEP) != 0);
1827		mutex_enter(&mg->mg_lock);
1828		msp = msp_next;
1829	}
1830	mutex_exit(&mg->mg_lock);
1831}
1832
1833/*
1834 * Determine if the space map's on-disk footprint is past our tolerance
1835 * for inefficiency. We would like to use the following criteria to make
1836 * our decision:
1837 *
1838 * 1. The size of the space map object should not dramatically increase as a
1839 * result of writing out the free space range tree.
1840 *
1841 * 2. The minimal on-disk space map representation is zfs_condense_pct/100
1842 * times the size than the free space range tree representation
1843 * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB).
1844 *
1845 * 3. The on-disk size of the space map should actually decrease.
1846 *
1847 * Checking the first condition is tricky since we don't want to walk
1848 * the entire AVL tree calculating the estimated on-disk size. Instead we
1849 * use the size-ordered range tree in the metaslab and calculate the
1850 * size required to write out the largest segment in our free tree. If the
1851 * size required to represent that segment on disk is larger than the space
1852 * map object then we avoid condensing this map.
1853 *
1854 * To determine the second criterion we use a best-case estimate and assume
1855 * each segment can be represented on-disk as a single 64-bit entry. We refer
1856 * to this best-case estimate as the space map's minimal form.
1857 *
1858 * Unfortunately, we cannot compute the on-disk size of the space map in this
1859 * context because we cannot accurately compute the effects of compression, etc.
1860 * Instead, we apply the heuristic described in the block comment for
1861 * zfs_metaslab_condense_block_threshold - we only condense if the space used
1862 * is greater than a threshold number of blocks.
1863 */
1864static boolean_t
1865metaslab_should_condense(metaslab_t *msp)
1866{
1867	space_map_t *sm = msp->ms_sm;
1868	range_seg_t *rs;
1869	uint64_t size, entries, segsz, object_size, optimal_size, record_size;
1870	dmu_object_info_t doi;
1871	uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift;
1872
1873	ASSERT(MUTEX_HELD(&msp->ms_lock));
1874	ASSERT(msp->ms_loaded);
1875
1876	/*
1877	 * Use the ms_size_tree range tree, which is ordered by size, to
1878	 * obtain the largest segment in the free tree. We always condense
1879	 * metaslabs that are empty and metaslabs for which a condense
1880	 * request has been made.
1881	 */
1882	rs = avl_last(&msp->ms_size_tree);
1883	if (rs == NULL || msp->ms_condense_wanted)
1884		return (B_TRUE);
1885
1886	/*
1887	 * Calculate the number of 64-bit entries this segment would
1888	 * require when written to disk. If this single segment would be
1889	 * larger on-disk than the entire current on-disk structure, then
1890	 * clearly condensing will increase the on-disk structure size.
1891	 */
1892	size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
1893	entries = size / (MIN(size, SM_RUN_MAX));
1894	segsz = entries * sizeof (uint64_t);
1895
1896	optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root);
1897	object_size = space_map_length(msp->ms_sm);
1898
1899	dmu_object_info_from_db(sm->sm_dbuf, &doi);
1900	record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
1901
1902	return (segsz <= object_size &&
1903	    object_size >= (optimal_size * zfs_condense_pct / 100) &&
1904	    object_size > zfs_metaslab_condense_block_threshold * record_size);
1905}
1906
1907/*
1908 * Condense the on-disk space map representation to its minimized form.
1909 * The minimized form consists of a small number of allocations followed by
1910 * the entries of the free range tree.
1911 */
1912static void
1913metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
1914{
1915	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1916	range_tree_t *freetree = msp->ms_freetree[txg & TXG_MASK];
1917	range_tree_t *condense_tree;
1918	space_map_t *sm = msp->ms_sm;
1919
1920	ASSERT(MUTEX_HELD(&msp->ms_lock));
1921	ASSERT3U(spa_sync_pass(spa), ==, 1);
1922	ASSERT(msp->ms_loaded);
1923
1924
1925	spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, vdev id %llu, "
1926	    "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
1927	    msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
1928	    msp->ms_group->mg_vd->vdev_spa->spa_name,
1929	    space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root),
1930	    msp->ms_condense_wanted ? "TRUE" : "FALSE");
1931
1932	msp->ms_condense_wanted = B_FALSE;
1933
1934	/*
1935	 * Create an range tree that is 100% allocated. We remove segments
1936	 * that have been freed in this txg, any deferred frees that exist,
1937	 * and any allocation in the future. Removing segments should be
1938	 * a relatively inexpensive operation since we expect these trees to
1939	 * have a small number of nodes.
1940	 */
1941	condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock);
1942	range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
1943
1944	/*
1945	 * Remove what's been freed in this txg from the condense_tree.
1946	 * Since we're in sync_pass 1, we know that all the frees from
1947	 * this txg are in the freetree.
1948	 */
1949	range_tree_walk(freetree, range_tree_remove, condense_tree);
1950
1951	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1952		range_tree_walk(msp->ms_defertree[t],
1953		    range_tree_remove, condense_tree);
1954	}
1955
1956	for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
1957		range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK],
1958		    range_tree_remove, condense_tree);
1959	}
1960
1961	/*
1962	 * We're about to drop the metaslab's lock thus allowing
1963	 * other consumers to change it's content. Set the
1964	 * metaslab's ms_condensing flag to ensure that
1965	 * allocations on this metaslab do not occur while we're
1966	 * in the middle of committing it to disk. This is only critical
1967	 * for the ms_tree as all other range trees use per txg
1968	 * views of their content.
1969	 */
1970	msp->ms_condensing = B_TRUE;
1971
1972	mutex_exit(&msp->ms_lock);
1973	space_map_truncate(sm, tx);
1974	mutex_enter(&msp->ms_lock);
1975
1976	/*
1977	 * While we would ideally like to create a space_map representation
1978	 * that consists only of allocation records, doing so can be
1979	 * prohibitively expensive because the in-core free tree can be
1980	 * large, and therefore computationally expensive to subtract
1981	 * from the condense_tree. Instead we sync out two trees, a cheap
1982	 * allocation only tree followed by the in-core free tree. While not
1983	 * optimal, this is typically close to optimal, and much cheaper to
1984	 * compute.
1985	 */
1986	space_map_write(sm, condense_tree, SM_ALLOC, tx);
1987	range_tree_vacate(condense_tree, NULL, NULL);
1988	range_tree_destroy(condense_tree);
1989
1990	space_map_write(sm, msp->ms_tree, SM_FREE, tx);
1991	msp->ms_condensing = B_FALSE;
1992}
1993
1994/*
1995 * Write a metaslab to disk in the context of the specified transaction group.
1996 */
1997void
1998metaslab_sync(metaslab_t *msp, uint64_t txg)
1999{
2000	metaslab_group_t *mg = msp->ms_group;
2001	vdev_t *vd = mg->mg_vd;
2002	spa_t *spa = vd->vdev_spa;
2003	objset_t *mos = spa_meta_objset(spa);
2004	range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK];
2005	range_tree_t **freetree = &msp->ms_freetree[txg & TXG_MASK];
2006	range_tree_t **freed_tree =
2007	    &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
2008	dmu_tx_t *tx;
2009	uint64_t object = space_map_object(msp->ms_sm);
2010
2011	ASSERT(!vd->vdev_ishole);
2012
2013	/*
2014	 * This metaslab has just been added so there's no work to do now.
2015	 */
2016	if (*freetree == NULL) {
2017		ASSERT3P(alloctree, ==, NULL);
2018		return;
2019	}
2020
2021	ASSERT3P(alloctree, !=, NULL);
2022	ASSERT3P(*freetree, !=, NULL);
2023	ASSERT3P(*freed_tree, !=, NULL);
2024
2025	/*
2026	 * Normally, we don't want to process a metaslab if there
2027	 * are no allocations or frees to perform. However, if the metaslab
2028	 * is being forced to condense we need to let it through.
2029	 */
2030	if (range_tree_space(alloctree) == 0 &&
2031	    range_tree_space(*freetree) == 0 &&
2032	    !msp->ms_condense_wanted)
2033		return;
2034
2035	/*
2036	 * The only state that can actually be changing concurrently with
2037	 * metaslab_sync() is the metaslab's ms_tree.  No other thread can
2038	 * be modifying this txg's alloctree, freetree, freed_tree, or
2039	 * space_map_phys_t. Therefore, we only hold ms_lock to satify
2040	 * space_map ASSERTs. We drop it whenever we call into the DMU,
2041	 * because the DMU can call down to us (e.g. via zio_free()) at
2042	 * any time.
2043	 */
2044
2045	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
2046
2047	if (msp->ms_sm == NULL) {
2048		uint64_t new_object;
2049
2050		new_object = space_map_alloc(mos, tx);
2051		VERIFY3U(new_object, !=, 0);
2052
2053		VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
2054		    msp->ms_start, msp->ms_size, vd->vdev_ashift,
2055		    &msp->ms_lock));
2056		ASSERT(msp->ms_sm != NULL);
2057	}
2058
2059	mutex_enter(&msp->ms_lock);
2060
2061	/*
2062	 * Note: metaslab_condense() clears the space_map's histogram.
2063	 * Therefore we must verify and remove this histogram before
2064	 * condensing.
2065	 */
2066	metaslab_group_histogram_verify(mg);
2067	metaslab_class_histogram_verify(mg->mg_class);
2068	metaslab_group_histogram_remove(mg, msp);
2069
2070	if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
2071	    metaslab_should_condense(msp)) {
2072		metaslab_condense(msp, txg, tx);
2073	} else {
2074		space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
2075		space_map_write(msp->ms_sm, *freetree, SM_FREE, tx);
2076	}
2077
2078	if (msp->ms_loaded) {
2079		/*
2080		 * When the space map is loaded, we have an accruate
2081		 * histogram in the range tree. This gives us an opportunity
2082		 * to bring the space map's histogram up-to-date so we clear
2083		 * it first before updating it.
2084		 */
2085		space_map_histogram_clear(msp->ms_sm);
2086		space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
2087	} else {
2088		/*
2089		 * Since the space map is not loaded we simply update the
2090		 * exisiting histogram with what was freed in this txg. This
2091		 * means that the on-disk histogram may not have an accurate
2092		 * view of the free space but it's close enough to allow
2093		 * us to make allocation decisions.
2094		 */
2095		space_map_histogram_add(msp->ms_sm, *freetree, tx);
2096	}
2097	metaslab_group_histogram_add(mg, msp);
2098	metaslab_group_histogram_verify(mg);
2099	metaslab_class_histogram_verify(mg->mg_class);
2100
2101	/*
2102	 * For sync pass 1, we avoid traversing this txg's free range tree
2103	 * and instead will just swap the pointers for freetree and
2104	 * freed_tree. We can safely do this since the freed_tree is
2105	 * guaranteed to be empty on the initial pass.
2106	 */
2107	if (spa_sync_pass(spa) == 1) {
2108		range_tree_swap(freetree, freed_tree);
2109	} else {
2110		range_tree_vacate(*freetree, range_tree_add, *freed_tree);
2111	}
2112	range_tree_vacate(alloctree, NULL, NULL);
2113
2114	ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
2115	ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
2116
2117	mutex_exit(&msp->ms_lock);
2118
2119	if (object != space_map_object(msp->ms_sm)) {
2120		object = space_map_object(msp->ms_sm);
2121		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
2122		    msp->ms_id, sizeof (uint64_t), &object, tx);
2123	}
2124	dmu_tx_commit(tx);
2125}
2126
2127/*
2128 * Called after a transaction group has completely synced to mark
2129 * all of the metaslab's free space as usable.
2130 */
2131void
2132metaslab_sync_done(metaslab_t *msp, uint64_t txg)
2133{
2134	metaslab_group_t *mg = msp->ms_group;
2135	vdev_t *vd = mg->mg_vd;
2136	range_tree_t **freed_tree;
2137	range_tree_t **defer_tree;
2138	int64_t alloc_delta, defer_delta;
2139
2140	ASSERT(!vd->vdev_ishole);
2141
2142	mutex_enter(&msp->ms_lock);
2143
2144	/*
2145	 * If this metaslab is just becoming available, initialize its
2146	 * alloctrees, freetrees, and defertree and add its capacity to
2147	 * the vdev.
2148	 */
2149	if (msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK] == NULL) {
2150		for (int t = 0; t < TXG_SIZE; t++) {
2151			ASSERT(msp->ms_alloctree[t] == NULL);
2152			ASSERT(msp->ms_freetree[t] == NULL);
2153
2154			msp->ms_alloctree[t] = range_tree_create(NULL, msp,
2155			    &msp->ms_lock);
2156			msp->ms_freetree[t] = range_tree_create(NULL, msp,
2157			    &msp->ms_lock);
2158		}
2159
2160		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2161			ASSERT(msp->ms_defertree[t] == NULL);
2162
2163			msp->ms_defertree[t] = range_tree_create(NULL, msp,
2164			    &msp->ms_lock);
2165		}
2166
2167		vdev_space_update(vd, 0, 0, msp->ms_size);
2168	}
2169
2170	freed_tree = &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
2171	defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE];
2172
2173	alloc_delta = space_map_alloc_delta(msp->ms_sm);
2174	defer_delta = range_tree_space(*freed_tree) -
2175	    range_tree_space(*defer_tree);
2176
2177	vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
2178
2179	ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
2180	ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
2181
2182	/*
2183	 * If there's a metaslab_load() in progress, wait for it to complete
2184	 * so that we have a consistent view of the in-core space map.
2185	 */
2186	metaslab_load_wait(msp);
2187
2188	/*
2189	 * Move the frees from the defer_tree back to the free
2190	 * range tree (if it's loaded). Swap the freed_tree and the
2191	 * defer_tree -- this is safe to do because we've just emptied out
2192	 * the defer_tree.
2193	 */
2194	range_tree_vacate(*defer_tree,
2195	    msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
2196	range_tree_swap(freed_tree, defer_tree);
2197
2198	space_map_update(msp->ms_sm);
2199
2200	msp->ms_deferspace += defer_delta;
2201	ASSERT3S(msp->ms_deferspace, >=, 0);
2202	ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
2203	if (msp->ms_deferspace != 0) {
2204		/*
2205		 * Keep syncing this metaslab until all deferred frees
2206		 * are back in circulation.
2207		 */
2208		vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
2209	}
2210
2211	if (msp->ms_loaded && msp->ms_access_txg < txg) {
2212		for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
2213			VERIFY0(range_tree_space(
2214			    msp->ms_alloctree[(txg + t) & TXG_MASK]));
2215		}
2216
2217		if (!metaslab_debug_unload)
2218			metaslab_unload(msp);
2219	}
2220
2221	metaslab_group_sort(mg, msp, metaslab_weight(msp));
2222	mutex_exit(&msp->ms_lock);
2223}
2224
2225void
2226metaslab_sync_reassess(metaslab_group_t *mg)
2227{
2228	metaslab_group_alloc_update(mg);
2229	mg->mg_fragmentation = metaslab_group_fragmentation(mg);
2230
2231	/*
2232	 * Preload the next potential metaslabs
2233	 */
2234	metaslab_group_preload(mg);
2235}
2236
2237static uint64_t
2238metaslab_distance(metaslab_t *msp, dva_t *dva)
2239{
2240	uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
2241	uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
2242	uint64_t start = msp->ms_id;
2243
2244	if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
2245		return (1ULL << 63);
2246
2247	if (offset < start)
2248		return ((start - offset) << ms_shift);
2249	if (offset > start)
2250		return ((offset - start) << ms_shift);
2251	return (0);
2252}
2253
2254/*
2255 * ==========================================================================
2256 * Metaslab block operations
2257 * ==========================================================================
2258 */
2259
2260static void
2261metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags)
2262{
2263	if (!(flags & METASLAB_ASYNC_ALLOC) ||
2264	    flags & METASLAB_DONT_THROTTLE)
2265		return;
2266
2267	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2268	if (!mg->mg_class->mc_alloc_throttle_enabled)
2269		return;
2270
2271	(void) refcount_add(&mg->mg_alloc_queue_depth, tag);
2272}
2273
2274void
2275metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags)
2276{
2277	if (!(flags & METASLAB_ASYNC_ALLOC) ||
2278	    flags & METASLAB_DONT_THROTTLE)
2279		return;
2280
2281	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2282	if (!mg->mg_class->mc_alloc_throttle_enabled)
2283		return;
2284
2285	(void) refcount_remove(&mg->mg_alloc_queue_depth, tag);
2286}
2287
2288void
2289metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag)
2290{
2291#ifdef ZFS_DEBUG
2292	const dva_t *dva = bp->blk_dva;
2293	int ndvas = BP_GET_NDVAS(bp);
2294
2295	for (int d = 0; d < ndvas; d++) {
2296		uint64_t vdev = DVA_GET_VDEV(&dva[d]);
2297		metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2298		VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag));
2299	}
2300#endif
2301}
2302
2303static uint64_t
2304metaslab_group_alloc(metaslab_group_t *mg, uint64_t asize,
2305    uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
2306{
2307	spa_t *spa = mg->mg_vd->vdev_spa;
2308	metaslab_t *msp = NULL;
2309	uint64_t offset = -1ULL;
2310	avl_tree_t *t = &mg->mg_metaslab_tree;
2311	uint64_t activation_weight;
2312	uint64_t target_distance;
2313	int i;
2314
2315	activation_weight = METASLAB_WEIGHT_PRIMARY;
2316	for (i = 0; i < d; i++) {
2317		if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
2318			activation_weight = METASLAB_WEIGHT_SECONDARY;
2319			break;
2320		}
2321	}
2322
2323	for (;;) {
2324		boolean_t was_active;
2325
2326		mutex_enter(&mg->mg_lock);
2327		for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
2328			if (msp->ms_weight < asize) {
2329				spa_dbgmsg(spa, "%s: failed to meet weight "
2330				    "requirement: vdev %llu, txg %llu, mg %p, "
2331				    "msp %p, asize %llu, "
2332				    "weight %llu", spa_name(spa),
2333				    mg->mg_vd->vdev_id, txg,
2334				    mg, msp, asize, msp->ms_weight);
2335				mutex_exit(&mg->mg_lock);
2336				return (-1ULL);
2337			}
2338
2339			/*
2340			 * If the selected metaslab is condensing, skip it.
2341			 */
2342			if (msp->ms_condensing)
2343				continue;
2344
2345			was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
2346			if (activation_weight == METASLAB_WEIGHT_PRIMARY)
2347				break;
2348
2349			target_distance = min_distance +
2350			    (space_map_allocated(msp->ms_sm) != 0 ? 0 :
2351			    min_distance >> 1);
2352
2353			for (i = 0; i < d; i++)
2354				if (metaslab_distance(msp, &dva[i]) <
2355				    target_distance)
2356					break;
2357			if (i == d)
2358				break;
2359		}
2360		mutex_exit(&mg->mg_lock);
2361		if (msp == NULL)
2362			return (-1ULL);
2363
2364		mutex_enter(&msp->ms_lock);
2365
2366		/*
2367		 * Ensure that the metaslab we have selected is still
2368		 * capable of handling our request. It's possible that
2369		 * another thread may have changed the weight while we
2370		 * were blocked on the metaslab lock.
2371		 */
2372		if (msp->ms_weight < asize || (was_active &&
2373		    !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
2374		    activation_weight == METASLAB_WEIGHT_PRIMARY)) {
2375			mutex_exit(&msp->ms_lock);
2376			continue;
2377		}
2378
2379		if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
2380		    activation_weight == METASLAB_WEIGHT_PRIMARY) {
2381			metaslab_passivate(msp,
2382			    msp->ms_weight & ~METASLAB_ACTIVE_MASK);
2383			mutex_exit(&msp->ms_lock);
2384			continue;
2385		}
2386
2387		if (metaslab_activate(msp, activation_weight) != 0) {
2388			mutex_exit(&msp->ms_lock);
2389			continue;
2390		}
2391
2392		/*
2393		 * If this metaslab is currently condensing then pick again as
2394		 * we can't manipulate this metaslab until it's committed
2395		 * to disk.
2396		 */
2397		if (msp->ms_condensing) {
2398			mutex_exit(&msp->ms_lock);
2399			continue;
2400		}
2401
2402		if ((offset = metaslab_block_alloc(msp, asize)) != -1ULL)
2403			break;
2404
2405		metaslab_passivate(msp, metaslab_block_maxsize(msp));
2406		mutex_exit(&msp->ms_lock);
2407	}
2408
2409	if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
2410		vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
2411
2412	range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, asize);
2413	msp->ms_access_txg = txg + metaslab_unload_delay;
2414
2415	mutex_exit(&msp->ms_lock);
2416	return (offset);
2417}
2418
2419/*
2420 * Allocate a block for the specified i/o.
2421 */
2422static int
2423metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
2424    dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags)
2425{
2426	metaslab_group_t *mg, *rotor;
2427	vdev_t *vd;
2428	int dshift = 3;
2429	int all_zero;
2430	int zio_lock = B_FALSE;
2431	boolean_t allocatable;
2432	uint64_t asize;
2433	uint64_t distance;
2434
2435	ASSERT(!DVA_IS_VALID(&dva[d]));
2436
2437	/*
2438	 * For testing, make some blocks above a certain size be gang blocks.
2439	 */
2440	if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0)
2441		return (SET_ERROR(ENOSPC));
2442
2443	/*
2444	 * Start at the rotor and loop through all mgs until we find something.
2445	 * Note that there's no locking on mc_rotor or mc_aliquot because
2446	 * nothing actually breaks if we miss a few updates -- we just won't
2447	 * allocate quite as evenly.  It all balances out over time.
2448	 *
2449	 * If we are doing ditto or log blocks, try to spread them across
2450	 * consecutive vdevs.  If we're forced to reuse a vdev before we've
2451	 * allocated all of our ditto blocks, then try and spread them out on
2452	 * that vdev as much as possible.  If it turns out to not be possible,
2453	 * gradually lower our standards until anything becomes acceptable.
2454	 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
2455	 * gives us hope of containing our fault domains to something we're
2456	 * able to reason about.  Otherwise, any two top-level vdev failures
2457	 * will guarantee the loss of data.  With consecutive allocation,
2458	 * only two adjacent top-level vdev failures will result in data loss.
2459	 *
2460	 * If we are doing gang blocks (hintdva is non-NULL), try to keep
2461	 * ourselves on the same vdev as our gang block header.  That
2462	 * way, we can hope for locality in vdev_cache, plus it makes our
2463	 * fault domains something tractable.
2464	 */
2465	if (hintdva) {
2466		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
2467
2468		/*
2469		 * It's possible the vdev we're using as the hint no
2470		 * longer exists (i.e. removed). Consult the rotor when
2471		 * all else fails.
2472		 */
2473		if (vd != NULL) {
2474			mg = vd->vdev_mg;
2475
2476			if (flags & METASLAB_HINTBP_AVOID &&
2477			    mg->mg_next != NULL)
2478				mg = mg->mg_next;
2479		} else {
2480			mg = mc->mc_rotor;
2481		}
2482	} else if (d != 0) {
2483		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
2484		mg = vd->vdev_mg->mg_next;
2485	} else {
2486		mg = mc->mc_rotor;
2487	}
2488
2489	/*
2490	 * If the hint put us into the wrong metaslab class, or into a
2491	 * metaslab group that has been passivated, just follow the rotor.
2492	 */
2493	if (mg->mg_class != mc || mg->mg_activation_count <= 0)
2494		mg = mc->mc_rotor;
2495
2496	rotor = mg;
2497top:
2498	all_zero = B_TRUE;
2499	do {
2500		ASSERT(mg->mg_activation_count == 1);
2501		vd = mg->mg_vd;
2502
2503		/*
2504		 * Don't allocate from faulted devices.
2505		 */
2506		if (zio_lock) {
2507			spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
2508			allocatable = vdev_allocatable(vd);
2509			spa_config_exit(spa, SCL_ZIO, FTAG);
2510		} else {
2511			allocatable = vdev_allocatable(vd);
2512		}
2513
2514		/*
2515		 * Determine if the selected metaslab group is eligible
2516		 * for allocations. If we're ganging then don't allow
2517		 * this metaslab group to skip allocations since that would
2518		 * inadvertently return ENOSPC and suspend the pool
2519		 * even though space is still available.
2520		 */
2521		if (allocatable && !GANG_ALLOCATION(flags) && !zio_lock) {
2522			allocatable = metaslab_group_allocatable(mg, rotor,
2523			    psize);
2524		}
2525
2526		if (!allocatable)
2527			goto next;
2528
2529		ASSERT(mg->mg_initialized);
2530
2531		/*
2532		 * Avoid writing single-copy data to a failing vdev.
2533		 */
2534		if ((vd->vdev_stat.vs_write_errors > 0 ||
2535		    vd->vdev_state < VDEV_STATE_HEALTHY) &&
2536		    d == 0 && dshift == 3 && vd->vdev_children == 0) {
2537			all_zero = B_FALSE;
2538			goto next;
2539		}
2540
2541		ASSERT(mg->mg_class == mc);
2542
2543		distance = vd->vdev_asize >> dshift;
2544		if (distance <= (1ULL << vd->vdev_ms_shift))
2545			distance = 0;
2546		else
2547			all_zero = B_FALSE;
2548
2549		asize = vdev_psize_to_asize(vd, psize);
2550		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
2551
2552		uint64_t offset = metaslab_group_alloc(mg, asize, txg,
2553		    distance, dva, d);
2554
2555		mutex_enter(&mg->mg_lock);
2556		if (offset == -1ULL) {
2557			mg->mg_failed_allocations++;
2558			if (asize == SPA_GANGBLOCKSIZE) {
2559				/*
2560				 * This metaslab group was unable to allocate
2561				 * the minimum gang block size so it must be
2562				 * out of space. We must notify the allocation
2563				 * throttle to start skipping allocation
2564				 * attempts to this metaslab group until more
2565				 * space becomes available.
2566				 *
2567				 * Note: this failure cannot be caused by the
2568				 * allocation throttle since the allocation
2569				 * throttle is only responsible for skipping
2570				 * devices and not failing block allocations.
2571				 */
2572				mg->mg_no_free_space = B_TRUE;
2573			}
2574		}
2575		mg->mg_allocations++;
2576		mutex_exit(&mg->mg_lock);
2577
2578		if (offset != -1ULL) {
2579			/*
2580			 * If we've just selected this metaslab group,
2581			 * figure out whether the corresponding vdev is
2582			 * over- or under-used relative to the pool,
2583			 * and set an allocation bias to even it out.
2584			 */
2585			if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
2586				vdev_stat_t *vs = &vd->vdev_stat;
2587				int64_t vu, cu;
2588
2589				vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
2590				cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
2591
2592				/*
2593				 * Calculate how much more or less we should
2594				 * try to allocate from this device during
2595				 * this iteration around the rotor.
2596				 * For example, if a device is 80% full
2597				 * and the pool is 20% full then we should
2598				 * reduce allocations by 60% on this device.
2599				 *
2600				 * mg_bias = (20 - 80) * 512K / 100 = -307K
2601				 *
2602				 * This reduces allocations by 307K for this
2603				 * iteration.
2604				 */
2605				mg->mg_bias = ((cu - vu) *
2606				    (int64_t)mg->mg_aliquot) / 100;
2607			} else if (!metaslab_bias_enabled) {
2608				mg->mg_bias = 0;
2609			}
2610
2611			if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
2612			    mg->mg_aliquot + mg->mg_bias) {
2613				mc->mc_rotor = mg->mg_next;
2614				mc->mc_aliquot = 0;
2615			}
2616
2617			DVA_SET_VDEV(&dva[d], vd->vdev_id);
2618			DVA_SET_OFFSET(&dva[d], offset);
2619			DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
2620			DVA_SET_ASIZE(&dva[d], asize);
2621
2622			return (0);
2623		}
2624next:
2625		mc->mc_rotor = mg->mg_next;
2626		mc->mc_aliquot = 0;
2627	} while ((mg = mg->mg_next) != rotor);
2628
2629	if (!all_zero) {
2630		dshift++;
2631		ASSERT(dshift < 64);
2632		goto top;
2633	}
2634
2635	if (!allocatable && !zio_lock) {
2636		dshift = 3;
2637		zio_lock = B_TRUE;
2638		goto top;
2639	}
2640
2641	bzero(&dva[d], sizeof (dva_t));
2642
2643	return (SET_ERROR(ENOSPC));
2644}
2645
2646/*
2647 * Free the block represented by DVA in the context of the specified
2648 * transaction group.
2649 */
2650static void
2651metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
2652{
2653	uint64_t vdev = DVA_GET_VDEV(dva);
2654	uint64_t offset = DVA_GET_OFFSET(dva);
2655	uint64_t size = DVA_GET_ASIZE(dva);
2656	vdev_t *vd;
2657	metaslab_t *msp;
2658
2659	ASSERT(DVA_IS_VALID(dva));
2660
2661	if (txg > spa_freeze_txg(spa))
2662		return;
2663
2664	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
2665	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
2666		cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
2667		    (u_longlong_t)vdev, (u_longlong_t)offset);
2668		ASSERT(0);
2669		return;
2670	}
2671
2672	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
2673
2674	if (DVA_GET_GANG(dva))
2675		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
2676
2677	mutex_enter(&msp->ms_lock);
2678
2679	if (now) {
2680		range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
2681		    offset, size);
2682
2683		VERIFY(!msp->ms_condensing);
2684		VERIFY3U(offset, >=, msp->ms_start);
2685		VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
2686		VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
2687		    msp->ms_size);
2688		VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
2689		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
2690		range_tree_add(msp->ms_tree, offset, size);
2691	} else {
2692		if (range_tree_space(msp->ms_freetree[txg & TXG_MASK]) == 0)
2693			vdev_dirty(vd, VDD_METASLAB, msp, txg);
2694		range_tree_add(msp->ms_freetree[txg & TXG_MASK],
2695		    offset, size);
2696	}
2697
2698	mutex_exit(&msp->ms_lock);
2699}
2700
2701/*
2702 * Intent log support: upon opening the pool after a crash, notify the SPA
2703 * of blocks that the intent log has allocated for immediate write, but
2704 * which are still considered free by the SPA because the last transaction
2705 * group didn't commit yet.
2706 */
2707static int
2708metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
2709{
2710	uint64_t vdev = DVA_GET_VDEV(dva);
2711	uint64_t offset = DVA_GET_OFFSET(dva);
2712	uint64_t size = DVA_GET_ASIZE(dva);
2713	vdev_t *vd;
2714	metaslab_t *msp;
2715	int error = 0;
2716
2717	ASSERT(DVA_IS_VALID(dva));
2718
2719	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
2720	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
2721		return (SET_ERROR(ENXIO));
2722
2723	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
2724
2725	if (DVA_GET_GANG(dva))
2726		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
2727
2728	mutex_enter(&msp->ms_lock);
2729
2730	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
2731		error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
2732
2733	if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
2734		error = SET_ERROR(ENOENT);
2735
2736	if (error || txg == 0) {	/* txg == 0 indicates dry run */
2737		mutex_exit(&msp->ms_lock);
2738		return (error);
2739	}
2740
2741	VERIFY(!msp->ms_condensing);
2742	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
2743	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
2744	VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
2745	range_tree_remove(msp->ms_tree, offset, size);
2746
2747	if (spa_writeable(spa)) {	/* don't dirty if we're zdb(1M) */
2748		if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
2749			vdev_dirty(vd, VDD_METASLAB, msp, txg);
2750		range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
2751	}
2752
2753	mutex_exit(&msp->ms_lock);
2754
2755	return (0);
2756}
2757
2758/*
2759 * Reserve some allocation slots. The reservation system must be called
2760 * before we call into the allocator. If there aren't any available slots
2761 * then the I/O will be throttled until an I/O completes and its slots are
2762 * freed up. The function returns true if it was successful in placing
2763 * the reservation.
2764 */
2765boolean_t
2766metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
2767    int flags)
2768{
2769	uint64_t available_slots = 0;
2770	boolean_t slot_reserved = B_FALSE;
2771
2772	ASSERT(mc->mc_alloc_throttle_enabled);
2773	mutex_enter(&mc->mc_lock);
2774
2775	uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots);
2776	if (reserved_slots < mc->mc_alloc_max_slots)
2777		available_slots = mc->mc_alloc_max_slots - reserved_slots;
2778
2779	if (slots <= available_slots || GANG_ALLOCATION(flags)) {
2780		/*
2781		 * We reserve the slots individually so that we can unreserve
2782		 * them individually when an I/O completes.
2783		 */
2784		for (int d = 0; d < slots; d++) {
2785			reserved_slots = refcount_add(&mc->mc_alloc_slots, zio);
2786		}
2787		zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
2788		slot_reserved = B_TRUE;
2789	}
2790
2791	mutex_exit(&mc->mc_lock);
2792	return (slot_reserved);
2793}
2794
2795void
2796metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio)
2797{
2798	ASSERT(mc->mc_alloc_throttle_enabled);
2799	mutex_enter(&mc->mc_lock);
2800	for (int d = 0; d < slots; d++) {
2801		(void) refcount_remove(&mc->mc_alloc_slots, zio);
2802	}
2803	mutex_exit(&mc->mc_lock);
2804}
2805
2806int
2807metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
2808    int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, zio_t *zio)
2809{
2810	dva_t *dva = bp->blk_dva;
2811	dva_t *hintdva = hintbp->blk_dva;
2812	int error = 0;
2813
2814	ASSERT(bp->blk_birth == 0);
2815	ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
2816
2817	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
2818
2819	if (mc->mc_rotor == NULL) {	/* no vdevs in this class */
2820		spa_config_exit(spa, SCL_ALLOC, FTAG);
2821		return (SET_ERROR(ENOSPC));
2822	}
2823
2824	ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
2825	ASSERT(BP_GET_NDVAS(bp) == 0);
2826	ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
2827
2828	for (int d = 0; d < ndvas; d++) {
2829		error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
2830		    txg, flags);
2831		if (error != 0) {
2832			for (d--; d >= 0; d--) {
2833				metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
2834				metaslab_group_alloc_decrement(spa,
2835				    DVA_GET_VDEV(&dva[d]), zio, flags);
2836				bzero(&dva[d], sizeof (dva_t));
2837			}
2838			spa_config_exit(spa, SCL_ALLOC, FTAG);
2839			return (error);
2840		} else {
2841			/*
2842			 * Update the metaslab group's queue depth
2843			 * based on the newly allocated dva.
2844			 */
2845			metaslab_group_alloc_increment(spa,
2846			    DVA_GET_VDEV(&dva[d]), zio, flags);
2847		}
2848
2849	}
2850	ASSERT(error == 0);
2851	ASSERT(BP_GET_NDVAS(bp) == ndvas);
2852
2853	spa_config_exit(spa, SCL_ALLOC, FTAG);
2854
2855	BP_SET_BIRTH(bp, txg, txg);
2856
2857	return (0);
2858}
2859
2860void
2861metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
2862{
2863	const dva_t *dva = bp->blk_dva;
2864	int ndvas = BP_GET_NDVAS(bp);
2865
2866	ASSERT(!BP_IS_HOLE(bp));
2867	ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
2868
2869	spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
2870
2871	for (int d = 0; d < ndvas; d++)
2872		metaslab_free_dva(spa, &dva[d], txg, now);
2873
2874	spa_config_exit(spa, SCL_FREE, FTAG);
2875}
2876
2877int
2878metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
2879{
2880	const dva_t *dva = bp->blk_dva;
2881	int ndvas = BP_GET_NDVAS(bp);
2882	int error = 0;
2883
2884	ASSERT(!BP_IS_HOLE(bp));
2885
2886	if (txg != 0) {
2887		/*
2888		 * First do a dry run to make sure all DVAs are claimable,
2889		 * so we don't have to unwind from partial failures below.
2890		 */
2891		if ((error = metaslab_claim(spa, bp, 0)) != 0)
2892			return (error);
2893	}
2894
2895	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
2896
2897	for (int d = 0; d < ndvas; d++)
2898		if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
2899			break;
2900
2901	spa_config_exit(spa, SCL_ALLOC, FTAG);
2902
2903	ASSERT(error == 0 || txg == 0);
2904
2905	return (error);
2906}
2907
2908void
2909metaslab_check_free(spa_t *spa, const blkptr_t *bp)
2910{
2911	if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
2912		return;
2913
2914	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
2915	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
2916		uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
2917		vdev_t *vd = vdev_lookup_top(spa, vdev);
2918		uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
2919		uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
2920		metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
2921
2922		if (msp->ms_loaded)
2923			range_tree_verify(msp->ms_tree, offset, size);
2924
2925		for (int j = 0; j < TXG_SIZE; j++)
2926			range_tree_verify(msp->ms_freetree[j], offset, size);
2927		for (int j = 0; j < TXG_DEFER_SIZE; j++)
2928			range_tree_verify(msp->ms_defertree[j], offset, size);
2929	}
2930	spa_config_exit(spa, SCL_VDEV, FTAG);
2931}
2932