metaslab.c revision 269416
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
25 */
26
27#include <sys/zfs_context.h>
28#include <sys/dmu.h>
29#include <sys/dmu_tx.h>
30#include <sys/space_map.h>
31#include <sys/metaslab_impl.h>
32#include <sys/vdev_impl.h>
33#include <sys/zio.h>
34#include <sys/spa_impl.h>
35
36SYSCTL_DECL(_vfs_zfs);
37SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab");
38
39/*
40 * Allow allocations to switch to gang blocks quickly. We do this to
41 * avoid having to load lots of space_maps in a given txg. There are,
42 * however, some cases where we want to avoid "fast" ganging and instead
43 * we want to do an exhaustive search of all metaslabs on this device.
44 * Currently we don't allow any gang, slog, or dump device related allocations
45 * to "fast" gang.
46 */
47#define	CAN_FASTGANG(flags) \
48	(!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
49	METASLAB_GANG_AVOID)))
50
51#define	METASLAB_WEIGHT_PRIMARY		(1ULL << 63)
52#define	METASLAB_WEIGHT_SECONDARY	(1ULL << 62)
53#define	METASLAB_ACTIVE_MASK		\
54	(METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
55
56uint64_t metaslab_aliquot = 512ULL << 10;
57uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
58TUNABLE_QUAD("vfs.zfs.metaslab.gang_bang", &metaslab_gang_bang);
59SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, gang_bang, CTLFLAG_RWTUN,
60    &metaslab_gang_bang, 0,
61    "Force gang block allocation for blocks larger than or equal to this value");
62
63/*
64 * The in-core space map representation is more compact than its on-disk form.
65 * The zfs_condense_pct determines how much more compact the in-core
66 * space_map representation must be before we compact it on-disk.
67 * Values should be greater than or equal to 100.
68 */
69int zfs_condense_pct = 200;
70TUNABLE_INT("vfs.zfs.condense_pct", &zfs_condense_pct);
71SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN,
72    &zfs_condense_pct, 0,
73    "Condense on-disk spacemap when it is more than this many percents"
74    " of in-memory counterpart");
75
76/*
77 * Condensing a metaslab is not guaranteed to actually reduce the amount of
78 * space used on disk. In particular, a space map uses data in increments of
79 * MAX(1 << ashift, SPACE_MAP_INITIAL_BLOCKSIZE), so a metaslab might use the
80 * same number of blocks after condensing. Since the goal of condensing is to
81 * reduce the number of IOPs required to read the space map, we only want to
82 * condense when we can be sure we will reduce the number of blocks used by the
83 * space map. Unfortunately, we cannot precisely compute whether or not this is
84 * the case in metaslab_should_condense since we are holding ms_lock. Instead,
85 * we apply the following heuristic: do not condense a spacemap unless the
86 * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
87 * blocks.
88 */
89int zfs_metaslab_condense_block_threshold = 4;
90
91/*
92 * The zfs_mg_noalloc_threshold defines which metaslab groups should
93 * be eligible for allocation. The value is defined as a percentage of
94 * a free space. Metaslab groups that have more free space than
95 * zfs_mg_noalloc_threshold are always eligible for allocations. Once
96 * a metaslab group's free space is less than or equal to the
97 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
98 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
99 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
100 * groups are allowed to accept allocations. Gang blocks are always
101 * eligible to allocate on any metaslab group. The default value of 0 means
102 * no metaslab group will be excluded based on this criterion.
103 */
104int zfs_mg_noalloc_threshold = 0;
105TUNABLE_INT("vfs.zfs.mg_noalloc_threshold", &zfs_mg_noalloc_threshold);
106SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_noalloc_threshold, CTLFLAG_RWTUN,
107    &zfs_mg_noalloc_threshold, 0,
108    "Percentage of metaslab group size that should be free"
109    " to make it eligible for allocation");
110
111/*
112 * When set will load all metaslabs when pool is first opened.
113 */
114int metaslab_debug_load = 0;
115TUNABLE_INT("vfs.zfs.metaslab.debug_load", &metaslab_debug_load);
116SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_load, CTLFLAG_RWTUN,
117    &metaslab_debug_load, 0,
118    "Load all metaslabs when pool is first opened");
119
120/*
121 * When set will prevent metaslabs from being unloaded.
122 */
123int metaslab_debug_unload = 0;
124TUNABLE_INT("vfs.zfs.metaslab.debug_unload", &metaslab_debug_unload);
125SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_unload, CTLFLAG_RWTUN,
126    &metaslab_debug_unload, 0,
127    "Prevent metaslabs from being unloaded");
128
129/*
130 * Minimum size which forces the dynamic allocator to change
131 * it's allocation strategy.  Once the space map cannot satisfy
132 * an allocation of this size then it switches to using more
133 * aggressive strategy (i.e search by size rather than offset).
134 */
135uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
136TUNABLE_QUAD("vfs.zfs.metaslab.df_alloc_threshold",
137    &metaslab_df_alloc_threshold);
138SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN,
139    &metaslab_df_alloc_threshold, 0,
140    "Minimum size which forces the dynamic allocator to change it's allocation strategy");
141
142/*
143 * The minimum free space, in percent, which must be available
144 * in a space map to continue allocations in a first-fit fashion.
145 * Once the space_map's free space drops below this level we dynamically
146 * switch to using best-fit allocations.
147 */
148int metaslab_df_free_pct = 4;
149TUNABLE_INT("vfs.zfs.metaslab.df_free_pct", &metaslab_df_free_pct);
150SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN,
151    &metaslab_df_free_pct, 0,
152    "The minimum free space, in percent, which must be available in a space map to continue allocations in a first-fit fashion");
153
154/*
155 * A metaslab is considered "free" if it contains a contiguous
156 * segment which is greater than metaslab_min_alloc_size.
157 */
158uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
159TUNABLE_QUAD("vfs.zfs.metaslab.min_alloc_size",
160    &metaslab_min_alloc_size);
161SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, min_alloc_size, CTLFLAG_RWTUN,
162    &metaslab_min_alloc_size, 0,
163    "A metaslab is considered \"free\" if it contains a contiguous segment which is greater than vfs.zfs.metaslab.min_alloc_size");
164
165/*
166 * Percentage of all cpus that can be used by the metaslab taskq.
167 */
168int metaslab_load_pct = 50;
169TUNABLE_INT("vfs.zfs.metaslab.load_pct", &metaslab_load_pct);
170SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN,
171    &metaslab_load_pct, 0,
172    "Percentage of cpus that can be used by the metaslab taskq");
173
174/*
175 * Determines how many txgs a metaslab may remain loaded without having any
176 * allocations from it. As long as a metaslab continues to be used we will
177 * keep it loaded.
178 */
179int metaslab_unload_delay = TXG_SIZE * 2;
180TUNABLE_INT("vfs.zfs.metaslab.unload_delay", &metaslab_unload_delay);
181SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, unload_delay, CTLFLAG_RWTUN,
182    &metaslab_unload_delay, 0,
183    "Number of TXGs that an unused metaslab can be kept in memory");
184
185/*
186 * Should we be willing to write data to degraded vdevs?
187 */
188boolean_t zfs_write_to_degraded = B_FALSE;
189SYSCTL_INT(_vfs_zfs, OID_AUTO, write_to_degraded, CTLFLAG_RWTUN,
190    &zfs_write_to_degraded, 0, "Allow writing data to degraded vdevs");
191TUNABLE_INT("vfs.zfs.write_to_degraded", &zfs_write_to_degraded);
192
193/*
194 * Max number of metaslabs per group to preload.
195 */
196int metaslab_preload_limit = SPA_DVAS_PER_BP;
197TUNABLE_INT("vfs.zfs.metaslab.preload_limit", &metaslab_preload_limit);
198SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN,
199    &metaslab_preload_limit, 0,
200    "Max number of metaslabs per group to preload");
201
202/*
203 * Enable/disable preloading of metaslab.
204 */
205boolean_t metaslab_preload_enabled = B_TRUE;
206TUNABLE_INT("vfs.zfs.metaslab.preload_enabled", &metaslab_preload_enabled);
207SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_enabled, CTLFLAG_RWTUN,
208    &metaslab_preload_enabled, 0,
209    "Max number of metaslabs per group to preload");
210
211/*
212 * Enable/disable additional weight factor for each metaslab.
213 */
214boolean_t metaslab_weight_factor_enable = B_FALSE;
215TUNABLE_INT("vfs.zfs.metaslab.weight_factor_enable",
216    &metaslab_weight_factor_enable);
217SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, weight_factor_enable, CTLFLAG_RWTUN,
218    &metaslab_weight_factor_enable, 0,
219    "Enable additional weight factor for each metaslab");
220
221
222/*
223 * ==========================================================================
224 * Metaslab classes
225 * ==========================================================================
226 */
227metaslab_class_t *
228metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
229{
230	metaslab_class_t *mc;
231
232	mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
233
234	mc->mc_spa = spa;
235	mc->mc_rotor = NULL;
236	mc->mc_ops = ops;
237
238	return (mc);
239}
240
241void
242metaslab_class_destroy(metaslab_class_t *mc)
243{
244	ASSERT(mc->mc_rotor == NULL);
245	ASSERT(mc->mc_alloc == 0);
246	ASSERT(mc->mc_deferred == 0);
247	ASSERT(mc->mc_space == 0);
248	ASSERT(mc->mc_dspace == 0);
249
250	kmem_free(mc, sizeof (metaslab_class_t));
251}
252
253int
254metaslab_class_validate(metaslab_class_t *mc)
255{
256	metaslab_group_t *mg;
257	vdev_t *vd;
258
259	/*
260	 * Must hold one of the spa_config locks.
261	 */
262	ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
263	    spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
264
265	if ((mg = mc->mc_rotor) == NULL)
266		return (0);
267
268	do {
269		vd = mg->mg_vd;
270		ASSERT(vd->vdev_mg != NULL);
271		ASSERT3P(vd->vdev_top, ==, vd);
272		ASSERT3P(mg->mg_class, ==, mc);
273		ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
274	} while ((mg = mg->mg_next) != mc->mc_rotor);
275
276	return (0);
277}
278
279void
280metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
281    int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
282{
283	atomic_add_64(&mc->mc_alloc, alloc_delta);
284	atomic_add_64(&mc->mc_deferred, defer_delta);
285	atomic_add_64(&mc->mc_space, space_delta);
286	atomic_add_64(&mc->mc_dspace, dspace_delta);
287}
288
289void
290metaslab_class_minblocksize_update(metaslab_class_t *mc)
291{
292	metaslab_group_t *mg;
293	vdev_t *vd;
294	uint64_t minashift = UINT64_MAX;
295
296	if ((mg = mc->mc_rotor) == NULL) {
297		mc->mc_minblocksize = SPA_MINBLOCKSIZE;
298		return;
299	}
300
301	do {
302		vd = mg->mg_vd;
303		if (vd->vdev_ashift < minashift)
304			minashift = vd->vdev_ashift;
305	} while ((mg = mg->mg_next) != mc->mc_rotor);
306
307	mc->mc_minblocksize = 1ULL << minashift;
308}
309
310uint64_t
311metaslab_class_get_alloc(metaslab_class_t *mc)
312{
313	return (mc->mc_alloc);
314}
315
316uint64_t
317metaslab_class_get_deferred(metaslab_class_t *mc)
318{
319	return (mc->mc_deferred);
320}
321
322uint64_t
323metaslab_class_get_space(metaslab_class_t *mc)
324{
325	return (mc->mc_space);
326}
327
328uint64_t
329metaslab_class_get_dspace(metaslab_class_t *mc)
330{
331	return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
332}
333
334uint64_t
335metaslab_class_get_minblocksize(metaslab_class_t *mc)
336{
337	return (mc->mc_minblocksize);
338}
339
340/*
341 * ==========================================================================
342 * Metaslab groups
343 * ==========================================================================
344 */
345static int
346metaslab_compare(const void *x1, const void *x2)
347{
348	const metaslab_t *m1 = x1;
349	const metaslab_t *m2 = x2;
350
351	if (m1->ms_weight < m2->ms_weight)
352		return (1);
353	if (m1->ms_weight > m2->ms_weight)
354		return (-1);
355
356	/*
357	 * If the weights are identical, use the offset to force uniqueness.
358	 */
359	if (m1->ms_start < m2->ms_start)
360		return (-1);
361	if (m1->ms_start > m2->ms_start)
362		return (1);
363
364	ASSERT3P(m1, ==, m2);
365
366	return (0);
367}
368
369/*
370 * Update the allocatable flag and the metaslab group's capacity.
371 * The allocatable flag is set to true if the capacity is below
372 * the zfs_mg_noalloc_threshold. If a metaslab group transitions
373 * from allocatable to non-allocatable or vice versa then the metaslab
374 * group's class is updated to reflect the transition.
375 */
376static void
377metaslab_group_alloc_update(metaslab_group_t *mg)
378{
379	vdev_t *vd = mg->mg_vd;
380	metaslab_class_t *mc = mg->mg_class;
381	vdev_stat_t *vs = &vd->vdev_stat;
382	boolean_t was_allocatable;
383
384	ASSERT(vd == vd->vdev_top);
385
386	mutex_enter(&mg->mg_lock);
387	was_allocatable = mg->mg_allocatable;
388
389	mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
390	    (vs->vs_space + 1);
391
392	mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold);
393
394	/*
395	 * The mc_alloc_groups maintains a count of the number of
396	 * groups in this metaslab class that are still above the
397	 * zfs_mg_noalloc_threshold. This is used by the allocating
398	 * threads to determine if they should avoid allocations to
399	 * a given group. The allocator will avoid allocations to a group
400	 * if that group has reached or is below the zfs_mg_noalloc_threshold
401	 * and there are still other groups that are above the threshold.
402	 * When a group transitions from allocatable to non-allocatable or
403	 * vice versa we update the metaslab class to reflect that change.
404	 * When the mc_alloc_groups value drops to 0 that means that all
405	 * groups have reached the zfs_mg_noalloc_threshold making all groups
406	 * eligible for allocations. This effectively means that all devices
407	 * are balanced again.
408	 */
409	if (was_allocatable && !mg->mg_allocatable)
410		mc->mc_alloc_groups--;
411	else if (!was_allocatable && mg->mg_allocatable)
412		mc->mc_alloc_groups++;
413	mutex_exit(&mg->mg_lock);
414}
415
416metaslab_group_t *
417metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
418{
419	metaslab_group_t *mg;
420
421	mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
422	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
423	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
424	    sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
425	mg->mg_vd = vd;
426	mg->mg_class = mc;
427	mg->mg_activation_count = 0;
428
429	mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
430	    minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
431
432	return (mg);
433}
434
435void
436metaslab_group_destroy(metaslab_group_t *mg)
437{
438	ASSERT(mg->mg_prev == NULL);
439	ASSERT(mg->mg_next == NULL);
440	/*
441	 * We may have gone below zero with the activation count
442	 * either because we never activated in the first place or
443	 * because we're done, and possibly removing the vdev.
444	 */
445	ASSERT(mg->mg_activation_count <= 0);
446
447	taskq_destroy(mg->mg_taskq);
448	avl_destroy(&mg->mg_metaslab_tree);
449	mutex_destroy(&mg->mg_lock);
450	kmem_free(mg, sizeof (metaslab_group_t));
451}
452
453void
454metaslab_group_activate(metaslab_group_t *mg)
455{
456	metaslab_class_t *mc = mg->mg_class;
457	metaslab_group_t *mgprev, *mgnext;
458
459	ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
460
461	ASSERT(mc->mc_rotor != mg);
462	ASSERT(mg->mg_prev == NULL);
463	ASSERT(mg->mg_next == NULL);
464	ASSERT(mg->mg_activation_count <= 0);
465
466	if (++mg->mg_activation_count <= 0)
467		return;
468
469	mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
470	metaslab_group_alloc_update(mg);
471
472	if ((mgprev = mc->mc_rotor) == NULL) {
473		mg->mg_prev = mg;
474		mg->mg_next = mg;
475	} else {
476		mgnext = mgprev->mg_next;
477		mg->mg_prev = mgprev;
478		mg->mg_next = mgnext;
479		mgprev->mg_next = mg;
480		mgnext->mg_prev = mg;
481	}
482	mc->mc_rotor = mg;
483	metaslab_class_minblocksize_update(mc);
484}
485
486void
487metaslab_group_passivate(metaslab_group_t *mg)
488{
489	metaslab_class_t *mc = mg->mg_class;
490	metaslab_group_t *mgprev, *mgnext;
491
492	ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
493
494	if (--mg->mg_activation_count != 0) {
495		ASSERT(mc->mc_rotor != mg);
496		ASSERT(mg->mg_prev == NULL);
497		ASSERT(mg->mg_next == NULL);
498		ASSERT(mg->mg_activation_count < 0);
499		return;
500	}
501
502	taskq_wait(mg->mg_taskq);
503
504	mgprev = mg->mg_prev;
505	mgnext = mg->mg_next;
506
507	if (mg == mgnext) {
508		mc->mc_rotor = NULL;
509	} else {
510		mc->mc_rotor = mgnext;
511		mgprev->mg_next = mgnext;
512		mgnext->mg_prev = mgprev;
513	}
514
515	mg->mg_prev = NULL;
516	mg->mg_next = NULL;
517	metaslab_class_minblocksize_update(mc);
518}
519
520static void
521metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
522{
523	mutex_enter(&mg->mg_lock);
524	ASSERT(msp->ms_group == NULL);
525	msp->ms_group = mg;
526	msp->ms_weight = 0;
527	avl_add(&mg->mg_metaslab_tree, msp);
528	mutex_exit(&mg->mg_lock);
529}
530
531static void
532metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
533{
534	mutex_enter(&mg->mg_lock);
535	ASSERT(msp->ms_group == mg);
536	avl_remove(&mg->mg_metaslab_tree, msp);
537	msp->ms_group = NULL;
538	mutex_exit(&mg->mg_lock);
539}
540
541static void
542metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
543{
544	/*
545	 * Although in principle the weight can be any value, in
546	 * practice we do not use values in the range [1, 510].
547	 */
548	ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0);
549	ASSERT(MUTEX_HELD(&msp->ms_lock));
550
551	mutex_enter(&mg->mg_lock);
552	ASSERT(msp->ms_group == mg);
553	avl_remove(&mg->mg_metaslab_tree, msp);
554	msp->ms_weight = weight;
555	avl_add(&mg->mg_metaslab_tree, msp);
556	mutex_exit(&mg->mg_lock);
557}
558
559/*
560 * Determine if a given metaslab group should skip allocations. A metaslab
561 * group should avoid allocations if its used capacity has crossed the
562 * zfs_mg_noalloc_threshold and there is at least one metaslab group
563 * that can still handle allocations.
564 */
565static boolean_t
566metaslab_group_allocatable(metaslab_group_t *mg)
567{
568	vdev_t *vd = mg->mg_vd;
569	spa_t *spa = vd->vdev_spa;
570	metaslab_class_t *mc = mg->mg_class;
571
572	/*
573	 * A metaslab group is considered allocatable if its free capacity
574	 * is greater than the set value of zfs_mg_noalloc_threshold, it's
575	 * associated with a slog, or there are no other metaslab groups
576	 * with free capacity greater than zfs_mg_noalloc_threshold.
577	 */
578	return (mg->mg_free_capacity > zfs_mg_noalloc_threshold ||
579	    mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0);
580}
581
582/*
583 * ==========================================================================
584 * Range tree callbacks
585 * ==========================================================================
586 */
587
588/*
589 * Comparison function for the private size-ordered tree. Tree is sorted
590 * by size, larger sizes at the end of the tree.
591 */
592static int
593metaslab_rangesize_compare(const void *x1, const void *x2)
594{
595	const range_seg_t *r1 = x1;
596	const range_seg_t *r2 = x2;
597	uint64_t rs_size1 = r1->rs_end - r1->rs_start;
598	uint64_t rs_size2 = r2->rs_end - r2->rs_start;
599
600	if (rs_size1 < rs_size2)
601		return (-1);
602	if (rs_size1 > rs_size2)
603		return (1);
604
605	if (r1->rs_start < r2->rs_start)
606		return (-1);
607
608	if (r1->rs_start > r2->rs_start)
609		return (1);
610
611	return (0);
612}
613
614/*
615 * Create any block allocator specific components. The current allocators
616 * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
617 */
618static void
619metaslab_rt_create(range_tree_t *rt, void *arg)
620{
621	metaslab_t *msp = arg;
622
623	ASSERT3P(rt->rt_arg, ==, msp);
624	ASSERT(msp->ms_tree == NULL);
625
626	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
627	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
628}
629
630/*
631 * Destroy the block allocator specific components.
632 */
633static void
634metaslab_rt_destroy(range_tree_t *rt, void *arg)
635{
636	metaslab_t *msp = arg;
637
638	ASSERT3P(rt->rt_arg, ==, msp);
639	ASSERT3P(msp->ms_tree, ==, rt);
640	ASSERT0(avl_numnodes(&msp->ms_size_tree));
641
642	avl_destroy(&msp->ms_size_tree);
643}
644
645static void
646metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
647{
648	metaslab_t *msp = arg;
649
650	ASSERT3P(rt->rt_arg, ==, msp);
651	ASSERT3P(msp->ms_tree, ==, rt);
652	VERIFY(!msp->ms_condensing);
653	avl_add(&msp->ms_size_tree, rs);
654}
655
656static void
657metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
658{
659	metaslab_t *msp = arg;
660
661	ASSERT3P(rt->rt_arg, ==, msp);
662	ASSERT3P(msp->ms_tree, ==, rt);
663	VERIFY(!msp->ms_condensing);
664	avl_remove(&msp->ms_size_tree, rs);
665}
666
667static void
668metaslab_rt_vacate(range_tree_t *rt, void *arg)
669{
670	metaslab_t *msp = arg;
671
672	ASSERT3P(rt->rt_arg, ==, msp);
673	ASSERT3P(msp->ms_tree, ==, rt);
674
675	/*
676	 * Normally one would walk the tree freeing nodes along the way.
677	 * Since the nodes are shared with the range trees we can avoid
678	 * walking all nodes and just reinitialize the avl tree. The nodes
679	 * will be freed by the range tree, so we don't want to free them here.
680	 */
681	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
682	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
683}
684
685static range_tree_ops_t metaslab_rt_ops = {
686	metaslab_rt_create,
687	metaslab_rt_destroy,
688	metaslab_rt_add,
689	metaslab_rt_remove,
690	metaslab_rt_vacate
691};
692
693/*
694 * ==========================================================================
695 * Metaslab block operations
696 * ==========================================================================
697 */
698
699/*
700 * Return the maximum contiguous segment within the metaslab.
701 */
702uint64_t
703metaslab_block_maxsize(metaslab_t *msp)
704{
705	avl_tree_t *t = &msp->ms_size_tree;
706	range_seg_t *rs;
707
708	if (t == NULL || (rs = avl_last(t)) == NULL)
709		return (0ULL);
710
711	return (rs->rs_end - rs->rs_start);
712}
713
714uint64_t
715metaslab_block_alloc(metaslab_t *msp, uint64_t size)
716{
717	uint64_t start;
718	range_tree_t *rt = msp->ms_tree;
719
720	VERIFY(!msp->ms_condensing);
721
722	start = msp->ms_ops->msop_alloc(msp, size);
723	if (start != -1ULL) {
724		vdev_t *vd = msp->ms_group->mg_vd;
725
726		VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
727		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
728		VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
729		range_tree_remove(rt, start, size);
730	}
731	return (start);
732}
733
734/*
735 * ==========================================================================
736 * Common allocator routines
737 * ==========================================================================
738 */
739
740/*
741 * This is a helper function that can be used by the allocator to find
742 * a suitable block to allocate. This will search the specified AVL
743 * tree looking for a block that matches the specified criteria.
744 */
745static uint64_t
746metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
747    uint64_t align)
748{
749	range_seg_t *rs, rsearch;
750	avl_index_t where;
751
752	rsearch.rs_start = *cursor;
753	rsearch.rs_end = *cursor + size;
754
755	rs = avl_find(t, &rsearch, &where);
756	if (rs == NULL)
757		rs = avl_nearest(t, where, AVL_AFTER);
758
759	while (rs != NULL) {
760		uint64_t offset = P2ROUNDUP(rs->rs_start, align);
761
762		if (offset + size <= rs->rs_end) {
763			*cursor = offset + size;
764			return (offset);
765		}
766		rs = AVL_NEXT(t, rs);
767	}
768
769	/*
770	 * If we know we've searched the whole map (*cursor == 0), give up.
771	 * Otherwise, reset the cursor to the beginning and try again.
772	 */
773	if (*cursor == 0)
774		return (-1ULL);
775
776	*cursor = 0;
777	return (metaslab_block_picker(t, cursor, size, align));
778}
779
780/*
781 * ==========================================================================
782 * The first-fit block allocator
783 * ==========================================================================
784 */
785static uint64_t
786metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
787{
788	/*
789	 * Find the largest power of 2 block size that evenly divides the
790	 * requested size. This is used to try to allocate blocks with similar
791	 * alignment from the same area of the metaslab (i.e. same cursor
792	 * bucket) but it does not guarantee that other allocations sizes
793	 * may exist in the same region.
794	 */
795	uint64_t align = size & -size;
796	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
797	avl_tree_t *t = &msp->ms_tree->rt_root;
798
799	return (metaslab_block_picker(t, cursor, size, align));
800}
801
802/* ARGSUSED */
803static boolean_t
804metaslab_ff_fragmented(metaslab_t *msp)
805{
806	return (B_TRUE);
807}
808
809static metaslab_ops_t metaslab_ff_ops = {
810	metaslab_ff_alloc,
811	metaslab_ff_fragmented
812};
813
814/*
815 * ==========================================================================
816 * Dynamic block allocator -
817 * Uses the first fit allocation scheme until space get low and then
818 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
819 * and metaslab_df_free_pct to determine when to switch the allocation scheme.
820 * ==========================================================================
821 */
822static uint64_t
823metaslab_df_alloc(metaslab_t *msp, uint64_t size)
824{
825	/*
826	 * Find the largest power of 2 block size that evenly divides the
827	 * requested size. This is used to try to allocate blocks with similar
828	 * alignment from the same area of the metaslab (i.e. same cursor
829	 * bucket) but it does not guarantee that other allocations sizes
830	 * may exist in the same region.
831	 */
832	uint64_t align = size & -size;
833	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
834	range_tree_t *rt = msp->ms_tree;
835	avl_tree_t *t = &rt->rt_root;
836	uint64_t max_size = metaslab_block_maxsize(msp);
837	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
838
839	ASSERT(MUTEX_HELD(&msp->ms_lock));
840	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
841
842	if (max_size < size)
843		return (-1ULL);
844
845	/*
846	 * If we're running low on space switch to using the size
847	 * sorted AVL tree (best-fit).
848	 */
849	if (max_size < metaslab_df_alloc_threshold ||
850	    free_pct < metaslab_df_free_pct) {
851		t = &msp->ms_size_tree;
852		*cursor = 0;
853	}
854
855	return (metaslab_block_picker(t, cursor, size, 1ULL));
856}
857
858static boolean_t
859metaslab_df_fragmented(metaslab_t *msp)
860{
861	range_tree_t *rt = msp->ms_tree;
862	uint64_t max_size = metaslab_block_maxsize(msp);
863	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
864
865	if (max_size >= metaslab_df_alloc_threshold &&
866	    free_pct >= metaslab_df_free_pct)
867		return (B_FALSE);
868
869	return (B_TRUE);
870}
871
872static metaslab_ops_t metaslab_df_ops = {
873	metaslab_df_alloc,
874	metaslab_df_fragmented
875};
876
877/*
878 * ==========================================================================
879 * Cursor fit block allocator -
880 * Select the largest region in the metaslab, set the cursor to the beginning
881 * of the range and the cursor_end to the end of the range. As allocations
882 * are made advance the cursor. Continue allocating from the cursor until
883 * the range is exhausted and then find a new range.
884 * ==========================================================================
885 */
886static uint64_t
887metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
888{
889	range_tree_t *rt = msp->ms_tree;
890	avl_tree_t *t = &msp->ms_size_tree;
891	uint64_t *cursor = &msp->ms_lbas[0];
892	uint64_t *cursor_end = &msp->ms_lbas[1];
893	uint64_t offset = 0;
894
895	ASSERT(MUTEX_HELD(&msp->ms_lock));
896	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
897
898	ASSERT3U(*cursor_end, >=, *cursor);
899
900	if ((*cursor + size) > *cursor_end) {
901		range_seg_t *rs;
902
903		rs = avl_last(&msp->ms_size_tree);
904		if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
905			return (-1ULL);
906
907		*cursor = rs->rs_start;
908		*cursor_end = rs->rs_end;
909	}
910
911	offset = *cursor;
912	*cursor += size;
913
914	return (offset);
915}
916
917static boolean_t
918metaslab_cf_fragmented(metaslab_t *msp)
919{
920	return (metaslab_block_maxsize(msp) < metaslab_min_alloc_size);
921}
922
923static metaslab_ops_t metaslab_cf_ops = {
924	metaslab_cf_alloc,
925	metaslab_cf_fragmented
926};
927
928/*
929 * ==========================================================================
930 * New dynamic fit allocator -
931 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
932 * contiguous blocks. If no region is found then just use the largest segment
933 * that remains.
934 * ==========================================================================
935 */
936
937/*
938 * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
939 * to request from the allocator.
940 */
941uint64_t metaslab_ndf_clump_shift = 4;
942
943static uint64_t
944metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
945{
946	avl_tree_t *t = &msp->ms_tree->rt_root;
947	avl_index_t where;
948	range_seg_t *rs, rsearch;
949	uint64_t hbit = highbit64(size);
950	uint64_t *cursor = &msp->ms_lbas[hbit - 1];
951	uint64_t max_size = metaslab_block_maxsize(msp);
952
953	ASSERT(MUTEX_HELD(&msp->ms_lock));
954	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
955
956	if (max_size < size)
957		return (-1ULL);
958
959	rsearch.rs_start = *cursor;
960	rsearch.rs_end = *cursor + size;
961
962	rs = avl_find(t, &rsearch, &where);
963	if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
964		t = &msp->ms_size_tree;
965
966		rsearch.rs_start = 0;
967		rsearch.rs_end = MIN(max_size,
968		    1ULL << (hbit + metaslab_ndf_clump_shift));
969		rs = avl_find(t, &rsearch, &where);
970		if (rs == NULL)
971			rs = avl_nearest(t, where, AVL_AFTER);
972		ASSERT(rs != NULL);
973	}
974
975	if ((rs->rs_end - rs->rs_start) >= size) {
976		*cursor = rs->rs_start + size;
977		return (rs->rs_start);
978	}
979	return (-1ULL);
980}
981
982static boolean_t
983metaslab_ndf_fragmented(metaslab_t *msp)
984{
985	return (metaslab_block_maxsize(msp) <=
986	    (metaslab_min_alloc_size << metaslab_ndf_clump_shift));
987}
988
989static metaslab_ops_t metaslab_ndf_ops = {
990	metaslab_ndf_alloc,
991	metaslab_ndf_fragmented
992};
993
994metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
995
996/*
997 * ==========================================================================
998 * Metaslabs
999 * ==========================================================================
1000 */
1001
1002/*
1003 * Wait for any in-progress metaslab loads to complete.
1004 */
1005void
1006metaslab_load_wait(metaslab_t *msp)
1007{
1008	ASSERT(MUTEX_HELD(&msp->ms_lock));
1009
1010	while (msp->ms_loading) {
1011		ASSERT(!msp->ms_loaded);
1012		cv_wait(&msp->ms_load_cv, &msp->ms_lock);
1013	}
1014}
1015
1016int
1017metaslab_load(metaslab_t *msp)
1018{
1019	int error = 0;
1020
1021	ASSERT(MUTEX_HELD(&msp->ms_lock));
1022	ASSERT(!msp->ms_loaded);
1023	ASSERT(!msp->ms_loading);
1024
1025	msp->ms_loading = B_TRUE;
1026
1027	/*
1028	 * If the space map has not been allocated yet, then treat
1029	 * all the space in the metaslab as free and add it to the
1030	 * ms_tree.
1031	 */
1032	if (msp->ms_sm != NULL)
1033		error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
1034	else
1035		range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
1036
1037	msp->ms_loaded = (error == 0);
1038	msp->ms_loading = B_FALSE;
1039
1040	if (msp->ms_loaded) {
1041		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1042			range_tree_walk(msp->ms_defertree[t],
1043			    range_tree_remove, msp->ms_tree);
1044		}
1045	}
1046	cv_broadcast(&msp->ms_load_cv);
1047	return (error);
1048}
1049
1050void
1051metaslab_unload(metaslab_t *msp)
1052{
1053	ASSERT(MUTEX_HELD(&msp->ms_lock));
1054	range_tree_vacate(msp->ms_tree, NULL, NULL);
1055	msp->ms_loaded = B_FALSE;
1056	msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
1057}
1058
1059metaslab_t *
1060metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg)
1061{
1062	vdev_t *vd = mg->mg_vd;
1063	objset_t *mos = vd->vdev_spa->spa_meta_objset;
1064	metaslab_t *msp;
1065
1066	msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
1067	mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL);
1068	cv_init(&msp->ms_load_cv, NULL, CV_DEFAULT, NULL);
1069	msp->ms_id = id;
1070	msp->ms_start = id << vd->vdev_ms_shift;
1071	msp->ms_size = 1ULL << vd->vdev_ms_shift;
1072
1073	/*
1074	 * We only open space map objects that already exist. All others
1075	 * will be opened when we finally allocate an object for it.
1076	 */
1077	if (object != 0) {
1078		VERIFY0(space_map_open(&msp->ms_sm, mos, object, msp->ms_start,
1079		    msp->ms_size, vd->vdev_ashift, &msp->ms_lock));
1080		ASSERT(msp->ms_sm != NULL);
1081	}
1082
1083	/*
1084	 * We create the main range tree here, but we don't create the
1085	 * alloctree and freetree until metaslab_sync_done().  This serves
1086	 * two purposes: it allows metaslab_sync_done() to detect the
1087	 * addition of new space; and for debugging, it ensures that we'd
1088	 * data fault on any attempt to use this metaslab before it's ready.
1089	 */
1090	msp->ms_tree = range_tree_create(&metaslab_rt_ops, msp, &msp->ms_lock);
1091	metaslab_group_add(mg, msp);
1092
1093	msp->ms_ops = mg->mg_class->mc_ops;
1094
1095	/*
1096	 * If we're opening an existing pool (txg == 0) or creating
1097	 * a new one (txg == TXG_INITIAL), all space is available now.
1098	 * If we're adding space to an existing pool, the new space
1099	 * does not become available until after this txg has synced.
1100	 */
1101	if (txg <= TXG_INITIAL)
1102		metaslab_sync_done(msp, 0);
1103
1104	/*
1105	 * If metaslab_debug_load is set and we're initializing a metaslab
1106	 * that has an allocated space_map object then load the its space
1107	 * map so that can verify frees.
1108	 */
1109	if (metaslab_debug_load && msp->ms_sm != NULL) {
1110		mutex_enter(&msp->ms_lock);
1111		VERIFY0(metaslab_load(msp));
1112		mutex_exit(&msp->ms_lock);
1113	}
1114
1115	if (txg != 0) {
1116		vdev_dirty(vd, 0, NULL, txg);
1117		vdev_dirty(vd, VDD_METASLAB, msp, txg);
1118	}
1119
1120	return (msp);
1121}
1122
1123void
1124metaslab_fini(metaslab_t *msp)
1125{
1126	metaslab_group_t *mg = msp->ms_group;
1127
1128	metaslab_group_remove(mg, msp);
1129
1130	mutex_enter(&msp->ms_lock);
1131
1132	VERIFY(msp->ms_group == NULL);
1133	vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
1134	    0, -msp->ms_size);
1135	space_map_close(msp->ms_sm);
1136
1137	metaslab_unload(msp);
1138	range_tree_destroy(msp->ms_tree);
1139
1140	for (int t = 0; t < TXG_SIZE; t++) {
1141		range_tree_destroy(msp->ms_alloctree[t]);
1142		range_tree_destroy(msp->ms_freetree[t]);
1143	}
1144
1145	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1146		range_tree_destroy(msp->ms_defertree[t]);
1147	}
1148
1149	ASSERT0(msp->ms_deferspace);
1150
1151	mutex_exit(&msp->ms_lock);
1152	cv_destroy(&msp->ms_load_cv);
1153	mutex_destroy(&msp->ms_lock);
1154
1155	kmem_free(msp, sizeof (metaslab_t));
1156}
1157
1158/*
1159 * Apply a weighting factor based on the histogram information for this
1160 * metaslab. The current weighting factor is somewhat arbitrary and requires
1161 * additional investigation. The implementation provides a measure of
1162 * "weighted" free space and gives a higher weighting for larger contiguous
1163 * regions. The weighting factor is determined by counting the number of
1164 * sm_shift sectors that exist in each region represented by the histogram.
1165 * That value is then multiplied by the power of 2 exponent and the sm_shift
1166 * value.
1167 *
1168 * For example, assume the 2^21 histogram bucket has 4 2MB regions and the
1169 * metaslab has an sm_shift value of 9 (512B):
1170 *
1171 * 1) calculate the number of sm_shift sectors in the region:
1172 *	2^21 / 2^9 = 2^12 = 4096 * 4 (number of regions) = 16384
1173 * 2) multiply by the power of 2 exponent and the sm_shift value:
1174 *	16384 * 21 * 9 = 3096576
1175 * This value will be added to the weighting of the metaslab.
1176 */
1177static uint64_t
1178metaslab_weight_factor(metaslab_t *msp)
1179{
1180	uint64_t factor = 0;
1181	uint64_t sectors;
1182	int i;
1183
1184	/*
1185	 * A null space map means that the entire metaslab is free,
1186	 * calculate a weight factor that spans the entire size of the
1187	 * metaslab.
1188	 */
1189	if (msp->ms_sm == NULL) {
1190		vdev_t *vd = msp->ms_group->mg_vd;
1191
1192		i = highbit64(msp->ms_size) - 1;
1193		sectors = msp->ms_size >> vd->vdev_ashift;
1194		return (sectors * i * vd->vdev_ashift);
1195	}
1196
1197	if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
1198		return (0);
1199
1200	for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE(msp->ms_sm); i++) {
1201		if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
1202			continue;
1203
1204		/*
1205		 * Determine the number of sm_shift sectors in the region
1206		 * indicated by the histogram. For example, given an
1207		 * sm_shift value of 9 (512 bytes) and i = 4 then we know
1208		 * that we're looking at an 8K region in the histogram
1209		 * (i.e. 9 + 4 = 13, 2^13 = 8192). To figure out the
1210		 * number of sm_shift sectors (512 bytes in this example),
1211		 * we would take 8192 / 512 = 16. Since the histogram
1212		 * is offset by sm_shift we can simply use the value of
1213		 * of i to calculate this (i.e. 2^i = 16 where i = 4).
1214		 */
1215		sectors = msp->ms_sm->sm_phys->smp_histogram[i] << i;
1216		factor += (i + msp->ms_sm->sm_shift) * sectors;
1217	}
1218	return (factor * msp->ms_sm->sm_shift);
1219}
1220
1221static uint64_t
1222metaslab_weight(metaslab_t *msp)
1223{
1224	metaslab_group_t *mg = msp->ms_group;
1225	vdev_t *vd = mg->mg_vd;
1226	uint64_t weight, space;
1227
1228	ASSERT(MUTEX_HELD(&msp->ms_lock));
1229
1230	/*
1231	 * This vdev is in the process of being removed so there is nothing
1232	 * for us to do here.
1233	 */
1234	if (vd->vdev_removing) {
1235		ASSERT0(space_map_allocated(msp->ms_sm));
1236		ASSERT0(vd->vdev_ms_shift);
1237		return (0);
1238	}
1239
1240	/*
1241	 * The baseline weight is the metaslab's free space.
1242	 */
1243	space = msp->ms_size - space_map_allocated(msp->ms_sm);
1244	weight = space;
1245
1246	/*
1247	 * Modern disks have uniform bit density and constant angular velocity.
1248	 * Therefore, the outer recording zones are faster (higher bandwidth)
1249	 * than the inner zones by the ratio of outer to inner track diameter,
1250	 * which is typically around 2:1.  We account for this by assigning
1251	 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
1252	 * In effect, this means that we'll select the metaslab with the most
1253	 * free bandwidth rather than simply the one with the most free space.
1254	 */
1255	weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
1256	ASSERT(weight >= space && weight <= 2 * space);
1257
1258	msp->ms_factor = metaslab_weight_factor(msp);
1259	if (metaslab_weight_factor_enable)
1260		weight += msp->ms_factor;
1261
1262	if (msp->ms_loaded && !msp->ms_ops->msop_fragmented(msp)) {
1263		/*
1264		 * If this metaslab is one we're actively using, adjust its
1265		 * weight to make it preferable to any inactive metaslab so
1266		 * we'll polish it off.
1267		 */
1268		weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
1269	}
1270
1271	return (weight);
1272}
1273
1274static int
1275metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
1276{
1277	ASSERT(MUTEX_HELD(&msp->ms_lock));
1278
1279	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
1280		metaslab_load_wait(msp);
1281		if (!msp->ms_loaded) {
1282			int error = metaslab_load(msp);
1283			if (error) {
1284				metaslab_group_sort(msp->ms_group, msp, 0);
1285				return (error);
1286			}
1287		}
1288
1289		metaslab_group_sort(msp->ms_group, msp,
1290		    msp->ms_weight | activation_weight);
1291	}
1292	ASSERT(msp->ms_loaded);
1293	ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
1294
1295	return (0);
1296}
1297
1298static void
1299metaslab_passivate(metaslab_t *msp, uint64_t size)
1300{
1301	/*
1302	 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
1303	 * this metaslab again.  In that case, it had better be empty,
1304	 * or we would be leaving space on the table.
1305	 */
1306	ASSERT(size >= SPA_MINBLOCKSIZE || range_tree_space(msp->ms_tree) == 0);
1307	metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
1308	ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
1309}
1310
1311static void
1312metaslab_preload(void *arg)
1313{
1314	metaslab_t *msp = arg;
1315	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1316
1317	ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
1318
1319	mutex_enter(&msp->ms_lock);
1320	metaslab_load_wait(msp);
1321	if (!msp->ms_loaded)
1322		(void) metaslab_load(msp);
1323
1324	/*
1325	 * Set the ms_access_txg value so that we don't unload it right away.
1326	 */
1327	msp->ms_access_txg = spa_syncing_txg(spa) + metaslab_unload_delay + 1;
1328	mutex_exit(&msp->ms_lock);
1329}
1330
1331static void
1332metaslab_group_preload(metaslab_group_t *mg)
1333{
1334	spa_t *spa = mg->mg_vd->vdev_spa;
1335	metaslab_t *msp;
1336	avl_tree_t *t = &mg->mg_metaslab_tree;
1337	int m = 0;
1338
1339	if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
1340		taskq_wait(mg->mg_taskq);
1341		return;
1342	}
1343
1344	mutex_enter(&mg->mg_lock);
1345	/*
1346	 * Load the next potential metaslabs
1347	 */
1348	msp = avl_first(t);
1349	while (msp != NULL) {
1350		metaslab_t *msp_next = AVL_NEXT(t, msp);
1351
1352		/* If we have reached our preload limit then we're done */
1353		if (++m > metaslab_preload_limit)
1354			break;
1355
1356		/*
1357		 * We must drop the metaslab group lock here to preserve
1358		 * lock ordering with the ms_lock (when grabbing both
1359		 * the mg_lock and the ms_lock, the ms_lock must be taken
1360		 * first).  As a result, it is possible that the ordering
1361		 * of the metaslabs within the avl tree may change before
1362		 * we reacquire the lock. The metaslab cannot be removed from
1363		 * the tree while we're in syncing context so it is safe to
1364		 * drop the mg_lock here. If the metaslabs are reordered
1365		 * nothing will break -- we just may end up loading a
1366		 * less than optimal one.
1367		 */
1368		mutex_exit(&mg->mg_lock);
1369		VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
1370		    msp, TQ_SLEEP) != 0);
1371		mutex_enter(&mg->mg_lock);
1372		msp = msp_next;
1373	}
1374	mutex_exit(&mg->mg_lock);
1375}
1376
1377/*
1378 * Determine if the space map's on-disk footprint is past our tolerance
1379 * for inefficiency. We would like to use the following criteria to make
1380 * our decision:
1381 *
1382 * 1. The size of the space map object should not dramatically increase as a
1383 * result of writing out the free space range tree.
1384 *
1385 * 2. The minimal on-disk space map representation is zfs_condense_pct/100
1386 * times the size than the free space range tree representation
1387 * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB).
1388 *
1389 * 3. The on-disk size of the space map should actually decrease.
1390 *
1391 * Checking the first condition is tricky since we don't want to walk
1392 * the entire AVL tree calculating the estimated on-disk size. Instead we
1393 * use the size-ordered range tree in the metaslab and calculate the
1394 * size required to write out the largest segment in our free tree. If the
1395 * size required to represent that segment on disk is larger than the space
1396 * map object then we avoid condensing this map.
1397 *
1398 * To determine the second criterion we use a best-case estimate and assume
1399 * each segment can be represented on-disk as a single 64-bit entry. We refer
1400 * to this best-case estimate as the space map's minimal form.
1401 *
1402 * Unfortunately, we cannot compute the on-disk size of the space map in this
1403 * context because we cannot accurately compute the effects of compression, etc.
1404 * Instead, we apply the heuristic described in the block comment for
1405 * zfs_metaslab_condense_block_threshold - we only condense if the space used
1406 * is greater than a threshold number of blocks.
1407 */
1408static boolean_t
1409metaslab_should_condense(metaslab_t *msp)
1410{
1411	space_map_t *sm = msp->ms_sm;
1412	range_seg_t *rs;
1413	uint64_t size, entries, segsz, object_size, optimal_size, record_size;
1414	dmu_object_info_t doi;
1415	uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift;
1416
1417	ASSERT(MUTEX_HELD(&msp->ms_lock));
1418	ASSERT(msp->ms_loaded);
1419
1420	/*
1421	 * Use the ms_size_tree range tree, which is ordered by size, to
1422	 * obtain the largest segment in the free tree. If the tree is empty
1423	 * then we should condense the map.
1424	 */
1425	rs = avl_last(&msp->ms_size_tree);
1426	if (rs == NULL)
1427		return (B_TRUE);
1428
1429	/*
1430	 * Calculate the number of 64-bit entries this segment would
1431	 * require when written to disk. If this single segment would be
1432	 * larger on-disk than the entire current on-disk structure, then
1433	 * clearly condensing will increase the on-disk structure size.
1434	 */
1435	size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
1436	entries = size / (MIN(size, SM_RUN_MAX));
1437	segsz = entries * sizeof (uint64_t);
1438
1439	optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root);
1440	object_size = space_map_length(msp->ms_sm);
1441
1442	dmu_object_info_from_db(sm->sm_dbuf, &doi);
1443	record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
1444
1445	return (segsz <= object_size &&
1446	    object_size >= (optimal_size * zfs_condense_pct / 100) &&
1447	    object_size > zfs_metaslab_condense_block_threshold * record_size);
1448}
1449
1450/*
1451 * Condense the on-disk space map representation to its minimized form.
1452 * The minimized form consists of a small number of allocations followed by
1453 * the entries of the free range tree.
1454 */
1455static void
1456metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
1457{
1458	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1459	range_tree_t *freetree = msp->ms_freetree[txg & TXG_MASK];
1460	range_tree_t *condense_tree;
1461	space_map_t *sm = msp->ms_sm;
1462
1463	ASSERT(MUTEX_HELD(&msp->ms_lock));
1464	ASSERT3U(spa_sync_pass(spa), ==, 1);
1465	ASSERT(msp->ms_loaded);
1466
1467	spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, "
1468	    "smp size %llu, segments %lu", txg, msp->ms_id, msp,
1469	    space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root));
1470
1471	/*
1472	 * Create an range tree that is 100% allocated. We remove segments
1473	 * that have been freed in this txg, any deferred frees that exist,
1474	 * and any allocation in the future. Removing segments should be
1475	 * a relatively inexpensive operation since we expect these trees to
1476	 * have a small number of nodes.
1477	 */
1478	condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock);
1479	range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
1480
1481	/*
1482	 * Remove what's been freed in this txg from the condense_tree.
1483	 * Since we're in sync_pass 1, we know that all the frees from
1484	 * this txg are in the freetree.
1485	 */
1486	range_tree_walk(freetree, range_tree_remove, condense_tree);
1487
1488	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1489		range_tree_walk(msp->ms_defertree[t],
1490		    range_tree_remove, condense_tree);
1491	}
1492
1493	for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
1494		range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK],
1495		    range_tree_remove, condense_tree);
1496	}
1497
1498	/*
1499	 * We're about to drop the metaslab's lock thus allowing
1500	 * other consumers to change it's content. Set the
1501	 * metaslab's ms_condensing flag to ensure that
1502	 * allocations on this metaslab do not occur while we're
1503	 * in the middle of committing it to disk. This is only critical
1504	 * for the ms_tree as all other range trees use per txg
1505	 * views of their content.
1506	 */
1507	msp->ms_condensing = B_TRUE;
1508
1509	mutex_exit(&msp->ms_lock);
1510	space_map_truncate(sm, tx);
1511	mutex_enter(&msp->ms_lock);
1512
1513	/*
1514	 * While we would ideally like to create a space_map representation
1515	 * that consists only of allocation records, doing so can be
1516	 * prohibitively expensive because the in-core free tree can be
1517	 * large, and therefore computationally expensive to subtract
1518	 * from the condense_tree. Instead we sync out two trees, a cheap
1519	 * allocation only tree followed by the in-core free tree. While not
1520	 * optimal, this is typically close to optimal, and much cheaper to
1521	 * compute.
1522	 */
1523	space_map_write(sm, condense_tree, SM_ALLOC, tx);
1524	range_tree_vacate(condense_tree, NULL, NULL);
1525	range_tree_destroy(condense_tree);
1526
1527	space_map_write(sm, msp->ms_tree, SM_FREE, tx);
1528	msp->ms_condensing = B_FALSE;
1529}
1530
1531/*
1532 * Write a metaslab to disk in the context of the specified transaction group.
1533 */
1534void
1535metaslab_sync(metaslab_t *msp, uint64_t txg)
1536{
1537	metaslab_group_t *mg = msp->ms_group;
1538	vdev_t *vd = mg->mg_vd;
1539	spa_t *spa = vd->vdev_spa;
1540	objset_t *mos = spa_meta_objset(spa);
1541	range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK];
1542	range_tree_t **freetree = &msp->ms_freetree[txg & TXG_MASK];
1543	range_tree_t **freed_tree =
1544	    &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
1545	dmu_tx_t *tx;
1546	uint64_t object = space_map_object(msp->ms_sm);
1547
1548	ASSERT(!vd->vdev_ishole);
1549
1550	/*
1551	 * This metaslab has just been added so there's no work to do now.
1552	 */
1553	if (*freetree == NULL) {
1554		ASSERT3P(alloctree, ==, NULL);
1555		return;
1556	}
1557
1558	ASSERT3P(alloctree, !=, NULL);
1559	ASSERT3P(*freetree, !=, NULL);
1560	ASSERT3P(*freed_tree, !=, NULL);
1561
1562	if (range_tree_space(alloctree) == 0 &&
1563	    range_tree_space(*freetree) == 0)
1564		return;
1565
1566	/*
1567	 * The only state that can actually be changing concurrently with
1568	 * metaslab_sync() is the metaslab's ms_tree.  No other thread can
1569	 * be modifying this txg's alloctree, freetree, freed_tree, or
1570	 * space_map_phys_t. Therefore, we only hold ms_lock to satify
1571	 * space_map ASSERTs. We drop it whenever we call into the DMU,
1572	 * because the DMU can call down to us (e.g. via zio_free()) at
1573	 * any time.
1574	 */
1575
1576	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
1577
1578	if (msp->ms_sm == NULL) {
1579		uint64_t new_object;
1580
1581		new_object = space_map_alloc(mos, tx);
1582		VERIFY3U(new_object, !=, 0);
1583
1584		VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
1585		    msp->ms_start, msp->ms_size, vd->vdev_ashift,
1586		    &msp->ms_lock));
1587		ASSERT(msp->ms_sm != NULL);
1588	}
1589
1590	mutex_enter(&msp->ms_lock);
1591
1592	if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
1593	    metaslab_should_condense(msp)) {
1594		metaslab_condense(msp, txg, tx);
1595	} else {
1596		space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
1597		space_map_write(msp->ms_sm, *freetree, SM_FREE, tx);
1598	}
1599
1600	range_tree_vacate(alloctree, NULL, NULL);
1601
1602	if (msp->ms_loaded) {
1603		/*
1604		 * When the space map is loaded, we have an accruate
1605		 * histogram in the range tree. This gives us an opportunity
1606		 * to bring the space map's histogram up-to-date so we clear
1607		 * it first before updating it.
1608		 */
1609		space_map_histogram_clear(msp->ms_sm);
1610		space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
1611	} else {
1612		/*
1613		 * Since the space map is not loaded we simply update the
1614		 * exisiting histogram with what was freed in this txg. This
1615		 * means that the on-disk histogram may not have an accurate
1616		 * view of the free space but it's close enough to allow
1617		 * us to make allocation decisions.
1618		 */
1619		space_map_histogram_add(msp->ms_sm, *freetree, tx);
1620	}
1621
1622	/*
1623	 * For sync pass 1, we avoid traversing this txg's free range tree
1624	 * and instead will just swap the pointers for freetree and
1625	 * freed_tree. We can safely do this since the freed_tree is
1626	 * guaranteed to be empty on the initial pass.
1627	 */
1628	if (spa_sync_pass(spa) == 1) {
1629		range_tree_swap(freetree, freed_tree);
1630	} else {
1631		range_tree_vacate(*freetree, range_tree_add, *freed_tree);
1632	}
1633
1634	ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
1635	ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
1636
1637	mutex_exit(&msp->ms_lock);
1638
1639	if (object != space_map_object(msp->ms_sm)) {
1640		object = space_map_object(msp->ms_sm);
1641		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
1642		    msp->ms_id, sizeof (uint64_t), &object, tx);
1643	}
1644	dmu_tx_commit(tx);
1645}
1646
1647/*
1648 * Called after a transaction group has completely synced to mark
1649 * all of the metaslab's free space as usable.
1650 */
1651void
1652metaslab_sync_done(metaslab_t *msp, uint64_t txg)
1653{
1654	metaslab_group_t *mg = msp->ms_group;
1655	vdev_t *vd = mg->mg_vd;
1656	range_tree_t **freed_tree;
1657	range_tree_t **defer_tree;
1658	int64_t alloc_delta, defer_delta;
1659
1660	ASSERT(!vd->vdev_ishole);
1661
1662	mutex_enter(&msp->ms_lock);
1663
1664	/*
1665	 * If this metaslab is just becoming available, initialize its
1666	 * alloctrees, freetrees, and defertree and add its capacity to
1667	 * the vdev.
1668	 */
1669	if (msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK] == NULL) {
1670		for (int t = 0; t < TXG_SIZE; t++) {
1671			ASSERT(msp->ms_alloctree[t] == NULL);
1672			ASSERT(msp->ms_freetree[t] == NULL);
1673
1674			msp->ms_alloctree[t] = range_tree_create(NULL, msp,
1675			    &msp->ms_lock);
1676			msp->ms_freetree[t] = range_tree_create(NULL, msp,
1677			    &msp->ms_lock);
1678		}
1679
1680		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1681			ASSERT(msp->ms_defertree[t] == NULL);
1682
1683			msp->ms_defertree[t] = range_tree_create(NULL, msp,
1684			    &msp->ms_lock);
1685		}
1686
1687		vdev_space_update(vd, 0, 0, msp->ms_size);
1688	}
1689
1690	freed_tree = &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
1691	defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE];
1692
1693	alloc_delta = space_map_alloc_delta(msp->ms_sm);
1694	defer_delta = range_tree_space(*freed_tree) -
1695	    range_tree_space(*defer_tree);
1696
1697	vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
1698
1699	ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
1700	ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
1701
1702	/*
1703	 * If there's a metaslab_load() in progress, wait for it to complete
1704	 * so that we have a consistent view of the in-core space map.
1705	 */
1706	metaslab_load_wait(msp);
1707
1708	/*
1709	 * Move the frees from the defer_tree back to the free
1710	 * range tree (if it's loaded). Swap the freed_tree and the
1711	 * defer_tree -- this is safe to do because we've just emptied out
1712	 * the defer_tree.
1713	 */
1714	range_tree_vacate(*defer_tree,
1715	    msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
1716	range_tree_swap(freed_tree, defer_tree);
1717
1718	space_map_update(msp->ms_sm);
1719
1720	msp->ms_deferspace += defer_delta;
1721	ASSERT3S(msp->ms_deferspace, >=, 0);
1722	ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
1723	if (msp->ms_deferspace != 0) {
1724		/*
1725		 * Keep syncing this metaslab until all deferred frees
1726		 * are back in circulation.
1727		 */
1728		vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
1729	}
1730
1731	if (msp->ms_loaded && msp->ms_access_txg < txg) {
1732		for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
1733			VERIFY0(range_tree_space(
1734			    msp->ms_alloctree[(txg + t) & TXG_MASK]));
1735		}
1736
1737		if (!metaslab_debug_unload)
1738			metaslab_unload(msp);
1739	}
1740
1741	metaslab_group_sort(mg, msp, metaslab_weight(msp));
1742	mutex_exit(&msp->ms_lock);
1743
1744}
1745
1746void
1747metaslab_sync_reassess(metaslab_group_t *mg)
1748{
1749	metaslab_group_alloc_update(mg);
1750
1751	/*
1752	 * Preload the next potential metaslabs
1753	 */
1754	metaslab_group_preload(mg);
1755}
1756
1757static uint64_t
1758metaslab_distance(metaslab_t *msp, dva_t *dva)
1759{
1760	uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
1761	uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
1762	uint64_t start = msp->ms_id;
1763
1764	if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
1765		return (1ULL << 63);
1766
1767	if (offset < start)
1768		return ((start - offset) << ms_shift);
1769	if (offset > start)
1770		return ((offset - start) << ms_shift);
1771	return (0);
1772}
1773
1774static uint64_t
1775metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
1776    uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
1777{
1778	spa_t *spa = mg->mg_vd->vdev_spa;
1779	metaslab_t *msp = NULL;
1780	uint64_t offset = -1ULL;
1781	avl_tree_t *t = &mg->mg_metaslab_tree;
1782	uint64_t activation_weight;
1783	uint64_t target_distance;
1784	int i;
1785
1786	activation_weight = METASLAB_WEIGHT_PRIMARY;
1787	for (i = 0; i < d; i++) {
1788		if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
1789			activation_weight = METASLAB_WEIGHT_SECONDARY;
1790			break;
1791		}
1792	}
1793
1794	for (;;) {
1795		boolean_t was_active;
1796
1797		mutex_enter(&mg->mg_lock);
1798		for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
1799			if (msp->ms_weight < asize) {
1800				spa_dbgmsg(spa, "%s: failed to meet weight "
1801				    "requirement: vdev %llu, txg %llu, mg %p, "
1802				    "msp %p, psize %llu, asize %llu, "
1803				    "weight %llu", spa_name(spa),
1804				    mg->mg_vd->vdev_id, txg,
1805				    mg, msp, psize, asize, msp->ms_weight);
1806				mutex_exit(&mg->mg_lock);
1807				return (-1ULL);
1808			}
1809
1810			/*
1811			 * If the selected metaslab is condensing, skip it.
1812			 */
1813			if (msp->ms_condensing)
1814				continue;
1815
1816			was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
1817			if (activation_weight == METASLAB_WEIGHT_PRIMARY)
1818				break;
1819
1820			target_distance = min_distance +
1821			    (space_map_allocated(msp->ms_sm) != 0 ? 0 :
1822			    min_distance >> 1);
1823
1824			for (i = 0; i < d; i++)
1825				if (metaslab_distance(msp, &dva[i]) <
1826				    target_distance)
1827					break;
1828			if (i == d)
1829				break;
1830		}
1831		mutex_exit(&mg->mg_lock);
1832		if (msp == NULL)
1833			return (-1ULL);
1834
1835		mutex_enter(&msp->ms_lock);
1836
1837		/*
1838		 * Ensure that the metaslab we have selected is still
1839		 * capable of handling our request. It's possible that
1840		 * another thread may have changed the weight while we
1841		 * were blocked on the metaslab lock.
1842		 */
1843		if (msp->ms_weight < asize || (was_active &&
1844		    !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
1845		    activation_weight == METASLAB_WEIGHT_PRIMARY)) {
1846			mutex_exit(&msp->ms_lock);
1847			continue;
1848		}
1849
1850		if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
1851		    activation_weight == METASLAB_WEIGHT_PRIMARY) {
1852			metaslab_passivate(msp,
1853			    msp->ms_weight & ~METASLAB_ACTIVE_MASK);
1854			mutex_exit(&msp->ms_lock);
1855			continue;
1856		}
1857
1858		if (metaslab_activate(msp, activation_weight) != 0) {
1859			mutex_exit(&msp->ms_lock);
1860			continue;
1861		}
1862
1863		/*
1864		 * If this metaslab is currently condensing then pick again as
1865		 * we can't manipulate this metaslab until it's committed
1866		 * to disk.
1867		 */
1868		if (msp->ms_condensing) {
1869			mutex_exit(&msp->ms_lock);
1870			continue;
1871		}
1872
1873		if ((offset = metaslab_block_alloc(msp, asize)) != -1ULL)
1874			break;
1875
1876		metaslab_passivate(msp, metaslab_block_maxsize(msp));
1877		mutex_exit(&msp->ms_lock);
1878	}
1879
1880	if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
1881		vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
1882
1883	range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, asize);
1884	msp->ms_access_txg = txg + metaslab_unload_delay;
1885
1886	mutex_exit(&msp->ms_lock);
1887
1888	return (offset);
1889}
1890
1891/*
1892 * Allocate a block for the specified i/o.
1893 */
1894static int
1895metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
1896    dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags)
1897{
1898	metaslab_group_t *mg, *rotor;
1899	vdev_t *vd;
1900	int dshift = 3;
1901	int all_zero;
1902	int zio_lock = B_FALSE;
1903	boolean_t allocatable;
1904	uint64_t offset = -1ULL;
1905	uint64_t asize;
1906	uint64_t distance;
1907
1908	ASSERT(!DVA_IS_VALID(&dva[d]));
1909
1910	/*
1911	 * For testing, make some blocks above a certain size be gang blocks.
1912	 */
1913	if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0)
1914		return (SET_ERROR(ENOSPC));
1915
1916	/*
1917	 * Start at the rotor and loop through all mgs until we find something.
1918	 * Note that there's no locking on mc_rotor or mc_aliquot because
1919	 * nothing actually breaks if we miss a few updates -- we just won't
1920	 * allocate quite as evenly.  It all balances out over time.
1921	 *
1922	 * If we are doing ditto or log blocks, try to spread them across
1923	 * consecutive vdevs.  If we're forced to reuse a vdev before we've
1924	 * allocated all of our ditto blocks, then try and spread them out on
1925	 * that vdev as much as possible.  If it turns out to not be possible,
1926	 * gradually lower our standards until anything becomes acceptable.
1927	 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
1928	 * gives us hope of containing our fault domains to something we're
1929	 * able to reason about.  Otherwise, any two top-level vdev failures
1930	 * will guarantee the loss of data.  With consecutive allocation,
1931	 * only two adjacent top-level vdev failures will result in data loss.
1932	 *
1933	 * If we are doing gang blocks (hintdva is non-NULL), try to keep
1934	 * ourselves on the same vdev as our gang block header.  That
1935	 * way, we can hope for locality in vdev_cache, plus it makes our
1936	 * fault domains something tractable.
1937	 */
1938	if (hintdva) {
1939		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
1940
1941		/*
1942		 * It's possible the vdev we're using as the hint no
1943		 * longer exists (i.e. removed). Consult the rotor when
1944		 * all else fails.
1945		 */
1946		if (vd != NULL) {
1947			mg = vd->vdev_mg;
1948
1949			if (flags & METASLAB_HINTBP_AVOID &&
1950			    mg->mg_next != NULL)
1951				mg = mg->mg_next;
1952		} else {
1953			mg = mc->mc_rotor;
1954		}
1955	} else if (d != 0) {
1956		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
1957		mg = vd->vdev_mg->mg_next;
1958	} else {
1959		mg = mc->mc_rotor;
1960	}
1961
1962	/*
1963	 * If the hint put us into the wrong metaslab class, or into a
1964	 * metaslab group that has been passivated, just follow the rotor.
1965	 */
1966	if (mg->mg_class != mc || mg->mg_activation_count <= 0)
1967		mg = mc->mc_rotor;
1968
1969	rotor = mg;
1970top:
1971	all_zero = B_TRUE;
1972	do {
1973		ASSERT(mg->mg_activation_count == 1);
1974
1975		vd = mg->mg_vd;
1976
1977		/*
1978		 * Don't allocate from faulted devices.
1979		 */
1980		if (zio_lock) {
1981			spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
1982			allocatable = vdev_allocatable(vd);
1983			spa_config_exit(spa, SCL_ZIO, FTAG);
1984		} else {
1985			allocatable = vdev_allocatable(vd);
1986		}
1987
1988		/*
1989		 * Determine if the selected metaslab group is eligible
1990		 * for allocations. If we're ganging or have requested
1991		 * an allocation for the smallest gang block size
1992		 * then we don't want to avoid allocating to the this
1993		 * metaslab group. If we're in this condition we should
1994		 * try to allocate from any device possible so that we
1995		 * don't inadvertently return ENOSPC and suspend the pool
1996		 * even though space is still available.
1997		 */
1998		if (allocatable && CAN_FASTGANG(flags) &&
1999		    psize > SPA_GANGBLOCKSIZE)
2000			allocatable = metaslab_group_allocatable(mg);
2001
2002		if (!allocatable)
2003			goto next;
2004
2005		/*
2006		 * Avoid writing single-copy data to a failing vdev
2007		 * unless the user instructs us that it is okay.
2008		 */
2009		if ((vd->vdev_stat.vs_write_errors > 0 ||
2010		    vd->vdev_state < VDEV_STATE_HEALTHY) &&
2011		    d == 0 && dshift == 3 &&
2012		    !(zfs_write_to_degraded && vd->vdev_state ==
2013		    VDEV_STATE_DEGRADED)) {
2014			all_zero = B_FALSE;
2015			goto next;
2016		}
2017
2018		ASSERT(mg->mg_class == mc);
2019
2020		distance = vd->vdev_asize >> dshift;
2021		if (distance <= (1ULL << vd->vdev_ms_shift))
2022			distance = 0;
2023		else
2024			all_zero = B_FALSE;
2025
2026		asize = vdev_psize_to_asize(vd, psize);
2027		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
2028
2029		offset = metaslab_group_alloc(mg, psize, asize, txg, distance,
2030		    dva, d);
2031		if (offset != -1ULL) {
2032			/*
2033			 * If we've just selected this metaslab group,
2034			 * figure out whether the corresponding vdev is
2035			 * over- or under-used relative to the pool,
2036			 * and set an allocation bias to even it out.
2037			 */
2038			if (mc->mc_aliquot == 0) {
2039				vdev_stat_t *vs = &vd->vdev_stat;
2040				int64_t vu, cu;
2041
2042				vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
2043				cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
2044
2045				/*
2046				 * Calculate how much more or less we should
2047				 * try to allocate from this device during
2048				 * this iteration around the rotor.
2049				 * For example, if a device is 80% full
2050				 * and the pool is 20% full then we should
2051				 * reduce allocations by 60% on this device.
2052				 *
2053				 * mg_bias = (20 - 80) * 512K / 100 = -307K
2054				 *
2055				 * This reduces allocations by 307K for this
2056				 * iteration.
2057				 */
2058				mg->mg_bias = ((cu - vu) *
2059				    (int64_t)mg->mg_aliquot) / 100;
2060			}
2061
2062			if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
2063			    mg->mg_aliquot + mg->mg_bias) {
2064				mc->mc_rotor = mg->mg_next;
2065				mc->mc_aliquot = 0;
2066			}
2067
2068			DVA_SET_VDEV(&dva[d], vd->vdev_id);
2069			DVA_SET_OFFSET(&dva[d], offset);
2070			DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
2071			DVA_SET_ASIZE(&dva[d], asize);
2072
2073			return (0);
2074		}
2075next:
2076		mc->mc_rotor = mg->mg_next;
2077		mc->mc_aliquot = 0;
2078	} while ((mg = mg->mg_next) != rotor);
2079
2080	if (!all_zero) {
2081		dshift++;
2082		ASSERT(dshift < 64);
2083		goto top;
2084	}
2085
2086	if (!allocatable && !zio_lock) {
2087		dshift = 3;
2088		zio_lock = B_TRUE;
2089		goto top;
2090	}
2091
2092	bzero(&dva[d], sizeof (dva_t));
2093
2094	return (SET_ERROR(ENOSPC));
2095}
2096
2097/*
2098 * Free the block represented by DVA in the context of the specified
2099 * transaction group.
2100 */
2101static void
2102metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
2103{
2104	uint64_t vdev = DVA_GET_VDEV(dva);
2105	uint64_t offset = DVA_GET_OFFSET(dva);
2106	uint64_t size = DVA_GET_ASIZE(dva);
2107	vdev_t *vd;
2108	metaslab_t *msp;
2109
2110	ASSERT(DVA_IS_VALID(dva));
2111
2112	if (txg > spa_freeze_txg(spa))
2113		return;
2114
2115	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
2116	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
2117		cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
2118		    (u_longlong_t)vdev, (u_longlong_t)offset);
2119		ASSERT(0);
2120		return;
2121	}
2122
2123	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
2124
2125	if (DVA_GET_GANG(dva))
2126		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
2127
2128	mutex_enter(&msp->ms_lock);
2129
2130	if (now) {
2131		range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
2132		    offset, size);
2133
2134		VERIFY(!msp->ms_condensing);
2135		VERIFY3U(offset, >=, msp->ms_start);
2136		VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
2137		VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
2138		    msp->ms_size);
2139		VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
2140		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
2141		range_tree_add(msp->ms_tree, offset, size);
2142	} else {
2143		if (range_tree_space(msp->ms_freetree[txg & TXG_MASK]) == 0)
2144			vdev_dirty(vd, VDD_METASLAB, msp, txg);
2145		range_tree_add(msp->ms_freetree[txg & TXG_MASK],
2146		    offset, size);
2147	}
2148
2149	mutex_exit(&msp->ms_lock);
2150}
2151
2152/*
2153 * Intent log support: upon opening the pool after a crash, notify the SPA
2154 * of blocks that the intent log has allocated for immediate write, but
2155 * which are still considered free by the SPA because the last transaction
2156 * group didn't commit yet.
2157 */
2158static int
2159metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
2160{
2161	uint64_t vdev = DVA_GET_VDEV(dva);
2162	uint64_t offset = DVA_GET_OFFSET(dva);
2163	uint64_t size = DVA_GET_ASIZE(dva);
2164	vdev_t *vd;
2165	metaslab_t *msp;
2166	int error = 0;
2167
2168	ASSERT(DVA_IS_VALID(dva));
2169
2170	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
2171	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
2172		return (SET_ERROR(ENXIO));
2173
2174	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
2175
2176	if (DVA_GET_GANG(dva))
2177		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
2178
2179	mutex_enter(&msp->ms_lock);
2180
2181	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
2182		error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
2183
2184	if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
2185		error = SET_ERROR(ENOENT);
2186
2187	if (error || txg == 0) {	/* txg == 0 indicates dry run */
2188		mutex_exit(&msp->ms_lock);
2189		return (error);
2190	}
2191
2192	VERIFY(!msp->ms_condensing);
2193	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
2194	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
2195	VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
2196	range_tree_remove(msp->ms_tree, offset, size);
2197
2198	if (spa_writeable(spa)) {	/* don't dirty if we're zdb(1M) */
2199		if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
2200			vdev_dirty(vd, VDD_METASLAB, msp, txg);
2201		range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
2202	}
2203
2204	mutex_exit(&msp->ms_lock);
2205
2206	return (0);
2207}
2208
2209int
2210metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
2211    int ndvas, uint64_t txg, blkptr_t *hintbp, int flags)
2212{
2213	dva_t *dva = bp->blk_dva;
2214	dva_t *hintdva = hintbp->blk_dva;
2215	int error = 0;
2216
2217	ASSERT(bp->blk_birth == 0);
2218	ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
2219
2220	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
2221
2222	if (mc->mc_rotor == NULL) {	/* no vdevs in this class */
2223		spa_config_exit(spa, SCL_ALLOC, FTAG);
2224		return (SET_ERROR(ENOSPC));
2225	}
2226
2227	ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
2228	ASSERT(BP_GET_NDVAS(bp) == 0);
2229	ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
2230
2231	for (int d = 0; d < ndvas; d++) {
2232		error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
2233		    txg, flags);
2234		if (error != 0) {
2235			for (d--; d >= 0; d--) {
2236				metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
2237				bzero(&dva[d], sizeof (dva_t));
2238			}
2239			spa_config_exit(spa, SCL_ALLOC, FTAG);
2240			return (error);
2241		}
2242	}
2243	ASSERT(error == 0);
2244	ASSERT(BP_GET_NDVAS(bp) == ndvas);
2245
2246	spa_config_exit(spa, SCL_ALLOC, FTAG);
2247
2248	BP_SET_BIRTH(bp, txg, txg);
2249
2250	return (0);
2251}
2252
2253void
2254metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
2255{
2256	const dva_t *dva = bp->blk_dva;
2257	int ndvas = BP_GET_NDVAS(bp);
2258
2259	ASSERT(!BP_IS_HOLE(bp));
2260	ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
2261
2262	spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
2263
2264	for (int d = 0; d < ndvas; d++)
2265		metaslab_free_dva(spa, &dva[d], txg, now);
2266
2267	spa_config_exit(spa, SCL_FREE, FTAG);
2268}
2269
2270int
2271metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
2272{
2273	const dva_t *dva = bp->blk_dva;
2274	int ndvas = BP_GET_NDVAS(bp);
2275	int error = 0;
2276
2277	ASSERT(!BP_IS_HOLE(bp));
2278
2279	if (txg != 0) {
2280		/*
2281		 * First do a dry run to make sure all DVAs are claimable,
2282		 * so we don't have to unwind from partial failures below.
2283		 */
2284		if ((error = metaslab_claim(spa, bp, 0)) != 0)
2285			return (error);
2286	}
2287
2288	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
2289
2290	for (int d = 0; d < ndvas; d++)
2291		if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
2292			break;
2293
2294	spa_config_exit(spa, SCL_ALLOC, FTAG);
2295
2296	ASSERT(error == 0 || txg == 0);
2297
2298	return (error);
2299}
2300
2301void
2302metaslab_check_free(spa_t *spa, const blkptr_t *bp)
2303{
2304	if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
2305		return;
2306
2307	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
2308	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
2309		uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
2310		vdev_t *vd = vdev_lookup_top(spa, vdev);
2311		uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
2312		uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
2313		metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
2314
2315		if (msp->ms_loaded)
2316			range_tree_verify(msp->ms_tree, offset, size);
2317
2318		for (int j = 0; j < TXG_SIZE; j++)
2319			range_tree_verify(msp->ms_freetree[j], offset, size);
2320		for (int j = 0; j < TXG_DEFER_SIZE; j++)
2321			range_tree_verify(msp->ms_defertree[j], offset, size);
2322	}
2323	spa_config_exit(spa, SCL_VDEV, FTAG);
2324}
2325