metaslab.c revision 262093
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2013 by Delphix. All rights reserved.
24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
25 */
26
27#include <sys/zfs_context.h>
28#include <sys/dmu.h>
29#include <sys/dmu_tx.h>
30#include <sys/space_map.h>
31#include <sys/metaslab_impl.h>
32#include <sys/vdev_impl.h>
33#include <sys/zio.h>
34#include <sys/spa_impl.h>
35
36SYSCTL_DECL(_vfs_zfs);
37SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab");
38
39/*
40 * Allow allocations to switch to gang blocks quickly. We do this to
41 * avoid having to load lots of space_maps in a given txg. There are,
42 * however, some cases where we want to avoid "fast" ganging and instead
43 * we want to do an exhaustive search of all metaslabs on this device.
44 * Currently we don't allow any gang, zil, or dump device related allocations
45 * to "fast" gang.
46 */
47#define	CAN_FASTGANG(flags) \
48	(!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
49	METASLAB_GANG_AVOID)))
50
51#define	METASLAB_WEIGHT_PRIMARY		(1ULL << 63)
52#define	METASLAB_WEIGHT_SECONDARY	(1ULL << 62)
53#define	METASLAB_ACTIVE_MASK		\
54	(METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
55
56uint64_t metaslab_aliquot = 512ULL << 10;
57uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
58TUNABLE_QUAD("vfs.zfs.metaslab.gang_bang", &metaslab_gang_bang);
59SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, gang_bang, CTLFLAG_RWTUN,
60    &metaslab_gang_bang, 0,
61    "Force gang block allocation for blocks larger than or equal to this value");
62
63/*
64 * The in-core space map representation is more compact than its on-disk form.
65 * The zfs_condense_pct determines how much more compact the in-core
66 * space_map representation must be before we compact it on-disk.
67 * Values should be greater than or equal to 100.
68 */
69int zfs_condense_pct = 200;
70TUNABLE_INT("vfs.zfs.condense_pct", &zfs_condense_pct);
71SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN,
72    &zfs_condense_pct, 0,
73    "Condense on-disk spacemap when it is more than this many percents"
74    " of in-memory counterpart");
75
76/*
77 * This value defines the number of allowed allocation failures per vdev.
78 * If a device reaches this threshold in a given txg then we consider skipping
79 * allocations on that device. The value of zfs_mg_alloc_failures is computed
80 * in zio_init() unless it has been overridden in /etc/system.
81 */
82int zfs_mg_alloc_failures = 0;
83TUNABLE_INT("vfs.zfs.mg_alloc_failures", &zfs_mg_alloc_failures);
84SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_alloc_failures, CTLFLAG_RWTUN,
85    &zfs_mg_alloc_failures, 0,
86    "Number of allowed allocation failures per vdev");
87
88/*
89 * The zfs_mg_noalloc_threshold defines which metaslab groups should
90 * be eligible for allocation. The value is defined as a percentage of
91 * a free space. Metaslab groups that have more free space than
92 * zfs_mg_noalloc_threshold are always eligible for allocations. Once
93 * a metaslab group's free space is less than or equal to the
94 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
95 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
96 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
97 * groups are allowed to accept allocations. Gang blocks are always
98 * eligible to allocate on any metaslab group. The default value of 0 means
99 * no metaslab group will be excluded based on this criterion.
100 */
101int zfs_mg_noalloc_threshold = 0;
102TUNABLE_INT("vfs.zfs.mg_noalloc_threshold", &zfs_mg_noalloc_threshold);
103SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_noalloc_threshold, CTLFLAG_RWTUN,
104    &zfs_mg_noalloc_threshold, 0,
105    "Percentage of metaslab group size that should be free"
106    " to make it eligible for allocation");
107
108/*
109 * When set will load all metaslabs when pool is first opened.
110 */
111int metaslab_debug_load = 0;
112TUNABLE_INT("vfs.zfs.metaslab.debug_load", &metaslab_debug_load);
113SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_load, CTLFLAG_RWTUN,
114    &metaslab_debug_load, 0,
115    "Load all metaslabs when pool is first opened");
116
117/*
118 * When set will prevent metaslabs from being unloaded.
119 */
120int metaslab_debug_unload = 0;
121TUNABLE_INT("vfs.zfs.metaslab.debug_unload", &metaslab_debug_unload);
122SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_unload, CTLFLAG_RWTUN,
123    &metaslab_debug_unload, 0,
124    "Prevent metaslabs from being unloaded");
125
126/*
127 * Minimum size which forces the dynamic allocator to change
128 * it's allocation strategy.  Once the space map cannot satisfy
129 * an allocation of this size then it switches to using more
130 * aggressive strategy (i.e search by size rather than offset).
131 */
132uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
133TUNABLE_QUAD("vfs.zfs.metaslab.df_alloc_threshold",
134    &metaslab_df_alloc_threshold);
135SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN,
136    &metaslab_df_alloc_threshold, 0,
137    "Minimum size which forces the dynamic allocator to change it's allocation strategy");
138
139/*
140 * The minimum free space, in percent, which must be available
141 * in a space map to continue allocations in a first-fit fashion.
142 * Once the space_map's free space drops below this level we dynamically
143 * switch to using best-fit allocations.
144 */
145int metaslab_df_free_pct = 4;
146TUNABLE_INT("vfs.zfs.metaslab.df_free_pct", &metaslab_df_free_pct);
147SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN,
148    &metaslab_df_free_pct, 0,
149    "The minimum free space, in percent, which must be available in a space map to continue allocations in a first-fit fashion");
150
151/*
152 * A metaslab is considered "free" if it contains a contiguous
153 * segment which is greater than metaslab_min_alloc_size.
154 */
155uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
156TUNABLE_QUAD("vfs.zfs.metaslab.min_alloc_size",
157    &metaslab_min_alloc_size);
158SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, min_alloc_size, CTLFLAG_RWTUN,
159    &metaslab_min_alloc_size, 0,
160    "A metaslab is considered \"free\" if it contains a contiguous segment which is greater than vfs.zfs.metaslab.min_alloc_size");
161
162/*
163 * Percentage of all cpus that can be used by the metaslab taskq.
164 */
165int metaslab_load_pct = 50;
166TUNABLE_INT("vfs.zfs.metaslab.load_pct", &metaslab_load_pct);
167SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN,
168    &metaslab_load_pct, 0,
169    "Percentage of cpus that can be used by the metaslab taskq");
170
171/*
172 * Determines how many txgs a metaslab may remain loaded without having any
173 * allocations from it. As long as a metaslab continues to be used we will
174 * keep it loaded.
175 */
176int metaslab_unload_delay = TXG_SIZE * 2;
177TUNABLE_INT("vfs.zfs.metaslab.unload_delay", &metaslab_unload_delay);
178SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, unload_delay, CTLFLAG_RWTUN,
179    &metaslab_unload_delay, 0,
180    "Number of TXGs that an unused metaslab can be kept in memory");
181
182/*
183 * Should we be willing to write data to degraded vdevs?
184 */
185boolean_t zfs_write_to_degraded = B_FALSE;
186SYSCTL_INT(_vfs_zfs, OID_AUTO, write_to_degraded, CTLFLAG_RWTUN,
187    &zfs_write_to_degraded, 0, "Allow writing data to degraded vdevs");
188TUNABLE_INT("vfs.zfs.write_to_degraded", &zfs_write_to_degraded);
189
190/*
191 * Max number of metaslabs per group to preload.
192 */
193int metaslab_preload_limit = SPA_DVAS_PER_BP;
194TUNABLE_INT("vfs.zfs.metaslab.preload_limit", &metaslab_preload_limit);
195SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN,
196    &metaslab_preload_limit, 0,
197    "Max number of metaslabs per group to preload");
198
199/*
200 * Enable/disable preloading of metaslab.
201 */
202boolean_t metaslab_preload_enabled = B_TRUE;
203TUNABLE_INT("vfs.zfs.metaslab.preload_enabled", &metaslab_preload_enabled);
204SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_enabled, CTLFLAG_RWTUN,
205    &metaslab_preload_enabled, 0,
206    "Max number of metaslabs per group to preload");
207
208/*
209 * Enable/disable additional weight factor for each metaslab.
210 */
211boolean_t metaslab_weight_factor_enable = B_FALSE;
212TUNABLE_INT("vfs.zfs.metaslab.weight_factor_enable",
213    &metaslab_weight_factor_enable);
214SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, weight_factor_enable, CTLFLAG_RWTUN,
215    &metaslab_weight_factor_enable, 0,
216    "Enable additional weight factor for each metaslab");
217
218
219/*
220 * ==========================================================================
221 * Metaslab classes
222 * ==========================================================================
223 */
224metaslab_class_t *
225metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
226{
227	metaslab_class_t *mc;
228
229	mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
230
231	mc->mc_spa = spa;
232	mc->mc_rotor = NULL;
233	mc->mc_ops = ops;
234
235	return (mc);
236}
237
238void
239metaslab_class_destroy(metaslab_class_t *mc)
240{
241	ASSERT(mc->mc_rotor == NULL);
242	ASSERT(mc->mc_alloc == 0);
243	ASSERT(mc->mc_deferred == 0);
244	ASSERT(mc->mc_space == 0);
245	ASSERT(mc->mc_dspace == 0);
246
247	kmem_free(mc, sizeof (metaslab_class_t));
248}
249
250int
251metaslab_class_validate(metaslab_class_t *mc)
252{
253	metaslab_group_t *mg;
254	vdev_t *vd;
255
256	/*
257	 * Must hold one of the spa_config locks.
258	 */
259	ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
260	    spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
261
262	if ((mg = mc->mc_rotor) == NULL)
263		return (0);
264
265	do {
266		vd = mg->mg_vd;
267		ASSERT(vd->vdev_mg != NULL);
268		ASSERT3P(vd->vdev_top, ==, vd);
269		ASSERT3P(mg->mg_class, ==, mc);
270		ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
271	} while ((mg = mg->mg_next) != mc->mc_rotor);
272
273	return (0);
274}
275
276void
277metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
278    int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
279{
280	atomic_add_64(&mc->mc_alloc, alloc_delta);
281	atomic_add_64(&mc->mc_deferred, defer_delta);
282	atomic_add_64(&mc->mc_space, space_delta);
283	atomic_add_64(&mc->mc_dspace, dspace_delta);
284}
285
286void
287metaslab_class_minblocksize_update(metaslab_class_t *mc)
288{
289	metaslab_group_t *mg;
290	vdev_t *vd;
291	uint64_t minashift = UINT64_MAX;
292
293	if ((mg = mc->mc_rotor) == NULL) {
294		mc->mc_minblocksize = SPA_MINBLOCKSIZE;
295		return;
296	}
297
298	do {
299		vd = mg->mg_vd;
300		if (vd->vdev_ashift < minashift)
301			minashift = vd->vdev_ashift;
302	} while ((mg = mg->mg_next) != mc->mc_rotor);
303
304	mc->mc_minblocksize = 1ULL << minashift;
305}
306
307uint64_t
308metaslab_class_get_alloc(metaslab_class_t *mc)
309{
310	return (mc->mc_alloc);
311}
312
313uint64_t
314metaslab_class_get_deferred(metaslab_class_t *mc)
315{
316	return (mc->mc_deferred);
317}
318
319uint64_t
320metaslab_class_get_space(metaslab_class_t *mc)
321{
322	return (mc->mc_space);
323}
324
325uint64_t
326metaslab_class_get_dspace(metaslab_class_t *mc)
327{
328	return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
329}
330
331uint64_t
332metaslab_class_get_minblocksize(metaslab_class_t *mc)
333{
334	return (mc->mc_minblocksize);
335}
336
337/*
338 * ==========================================================================
339 * Metaslab groups
340 * ==========================================================================
341 */
342static int
343metaslab_compare(const void *x1, const void *x2)
344{
345	const metaslab_t *m1 = x1;
346	const metaslab_t *m2 = x2;
347
348	if (m1->ms_weight < m2->ms_weight)
349		return (1);
350	if (m1->ms_weight > m2->ms_weight)
351		return (-1);
352
353	/*
354	 * If the weights are identical, use the offset to force uniqueness.
355	 */
356	if (m1->ms_start < m2->ms_start)
357		return (-1);
358	if (m1->ms_start > m2->ms_start)
359		return (1);
360
361	ASSERT3P(m1, ==, m2);
362
363	return (0);
364}
365
366/*
367 * Update the allocatable flag and the metaslab group's capacity.
368 * The allocatable flag is set to true if the capacity is below
369 * the zfs_mg_noalloc_threshold. If a metaslab group transitions
370 * from allocatable to non-allocatable or vice versa then the metaslab
371 * group's class is updated to reflect the transition.
372 */
373static void
374metaslab_group_alloc_update(metaslab_group_t *mg)
375{
376	vdev_t *vd = mg->mg_vd;
377	metaslab_class_t *mc = mg->mg_class;
378	vdev_stat_t *vs = &vd->vdev_stat;
379	boolean_t was_allocatable;
380
381	ASSERT(vd == vd->vdev_top);
382
383	mutex_enter(&mg->mg_lock);
384	was_allocatable = mg->mg_allocatable;
385
386	mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
387	    (vs->vs_space + 1);
388
389	mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold);
390
391	/*
392	 * The mc_alloc_groups maintains a count of the number of
393	 * groups in this metaslab class that are still above the
394	 * zfs_mg_noalloc_threshold. This is used by the allocating
395	 * threads to determine if they should avoid allocations to
396	 * a given group. The allocator will avoid allocations to a group
397	 * if that group has reached or is below the zfs_mg_noalloc_threshold
398	 * and there are still other groups that are above the threshold.
399	 * When a group transitions from allocatable to non-allocatable or
400	 * vice versa we update the metaslab class to reflect that change.
401	 * When the mc_alloc_groups value drops to 0 that means that all
402	 * groups have reached the zfs_mg_noalloc_threshold making all groups
403	 * eligible for allocations. This effectively means that all devices
404	 * are balanced again.
405	 */
406	if (was_allocatable && !mg->mg_allocatable)
407		mc->mc_alloc_groups--;
408	else if (!was_allocatable && mg->mg_allocatable)
409		mc->mc_alloc_groups++;
410	mutex_exit(&mg->mg_lock);
411}
412
413metaslab_group_t *
414metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
415{
416	metaslab_group_t *mg;
417
418	mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
419	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
420	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
421	    sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
422	mg->mg_vd = vd;
423	mg->mg_class = mc;
424	mg->mg_activation_count = 0;
425
426	mg->mg_taskq = taskq_create("metaslab_group_tasksq", metaslab_load_pct,
427	    minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
428
429	return (mg);
430}
431
432void
433metaslab_group_destroy(metaslab_group_t *mg)
434{
435	ASSERT(mg->mg_prev == NULL);
436	ASSERT(mg->mg_next == NULL);
437	/*
438	 * We may have gone below zero with the activation count
439	 * either because we never activated in the first place or
440	 * because we're done, and possibly removing the vdev.
441	 */
442	ASSERT(mg->mg_activation_count <= 0);
443
444	avl_destroy(&mg->mg_metaslab_tree);
445	mutex_destroy(&mg->mg_lock);
446	kmem_free(mg, sizeof (metaslab_group_t));
447}
448
449void
450metaslab_group_activate(metaslab_group_t *mg)
451{
452	metaslab_class_t *mc = mg->mg_class;
453	metaslab_group_t *mgprev, *mgnext;
454
455	ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
456
457	ASSERT(mc->mc_rotor != mg);
458	ASSERT(mg->mg_prev == NULL);
459	ASSERT(mg->mg_next == NULL);
460	ASSERT(mg->mg_activation_count <= 0);
461
462	if (++mg->mg_activation_count <= 0)
463		return;
464
465	mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
466	metaslab_group_alloc_update(mg);
467
468	if ((mgprev = mc->mc_rotor) == NULL) {
469		mg->mg_prev = mg;
470		mg->mg_next = mg;
471	} else {
472		mgnext = mgprev->mg_next;
473		mg->mg_prev = mgprev;
474		mg->mg_next = mgnext;
475		mgprev->mg_next = mg;
476		mgnext->mg_prev = mg;
477	}
478	mc->mc_rotor = mg;
479	metaslab_class_minblocksize_update(mc);
480}
481
482void
483metaslab_group_passivate(metaslab_group_t *mg)
484{
485	metaslab_class_t *mc = mg->mg_class;
486	metaslab_group_t *mgprev, *mgnext;
487
488	ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
489
490	if (--mg->mg_activation_count != 0) {
491		ASSERT(mc->mc_rotor != mg);
492		ASSERT(mg->mg_prev == NULL);
493		ASSERT(mg->mg_next == NULL);
494		ASSERT(mg->mg_activation_count < 0);
495		return;
496	}
497
498	taskq_wait(mg->mg_taskq);
499
500	mgprev = mg->mg_prev;
501	mgnext = mg->mg_next;
502
503	if (mg == mgnext) {
504		mc->mc_rotor = NULL;
505	} else {
506		mc->mc_rotor = mgnext;
507		mgprev->mg_next = mgnext;
508		mgnext->mg_prev = mgprev;
509	}
510
511	mg->mg_prev = NULL;
512	mg->mg_next = NULL;
513	metaslab_class_minblocksize_update(mc);
514}
515
516static void
517metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
518{
519	mutex_enter(&mg->mg_lock);
520	ASSERT(msp->ms_group == NULL);
521	msp->ms_group = mg;
522	msp->ms_weight = 0;
523	avl_add(&mg->mg_metaslab_tree, msp);
524	mutex_exit(&mg->mg_lock);
525}
526
527static void
528metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
529{
530	mutex_enter(&mg->mg_lock);
531	ASSERT(msp->ms_group == mg);
532	avl_remove(&mg->mg_metaslab_tree, msp);
533	msp->ms_group = NULL;
534	mutex_exit(&mg->mg_lock);
535}
536
537static void
538metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
539{
540	/*
541	 * Although in principle the weight can be any value, in
542	 * practice we do not use values in the range [1, 510].
543	 */
544	ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0);
545	ASSERT(MUTEX_HELD(&msp->ms_lock));
546
547	mutex_enter(&mg->mg_lock);
548	ASSERT(msp->ms_group == mg);
549	avl_remove(&mg->mg_metaslab_tree, msp);
550	msp->ms_weight = weight;
551	avl_add(&mg->mg_metaslab_tree, msp);
552	mutex_exit(&mg->mg_lock);
553}
554
555/*
556 * Determine if a given metaslab group should skip allocations. A metaslab
557 * group should avoid allocations if its used capacity has crossed the
558 * zfs_mg_noalloc_threshold and there is at least one metaslab group
559 * that can still handle allocations.
560 */
561static boolean_t
562metaslab_group_allocatable(metaslab_group_t *mg)
563{
564	vdev_t *vd = mg->mg_vd;
565	spa_t *spa = vd->vdev_spa;
566	metaslab_class_t *mc = mg->mg_class;
567
568	/*
569	 * A metaslab group is considered allocatable if its free capacity
570	 * is greater than the set value of zfs_mg_noalloc_threshold, it's
571	 * associated with a slog, or there are no other metaslab groups
572	 * with free capacity greater than zfs_mg_noalloc_threshold.
573	 */
574	return (mg->mg_free_capacity > zfs_mg_noalloc_threshold ||
575	    mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0);
576}
577
578/*
579 * ==========================================================================
580 * Range tree callbacks
581 * ==========================================================================
582 */
583
584/*
585 * Comparison function for the private size-ordered tree. Tree is sorted
586 * by size, larger sizes at the end of the tree.
587 */
588static int
589metaslab_rangesize_compare(const void *x1, const void *x2)
590{
591	const range_seg_t *r1 = x1;
592	const range_seg_t *r2 = x2;
593	uint64_t rs_size1 = r1->rs_end - r1->rs_start;
594	uint64_t rs_size2 = r2->rs_end - r2->rs_start;
595
596	if (rs_size1 < rs_size2)
597		return (-1);
598	if (rs_size1 > rs_size2)
599		return (1);
600
601	if (r1->rs_start < r2->rs_start)
602		return (-1);
603
604	if (r1->rs_start > r2->rs_start)
605		return (1);
606
607	return (0);
608}
609
610/*
611 * Create any block allocator specific components. The current allocators
612 * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
613 */
614static void
615metaslab_rt_create(range_tree_t *rt, void *arg)
616{
617	metaslab_t *msp = arg;
618
619	ASSERT3P(rt->rt_arg, ==, msp);
620	ASSERT(msp->ms_tree == NULL);
621
622	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
623	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
624}
625
626/*
627 * Destroy the block allocator specific components.
628 */
629static void
630metaslab_rt_destroy(range_tree_t *rt, void *arg)
631{
632	metaslab_t *msp = arg;
633
634	ASSERT3P(rt->rt_arg, ==, msp);
635	ASSERT3P(msp->ms_tree, ==, rt);
636	ASSERT0(avl_numnodes(&msp->ms_size_tree));
637
638	avl_destroy(&msp->ms_size_tree);
639}
640
641static void
642metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
643{
644	metaslab_t *msp = arg;
645
646	ASSERT3P(rt->rt_arg, ==, msp);
647	ASSERT3P(msp->ms_tree, ==, rt);
648	VERIFY(!msp->ms_condensing);
649	avl_add(&msp->ms_size_tree, rs);
650}
651
652static void
653metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
654{
655	metaslab_t *msp = arg;
656
657	ASSERT3P(rt->rt_arg, ==, msp);
658	ASSERT3P(msp->ms_tree, ==, rt);
659	VERIFY(!msp->ms_condensing);
660	avl_remove(&msp->ms_size_tree, rs);
661}
662
663static void
664metaslab_rt_vacate(range_tree_t *rt, void *arg)
665{
666	metaslab_t *msp = arg;
667
668	ASSERT3P(rt->rt_arg, ==, msp);
669	ASSERT3P(msp->ms_tree, ==, rt);
670
671	/*
672	 * Normally one would walk the tree freeing nodes along the way.
673	 * Since the nodes are shared with the range trees we can avoid
674	 * walking all nodes and just reinitialize the avl tree. The nodes
675	 * will be freed by the range tree, so we don't want to free them here.
676	 */
677	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
678	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
679}
680
681static range_tree_ops_t metaslab_rt_ops = {
682	metaslab_rt_create,
683	metaslab_rt_destroy,
684	metaslab_rt_add,
685	metaslab_rt_remove,
686	metaslab_rt_vacate
687};
688
689/*
690 * ==========================================================================
691 * Metaslab block operations
692 * ==========================================================================
693 */
694
695/*
696 * Return the maximum contiguous segment within the metaslab.
697 */
698uint64_t
699metaslab_block_maxsize(metaslab_t *msp)
700{
701	avl_tree_t *t = &msp->ms_size_tree;
702	range_seg_t *rs;
703
704	if (t == NULL || (rs = avl_last(t)) == NULL)
705		return (0ULL);
706
707	return (rs->rs_end - rs->rs_start);
708}
709
710uint64_t
711metaslab_block_alloc(metaslab_t *msp, uint64_t size)
712{
713	uint64_t start;
714	range_tree_t *rt = msp->ms_tree;
715
716	VERIFY(!msp->ms_condensing);
717
718	start = msp->ms_ops->msop_alloc(msp, size);
719	if (start != -1ULL) {
720		vdev_t *vd = msp->ms_group->mg_vd;
721
722		VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
723		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
724		VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
725		range_tree_remove(rt, start, size);
726	}
727	return (start);
728}
729
730/*
731 * ==========================================================================
732 * Common allocator routines
733 * ==========================================================================
734 */
735
736/*
737 * This is a helper function that can be used by the allocator to find
738 * a suitable block to allocate. This will search the specified AVL
739 * tree looking for a block that matches the specified criteria.
740 */
741static uint64_t
742metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
743    uint64_t align)
744{
745	range_seg_t *rs, rsearch;
746	avl_index_t where;
747
748	rsearch.rs_start = *cursor;
749	rsearch.rs_end = *cursor + size;
750
751	rs = avl_find(t, &rsearch, &where);
752	if (rs == NULL)
753		rs = avl_nearest(t, where, AVL_AFTER);
754
755	while (rs != NULL) {
756		uint64_t offset = P2ROUNDUP(rs->rs_start, align);
757
758		if (offset + size <= rs->rs_end) {
759			*cursor = offset + size;
760			return (offset);
761		}
762		rs = AVL_NEXT(t, rs);
763	}
764
765	/*
766	 * If we know we've searched the whole map (*cursor == 0), give up.
767	 * Otherwise, reset the cursor to the beginning and try again.
768	 */
769	if (*cursor == 0)
770		return (-1ULL);
771
772	*cursor = 0;
773	return (metaslab_block_picker(t, cursor, size, align));
774}
775
776/*
777 * ==========================================================================
778 * The first-fit block allocator
779 * ==========================================================================
780 */
781static uint64_t
782metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
783{
784	/*
785	 * Find the largest power of 2 block size that evenly divides the
786	 * requested size. This is used to try to allocate blocks with similar
787	 * alignment from the same area of the metaslab (i.e. same cursor
788	 * bucket) but it does not guarantee that other allocations sizes
789	 * may exist in the same region.
790	 */
791	uint64_t align = size & -size;
792	uint64_t *cursor = &msp->ms_lbas[highbit(align) - 1];
793	avl_tree_t *t = &msp->ms_tree->rt_root;
794
795	return (metaslab_block_picker(t, cursor, size, align));
796}
797
798/* ARGSUSED */
799static boolean_t
800metaslab_ff_fragmented(metaslab_t *msp)
801{
802	return (B_TRUE);
803}
804
805static metaslab_ops_t metaslab_ff_ops = {
806	metaslab_ff_alloc,
807	metaslab_ff_fragmented
808};
809
810/*
811 * ==========================================================================
812 * Dynamic block allocator -
813 * Uses the first fit allocation scheme until space get low and then
814 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
815 * and metaslab_df_free_pct to determine when to switch the allocation scheme.
816 * ==========================================================================
817 */
818static uint64_t
819metaslab_df_alloc(metaslab_t *msp, uint64_t size)
820{
821	/*
822	 * Find the largest power of 2 block size that evenly divides the
823	 * requested size. This is used to try to allocate blocks with similar
824	 * alignment from the same area of the metaslab (i.e. same cursor
825	 * bucket) but it does not guarantee that other allocations sizes
826	 * may exist in the same region.
827	 */
828	uint64_t align = size & -size;
829	uint64_t *cursor = &msp->ms_lbas[highbit(align) - 1];
830	range_tree_t *rt = msp->ms_tree;
831	avl_tree_t *t = &rt->rt_root;
832	uint64_t max_size = metaslab_block_maxsize(msp);
833	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
834
835	ASSERT(MUTEX_HELD(&msp->ms_lock));
836	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
837
838	if (max_size < size)
839		return (-1ULL);
840
841	/*
842	 * If we're running low on space switch to using the size
843	 * sorted AVL tree (best-fit).
844	 */
845	if (max_size < metaslab_df_alloc_threshold ||
846	    free_pct < metaslab_df_free_pct) {
847		t = &msp->ms_size_tree;
848		*cursor = 0;
849	}
850
851	return (metaslab_block_picker(t, cursor, size, 1ULL));
852}
853
854static boolean_t
855metaslab_df_fragmented(metaslab_t *msp)
856{
857	range_tree_t *rt = msp->ms_tree;
858	uint64_t max_size = metaslab_block_maxsize(msp);
859	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
860
861	if (max_size >= metaslab_df_alloc_threshold &&
862	    free_pct >= metaslab_df_free_pct)
863		return (B_FALSE);
864
865	return (B_TRUE);
866}
867
868static metaslab_ops_t metaslab_df_ops = {
869	metaslab_df_alloc,
870	metaslab_df_fragmented
871};
872
873/*
874 * ==========================================================================
875 * Cursor fit block allocator -
876 * Select the largest region in the metaslab, set the cursor to the beginning
877 * of the range and the cursor_end to the end of the range. As allocations
878 * are made advance the cursor. Continue allocating from the cursor until
879 * the range is exhausted and then find a new range.
880 * ==========================================================================
881 */
882static uint64_t
883metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
884{
885	range_tree_t *rt = msp->ms_tree;
886	avl_tree_t *t = &msp->ms_size_tree;
887	uint64_t *cursor = &msp->ms_lbas[0];
888	uint64_t *cursor_end = &msp->ms_lbas[1];
889	uint64_t offset = 0;
890
891	ASSERT(MUTEX_HELD(&msp->ms_lock));
892	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
893
894	ASSERT3U(*cursor_end, >=, *cursor);
895
896	if ((*cursor + size) > *cursor_end) {
897		range_seg_t *rs;
898
899		rs = avl_last(&msp->ms_size_tree);
900		if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
901			return (-1ULL);
902
903		*cursor = rs->rs_start;
904		*cursor_end = rs->rs_end;
905	}
906
907	offset = *cursor;
908	*cursor += size;
909
910	return (offset);
911}
912
913static boolean_t
914metaslab_cf_fragmented(metaslab_t *msp)
915{
916	return (metaslab_block_maxsize(msp) < metaslab_min_alloc_size);
917}
918
919static metaslab_ops_t metaslab_cf_ops = {
920	metaslab_cf_alloc,
921	metaslab_cf_fragmented
922};
923
924/*
925 * ==========================================================================
926 * New dynamic fit allocator -
927 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
928 * contiguous blocks. If no region is found then just use the largest segment
929 * that remains.
930 * ==========================================================================
931 */
932
933/*
934 * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
935 * to request from the allocator.
936 */
937uint64_t metaslab_ndf_clump_shift = 4;
938
939static uint64_t
940metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
941{
942	avl_tree_t *t = &msp->ms_tree->rt_root;
943	avl_index_t where;
944	range_seg_t *rs, rsearch;
945	uint64_t hbit = highbit(size);
946	uint64_t *cursor = &msp->ms_lbas[hbit - 1];
947	uint64_t max_size = metaslab_block_maxsize(msp);
948
949	ASSERT(MUTEX_HELD(&msp->ms_lock));
950	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
951
952	if (max_size < size)
953		return (-1ULL);
954
955	rsearch.rs_start = *cursor;
956	rsearch.rs_end = *cursor + size;
957
958	rs = avl_find(t, &rsearch, &where);
959	if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
960		t = &msp->ms_size_tree;
961
962		rsearch.rs_start = 0;
963		rsearch.rs_end = MIN(max_size,
964		    1ULL << (hbit + metaslab_ndf_clump_shift));
965		rs = avl_find(t, &rsearch, &where);
966		if (rs == NULL)
967			rs = avl_nearest(t, where, AVL_AFTER);
968		ASSERT(rs != NULL);
969	}
970
971	if ((rs->rs_end - rs->rs_start) >= size) {
972		*cursor = rs->rs_start + size;
973		return (rs->rs_start);
974	}
975	return (-1ULL);
976}
977
978static boolean_t
979metaslab_ndf_fragmented(metaslab_t *msp)
980{
981	return (metaslab_block_maxsize(msp) <=
982	    (metaslab_min_alloc_size << metaslab_ndf_clump_shift));
983}
984
985static metaslab_ops_t metaslab_ndf_ops = {
986	metaslab_ndf_alloc,
987	metaslab_ndf_fragmented
988};
989
990metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
991
992/*
993 * ==========================================================================
994 * Metaslabs
995 * ==========================================================================
996 */
997
998/*
999 * Wait for any in-progress metaslab loads to complete.
1000 */
1001void
1002metaslab_load_wait(metaslab_t *msp)
1003{
1004	ASSERT(MUTEX_HELD(&msp->ms_lock));
1005
1006	while (msp->ms_loading) {
1007		ASSERT(!msp->ms_loaded);
1008		cv_wait(&msp->ms_load_cv, &msp->ms_lock);
1009	}
1010}
1011
1012int
1013metaslab_load(metaslab_t *msp)
1014{
1015	int error = 0;
1016
1017	ASSERT(MUTEX_HELD(&msp->ms_lock));
1018	ASSERT(!msp->ms_loaded);
1019	ASSERT(!msp->ms_loading);
1020
1021	msp->ms_loading = B_TRUE;
1022
1023	/*
1024	 * If the space map has not been allocated yet, then treat
1025	 * all the space in the metaslab as free and add it to the
1026	 * ms_tree.
1027	 */
1028	if (msp->ms_sm != NULL)
1029		error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
1030	else
1031		range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
1032
1033	msp->ms_loaded = (error == 0);
1034	msp->ms_loading = B_FALSE;
1035
1036	if (msp->ms_loaded) {
1037		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1038			range_tree_walk(msp->ms_defertree[t],
1039			    range_tree_remove, msp->ms_tree);
1040		}
1041	}
1042	cv_broadcast(&msp->ms_load_cv);
1043	return (error);
1044}
1045
1046void
1047metaslab_unload(metaslab_t *msp)
1048{
1049	ASSERT(MUTEX_HELD(&msp->ms_lock));
1050	range_tree_vacate(msp->ms_tree, NULL, NULL);
1051	msp->ms_loaded = B_FALSE;
1052	msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
1053}
1054
1055metaslab_t *
1056metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg)
1057{
1058	vdev_t *vd = mg->mg_vd;
1059	objset_t *mos = vd->vdev_spa->spa_meta_objset;
1060	metaslab_t *msp;
1061
1062	msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
1063	mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL);
1064	cv_init(&msp->ms_load_cv, NULL, CV_DEFAULT, NULL);
1065	msp->ms_id = id;
1066	msp->ms_start = id << vd->vdev_ms_shift;
1067	msp->ms_size = 1ULL << vd->vdev_ms_shift;
1068
1069	/*
1070	 * We only open space map objects that already exist. All others
1071	 * will be opened when we finally allocate an object for it.
1072	 */
1073	if (object != 0) {
1074		VERIFY0(space_map_open(&msp->ms_sm, mos, object, msp->ms_start,
1075		    msp->ms_size, vd->vdev_ashift, &msp->ms_lock));
1076		ASSERT(msp->ms_sm != NULL);
1077	}
1078
1079	/*
1080	 * We create the main range tree here, but we don't create the
1081	 * alloctree and freetree until metaslab_sync_done().  This serves
1082	 * two purposes: it allows metaslab_sync_done() to detect the
1083	 * addition of new space; and for debugging, it ensures that we'd
1084	 * data fault on any attempt to use this metaslab before it's ready.
1085	 */
1086	msp->ms_tree = range_tree_create(&metaslab_rt_ops, msp, &msp->ms_lock);
1087	metaslab_group_add(mg, msp);
1088
1089	msp->ms_ops = mg->mg_class->mc_ops;
1090
1091	/*
1092	 * If we're opening an existing pool (txg == 0) or creating
1093	 * a new one (txg == TXG_INITIAL), all space is available now.
1094	 * If we're adding space to an existing pool, the new space
1095	 * does not become available until after this txg has synced.
1096	 */
1097	if (txg <= TXG_INITIAL)
1098		metaslab_sync_done(msp, 0);
1099
1100	/*
1101	 * If metaslab_debug_load is set and we're initializing a metaslab
1102	 * that has an allocated space_map object then load the its space
1103	 * map so that can verify frees.
1104	 */
1105	if (metaslab_debug_load && msp->ms_sm != NULL) {
1106		mutex_enter(&msp->ms_lock);
1107		VERIFY0(metaslab_load(msp));
1108		mutex_exit(&msp->ms_lock);
1109	}
1110
1111	if (txg != 0) {
1112		vdev_dirty(vd, 0, NULL, txg);
1113		vdev_dirty(vd, VDD_METASLAB, msp, txg);
1114	}
1115
1116	return (msp);
1117}
1118
1119void
1120metaslab_fini(metaslab_t *msp)
1121{
1122	metaslab_group_t *mg = msp->ms_group;
1123
1124	metaslab_group_remove(mg, msp);
1125
1126	mutex_enter(&msp->ms_lock);
1127
1128	VERIFY(msp->ms_group == NULL);
1129	vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
1130	    0, -msp->ms_size);
1131	space_map_close(msp->ms_sm);
1132
1133	metaslab_unload(msp);
1134	range_tree_destroy(msp->ms_tree);
1135
1136	for (int t = 0; t < TXG_SIZE; t++) {
1137		range_tree_destroy(msp->ms_alloctree[t]);
1138		range_tree_destroy(msp->ms_freetree[t]);
1139	}
1140
1141	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1142		range_tree_destroy(msp->ms_defertree[t]);
1143	}
1144
1145	ASSERT0(msp->ms_deferspace);
1146
1147	mutex_exit(&msp->ms_lock);
1148	cv_destroy(&msp->ms_load_cv);
1149	mutex_destroy(&msp->ms_lock);
1150
1151	kmem_free(msp, sizeof (metaslab_t));
1152}
1153
1154/*
1155 * Apply a weighting factor based on the histogram information for this
1156 * metaslab. The current weighting factor is somewhat arbitrary and requires
1157 * additional investigation. The implementation provides a measure of
1158 * "weighted" free space and gives a higher weighting for larger contiguous
1159 * regions. The weighting factor is determined by counting the number of
1160 * sm_shift sectors that exist in each region represented by the histogram.
1161 * That value is then multiplied by the power of 2 exponent and the sm_shift
1162 * value.
1163 *
1164 * For example, assume the 2^21 histogram bucket has 4 2MB regions and the
1165 * metaslab has an sm_shift value of 9 (512B):
1166 *
1167 * 1) calculate the number of sm_shift sectors in the region:
1168 *	2^21 / 2^9 = 2^12 = 4096 * 4 (number of regions) = 16384
1169 * 2) multiply by the power of 2 exponent and the sm_shift value:
1170 *	16384 * 21 * 9 = 3096576
1171 * This value will be added to the weighting of the metaslab.
1172 */
1173static uint64_t
1174metaslab_weight_factor(metaslab_t *msp)
1175{
1176	uint64_t factor = 0;
1177	uint64_t sectors;
1178	int i;
1179
1180	/*
1181	 * A null space map means that the entire metaslab is free,
1182	 * calculate a weight factor that spans the entire size of the
1183	 * metaslab.
1184	 */
1185	if (msp->ms_sm == NULL) {
1186		vdev_t *vd = msp->ms_group->mg_vd;
1187
1188		i = highbit(msp->ms_size) - 1;
1189		sectors = msp->ms_size >> vd->vdev_ashift;
1190		return (sectors * i * vd->vdev_ashift);
1191	}
1192
1193	if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
1194		return (0);
1195
1196	for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE(msp->ms_sm); i++) {
1197		if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
1198			continue;
1199
1200		/*
1201		 * Determine the number of sm_shift sectors in the region
1202		 * indicated by the histogram. For example, given an
1203		 * sm_shift value of 9 (512 bytes) and i = 4 then we know
1204		 * that we're looking at an 8K region in the histogram
1205		 * (i.e. 9 + 4 = 13, 2^13 = 8192). To figure out the
1206		 * number of sm_shift sectors (512 bytes in this example),
1207		 * we would take 8192 / 512 = 16. Since the histogram
1208		 * is offset by sm_shift we can simply use the value of
1209		 * of i to calculate this (i.e. 2^i = 16 where i = 4).
1210		 */
1211		sectors = msp->ms_sm->sm_phys->smp_histogram[i] << i;
1212		factor += (i + msp->ms_sm->sm_shift) * sectors;
1213	}
1214	return (factor * msp->ms_sm->sm_shift);
1215}
1216
1217static uint64_t
1218metaslab_weight(metaslab_t *msp)
1219{
1220	metaslab_group_t *mg = msp->ms_group;
1221	vdev_t *vd = mg->mg_vd;
1222	uint64_t weight, space;
1223
1224	ASSERT(MUTEX_HELD(&msp->ms_lock));
1225
1226	/*
1227	 * This vdev is in the process of being removed so there is nothing
1228	 * for us to do here.
1229	 */
1230	if (vd->vdev_removing) {
1231		ASSERT0(space_map_allocated(msp->ms_sm));
1232		ASSERT0(vd->vdev_ms_shift);
1233		return (0);
1234	}
1235
1236	/*
1237	 * The baseline weight is the metaslab's free space.
1238	 */
1239	space = msp->ms_size - space_map_allocated(msp->ms_sm);
1240	weight = space;
1241
1242	/*
1243	 * Modern disks have uniform bit density and constant angular velocity.
1244	 * Therefore, the outer recording zones are faster (higher bandwidth)
1245	 * than the inner zones by the ratio of outer to inner track diameter,
1246	 * which is typically around 2:1.  We account for this by assigning
1247	 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
1248	 * In effect, this means that we'll select the metaslab with the most
1249	 * free bandwidth rather than simply the one with the most free space.
1250	 */
1251	weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
1252	ASSERT(weight >= space && weight <= 2 * space);
1253
1254	msp->ms_factor = metaslab_weight_factor(msp);
1255	if (metaslab_weight_factor_enable)
1256		weight += msp->ms_factor;
1257
1258	if (msp->ms_loaded && !msp->ms_ops->msop_fragmented(msp)) {
1259		/*
1260		 * If this metaslab is one we're actively using, adjust its
1261		 * weight to make it preferable to any inactive metaslab so
1262		 * we'll polish it off.
1263		 */
1264		weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
1265	}
1266
1267	return (weight);
1268}
1269
1270static int
1271metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
1272{
1273	ASSERT(MUTEX_HELD(&msp->ms_lock));
1274
1275	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
1276		metaslab_load_wait(msp);
1277		if (!msp->ms_loaded) {
1278			int error = metaslab_load(msp);
1279			if (error) {
1280				metaslab_group_sort(msp->ms_group, msp, 0);
1281				return (error);
1282			}
1283		}
1284
1285		metaslab_group_sort(msp->ms_group, msp,
1286		    msp->ms_weight | activation_weight);
1287	}
1288	ASSERT(msp->ms_loaded);
1289	ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
1290
1291	return (0);
1292}
1293
1294static void
1295metaslab_passivate(metaslab_t *msp, uint64_t size)
1296{
1297	/*
1298	 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
1299	 * this metaslab again.  In that case, it had better be empty,
1300	 * or we would be leaving space on the table.
1301	 */
1302	ASSERT(size >= SPA_MINBLOCKSIZE || range_tree_space(msp->ms_tree) == 0);
1303	metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
1304	ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
1305}
1306
1307static void
1308metaslab_preload(void *arg)
1309{
1310	metaslab_t *msp = arg;
1311	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1312
1313	mutex_enter(&msp->ms_lock);
1314	metaslab_load_wait(msp);
1315	if (!msp->ms_loaded)
1316		(void) metaslab_load(msp);
1317
1318	/*
1319	 * Set the ms_access_txg value so that we don't unload it right away.
1320	 */
1321	msp->ms_access_txg = spa_syncing_txg(spa) + metaslab_unload_delay + 1;
1322	mutex_exit(&msp->ms_lock);
1323}
1324
1325static void
1326metaslab_group_preload(metaslab_group_t *mg)
1327{
1328	spa_t *spa = mg->mg_vd->vdev_spa;
1329	metaslab_t *msp;
1330	avl_tree_t *t = &mg->mg_metaslab_tree;
1331	int m = 0;
1332
1333	if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
1334		taskq_wait(mg->mg_taskq);
1335		return;
1336	}
1337	mutex_enter(&mg->mg_lock);
1338
1339	/*
1340	 * Prefetch the next potential metaslabs
1341	 */
1342	for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
1343
1344		/* If we have reached our preload limit then we're done */
1345		if (++m > metaslab_preload_limit)
1346			break;
1347
1348		VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
1349		    msp, TQ_SLEEP) != 0);
1350	}
1351	mutex_exit(&mg->mg_lock);
1352}
1353
1354/*
1355 * Determine if the space map's on-disk footprint is past our tolerance
1356 * for inefficiency. We would like to use the following criteria to make
1357 * our decision:
1358 *
1359 * 1. The size of the space map object should not dramatically increase as a
1360 * result of writing out the free space range tree.
1361 *
1362 * 2. The minimal on-disk space map representation is zfs_condense_pct/100
1363 * times the size than the free space range tree representation
1364 * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB).
1365 *
1366 * Checking the first condition is tricky since we don't want to walk
1367 * the entire AVL tree calculating the estimated on-disk size. Instead we
1368 * use the size-ordered range tree in the metaslab and calculate the
1369 * size required to write out the largest segment in our free tree. If the
1370 * size required to represent that segment on disk is larger than the space
1371 * map object then we avoid condensing this map.
1372 *
1373 * To determine the second criterion we use a best-case estimate and assume
1374 * each segment can be represented on-disk as a single 64-bit entry. We refer
1375 * to this best-case estimate as the space map's minimal form.
1376 */
1377static boolean_t
1378metaslab_should_condense(metaslab_t *msp)
1379{
1380	space_map_t *sm = msp->ms_sm;
1381	range_seg_t *rs;
1382	uint64_t size, entries, segsz;
1383
1384	ASSERT(MUTEX_HELD(&msp->ms_lock));
1385	ASSERT(msp->ms_loaded);
1386
1387	/*
1388	 * Use the ms_size_tree range tree, which is ordered by size, to
1389	 * obtain the largest segment in the free tree. If the tree is empty
1390	 * then we should condense the map.
1391	 */
1392	rs = avl_last(&msp->ms_size_tree);
1393	if (rs == NULL)
1394		return (B_TRUE);
1395
1396	/*
1397	 * Calculate the number of 64-bit entries this segment would
1398	 * require when written to disk. If this single segment would be
1399	 * larger on-disk than the entire current on-disk structure, then
1400	 * clearly condensing will increase the on-disk structure size.
1401	 */
1402	size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
1403	entries = size / (MIN(size, SM_RUN_MAX));
1404	segsz = entries * sizeof (uint64_t);
1405
1406	return (segsz <= space_map_length(msp->ms_sm) &&
1407	    space_map_length(msp->ms_sm) >= (zfs_condense_pct *
1408	    sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root)) / 100);
1409}
1410
1411/*
1412 * Condense the on-disk space map representation to its minimized form.
1413 * The minimized form consists of a small number of allocations followed by
1414 * the entries of the free range tree.
1415 */
1416static void
1417metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
1418{
1419	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1420	range_tree_t *freetree = msp->ms_freetree[txg & TXG_MASK];
1421	range_tree_t *condense_tree;
1422	space_map_t *sm = msp->ms_sm;
1423
1424	ASSERT(MUTEX_HELD(&msp->ms_lock));
1425	ASSERT3U(spa_sync_pass(spa), ==, 1);
1426	ASSERT(msp->ms_loaded);
1427
1428	spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, "
1429	    "smp size %llu, segments %lu", txg, msp->ms_id, msp,
1430	    space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root));
1431
1432	/*
1433	 * Create an range tree that is 100% allocated. We remove segments
1434	 * that have been freed in this txg, any deferred frees that exist,
1435	 * and any allocation in the future. Removing segments should be
1436	 * a relatively inexpensive operation since we expect these trees to
1437	 * have a small number of nodes.
1438	 */
1439	condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock);
1440	range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
1441
1442	/*
1443	 * Remove what's been freed in this txg from the condense_tree.
1444	 * Since we're in sync_pass 1, we know that all the frees from
1445	 * this txg are in the freetree.
1446	 */
1447	range_tree_walk(freetree, range_tree_remove, condense_tree);
1448
1449	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1450		range_tree_walk(msp->ms_defertree[t],
1451		    range_tree_remove, condense_tree);
1452	}
1453
1454	for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
1455		range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK],
1456		    range_tree_remove, condense_tree);
1457	}
1458
1459	/*
1460	 * We're about to drop the metaslab's lock thus allowing
1461	 * other consumers to change it's content. Set the
1462	 * metaslab's ms_condensing flag to ensure that
1463	 * allocations on this metaslab do not occur while we're
1464	 * in the middle of committing it to disk. This is only critical
1465	 * for the ms_tree as all other range trees use per txg
1466	 * views of their content.
1467	 */
1468	msp->ms_condensing = B_TRUE;
1469
1470	mutex_exit(&msp->ms_lock);
1471	space_map_truncate(sm, tx);
1472	mutex_enter(&msp->ms_lock);
1473
1474	/*
1475	 * While we would ideally like to create a space_map representation
1476	 * that consists only of allocation records, doing so can be
1477	 * prohibitively expensive because the in-core free tree can be
1478	 * large, and therefore computationally expensive to subtract
1479	 * from the condense_tree. Instead we sync out two trees, a cheap
1480	 * allocation only tree followed by the in-core free tree. While not
1481	 * optimal, this is typically close to optimal, and much cheaper to
1482	 * compute.
1483	 */
1484	space_map_write(sm, condense_tree, SM_ALLOC, tx);
1485	range_tree_vacate(condense_tree, NULL, NULL);
1486	range_tree_destroy(condense_tree);
1487
1488	space_map_write(sm, msp->ms_tree, SM_FREE, tx);
1489	msp->ms_condensing = B_FALSE;
1490}
1491
1492/*
1493 * Write a metaslab to disk in the context of the specified transaction group.
1494 */
1495void
1496metaslab_sync(metaslab_t *msp, uint64_t txg)
1497{
1498	metaslab_group_t *mg = msp->ms_group;
1499	vdev_t *vd = mg->mg_vd;
1500	spa_t *spa = vd->vdev_spa;
1501	objset_t *mos = spa_meta_objset(spa);
1502	range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK];
1503	range_tree_t **freetree = &msp->ms_freetree[txg & TXG_MASK];
1504	range_tree_t **freed_tree =
1505	    &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
1506	dmu_tx_t *tx;
1507	uint64_t object = space_map_object(msp->ms_sm);
1508
1509	ASSERT(!vd->vdev_ishole);
1510
1511	/*
1512	 * This metaslab has just been added so there's no work to do now.
1513	 */
1514	if (*freetree == NULL) {
1515		ASSERT3P(alloctree, ==, NULL);
1516		return;
1517	}
1518
1519	ASSERT3P(alloctree, !=, NULL);
1520	ASSERT3P(*freetree, !=, NULL);
1521	ASSERT3P(*freed_tree, !=, NULL);
1522
1523	if (range_tree_space(alloctree) == 0 &&
1524	    range_tree_space(*freetree) == 0)
1525		return;
1526
1527	/*
1528	 * The only state that can actually be changing concurrently with
1529	 * metaslab_sync() is the metaslab's ms_tree.  No other thread can
1530	 * be modifying this txg's alloctree, freetree, freed_tree, or
1531	 * space_map_phys_t. Therefore, we only hold ms_lock to satify
1532	 * space_map ASSERTs. We drop it whenever we call into the DMU,
1533	 * because the DMU can call down to us (e.g. via zio_free()) at
1534	 * any time.
1535	 */
1536
1537	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
1538
1539	if (msp->ms_sm == NULL) {
1540		uint64_t new_object;
1541
1542		new_object = space_map_alloc(mos, tx);
1543		VERIFY3U(new_object, !=, 0);
1544
1545		VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
1546		    msp->ms_start, msp->ms_size, vd->vdev_ashift,
1547		    &msp->ms_lock));
1548		ASSERT(msp->ms_sm != NULL);
1549	}
1550
1551	mutex_enter(&msp->ms_lock);
1552
1553	if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
1554	    metaslab_should_condense(msp)) {
1555		metaslab_condense(msp, txg, tx);
1556	} else {
1557		space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
1558		space_map_write(msp->ms_sm, *freetree, SM_FREE, tx);
1559	}
1560
1561	range_tree_vacate(alloctree, NULL, NULL);
1562
1563	if (msp->ms_loaded) {
1564		/*
1565		 * When the space map is loaded, we have an accruate
1566		 * histogram in the range tree. This gives us an opportunity
1567		 * to bring the space map's histogram up-to-date so we clear
1568		 * it first before updating it.
1569		 */
1570		space_map_histogram_clear(msp->ms_sm);
1571		space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
1572	} else {
1573		/*
1574		 * Since the space map is not loaded we simply update the
1575		 * exisiting histogram with what was freed in this txg. This
1576		 * means that the on-disk histogram may not have an accurate
1577		 * view of the free space but it's close enough to allow
1578		 * us to make allocation decisions.
1579		 */
1580		space_map_histogram_add(msp->ms_sm, *freetree, tx);
1581	}
1582
1583	/*
1584	 * For sync pass 1, we avoid traversing this txg's free range tree
1585	 * and instead will just swap the pointers for freetree and
1586	 * freed_tree. We can safely do this since the freed_tree is
1587	 * guaranteed to be empty on the initial pass.
1588	 */
1589	if (spa_sync_pass(spa) == 1) {
1590		range_tree_swap(freetree, freed_tree);
1591	} else {
1592		range_tree_vacate(*freetree, range_tree_add, *freed_tree);
1593	}
1594
1595	ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
1596	ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
1597
1598	mutex_exit(&msp->ms_lock);
1599
1600	if (object != space_map_object(msp->ms_sm)) {
1601		object = space_map_object(msp->ms_sm);
1602		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
1603		    msp->ms_id, sizeof (uint64_t), &object, tx);
1604	}
1605	dmu_tx_commit(tx);
1606}
1607
1608/*
1609 * Called after a transaction group has completely synced to mark
1610 * all of the metaslab's free space as usable.
1611 */
1612void
1613metaslab_sync_done(metaslab_t *msp, uint64_t txg)
1614{
1615	metaslab_group_t *mg = msp->ms_group;
1616	vdev_t *vd = mg->mg_vd;
1617	range_tree_t **freed_tree;
1618	range_tree_t **defer_tree;
1619	int64_t alloc_delta, defer_delta;
1620
1621	ASSERT(!vd->vdev_ishole);
1622
1623	mutex_enter(&msp->ms_lock);
1624
1625	/*
1626	 * If this metaslab is just becoming available, initialize its
1627	 * alloctrees, freetrees, and defertree and add its capacity to
1628	 * the vdev.
1629	 */
1630	if (msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK] == NULL) {
1631		for (int t = 0; t < TXG_SIZE; t++) {
1632			ASSERT(msp->ms_alloctree[t] == NULL);
1633			ASSERT(msp->ms_freetree[t] == NULL);
1634
1635			msp->ms_alloctree[t] = range_tree_create(NULL, msp,
1636			    &msp->ms_lock);
1637			msp->ms_freetree[t] = range_tree_create(NULL, msp,
1638			    &msp->ms_lock);
1639		}
1640
1641		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1642			ASSERT(msp->ms_defertree[t] == NULL);
1643
1644			msp->ms_defertree[t] = range_tree_create(NULL, msp,
1645			    &msp->ms_lock);
1646		}
1647
1648		vdev_space_update(vd, 0, 0, msp->ms_size);
1649	}
1650
1651	freed_tree = &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
1652	defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE];
1653
1654	alloc_delta = space_map_alloc_delta(msp->ms_sm);
1655	defer_delta = range_tree_space(*freed_tree) -
1656	    range_tree_space(*defer_tree);
1657
1658	vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
1659
1660	ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
1661	ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
1662
1663	/*
1664	 * If there's a metaslab_load() in progress, wait for it to complete
1665	 * so that we have a consistent view of the in-core space map.
1666	 */
1667	metaslab_load_wait(msp);
1668
1669	/*
1670	 * Move the frees from the defer_tree back to the free
1671	 * range tree (if it's loaded). Swap the freed_tree and the
1672	 * defer_tree -- this is safe to do because we've just emptied out
1673	 * the defer_tree.
1674	 */
1675	range_tree_vacate(*defer_tree,
1676	    msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
1677	range_tree_swap(freed_tree, defer_tree);
1678
1679	space_map_update(msp->ms_sm);
1680
1681	msp->ms_deferspace += defer_delta;
1682	ASSERT3S(msp->ms_deferspace, >=, 0);
1683	ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
1684	if (msp->ms_deferspace != 0) {
1685		/*
1686		 * Keep syncing this metaslab until all deferred frees
1687		 * are back in circulation.
1688		 */
1689		vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
1690	}
1691
1692	if (msp->ms_loaded && msp->ms_access_txg < txg) {
1693		for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
1694			VERIFY0(range_tree_space(
1695			    msp->ms_alloctree[(txg + t) & TXG_MASK]));
1696		}
1697
1698		if (!metaslab_debug_unload)
1699			metaslab_unload(msp);
1700	}
1701
1702	metaslab_group_sort(mg, msp, metaslab_weight(msp));
1703	mutex_exit(&msp->ms_lock);
1704
1705}
1706
1707void
1708metaslab_sync_reassess(metaslab_group_t *mg)
1709{
1710	int64_t failures = mg->mg_alloc_failures;
1711
1712	metaslab_group_alloc_update(mg);
1713	atomic_add_64(&mg->mg_alloc_failures, -failures);
1714
1715	/*
1716	 * Preload the next potential metaslabs
1717	 */
1718	metaslab_group_preload(mg);
1719}
1720
1721static uint64_t
1722metaslab_distance(metaslab_t *msp, dva_t *dva)
1723{
1724	uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
1725	uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
1726	uint64_t start = msp->ms_id;
1727
1728	if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
1729		return (1ULL << 63);
1730
1731	if (offset < start)
1732		return ((start - offset) << ms_shift);
1733	if (offset > start)
1734		return ((offset - start) << ms_shift);
1735	return (0);
1736}
1737
1738static uint64_t
1739metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
1740    uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags)
1741{
1742	spa_t *spa = mg->mg_vd->vdev_spa;
1743	metaslab_t *msp = NULL;
1744	uint64_t offset = -1ULL;
1745	avl_tree_t *t = &mg->mg_metaslab_tree;
1746	uint64_t activation_weight;
1747	uint64_t target_distance;
1748	int i;
1749
1750	activation_weight = METASLAB_WEIGHT_PRIMARY;
1751	for (i = 0; i < d; i++) {
1752		if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
1753			activation_weight = METASLAB_WEIGHT_SECONDARY;
1754			break;
1755		}
1756	}
1757
1758	for (;;) {
1759		boolean_t was_active;
1760
1761		mutex_enter(&mg->mg_lock);
1762		for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
1763			if (msp->ms_weight < asize) {
1764				spa_dbgmsg(spa, "%s: failed to meet weight "
1765				    "requirement: vdev %llu, txg %llu, mg %p, "
1766				    "msp %p, psize %llu, asize %llu, "
1767				    "failures %llu, weight %llu",
1768				    spa_name(spa), mg->mg_vd->vdev_id, txg,
1769				    mg, msp, psize, asize,
1770				    mg->mg_alloc_failures, msp->ms_weight);
1771				mutex_exit(&mg->mg_lock);
1772				return (-1ULL);
1773			}
1774
1775			/*
1776			 * If the selected metaslab is condensing, skip it.
1777			 */
1778			if (msp->ms_condensing)
1779				continue;
1780
1781			was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
1782			if (activation_weight == METASLAB_WEIGHT_PRIMARY)
1783				break;
1784
1785			target_distance = min_distance +
1786			    (space_map_allocated(msp->ms_sm) != 0 ? 0 :
1787			    min_distance >> 1);
1788
1789			for (i = 0; i < d; i++)
1790				if (metaslab_distance(msp, &dva[i]) <
1791				    target_distance)
1792					break;
1793			if (i == d)
1794				break;
1795		}
1796		mutex_exit(&mg->mg_lock);
1797		if (msp == NULL)
1798			return (-1ULL);
1799
1800		mutex_enter(&msp->ms_lock);
1801
1802		/*
1803		 * If we've already reached the allowable number of failed
1804		 * allocation attempts on this metaslab group then we
1805		 * consider skipping it. We skip it only if we're allowed
1806		 * to "fast" gang, the physical size is larger than
1807		 * a gang block, and we're attempting to allocate from
1808		 * the primary metaslab.
1809		 */
1810		if (mg->mg_alloc_failures > zfs_mg_alloc_failures &&
1811		    CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE &&
1812		    activation_weight == METASLAB_WEIGHT_PRIMARY) {
1813			spa_dbgmsg(spa, "%s: skipping metaslab group: "
1814			    "vdev %llu, txg %llu, mg %p, msp[%llu] %p, "
1815			    "psize %llu, asize %llu, failures %llu",
1816			    spa_name(spa), mg->mg_vd->vdev_id, txg, mg,
1817			    msp->ms_id, msp, psize, asize,
1818			    mg->mg_alloc_failures);
1819			mutex_exit(&msp->ms_lock);
1820			return (-1ULL);
1821		}
1822
1823		/*
1824		 * Ensure that the metaslab we have selected is still
1825		 * capable of handling our request. It's possible that
1826		 * another thread may have changed the weight while we
1827		 * were blocked on the metaslab lock.
1828		 */
1829		if (msp->ms_weight < asize || (was_active &&
1830		    !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
1831		    activation_weight == METASLAB_WEIGHT_PRIMARY)) {
1832			mutex_exit(&msp->ms_lock);
1833			continue;
1834		}
1835
1836		if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
1837		    activation_weight == METASLAB_WEIGHT_PRIMARY) {
1838			metaslab_passivate(msp,
1839			    msp->ms_weight & ~METASLAB_ACTIVE_MASK);
1840			mutex_exit(&msp->ms_lock);
1841			continue;
1842		}
1843
1844		if (metaslab_activate(msp, activation_weight) != 0) {
1845			mutex_exit(&msp->ms_lock);
1846			continue;
1847		}
1848
1849		/*
1850		 * If this metaslab is currently condensing then pick again as
1851		 * we can't manipulate this metaslab until it's committed
1852		 * to disk.
1853		 */
1854		if (msp->ms_condensing) {
1855			mutex_exit(&msp->ms_lock);
1856			continue;
1857		}
1858
1859		if ((offset = metaslab_block_alloc(msp, asize)) != -1ULL)
1860			break;
1861
1862		atomic_inc_64(&mg->mg_alloc_failures);
1863
1864		metaslab_passivate(msp, metaslab_block_maxsize(msp));
1865		mutex_exit(&msp->ms_lock);
1866	}
1867
1868	if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
1869		vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
1870
1871	range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, asize);
1872	msp->ms_access_txg = txg + metaslab_unload_delay;
1873
1874	mutex_exit(&msp->ms_lock);
1875
1876	return (offset);
1877}
1878
1879/*
1880 * Allocate a block for the specified i/o.
1881 */
1882static int
1883metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
1884    dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags)
1885{
1886	metaslab_group_t *mg, *rotor;
1887	vdev_t *vd;
1888	int dshift = 3;
1889	int all_zero;
1890	int zio_lock = B_FALSE;
1891	boolean_t allocatable;
1892	uint64_t offset = -1ULL;
1893	uint64_t asize;
1894	uint64_t distance;
1895
1896	ASSERT(!DVA_IS_VALID(&dva[d]));
1897
1898	/*
1899	 * For testing, make some blocks above a certain size be gang blocks.
1900	 */
1901	if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0)
1902		return (SET_ERROR(ENOSPC));
1903
1904	/*
1905	 * Start at the rotor and loop through all mgs until we find something.
1906	 * Note that there's no locking on mc_rotor or mc_aliquot because
1907	 * nothing actually breaks if we miss a few updates -- we just won't
1908	 * allocate quite as evenly.  It all balances out over time.
1909	 *
1910	 * If we are doing ditto or log blocks, try to spread them across
1911	 * consecutive vdevs.  If we're forced to reuse a vdev before we've
1912	 * allocated all of our ditto blocks, then try and spread them out on
1913	 * that vdev as much as possible.  If it turns out to not be possible,
1914	 * gradually lower our standards until anything becomes acceptable.
1915	 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
1916	 * gives us hope of containing our fault domains to something we're
1917	 * able to reason about.  Otherwise, any two top-level vdev failures
1918	 * will guarantee the loss of data.  With consecutive allocation,
1919	 * only two adjacent top-level vdev failures will result in data loss.
1920	 *
1921	 * If we are doing gang blocks (hintdva is non-NULL), try to keep
1922	 * ourselves on the same vdev as our gang block header.  That
1923	 * way, we can hope for locality in vdev_cache, plus it makes our
1924	 * fault domains something tractable.
1925	 */
1926	if (hintdva) {
1927		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
1928
1929		/*
1930		 * It's possible the vdev we're using as the hint no
1931		 * longer exists (i.e. removed). Consult the rotor when
1932		 * all else fails.
1933		 */
1934		if (vd != NULL) {
1935			mg = vd->vdev_mg;
1936
1937			if (flags & METASLAB_HINTBP_AVOID &&
1938			    mg->mg_next != NULL)
1939				mg = mg->mg_next;
1940		} else {
1941			mg = mc->mc_rotor;
1942		}
1943	} else if (d != 0) {
1944		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
1945		mg = vd->vdev_mg->mg_next;
1946	} else {
1947		mg = mc->mc_rotor;
1948	}
1949
1950	/*
1951	 * If the hint put us into the wrong metaslab class, or into a
1952	 * metaslab group that has been passivated, just follow the rotor.
1953	 */
1954	if (mg->mg_class != mc || mg->mg_activation_count <= 0)
1955		mg = mc->mc_rotor;
1956
1957	rotor = mg;
1958top:
1959	all_zero = B_TRUE;
1960	do {
1961		ASSERT(mg->mg_activation_count == 1);
1962
1963		vd = mg->mg_vd;
1964
1965		/*
1966		 * Don't allocate from faulted devices.
1967		 */
1968		if (zio_lock) {
1969			spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
1970			allocatable = vdev_allocatable(vd);
1971			spa_config_exit(spa, SCL_ZIO, FTAG);
1972		} else {
1973			allocatable = vdev_allocatable(vd);
1974		}
1975
1976		/*
1977		 * Determine if the selected metaslab group is eligible
1978		 * for allocations. If we're ganging or have requested
1979		 * an allocation for the smallest gang block size
1980		 * then we don't want to avoid allocating to the this
1981		 * metaslab group. If we're in this condition we should
1982		 * try to allocate from any device possible so that we
1983		 * don't inadvertently return ENOSPC and suspend the pool
1984		 * even though space is still available.
1985		 */
1986		if (allocatable && CAN_FASTGANG(flags) &&
1987		    psize > SPA_GANGBLOCKSIZE)
1988			allocatable = metaslab_group_allocatable(mg);
1989
1990		if (!allocatable)
1991			goto next;
1992
1993		/*
1994		 * Avoid writing single-copy data to a failing vdev
1995		 * unless the user instructs us that it is okay.
1996		 */
1997		if ((vd->vdev_stat.vs_write_errors > 0 ||
1998		    vd->vdev_state < VDEV_STATE_HEALTHY) &&
1999		    d == 0 && dshift == 3 &&
2000		    !(zfs_write_to_degraded && vd->vdev_state ==
2001		    VDEV_STATE_DEGRADED)) {
2002			all_zero = B_FALSE;
2003			goto next;
2004		}
2005
2006		ASSERT(mg->mg_class == mc);
2007
2008		distance = vd->vdev_asize >> dshift;
2009		if (distance <= (1ULL << vd->vdev_ms_shift))
2010			distance = 0;
2011		else
2012			all_zero = B_FALSE;
2013
2014		asize = vdev_psize_to_asize(vd, psize);
2015		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
2016
2017		offset = metaslab_group_alloc(mg, psize, asize, txg, distance,
2018		    dva, d, flags);
2019		if (offset != -1ULL) {
2020			/*
2021			 * If we've just selected this metaslab group,
2022			 * figure out whether the corresponding vdev is
2023			 * over- or under-used relative to the pool,
2024			 * and set an allocation bias to even it out.
2025			 */
2026			if (mc->mc_aliquot == 0) {
2027				vdev_stat_t *vs = &vd->vdev_stat;
2028				int64_t vu, cu;
2029
2030				vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
2031				cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
2032
2033				/*
2034				 * Calculate how much more or less we should
2035				 * try to allocate from this device during
2036				 * this iteration around the rotor.
2037				 * For example, if a device is 80% full
2038				 * and the pool is 20% full then we should
2039				 * reduce allocations by 60% on this device.
2040				 *
2041				 * mg_bias = (20 - 80) * 512K / 100 = -307K
2042				 *
2043				 * This reduces allocations by 307K for this
2044				 * iteration.
2045				 */
2046				mg->mg_bias = ((cu - vu) *
2047				    (int64_t)mg->mg_aliquot) / 100;
2048			}
2049
2050			if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
2051			    mg->mg_aliquot + mg->mg_bias) {
2052				mc->mc_rotor = mg->mg_next;
2053				mc->mc_aliquot = 0;
2054			}
2055
2056			DVA_SET_VDEV(&dva[d], vd->vdev_id);
2057			DVA_SET_OFFSET(&dva[d], offset);
2058			DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
2059			DVA_SET_ASIZE(&dva[d], asize);
2060
2061			return (0);
2062		}
2063next:
2064		mc->mc_rotor = mg->mg_next;
2065		mc->mc_aliquot = 0;
2066	} while ((mg = mg->mg_next) != rotor);
2067
2068	if (!all_zero) {
2069		dshift++;
2070		ASSERT(dshift < 64);
2071		goto top;
2072	}
2073
2074	if (!allocatable && !zio_lock) {
2075		dshift = 3;
2076		zio_lock = B_TRUE;
2077		goto top;
2078	}
2079
2080	bzero(&dva[d], sizeof (dva_t));
2081
2082	return (SET_ERROR(ENOSPC));
2083}
2084
2085/*
2086 * Free the block represented by DVA in the context of the specified
2087 * transaction group.
2088 */
2089static void
2090metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
2091{
2092	uint64_t vdev = DVA_GET_VDEV(dva);
2093	uint64_t offset = DVA_GET_OFFSET(dva);
2094	uint64_t size = DVA_GET_ASIZE(dva);
2095	vdev_t *vd;
2096	metaslab_t *msp;
2097
2098	ASSERT(DVA_IS_VALID(dva));
2099
2100	if (txg > spa_freeze_txg(spa))
2101		return;
2102
2103	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
2104	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
2105		cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
2106		    (u_longlong_t)vdev, (u_longlong_t)offset);
2107		ASSERT(0);
2108		return;
2109	}
2110
2111	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
2112
2113	if (DVA_GET_GANG(dva))
2114		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
2115
2116	mutex_enter(&msp->ms_lock);
2117
2118	if (now) {
2119		range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
2120		    offset, size);
2121
2122		VERIFY(!msp->ms_condensing);
2123		VERIFY3U(offset, >=, msp->ms_start);
2124		VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
2125		VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
2126		    msp->ms_size);
2127		VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
2128		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
2129		range_tree_add(msp->ms_tree, offset, size);
2130	} else {
2131		if (range_tree_space(msp->ms_freetree[txg & TXG_MASK]) == 0)
2132			vdev_dirty(vd, VDD_METASLAB, msp, txg);
2133		range_tree_add(msp->ms_freetree[txg & TXG_MASK],
2134		    offset, size);
2135	}
2136
2137	mutex_exit(&msp->ms_lock);
2138}
2139
2140/*
2141 * Intent log support: upon opening the pool after a crash, notify the SPA
2142 * of blocks that the intent log has allocated for immediate write, but
2143 * which are still considered free by the SPA because the last transaction
2144 * group didn't commit yet.
2145 */
2146static int
2147metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
2148{
2149	uint64_t vdev = DVA_GET_VDEV(dva);
2150	uint64_t offset = DVA_GET_OFFSET(dva);
2151	uint64_t size = DVA_GET_ASIZE(dva);
2152	vdev_t *vd;
2153	metaslab_t *msp;
2154	int error = 0;
2155
2156	ASSERT(DVA_IS_VALID(dva));
2157
2158	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
2159	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
2160		return (SET_ERROR(ENXIO));
2161
2162	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
2163
2164	if (DVA_GET_GANG(dva))
2165		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
2166
2167	mutex_enter(&msp->ms_lock);
2168
2169	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
2170		error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
2171
2172	if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
2173		error = SET_ERROR(ENOENT);
2174
2175	if (error || txg == 0) {	/* txg == 0 indicates dry run */
2176		mutex_exit(&msp->ms_lock);
2177		return (error);
2178	}
2179
2180	VERIFY(!msp->ms_condensing);
2181	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
2182	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
2183	VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
2184	range_tree_remove(msp->ms_tree, offset, size);
2185
2186	if (spa_writeable(spa)) {	/* don't dirty if we're zdb(1M) */
2187		if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
2188			vdev_dirty(vd, VDD_METASLAB, msp, txg);
2189		range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
2190	}
2191
2192	mutex_exit(&msp->ms_lock);
2193
2194	return (0);
2195}
2196
2197int
2198metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
2199    int ndvas, uint64_t txg, blkptr_t *hintbp, int flags)
2200{
2201	dva_t *dva = bp->blk_dva;
2202	dva_t *hintdva = hintbp->blk_dva;
2203	int error = 0;
2204
2205	ASSERT(bp->blk_birth == 0);
2206	ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
2207
2208	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
2209
2210	if (mc->mc_rotor == NULL) {	/* no vdevs in this class */
2211		spa_config_exit(spa, SCL_ALLOC, FTAG);
2212		return (SET_ERROR(ENOSPC));
2213	}
2214
2215	ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
2216	ASSERT(BP_GET_NDVAS(bp) == 0);
2217	ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
2218
2219	for (int d = 0; d < ndvas; d++) {
2220		error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
2221		    txg, flags);
2222		if (error != 0) {
2223			for (d--; d >= 0; d--) {
2224				metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
2225				bzero(&dva[d], sizeof (dva_t));
2226			}
2227			spa_config_exit(spa, SCL_ALLOC, FTAG);
2228			return (error);
2229		}
2230	}
2231	ASSERT(error == 0);
2232	ASSERT(BP_GET_NDVAS(bp) == ndvas);
2233
2234	spa_config_exit(spa, SCL_ALLOC, FTAG);
2235
2236	BP_SET_BIRTH(bp, txg, txg);
2237
2238	return (0);
2239}
2240
2241void
2242metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
2243{
2244	const dva_t *dva = bp->blk_dva;
2245	int ndvas = BP_GET_NDVAS(bp);
2246
2247	ASSERT(!BP_IS_HOLE(bp));
2248	ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
2249
2250	spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
2251
2252	for (int d = 0; d < ndvas; d++)
2253		metaslab_free_dva(spa, &dva[d], txg, now);
2254
2255	spa_config_exit(spa, SCL_FREE, FTAG);
2256}
2257
2258int
2259metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
2260{
2261	const dva_t *dva = bp->blk_dva;
2262	int ndvas = BP_GET_NDVAS(bp);
2263	int error = 0;
2264
2265	ASSERT(!BP_IS_HOLE(bp));
2266
2267	if (txg != 0) {
2268		/*
2269		 * First do a dry run to make sure all DVAs are claimable,
2270		 * so we don't have to unwind from partial failures below.
2271		 */
2272		if ((error = metaslab_claim(spa, bp, 0)) != 0)
2273			return (error);
2274	}
2275
2276	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
2277
2278	for (int d = 0; d < ndvas; d++)
2279		if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
2280			break;
2281
2282	spa_config_exit(spa, SCL_ALLOC, FTAG);
2283
2284	ASSERT(error == 0 || txg == 0);
2285
2286	return (error);
2287}
2288
2289void
2290metaslab_check_free(spa_t *spa, const blkptr_t *bp)
2291{
2292	if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
2293		return;
2294
2295	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
2296	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
2297		uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
2298		vdev_t *vd = vdev_lookup_top(spa, vdev);
2299		uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
2300		uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
2301		metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
2302
2303		if (msp->ms_loaded)
2304			range_tree_verify(msp->ms_tree, offset, size);
2305
2306		for (int j = 0; j < TXG_SIZE; j++)
2307			range_tree_verify(msp->ms_freetree[j], offset, size);
2308		for (int j = 0; j < TXG_DEFER_SIZE; j++)
2309			range_tree_verify(msp->ms_defertree[j], offset, size);
2310	}
2311	spa_config_exit(spa, SCL_VDEV, FTAG);
2312}
2313