subr_blist.c revision 330897
1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 *    may be used to endorse or promote products derived from this software
15 *    without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
18 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
23 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29/*
30 * BLIST.C -	Bitmap allocator/deallocator, using a radix tree with hinting
31 *
32 *	This module implements a general bitmap allocator/deallocator.  The
33 *	allocator eats around 2 bits per 'block'.  The module does not
34 *	try to interpret the meaning of a 'block' other than to return
35 *	SWAPBLK_NONE on an allocation failure.
36 *
37 *	A radix tree controls access to pieces of the bitmap, and includes
38 *	auxiliary information at each interior node about the availabilty of
39 *	contiguous free blocks in the subtree rooted at that node.  Two radix
40 *	constants are involved: one for the size of the bitmaps contained in the
41 *	leaf nodes (BLIST_BMAP_RADIX), and one for the number of descendents of
42 *	each of the meta (interior) nodes (BLIST_META_RADIX).  Each subtree is
43 *	associated with a range of blocks.  The root of any subtree stores a
44 *	hint field that defines an upper bound on the size of the largest
45 *	allocation that can begin in the associated block range.  A hint is an
46 *	upper bound on a potential allocation, but not necessarily a tight upper
47 *	bound.
48 *
49 *	The radix tree also implements two collapsed states for meta nodes:
50 *	the ALL-ALLOCATED state and the ALL-FREE state.  If a meta node is
51 *	in either of these two states, all information contained underneath
52 *	the node is considered stale.  These states are used to optimize
53 *	allocation and freeing operations.
54 *
55 * 	The hinting greatly increases code efficiency for allocations while
56 *	the general radix structure optimizes both allocations and frees.  The
57 *	radix tree should be able to operate well no matter how much
58 *	fragmentation there is and no matter how large a bitmap is used.
59 *
60 *	The blist code wires all necessary memory at creation time.  Neither
61 *	allocations nor frees require interaction with the memory subsystem.
62 *	The non-blocking features of the blist code are used in the swap code
63 *	(vm/swap_pager.c).
64 *
65 *	LAYOUT: The radix tree is laid out recursively using a
66 *	linear array.  Each meta node is immediately followed (laid out
67 *	sequentially in memory) by BLIST_META_RADIX lower level nodes.  This
68 *	is a recursive structure but one that can be easily scanned through
69 *	a very simple 'skip' calculation.  In order to support large radixes,
70 *	portions of the tree may reside outside our memory allocation.  We
71 *	handle this with an early-termination optimization (when bighint is
72 *	set to -1) on the scan.  The memory allocation is only large enough
73 *	to cover the number of blocks requested at creation time even if it
74 *	must be encompassed in larger root-node radix.
75 *
76 *	NOTE: the allocator cannot currently allocate more than
77 *	BLIST_BMAP_RADIX blocks per call.  It will panic with 'allocation too
78 *	large' if you try.  This is an area that could use improvement.  The
79 *	radix is large enough that this restriction does not effect the swap
80 *	system, though.  Currently only the allocation code is affected by
81 *	this algorithmic unfeature.  The freeing code can handle arbitrary
82 *	ranges.
83 *
84 *	This code can be compiled stand-alone for debugging.
85 */
86
87#include <sys/cdefs.h>
88__FBSDID("$FreeBSD: stable/11/sys/kern/subr_blist.c 330897 2018-03-14 03:19:51Z eadler $");
89
90#ifdef _KERNEL
91
92#include <sys/param.h>
93#include <sys/systm.h>
94#include <sys/lock.h>
95#include <sys/kernel.h>
96#include <sys/blist.h>
97#include <sys/malloc.h>
98#include <sys/sbuf.h>
99#include <sys/proc.h>
100#include <sys/mutex.h>
101
102#else
103
104#ifndef BLIST_NO_DEBUG
105#define BLIST_DEBUG
106#endif
107
108#include <sys/types.h>
109#include <sys/malloc.h>
110#include <sys/sbuf.h>
111#include <stdio.h>
112#include <string.h>
113#include <stdlib.h>
114#include <stdarg.h>
115#include <stdbool.h>
116
117#define	bitcount64(x)	__bitcount64((uint64_t)(x))
118#define malloc(a,b,c)	calloc(a, 1)
119#define free(a,b)	free(a)
120static __inline int imax(int a, int b) { return (a > b ? a : b); }
121
122#include <sys/blist.h>
123
124void panic(const char *ctl, ...);
125
126#endif
127
128/*
129 * static support functions
130 */
131static daddr_t	blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count);
132static daddr_t	blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count,
133		    u_daddr_t radix);
134static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
135static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
136		    u_daddr_t radix);
137static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
138		    blist_t dest, daddr_t count);
139static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
140static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
141		    u_daddr_t radix);
142static daddr_t	blst_radix_init(blmeta_t *scan, daddr_t radix, daddr_t count);
143#ifndef _KERNEL
144static void	blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix,
145		    int tab);
146#endif
147
148#ifdef _KERNEL
149static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
150#endif
151
152_Static_assert(BLIST_BMAP_RADIX % BLIST_META_RADIX == 0,
153    "radix divisibility error");
154#define	BLIST_BMAP_MASK	(BLIST_BMAP_RADIX - 1)
155#define	BLIST_META_MASK	(BLIST_META_RADIX - 1)
156
157/*
158 * For a subtree that can represent the state of up to 'radix' blocks, the
159 * number of leaf nodes of the subtree is L=radix/BLIST_BMAP_RADIX.  If 'm'
160 * is short for BLIST_META_RADIX, then for a tree of height h with L=m**h
161 * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h,
162 * or, equivalently, (m**(h+1)-1)/(m-1).  This quantity is called 'skip'
163 * in the 'meta' functions that process subtrees.  Since integer division
164 * discards remainders, we can express this computation as
165 * skip = (m * m**h) / (m - 1)
166 * skip = (m * (radix / BLIST_BMAP_RADIX)) / (m - 1)
167 * and since m divides BLIST_BMAP_RADIX, we can simplify further to
168 * skip = (radix / (BLIST_BMAP_RADIX / m)) / (m - 1)
169 * skip = radix / ((BLIST_BMAP_RADIX / m) * (m - 1))
170 * so that simple integer division by a constant can safely be used for the
171 * calculation.
172 */
173static inline daddr_t
174radix_to_skip(daddr_t radix)
175{
176
177	return (radix /
178	    ((BLIST_BMAP_RADIX / BLIST_META_RADIX) * BLIST_META_MASK));
179}
180
181/*
182 * Use binary search, or a faster method, to find the 1 bit in a u_daddr_t.
183 * Assumes that the argument has only one bit set.
184 */
185static inline int
186bitpos(u_daddr_t mask)
187{
188	int hi, lo, mid;
189
190	switch (sizeof(mask)) {
191#ifdef HAVE_INLINE_FFSLL
192	case sizeof(long long):
193		return (ffsll(mask) - 1);
194#endif
195	default:
196		lo = 0;
197		hi = BLIST_BMAP_RADIX;
198		while (lo + 1 < hi) {
199			mid = (lo + hi) >> 1;
200			if ((mask >> mid) != 0)
201				lo = mid;
202			else
203				hi = mid;
204		}
205		return (lo);
206	}
207}
208
209/*
210 * blist_create() - create a blist capable of handling up to the specified
211 *		    number of blocks
212 *
213 *	blocks - must be greater than 0
214 * 	flags  - malloc flags
215 *
216 *	The smallest blist consists of a single leaf node capable of
217 *	managing BLIST_BMAP_RADIX blocks.
218 */
219blist_t
220blist_create(daddr_t blocks, int flags)
221{
222	blist_t bl;
223	daddr_t nodes, radix;
224
225	/*
226	 * Calculate the radix field used for scanning.
227	 */
228	radix = BLIST_BMAP_RADIX;
229	while (radix < blocks) {
230		radix *= BLIST_META_RADIX;
231	}
232	nodes = 1 + blst_radix_init(NULL, radix, blocks);
233
234	bl = malloc(sizeof(struct blist), M_SWAP, flags);
235	if (bl == NULL)
236		return (NULL);
237
238	bl->bl_blocks = blocks;
239	bl->bl_radix = radix;
240	bl->bl_cursor = 0;
241	bl->bl_root = malloc(nodes * sizeof(blmeta_t), M_SWAP, flags);
242	if (bl->bl_root == NULL) {
243		free(bl, M_SWAP);
244		return (NULL);
245	}
246	blst_radix_init(bl->bl_root, radix, blocks);
247
248#if defined(BLIST_DEBUG)
249	printf(
250		"BLIST representing %lld blocks (%lld MB of swap)"
251		", requiring %lldK of ram\n",
252		(long long)bl->bl_blocks,
253		(long long)bl->bl_blocks * 4 / 1024,
254		(long long)(nodes * sizeof(blmeta_t) + 1023) / 1024
255	);
256	printf("BLIST raw radix tree contains %lld records\n",
257	    (long long)nodes);
258#endif
259
260	return (bl);
261}
262
263void
264blist_destroy(blist_t bl)
265{
266	free(bl->bl_root, M_SWAP);
267	free(bl, M_SWAP);
268}
269
270/*
271 * blist_alloc() -   reserve space in the block bitmap.  Return the base
272 *		     of a contiguous region or SWAPBLK_NONE if space could
273 *		     not be allocated.
274 */
275daddr_t
276blist_alloc(blist_t bl, daddr_t count)
277{
278	daddr_t blk;
279
280	/*
281	 * This loop iterates at most twice.  An allocation failure in the
282	 * first iteration leads to a second iteration only if the cursor was
283	 * non-zero.  When the cursor is zero, an allocation failure will
284	 * reduce the hint, stopping further iterations.
285	 */
286	while (count <= bl->bl_root->bm_bighint) {
287		blk = blst_meta_alloc(bl->bl_root, bl->bl_cursor, count,
288		    bl->bl_radix);
289		if (blk != SWAPBLK_NONE) {
290			bl->bl_cursor = blk + count;
291			if (bl->bl_cursor == bl->bl_blocks)
292				bl->bl_cursor = 0;
293			return (blk);
294		} else if (bl->bl_cursor != 0)
295			bl->bl_cursor = 0;
296	}
297	return (SWAPBLK_NONE);
298}
299
300/*
301 * blist_avail() -	return the number of free blocks.
302 */
303daddr_t
304blist_avail(blist_t bl)
305{
306
307	if (bl->bl_radix == BLIST_BMAP_RADIX)
308		return (bitcount64(bl->bl_root->u.bmu_bitmap));
309	else
310		return (bl->bl_root->u.bmu_avail);
311}
312
313/*
314 * blist_free() -	free up space in the block bitmap.  Return the base
315 *		     	of a contiguous region.  Panic if an inconsistancy is
316 *			found.
317 */
318void
319blist_free(blist_t bl, daddr_t blkno, daddr_t count)
320{
321
322	blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix);
323}
324
325/*
326 * blist_fill() -	mark a region in the block bitmap as off-limits
327 *			to the allocator (i.e. allocate it), ignoring any
328 *			existing allocations.  Return the number of blocks
329 *			actually filled that were free before the call.
330 */
331daddr_t
332blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
333{
334
335	return (blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix));
336}
337
338/*
339 * blist_resize() -	resize an existing radix tree to handle the
340 *			specified number of blocks.  This will reallocate
341 *			the tree and transfer the previous bitmap to the new
342 *			one.  When extending the tree you can specify whether
343 *			the new blocks are to left allocated or freed.
344 */
345void
346blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
347{
348    blist_t newbl = blist_create(count, flags);
349    blist_t save = *pbl;
350
351    *pbl = newbl;
352    if (count > save->bl_blocks)
353	    count = save->bl_blocks;
354    blst_copy(save->bl_root, 0, save->bl_radix, newbl, count);
355
356    /*
357     * If resizing upwards, should we free the new space or not?
358     */
359    if (freenew && count < newbl->bl_blocks) {
360	    blist_free(newbl, count, newbl->bl_blocks - count);
361    }
362    blist_destroy(save);
363}
364
365#ifdef BLIST_DEBUG
366
367/*
368 * blist_print()    - dump radix tree
369 */
370void
371blist_print(blist_t bl)
372{
373	printf("BLIST cursor = %08jx {\n", (uintmax_t)bl->bl_cursor);
374	blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4);
375	printf("}\n");
376}
377
378#endif
379
380static const u_daddr_t fib[] = {
381	1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584,
382	4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811,
383	514229, 832040, 1346269, 2178309, 3524578,
384};
385
386/*
387 * Use 'gap' to describe a maximal range of unallocated blocks/bits.
388 */
389struct gap_stats {
390	daddr_t	start;		/* current gap start, or SWAPBLK_NONE */
391	daddr_t	num;		/* number of gaps observed */
392	daddr_t	max;		/* largest gap size */
393	daddr_t	avg;		/* average gap size */
394	daddr_t	err;		/* sum - num * avg */
395	daddr_t	histo[nitems(fib)]; /* # gaps in each size range */
396	int	max_bucket;	/* last histo elt with nonzero val */
397};
398
399/*
400 * gap_stats_counting()    - is the state 'counting 1 bits'?
401 *                           or 'skipping 0 bits'?
402 */
403static inline bool
404gap_stats_counting(const struct gap_stats *stats)
405{
406
407	return (stats->start != SWAPBLK_NONE);
408}
409
410/*
411 * init_gap_stats()    - initialize stats on gap sizes
412 */
413static inline void
414init_gap_stats(struct gap_stats *stats)
415{
416
417	bzero(stats, sizeof(*stats));
418	stats->start = SWAPBLK_NONE;
419}
420
421/*
422 * update_gap_stats()    - update stats on gap sizes
423 */
424static void
425update_gap_stats(struct gap_stats *stats, daddr_t posn)
426{
427	daddr_t size;
428	int hi, lo, mid;
429
430	if (!gap_stats_counting(stats)) {
431		stats->start = posn;
432		return;
433	}
434	size = posn - stats->start;
435	stats->start = SWAPBLK_NONE;
436	if (size > stats->max)
437		stats->max = size;
438
439	/*
440	 * Find the fibonacci range that contains size,
441	 * expecting to find it in an early range.
442	 */
443	lo = 0;
444	hi = 1;
445	while (hi < nitems(fib) && fib[hi] <= size) {
446		lo = hi;
447		hi *= 2;
448	}
449	if (hi >= nitems(fib))
450		hi = nitems(fib);
451	while (lo + 1 != hi) {
452		mid = (lo + hi) >> 1;
453		if (fib[mid] <= size)
454			lo = mid;
455		else
456			hi = mid;
457	}
458	stats->histo[lo]++;
459	if (lo > stats->max_bucket)
460		stats->max_bucket = lo;
461	stats->err += size - stats->avg;
462	stats->num++;
463	stats->avg += stats->err / stats->num;
464	stats->err %= stats->num;
465}
466
467/*
468 * dump_gap_stats()    - print stats on gap sizes
469 */
470static inline void
471dump_gap_stats(const struct gap_stats *stats, struct sbuf *s)
472{
473	int i;
474
475	sbuf_printf(s, "number of maximal free ranges: %jd\n",
476	    (intmax_t)stats->num);
477	sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max);
478	sbuf_printf(s, "average maximal free range size: %jd\n",
479	    (intmax_t)stats->avg);
480	sbuf_printf(s, "number of maximal free ranges of different sizes:\n");
481	sbuf_printf(s, "               count  |  size range\n");
482	sbuf_printf(s, "               -----  |  ----------\n");
483	for (i = 0; i < stats->max_bucket; i++) {
484		if (stats->histo[i] != 0) {
485			sbuf_printf(s, "%20jd  |  ",
486			    (intmax_t)stats->histo[i]);
487			if (fib[i] != fib[i + 1] - 1)
488				sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
489				    (intmax_t)fib[i + 1] - 1);
490			else
491				sbuf_printf(s, "%jd\n", (intmax_t)fib[i]);
492		}
493	}
494	sbuf_printf(s, "%20jd  |  ", (intmax_t)stats->histo[i]);
495	if (stats->histo[i] > 1)
496		sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
497		    (intmax_t)stats->max);
498	else
499		sbuf_printf(s, "%jd\n", (intmax_t)stats->max);
500}
501
502/*
503 * blist_stats()    - dump radix tree stats
504 */
505void
506blist_stats(blist_t bl, struct sbuf *s)
507{
508	struct gap_stats gstats;
509	struct gap_stats *stats = &gstats;
510	daddr_t i, nodes, radix;
511	u_daddr_t bit, diff, mask;
512
513	init_gap_stats(stats);
514	nodes = 0;
515	i = bl->bl_radix;
516	while (i < bl->bl_radix + bl->bl_blocks) {
517		/*
518		 * Find max size subtree starting at i.
519		 */
520		radix = BLIST_BMAP_RADIX;
521		while (((i / radix) & BLIST_META_MASK) == 0)
522			radix *= BLIST_META_RADIX;
523
524		/*
525		 * Check for skippable subtrees starting at i.
526		 */
527		while (radix > BLIST_BMAP_RADIX) {
528			if (bl->bl_root[nodes].u.bmu_avail == 0) {
529				if (gap_stats_counting(stats))
530					update_gap_stats(stats, i);
531				break;
532			}
533			if (bl->bl_root[nodes].u.bmu_avail == radix) {
534				if (!gap_stats_counting(stats))
535					update_gap_stats(stats, i);
536				break;
537			}
538
539			/*
540			 * Skip subtree root.
541			 */
542			nodes++;
543			radix /= BLIST_META_RADIX;
544		}
545		if (radix == BLIST_BMAP_RADIX) {
546			/*
547			 * Scan leaf.
548			 */
549			mask = bl->bl_root[nodes].u.bmu_bitmap;
550			diff = mask ^ (mask << 1);
551			if (gap_stats_counting(stats))
552				diff ^= 1;
553			while (diff != 0) {
554				bit = diff & -diff;
555				update_gap_stats(stats, i + bitpos(bit));
556				diff ^= bit;
557			}
558		}
559		nodes += radix_to_skip(radix);
560		i += radix;
561	}
562	update_gap_stats(stats, i);
563	dump_gap_stats(stats, s);
564}
565
566/************************************************************************
567 *			  ALLOCATION SUPPORT FUNCTIONS			*
568 ************************************************************************
569 *
570 *	These support functions do all the actual work.  They may seem
571 *	rather longish, but that's because I've commented them up.  The
572 *	actual code is straight forward.
573 *
574 */
575
576/*
577 * blist_leaf_alloc() -	allocate at a leaf in the radix tree (a bitmap).
578 *
579 *	This is the core of the allocator and is optimized for the
580 *	BLIST_BMAP_RADIX block allocation case.  Otherwise, execution
581 *	time is proportional to log2(count) + bitpos time.
582 */
583static daddr_t
584blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count)
585{
586	u_daddr_t mask;
587	int count1, hi, lo, num_shifts, range1, range_ext;
588
589	range1 = 0;
590	count1 = count - 1;
591	num_shifts = fls(count1);
592	mask = scan->u.bmu_bitmap;
593	while ((-mask & ~mask) != 0 && num_shifts > 0) {
594		/*
595		 * If bit i is set in mask, then bits in [i, i+range1] are set
596		 * in scan->u.bmu_bitmap.  The value of range1 is equal to
597		 * count1 >> num_shifts.  Grow range and reduce num_shifts to 0,
598		 * while preserving these invariants.  The updates to mask leave
599		 * fewer bits set, but each bit that remains set represents a
600		 * longer string of consecutive bits set in scan->u.bmu_bitmap.
601		 * If more updates to mask cannot clear more bits, because mask
602		 * is partitioned with all 0 bits preceding all 1 bits, the loop
603		 * terminates immediately.
604		 */
605		num_shifts--;
606		range_ext = range1 + ((count1 >> num_shifts) & 1);
607		/*
608		 * mask is a signed quantity for the shift because when it is
609		 * shifted right, the sign bit should copied; when the last
610		 * block of the leaf is free, pretend, for a while, that all the
611		 * blocks that follow it are also free.
612		 */
613		mask &= (daddr_t)mask >> range_ext;
614		range1 += range_ext;
615	}
616	if (mask == 0) {
617		/*
618		 * Update bighint.  There is no allocation bigger than range1
619		 * starting in this leaf.
620		 */
621		scan->bm_bighint = range1;
622		return (SWAPBLK_NONE);
623	}
624
625	/* Discard any candidates that appear before blk. */
626	mask &= (u_daddr_t)-1 << (blk & BLIST_BMAP_MASK);
627	if (mask == 0)
628		return (SWAPBLK_NONE);
629
630	/*
631	 * The least significant set bit in mask marks the start of the first
632	 * available range of sufficient size.  Clear all the bits but that one,
633	 * and then find its position.
634	 */
635	mask &= -mask;
636	lo = bitpos(mask);
637
638	hi = lo + count;
639	if (hi > BLIST_BMAP_RADIX) {
640		/*
641		 * An allocation within this leaf is impossible, so a successful
642		 * allocation depends on the next leaf providing some of the blocks.
643		 */
644		if (((blk / BLIST_BMAP_RADIX + 1) & BLIST_META_MASK) == 0) {
645			/*
646			 * The next leaf has a different meta-node parent, so it
647			 * is not necessarily initialized.  Update bighint,
648			 * comparing the range found at the end of mask to the
649			 * largest earlier range that could have been made to
650			 * vanish in the initial processing of mask.
651			 */
652			scan->bm_bighint = imax(BLIST_BMAP_RADIX - lo, range1);
653			return (SWAPBLK_NONE);
654		}
655		hi -= BLIST_BMAP_RADIX;
656		if (((scan[1].u.bmu_bitmap + 1) & ~((u_daddr_t)-1 << hi)) != 0) {
657			/*
658			 * The next leaf doesn't have enough free blocks at the
659			 * beginning to complete the spanning allocation.  The
660			 * hint cannot be updated, because the same allocation
661			 * request could be satisfied later, by this leaf, if
662			 * the state of the next leaf changes, and without any
663			 * changes to this leaf.
664			 */
665			return (SWAPBLK_NONE);
666		}
667		/* Clear the first 'hi' bits in the next leaf, allocating them. */
668		scan[1].u.bmu_bitmap &= (u_daddr_t)-1 << hi;
669		hi = BLIST_BMAP_RADIX;
670	}
671
672	/* Set the bits of mask at position 'lo' and higher. */
673	mask = -mask;
674	if (hi == BLIST_BMAP_RADIX) {
675		/*
676		 * Update bighint.  There is no allocation bigger than range1
677		 * available in this leaf after this allocation completes.
678		 */
679		scan->bm_bighint = range1;
680	} else {
681		/* Clear the bits of mask at position 'hi' and higher. */
682		mask &= (u_daddr_t)-1 >> (BLIST_BMAP_RADIX - hi);
683		/* If this allocation uses all the bits, clear the hint. */
684		if (mask == scan->u.bmu_bitmap)
685			scan->bm_bighint = 0;
686	}
687	/* Clear the allocated bits from this leaf. */
688	scan->u.bmu_bitmap &= ~mask;
689	return ((blk & ~BLIST_BMAP_MASK) + lo);
690}
691
692/*
693 * blist_meta_alloc() -	allocate at a meta in the radix tree.
694 *
695 *	Attempt to allocate at a meta node.  If we can't, we update
696 *	bighint and return a failure.  Updating bighint optimize future
697 *	calls that hit this node.  We have to check for our collapse cases
698 *	and we have a few optimizations strewn in as well.
699 */
700static daddr_t
701blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, u_daddr_t radix)
702{
703	daddr_t blk, i, next_skip, r, skip;
704	int child;
705	bool scan_from_start;
706
707	if (radix == BLIST_BMAP_RADIX)
708		return (blst_leaf_alloc(scan, cursor, count));
709	if (scan->u.bmu_avail < count) {
710		/*
711		 * The meta node's hint must be too large if the allocation
712		 * exceeds the number of free blocks.  Reduce the hint, and
713		 * return failure.
714		 */
715		scan->bm_bighint = scan->u.bmu_avail;
716		return (SWAPBLK_NONE);
717	}
718	blk = cursor & -radix;
719	skip = radix_to_skip(radix);
720	next_skip = skip / BLIST_META_RADIX;
721
722	/*
723	 * An ALL-FREE meta node requires special handling before allocating
724	 * any of its blocks.
725	 */
726	if (scan->u.bmu_avail == radix) {
727		radix /= BLIST_META_RADIX;
728
729		/*
730		 * Reinitialize each of the meta node's children.  An ALL-FREE
731		 * meta node cannot have a terminator in any subtree.
732		 */
733		for (i = 1; i < skip; i += next_skip) {
734			if (next_skip == 1)
735				scan[i].u.bmu_bitmap = (u_daddr_t)-1;
736			else
737				scan[i].u.bmu_avail = radix;
738			scan[i].bm_bighint = radix;
739		}
740	} else {
741		radix /= BLIST_META_RADIX;
742	}
743
744	if (count > radix) {
745		/*
746		 * The allocation exceeds the number of blocks that are
747		 * managed by a subtree of this meta node.
748		 */
749		panic("allocation too large");
750	}
751	scan_from_start = cursor == blk;
752	child = (cursor - blk) / radix;
753	blk += child * radix;
754	for (i = 1 + child * next_skip; i < skip; i += next_skip) {
755		if (count <= scan[i].bm_bighint) {
756			/*
757			 * The allocation might fit beginning in the i'th subtree.
758			 */
759			r = blst_meta_alloc(&scan[i],
760			    cursor > blk ? cursor : blk, count, radix);
761			if (r != SWAPBLK_NONE) {
762				scan->u.bmu_avail -= count;
763				return (r);
764			}
765		} else if (scan[i].bm_bighint == (daddr_t)-1) {
766			/*
767			 * Terminator
768			 */
769			break;
770		}
771		blk += radix;
772	}
773
774	/*
775	 * We couldn't allocate count in this subtree, update bighint.
776	 */
777	if (scan_from_start && scan->bm_bighint >= count)
778		scan->bm_bighint = count - 1;
779
780	return (SWAPBLK_NONE);
781}
782
783/*
784 * BLST_LEAF_FREE() -	free allocated block from leaf bitmap
785 *
786 */
787static void
788blst_leaf_free(blmeta_t *scan, daddr_t blk, int count)
789{
790	u_daddr_t mask;
791	int n;
792
793	/*
794	 * free some data in this bitmap
795	 * mask=0000111111111110000
796	 *          \_________/\__/
797	 *		count   n
798	 */
799	n = blk & BLIST_BMAP_MASK;
800	mask = ((u_daddr_t)-1 << n) &
801	    ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
802	if (scan->u.bmu_bitmap & mask)
803		panic("freeing free block");
804	scan->u.bmu_bitmap |= mask;
805
806	/*
807	 * We could probably do a better job here.  We are required to make
808	 * bighint at least as large as the biggest contiguous block of
809	 * data.  If we just shoehorn it, a little extra overhead will
810	 * be incured on the next allocation (but only that one typically).
811	 */
812	scan->bm_bighint = BLIST_BMAP_RADIX;
813}
814
815/*
816 * BLST_META_FREE() - free allocated blocks from radix tree meta info
817 *
818 *	This support routine frees a range of blocks from the bitmap.
819 *	The range must be entirely enclosed by this radix node.  If a
820 *	meta node, we break the range down recursively to free blocks
821 *	in subnodes (which means that this code can free an arbitrary
822 *	range whereas the allocation code cannot allocate an arbitrary
823 *	range).
824 */
825static void
826blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix)
827{
828	daddr_t blk, i, next_skip, skip, v;
829	int child;
830
831	if (scan->bm_bighint == (daddr_t)-1)
832		panic("freeing invalid range");
833	if (radix == BLIST_BMAP_RADIX)
834		return (blst_leaf_free(scan, freeBlk, count));
835	skip = radix_to_skip(radix);
836	next_skip = skip / BLIST_META_RADIX;
837
838	if (scan->u.bmu_avail == 0) {
839		/*
840		 * ALL-ALLOCATED special case, with possible
841		 * shortcut to ALL-FREE special case.
842		 */
843		scan->u.bmu_avail = count;
844		scan->bm_bighint = count;
845
846		if (count != radix)  {
847			for (i = 1; i < skip; i += next_skip) {
848				if (scan[i].bm_bighint == (daddr_t)-1)
849					break;
850				scan[i].bm_bighint = 0;
851				if (next_skip == 1) {
852					scan[i].u.bmu_bitmap = 0;
853				} else {
854					scan[i].u.bmu_avail = 0;
855				}
856			}
857			/* fall through */
858		}
859	} else {
860		scan->u.bmu_avail += count;
861		/* scan->bm_bighint = radix; */
862	}
863
864	/*
865	 * ALL-FREE special case.
866	 */
867
868	if (scan->u.bmu_avail == radix)
869		return;
870	if (scan->u.bmu_avail > radix)
871		panic("blst_meta_free: freeing already free blocks (%lld) %lld/%lld",
872		    (long long)count, (long long)scan->u.bmu_avail,
873		    (long long)radix);
874
875	/*
876	 * Break the free down into its components
877	 */
878
879	blk = freeBlk & -radix;
880	radix /= BLIST_META_RADIX;
881
882	child = (freeBlk - blk) / radix;
883	blk += child * radix;
884	i = 1 + child * next_skip;
885	while (i < skip && blk < freeBlk + count) {
886		v = blk + radix - freeBlk;
887		if (v > count)
888			v = count;
889		blst_meta_free(&scan[i], freeBlk, v, radix);
890		if (scan->bm_bighint < scan[i].bm_bighint)
891			scan->bm_bighint = scan[i].bm_bighint;
892		count -= v;
893		freeBlk += v;
894		blk += radix;
895		i += next_skip;
896	}
897}
898
899/*
900 * BLIST_RADIX_COPY() - copy one radix tree to another
901 *
902 *	Locates free space in the source tree and frees it in the destination
903 *	tree.  The space may not already be free in the destination.
904 */
905static void
906blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest,
907    daddr_t count)
908{
909	daddr_t i, next_skip, skip;
910
911	/*
912	 * Leaf node
913	 */
914
915	if (radix == BLIST_BMAP_RADIX) {
916		u_daddr_t v = scan->u.bmu_bitmap;
917
918		if (v == (u_daddr_t)-1) {
919			blist_free(dest, blk, count);
920		} else if (v != 0) {
921			int i;
922
923			for (i = 0; i < BLIST_BMAP_RADIX && i < count; ++i) {
924				if (v & ((u_daddr_t)1 << i))
925					blist_free(dest, blk + i, 1);
926			}
927		}
928		return;
929	}
930
931	/*
932	 * Meta node
933	 */
934
935	if (scan->u.bmu_avail == 0) {
936		/*
937		 * Source all allocated, leave dest allocated
938		 */
939		return;
940	}
941	if (scan->u.bmu_avail == radix) {
942		/*
943		 * Source all free, free entire dest
944		 */
945		if (count < radix)
946			blist_free(dest, blk, count);
947		else
948			blist_free(dest, blk, radix);
949		return;
950	}
951
952
953	skip = radix_to_skip(radix);
954	next_skip = skip / BLIST_META_RADIX;
955	radix /= BLIST_META_RADIX;
956
957	for (i = 1; count && i < skip; i += next_skip) {
958		if (scan[i].bm_bighint == (daddr_t)-1)
959			break;
960
961		if (count >= radix) {
962			blst_copy(&scan[i], blk, radix, dest, radix);
963			count -= radix;
964		} else {
965			if (count) {
966				blst_copy(&scan[i], blk, radix, dest, count);
967			}
968			count = 0;
969		}
970		blk += radix;
971	}
972}
973
974/*
975 * BLST_LEAF_FILL() -	allocate specific blocks in leaf bitmap
976 *
977 *	This routine allocates all blocks in the specified range
978 *	regardless of any existing allocations in that range.  Returns
979 *	the number of blocks allocated by the call.
980 */
981static daddr_t
982blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
983{
984	daddr_t nblks;
985	u_daddr_t mask;
986	int n;
987
988	n = blk & BLIST_BMAP_MASK;
989	mask = ((u_daddr_t)-1 << n) &
990	    ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
991
992	/* Count the number of blocks that we are allocating. */
993	nblks = bitcount64(scan->u.bmu_bitmap & mask);
994
995	scan->u.bmu_bitmap &= ~mask;
996	return (nblks);
997}
998
999/*
1000 * BLIST_META_FILL() -	allocate specific blocks at a meta node
1001 *
1002 *	This routine allocates the specified range of blocks,
1003 *	regardless of any existing allocations in the range.  The
1004 *	range must be within the extent of this node.  Returns the
1005 *	number of blocks allocated by the call.
1006 */
1007static daddr_t
1008blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix)
1009{
1010	daddr_t blk, i, nblks, next_skip, skip, v;
1011	int child;
1012
1013	if (scan->bm_bighint == (daddr_t)-1)
1014		panic("filling invalid range");
1015	if (count > radix) {
1016		/*
1017		 * The allocation exceeds the number of blocks that are
1018		 * managed by this node.
1019		 */
1020		panic("fill too large");
1021	}
1022	if (radix == BLIST_BMAP_RADIX)
1023		return (blst_leaf_fill(scan, allocBlk, count));
1024	if (count == radix || scan->u.bmu_avail == 0)  {
1025		/*
1026		 * ALL-ALLOCATED special case
1027		 */
1028		nblks = scan->u.bmu_avail;
1029		scan->u.bmu_avail = 0;
1030		scan->bm_bighint = 0;
1031		return (nblks);
1032	}
1033	skip = radix_to_skip(radix);
1034	next_skip = skip / BLIST_META_RADIX;
1035	blk = allocBlk & -radix;
1036
1037	/*
1038	 * An ALL-FREE meta node requires special handling before allocating
1039	 * any of its blocks.
1040	 */
1041	if (scan->u.bmu_avail == radix) {
1042		radix /= BLIST_META_RADIX;
1043
1044		/*
1045		 * Reinitialize each of the meta node's children.  An ALL-FREE
1046		 * meta node cannot have a terminator in any subtree.
1047		 */
1048		for (i = 1; i < skip; i += next_skip) {
1049			if (next_skip == 1)
1050				scan[i].u.bmu_bitmap = (u_daddr_t)-1;
1051			else
1052				scan[i].u.bmu_avail = radix;
1053			scan[i].bm_bighint = radix;
1054		}
1055	} else {
1056		radix /= BLIST_META_RADIX;
1057	}
1058
1059	nblks = 0;
1060	child = (allocBlk - blk) / radix;
1061	blk += child * radix;
1062	i = 1 + child * next_skip;
1063	while (i < skip && blk < allocBlk + count) {
1064		v = blk + radix - allocBlk;
1065		if (v > count)
1066			v = count;
1067		nblks += blst_meta_fill(&scan[i], allocBlk, v, radix);
1068		count -= v;
1069		allocBlk += v;
1070		blk += radix;
1071		i += next_skip;
1072	}
1073	scan->u.bmu_avail -= nblks;
1074	return (nblks);
1075}
1076
1077/*
1078 * BLST_RADIX_INIT() - initialize radix tree
1079 *
1080 *	Initialize our meta structures and bitmaps and calculate the exact
1081 *	amount of space required to manage 'count' blocks - this space may
1082 *	be considerably less than the calculated radix due to the large
1083 *	RADIX values we use.
1084 */
1085static daddr_t
1086blst_radix_init(blmeta_t *scan, daddr_t radix, daddr_t count)
1087{
1088	daddr_t i, memindex, next_skip, skip;
1089
1090	memindex = 0;
1091
1092	/*
1093	 * Leaf node
1094	 */
1095
1096	if (radix == BLIST_BMAP_RADIX) {
1097		if (scan) {
1098			scan->bm_bighint = 0;
1099			scan->u.bmu_bitmap = 0;
1100		}
1101		return (memindex);
1102	}
1103
1104	/*
1105	 * Meta node.  If allocating the entire object we can special
1106	 * case it.  However, we need to figure out how much memory
1107	 * is required to manage 'count' blocks, so we continue on anyway.
1108	 */
1109
1110	if (scan) {
1111		scan->bm_bighint = 0;
1112		scan->u.bmu_avail = 0;
1113	}
1114
1115	skip = radix_to_skip(radix);
1116	next_skip = skip / BLIST_META_RADIX;
1117	radix /= BLIST_META_RADIX;
1118
1119	for (i = 1; i < skip; i += next_skip) {
1120		if (count >= radix) {
1121			/*
1122			 * Allocate the entire object
1123			 */
1124			memindex = i +
1125			    blst_radix_init(((scan) ? &scan[i] : NULL), radix,
1126			    radix);
1127			count -= radix;
1128		} else if (count > 0) {
1129			/*
1130			 * Allocate a partial object
1131			 */
1132			memindex = i +
1133			    blst_radix_init(((scan) ? &scan[i] : NULL), radix,
1134			    count);
1135			count = 0;
1136		} else {
1137			/*
1138			 * Add terminator and break out.  Make terminator bitmap
1139			 * zero to avoid a spanning leaf allocation that
1140			 * includes the terminator.
1141			 */
1142			if (scan) {
1143				scan[i].bm_bighint = (daddr_t)-1;
1144				scan[i].u.bmu_bitmap = 0;
1145			}
1146			break;
1147		}
1148	}
1149	if (memindex < i)
1150		memindex = i;
1151	return (memindex);
1152}
1153
1154#ifdef BLIST_DEBUG
1155
1156static void
1157blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab)
1158{
1159	daddr_t i, next_skip, skip;
1160
1161	if (radix == BLIST_BMAP_RADIX) {
1162		printf(
1163		    "%*.*s(%08llx,%lld): bitmap %016llx big=%lld\n",
1164		    tab, tab, "",
1165		    (long long)blk, (long long)radix,
1166		    (long long)scan->u.bmu_bitmap,
1167		    (long long)scan->bm_bighint
1168		);
1169		return;
1170	}
1171
1172	if (scan->u.bmu_avail == 0) {
1173		printf(
1174		    "%*.*s(%08llx,%lld) ALL ALLOCATED\n",
1175		    tab, tab, "",
1176		    (long long)blk,
1177		    (long long)radix
1178		);
1179		return;
1180	}
1181	if (scan->u.bmu_avail == radix) {
1182		printf(
1183		    "%*.*s(%08llx,%lld) ALL FREE\n",
1184		    tab, tab, "",
1185		    (long long)blk,
1186		    (long long)radix
1187		);
1188		return;
1189	}
1190
1191	printf(
1192	    "%*.*s(%08llx,%lld): subtree (%lld/%lld) big=%lld {\n",
1193	    tab, tab, "",
1194	    (long long)blk, (long long)radix,
1195	    (long long)scan->u.bmu_avail,
1196	    (long long)radix,
1197	    (long long)scan->bm_bighint
1198	);
1199
1200	skip = radix_to_skip(radix);
1201	next_skip = skip / BLIST_META_RADIX;
1202	radix /= BLIST_META_RADIX;
1203	tab += 4;
1204
1205	for (i = 1; i < skip; i += next_skip) {
1206		if (scan[i].bm_bighint == (daddr_t)-1) {
1207			printf(
1208			    "%*.*s(%08llx,%lld): Terminator\n",
1209			    tab, tab, "",
1210			    (long long)blk, (long long)radix
1211			);
1212			break;
1213		}
1214		blst_radix_print(&scan[i], blk, radix, tab);
1215		blk += radix;
1216	}
1217	tab -= 4;
1218
1219	printf(
1220	    "%*.*s}\n",
1221	    tab, tab, ""
1222	);
1223}
1224
1225#endif
1226
1227#ifdef BLIST_DEBUG
1228
1229int
1230main(int ac, char **av)
1231{
1232	int size = 1024;
1233	int i;
1234	blist_t bl;
1235	struct sbuf *s;
1236
1237	for (i = 1; i < ac; ++i) {
1238		const char *ptr = av[i];
1239		if (*ptr != '-') {
1240			size = strtol(ptr, NULL, 0);
1241			continue;
1242		}
1243		ptr += 2;
1244		fprintf(stderr, "Bad option: %s\n", ptr - 2);
1245		exit(1);
1246	}
1247	bl = blist_create(size, M_WAITOK);
1248	blist_free(bl, 0, size);
1249
1250	for (;;) {
1251		char buf[1024];
1252		long long da = 0;
1253		long long count = 0;
1254
1255		printf("%lld/%lld/%lld> ", (long long)blist_avail(bl),
1256		    (long long)size, (long long)bl->bl_radix);
1257		fflush(stdout);
1258		if (fgets(buf, sizeof(buf), stdin) == NULL)
1259			break;
1260		switch(buf[0]) {
1261		case 'r':
1262			if (sscanf(buf + 1, "%lld", &count) == 1) {
1263				blist_resize(&bl, count, 1, M_WAITOK);
1264			} else {
1265				printf("?\n");
1266			}
1267		case 'p':
1268			blist_print(bl);
1269			break;
1270		case 's':
1271			s = sbuf_new_auto();
1272			blist_stats(bl, s);
1273			sbuf_finish(s);
1274			printf("%s", sbuf_data(s));
1275			sbuf_delete(s);
1276			break;
1277		case 'a':
1278			if (sscanf(buf + 1, "%lld", &count) == 1) {
1279				daddr_t blk = blist_alloc(bl, count);
1280				printf("    R=%08llx\n", (long long)blk);
1281			} else {
1282				printf("?\n");
1283			}
1284			break;
1285		case 'f':
1286			if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1287				blist_free(bl, da, count);
1288			} else {
1289				printf("?\n");
1290			}
1291			break;
1292		case 'l':
1293			if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1294				printf("    n=%jd\n",
1295				    (intmax_t)blist_fill(bl, da, count));
1296			} else {
1297				printf("?\n");
1298			}
1299			break;
1300		case '?':
1301		case 'h':
1302			puts(
1303			    "p          -print\n"
1304			    "s          -stats\n"
1305			    "a %d       -allocate\n"
1306			    "f %x %d    -free\n"
1307			    "l %x %d    -fill\n"
1308			    "r %d       -resize\n"
1309			    "h/?        -help"
1310			);
1311			break;
1312		default:
1313			printf("?\n");
1314			break;
1315		}
1316	}
1317	return(0);
1318}
1319
1320void
1321panic(const char *ctl, ...)
1322{
1323	va_list va;
1324
1325	va_start(va, ctl);
1326	vfprintf(stderr, ctl, va);
1327	fprintf(stderr, "\n");
1328	va_end(va);
1329	exit(1);
1330}
1331
1332#endif
1333