1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of the University nor the names of its contributors
14 *    may be used to endorse or promote products derived from this software
15 *    without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
18 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
23 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29/*
30 * BLIST.C -	Bitmap allocator/deallocator, using a radix tree with hinting
31 *
32 *	This module implements a general bitmap allocator/deallocator.  The
33 *	allocator eats around 2 bits per 'block'.  The module does not
34 *	try to interpret the meaning of a 'block' other than to return
35 *	SWAPBLK_NONE on an allocation failure.
36 *
37 *	A radix tree controls access to pieces of the bitmap, and includes
38 *	auxiliary information at each interior node about the availabilty of
39 *	contiguous free blocks in the subtree rooted at that node.  A radix
40 *	constant defines the size of the bitmaps contained in a leaf node
41 *	and the number of descendents of each of the meta (interior) nodes.
42 *	Each subtree is associated with a range of blocks.  The root of any
43 *	subtree stores a hint field that defines an upper bound on the size
44 *	of the largest allocation that can begin in the associated block
45 *	range.  A hint is an upper bound on a potential allocation, but not
46 *	necessarily a tight upper bound.
47 *
48 *	The bitmap field in each node directs the search for available blocks.
49 *	For a leaf node, a bit is set if the corresponding block is free.  For a
50 *	meta node, a bit is set if the corresponding subtree contains a free
51 *	block somewhere within it.  The search at a meta node considers only
52 *	children of that node that represent a range that includes a free block.
53 *
54 * 	The hinting greatly increases code efficiency for allocations while
55 *	the general radix structure optimizes both allocations and frees.  The
56 *	radix tree should be able to operate well no matter how much
57 *	fragmentation there is and no matter how large a bitmap is used.
58 *
59 *	The blist code wires all necessary memory at creation time.  Neither
60 *	allocations nor frees require interaction with the memory subsystem.
61 *	The non-blocking nature of allocations and frees is required by swap
62 *	code (vm/swap_pager.c).
63 *
64 *	LAYOUT: The radix tree is laid out recursively using a linear array.
65 *	Each meta node is immediately followed (laid out sequentially in
66 *	memory) by BLIST_RADIX lower-level nodes.  This is a recursive
67 *	structure but one that can be easily scanned through a very simple
68 *	'skip' calculation.  The memory allocation is only large enough to
69 *	cover the number of blocks requested at creation time.  Nodes that
70 *	represent blocks beyond that limit, nodes that would never be read
71 *	or written, are not allocated, so that the last of the
72 *	BLIST_RADIX lower-level nodes of a some nodes may not be allocated.
73 *
74 *	NOTE: the allocator cannot currently allocate more than
75 *	BLIST_RADIX blocks per call.  It will panic with 'allocation too
76 *	large' if you try.  This is an area that could use improvement.  The
77 *	radix is large enough that this restriction does not effect the swap
78 *	system, though.  Currently only the allocation code is affected by
79 *	this algorithmic unfeature.  The freeing code can handle arbitrary
80 *	ranges.
81 *
82 *	This code can be compiled stand-alone for debugging.
83 */
84
85#include <sys/cdefs.h>
86__FBSDID("$FreeBSD$");
87
88#ifdef _KERNEL
89
90#include <sys/param.h>
91#include <sys/systm.h>
92#include <sys/lock.h>
93#include <sys/kernel.h>
94#include <sys/blist.h>
95#include <sys/malloc.h>
96#include <sys/sbuf.h>
97#include <sys/proc.h>
98#include <sys/mutex.h>
99
100#else
101
102#ifndef BLIST_NO_DEBUG
103#define BLIST_DEBUG
104#endif
105
106#include <sys/errno.h>
107#include <sys/types.h>
108#include <sys/malloc.h>
109#include <sys/sbuf.h>
110#include <assert.h>
111#include <stdio.h>
112#include <string.h>
113#include <stddef.h>
114#include <stdlib.h>
115#include <stdarg.h>
116#include <stdbool.h>
117
118#define	bitcount64(x)	__bitcount64((uint64_t)(x))
119#define malloc(a,b,c)	calloc(a, 1)
120#define free(a,b)	free(a)
121#define ummin(a,b)	((a) < (b) ? (a) : (b))
122#define imin(a,b)	((a) < (b) ? (a) : (b))
123#define KASSERT(a,b)	assert(a)
124
125#include <sys/blist.h>
126
127#endif
128
129/*
130 * static support functions
131 */
132static daddr_t	blst_leaf_alloc(blmeta_t *scan, daddr_t blk,
133    int *count, int maxcount);
134static daddr_t	blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count,
135    int maxcount, u_daddr_t radix);
136static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
137static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
138		    u_daddr_t radix);
139static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
140		    blist_t dest, daddr_t count);
141static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
142static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
143		    u_daddr_t radix);
144#ifndef _KERNEL
145static void	blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix,
146		    int tab);
147#endif
148
149#ifdef _KERNEL
150static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
151#endif
152
153#define	BLIST_MASK	(BLIST_RADIX - 1)
154
155/*
156 * For a subtree that can represent the state of up to 'radix' blocks, the
157 * number of leaf nodes of the subtree is L=radix/BLIST_RADIX.  If 'm'
158 * is short for BLIST_RADIX, then for a tree of height h with L=m**h
159 * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h,
160 * or, equivalently, (m**(h+1)-1)/(m-1).  This quantity is called 'skip'
161 * in the 'meta' functions that process subtrees.  Since integer division
162 * discards remainders, we can express this computation as
163 * skip = (m * m**h) / (m - 1)
164 * skip = (m * (radix / m)) / (m - 1)
165 * skip = radix / (m - 1)
166 * so that simple integer division by a constant can safely be used for the
167 * calculation.
168 */
169static inline daddr_t
170radix_to_skip(daddr_t radix)
171{
172
173	return (radix / BLIST_MASK);
174}
175
176/*
177 * Provide a mask with count bits set, starting as position n.
178 */
179static inline u_daddr_t
180bitrange(int n, int count)
181{
182
183	return (((u_daddr_t)-1 << n) &
184	    ((u_daddr_t)-1 >> (BLIST_RADIX - (n + count))));
185}
186
187/*
188 * Find the first bit set in a u_daddr_t.
189 */
190static inline int
191generic_bitpos(u_daddr_t mask)
192{
193	int hi, lo, mid;
194
195	lo = 0;
196	hi = BLIST_RADIX;
197	while (lo + 1 < hi) {
198		mid = (lo + hi) >> 1;
199		if (mask & bitrange(0, mid))
200			hi = mid;
201		else
202			lo = mid;
203	}
204	return (lo);
205}
206
207static inline int
208bitpos(u_daddr_t mask)
209{
210
211	switch (sizeof(mask)) {
212#ifdef HAVE_INLINE_FFSLL
213	case sizeof(long long):
214		return (ffsll(mask) - 1);
215#endif
216#ifdef HAVE_INLINE_FFS
217	case sizeof(int):
218		return (ffs(mask) - 1);
219#endif
220	default:
221		return (generic_bitpos(mask));
222	}
223}
224
225/*
226 * blist_create() - create a blist capable of handling up to the specified
227 *		    number of blocks
228 *
229 *	blocks - must be greater than 0
230 * 	flags  - malloc flags
231 *
232 *	The smallest blist consists of a single leaf node capable of
233 *	managing BLIST_RADIX blocks.
234 */
235blist_t
236blist_create(daddr_t blocks, int flags)
237{
238	blist_t bl;
239	u_daddr_t nodes, radix;
240
241	KASSERT(blocks > 0, ("invalid block count"));
242
243	/*
244	 * Calculate the radix and node count used for scanning.
245	 */
246	nodes = 1;
247	for (radix = 1; radix <= blocks / BLIST_RADIX; radix *= BLIST_RADIX)
248		nodes += 1 + (blocks - 1) / radix / BLIST_RADIX;
249
250	bl = malloc(offsetof(struct blist, bl_root[nodes]), M_SWAP, flags |
251	    M_ZERO);
252	if (bl == NULL)
253		return (NULL);
254
255	bl->bl_blocks = blocks;
256	bl->bl_radix = radix;
257
258#if defined(BLIST_DEBUG)
259	printf(
260		"BLIST representing %lld blocks (%lld MB of swap)"
261		", requiring %lldK of ram\n",
262		(long long)bl->bl_blocks,
263		(long long)bl->bl_blocks * 4 / 1024,
264		(long long)(nodes * sizeof(blmeta_t) + 1023) / 1024
265	);
266	printf("BLIST raw radix tree contains %lld records\n",
267	    (long long)nodes);
268#endif
269
270	return (bl);
271}
272
273void
274blist_destroy(blist_t bl)
275{
276
277	free(bl, M_SWAP);
278}
279
280/*
281 * blist_alloc() -   reserve space in the block bitmap.  Return the base
282 *		     of a contiguous region or SWAPBLK_NONE if space could
283 *		     not be allocated.
284 */
285daddr_t
286blist_alloc(blist_t bl, int *count, int maxcount)
287{
288	daddr_t blk, cursor;
289
290	KASSERT(*count <= maxcount,
291	    ("invalid parameters %d > %d", *count, maxcount));
292	KASSERT(*count <= BLIST_MAX_ALLOC,
293	    ("minimum allocation too large: %d", *count));
294
295	/*
296	 * This loop iterates at most twice.  An allocation failure in the
297	 * first iteration leads to a second iteration only if the cursor was
298	 * non-zero.  When the cursor is zero, an allocation failure will
299	 * stop further iterations.
300	 */
301	for (cursor = bl->bl_cursor;; cursor = 0) {
302		blk = blst_meta_alloc(bl->bl_root, cursor, count, maxcount,
303		    bl->bl_radix);
304		if (blk != SWAPBLK_NONE) {
305			bl->bl_avail -= *count;
306			bl->bl_cursor = blk + *count;
307			if (bl->bl_cursor == bl->bl_blocks)
308				bl->bl_cursor = 0;
309			return (blk);
310		}
311		if (cursor == 0)
312			return (SWAPBLK_NONE);
313	}
314}
315
316/*
317 * blist_avail() -	return the number of free blocks.
318 */
319daddr_t
320blist_avail(blist_t bl)
321{
322
323	return (bl->bl_avail);
324}
325
326/*
327 * blist_free() -	free up space in the block bitmap.  Return the base
328 *		     	of a contiguous region.
329 */
330void
331blist_free(blist_t bl, daddr_t blkno, daddr_t count)
332{
333
334	KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks,
335	    ("freeing invalid range: blkno %jx, count %d, blocks %jd",
336	    (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks));
337	blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix);
338	bl->bl_avail += count;
339}
340
341/*
342 * blist_fill() -	mark a region in the block bitmap as off-limits
343 *			to the allocator (i.e. allocate it), ignoring any
344 *			existing allocations.  Return the number of blocks
345 *			actually filled that were free before the call.
346 */
347daddr_t
348blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
349{
350	daddr_t filled;
351
352	KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks,
353	    ("filling invalid range: blkno %jx, count %d, blocks %jd",
354	    (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks));
355	filled = blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix);
356	bl->bl_avail -= filled;
357	return (filled);
358}
359
360/*
361 * blist_resize() -	resize an existing radix tree to handle the
362 *			specified number of blocks.  This will reallocate
363 *			the tree and transfer the previous bitmap to the new
364 *			one.  When extending the tree you can specify whether
365 *			the new blocks are to left allocated or freed.
366 */
367void
368blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
369{
370    blist_t newbl = blist_create(count, flags);
371    blist_t save = *pbl;
372
373    *pbl = newbl;
374    if (count > save->bl_blocks)
375	    count = save->bl_blocks;
376    blst_copy(save->bl_root, 0, save->bl_radix, newbl, count);
377
378    /*
379     * If resizing upwards, should we free the new space or not?
380     */
381    if (freenew && count < newbl->bl_blocks) {
382	    blist_free(newbl, count, newbl->bl_blocks - count);
383    }
384    blist_destroy(save);
385}
386
387#ifdef BLIST_DEBUG
388
389/*
390 * blist_print()    - dump radix tree
391 */
392void
393blist_print(blist_t bl)
394{
395	printf("BLIST avail = %jd, cursor = %08jx {\n",
396	    (uintmax_t)bl->bl_avail, (uintmax_t)bl->bl_cursor);
397
398	if (bl->bl_root->bm_bitmap != 0)
399		blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4);
400	printf("}\n");
401}
402
403#endif
404
405static const u_daddr_t fib[] = {
406	1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584,
407	4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811,
408	514229, 832040, 1346269, 2178309, 3524578,
409};
410
411/*
412 * Use 'gap' to describe a maximal range of unallocated blocks/bits.
413 */
414struct gap_stats {
415	daddr_t	start;		/* current gap start, or SWAPBLK_NONE */
416	daddr_t	num;		/* number of gaps observed */
417	daddr_t	max;		/* largest gap size */
418	daddr_t	avg;		/* average gap size */
419	daddr_t	err;		/* sum - num * avg */
420	daddr_t	histo[nitems(fib)]; /* # gaps in each size range */
421	int	max_bucket;	/* last histo elt with nonzero val */
422};
423
424/*
425 * gap_stats_counting()    - is the state 'counting 1 bits'?
426 *                           or 'skipping 0 bits'?
427 */
428static inline bool
429gap_stats_counting(const struct gap_stats *stats)
430{
431
432	return (stats->start != SWAPBLK_NONE);
433}
434
435/*
436 * init_gap_stats()    - initialize stats on gap sizes
437 */
438static inline void
439init_gap_stats(struct gap_stats *stats)
440{
441
442	bzero(stats, sizeof(*stats));
443	stats->start = SWAPBLK_NONE;
444}
445
446/*
447 * update_gap_stats()    - update stats on gap sizes
448 */
449static void
450update_gap_stats(struct gap_stats *stats, daddr_t posn)
451{
452	daddr_t size;
453	int hi, lo, mid;
454
455	if (!gap_stats_counting(stats)) {
456		stats->start = posn;
457		return;
458	}
459	size = posn - stats->start;
460	stats->start = SWAPBLK_NONE;
461	if (size > stats->max)
462		stats->max = size;
463
464	/*
465	 * Find the fibonacci range that contains size,
466	 * expecting to find it in an early range.
467	 */
468	lo = 0;
469	hi = 1;
470	while (hi < nitems(fib) && fib[hi] <= size) {
471		lo = hi;
472		hi *= 2;
473	}
474	if (hi >= nitems(fib))
475		hi = nitems(fib);
476	while (lo + 1 != hi) {
477		mid = (lo + hi) >> 1;
478		if (fib[mid] <= size)
479			lo = mid;
480		else
481			hi = mid;
482	}
483	stats->histo[lo]++;
484	if (lo > stats->max_bucket)
485		stats->max_bucket = lo;
486	stats->err += size - stats->avg;
487	stats->num++;
488	stats->avg += stats->err / stats->num;
489	stats->err %= stats->num;
490}
491
492/*
493 * dump_gap_stats()    - print stats on gap sizes
494 */
495static inline void
496dump_gap_stats(const struct gap_stats *stats, struct sbuf *s)
497{
498	int i;
499
500	sbuf_printf(s, "number of maximal free ranges: %jd\n",
501	    (intmax_t)stats->num);
502	sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max);
503	sbuf_printf(s, "average maximal free range size: %jd\n",
504	    (intmax_t)stats->avg);
505	sbuf_printf(s, "number of maximal free ranges of different sizes:\n");
506	sbuf_printf(s, "               count  |  size range\n");
507	sbuf_printf(s, "               -----  |  ----------\n");
508	for (i = 0; i < stats->max_bucket; i++) {
509		if (stats->histo[i] != 0) {
510			sbuf_printf(s, "%20jd  |  ",
511			    (intmax_t)stats->histo[i]);
512			if (fib[i] != fib[i + 1] - 1)
513				sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
514				    (intmax_t)fib[i + 1] - 1);
515			else
516				sbuf_printf(s, "%jd\n", (intmax_t)fib[i]);
517		}
518	}
519	sbuf_printf(s, "%20jd  |  ", (intmax_t)stats->histo[i]);
520	if (stats->histo[i] > 1)
521		sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
522		    (intmax_t)stats->max);
523	else
524		sbuf_printf(s, "%jd\n", (intmax_t)stats->max);
525}
526
527/*
528 * blist_stats()    - dump radix tree stats
529 */
530void
531blist_stats(blist_t bl, struct sbuf *s)
532{
533	struct gap_stats gstats;
534	struct gap_stats *stats = &gstats;
535	daddr_t i, nodes, radix;
536	u_daddr_t diff, mask;
537	int digit;
538
539	init_gap_stats(stats);
540	nodes = 0;
541	radix = bl->bl_radix;
542	for (i = 0; i < bl->bl_blocks; ) {
543		/*
544		 * Check for skippable subtrees starting at i.
545		 */
546		while (radix != 1) {
547			if (bl->bl_root[nodes].bm_bitmap == 0) {
548				if (gap_stats_counting(stats))
549					update_gap_stats(stats, i);
550				break;
551			}
552
553			/*
554			 * Skip subtree root.
555			 */
556			nodes++;
557			radix /= BLIST_RADIX;
558		}
559		if (radix == 1) {
560			/*
561			 * Scan leaf.
562			 */
563			mask = bl->bl_root[nodes].bm_bitmap;
564			diff = mask ^ (mask << 1);
565			if (gap_stats_counting(stats))
566				diff ^= 1;
567			while (diff != 0) {
568				digit = bitpos(diff);
569				update_gap_stats(stats, i + digit);
570				diff ^= bitrange(digit, 1);
571			}
572		}
573		nodes += radix_to_skip(radix * BLIST_RADIX);
574		i += radix * BLIST_RADIX;
575
576		/*
577		 * Find max size subtree starting at i.
578		 */
579		for (radix = 1;
580		    ((i / BLIST_RADIX / radix) & BLIST_MASK) == 0;
581		    radix *= BLIST_RADIX)
582			;
583	}
584	update_gap_stats(stats, i);
585	dump_gap_stats(stats, s);
586}
587
588/************************************************************************
589 *			  ALLOCATION SUPPORT FUNCTIONS			*
590 ************************************************************************
591 *
592 *	These support functions do all the actual work.  They may seem
593 *	rather longish, but that's because I've commented them up.  The
594 *	actual code is straight forward.
595 *
596 */
597
598/*
599 * BLST_NEXT_LEAF_ALLOC() - allocate the blocks starting with the next leaf.
600 *
601 *	'scan' is a leaf node, and its first block is at address 'start'.  The
602 *	next leaf node could be adjacent, or several nodes away if the least
603 *	common ancestor of 'scan' and its neighbor is several levels up.  Use
604 *	addresses to determine how many meta-nodes lie between the leaves.  If
605 *	sequence of leaves starting with the next one has enough initial bits
606 *	set, clear them and clear the bits in the meta nodes on the path up to
607 *	the least common ancestor to mark any subtrees made completely empty.
608 */
609static int
610blst_next_leaf_alloc(blmeta_t *scan, daddr_t start, int count, int maxcount)
611{
612	u_daddr_t radix;
613	daddr_t blk;
614	int avail, digit;
615
616	start += BLIST_RADIX;
617	for (blk = start; blk - start < maxcount; blk += BLIST_RADIX) {
618		/* Skip meta-nodes, as long as they promise more free blocks. */
619		radix = BLIST_RADIX;
620		while (((++scan)->bm_bitmap & 1) == 1 &&
621		    ((blk / radix) & BLIST_MASK) == 0)
622			radix *= BLIST_RADIX;
623		if (~scan->bm_bitmap != 0) {
624			/*
625			 * Either there is no next leaf with any free blocks,
626			 * or we've reached the next leaf and found that some
627			 * of its blocks are not free.  In the first case,
628			 * bitpos() returns zero here.
629			 */
630			avail = blk - start + bitpos(~scan->bm_bitmap);
631			if (avail < count || avail == 0) {
632				/*
633				 * There isn't a next leaf with enough free
634				 * blocks at its beginning to bother
635				 * allocating.
636				 */
637				return (avail);
638			}
639			maxcount = imin(avail, maxcount);
640			if (maxcount % BLIST_RADIX == 0) {
641				/*
642				 * There was no next leaf.  Back scan up to
643				 * last leaf.
644				 */
645				do {
646					radix /= BLIST_RADIX;
647					--scan;
648				} while (radix != 1);
649				blk -= BLIST_RADIX;
650			}
651		}
652	}
653
654	/*
655	 * 'scan' is the last leaf that provides blocks.  Clear from 1 to
656	 * BLIST_RADIX bits to represent the allocation of those last blocks.
657	 */
658	if (maxcount % BLIST_RADIX != 0)
659		scan->bm_bitmap &= ~bitrange(0, maxcount % BLIST_RADIX);
660	else
661		scan->bm_bitmap = 0;
662
663	for (;;) {
664		/* Back up over meta-nodes, clearing bits if necessary. */
665		blk -= BLIST_RADIX;
666		for (radix = BLIST_RADIX;
667		    (digit = ((blk / radix) & BLIST_MASK)) == 0;
668		    radix *= BLIST_RADIX) {
669			if ((scan--)->bm_bitmap == 0)
670				scan->bm_bitmap ^= 1;
671		}
672		if ((scan--)->bm_bitmap == 0)
673			scan[-digit * radix_to_skip(radix)].bm_bitmap ^=
674			    (u_daddr_t)1 << digit;
675
676		if (blk == start)
677			break;
678		/* Clear all the bits of this leaf. */
679		scan->bm_bitmap = 0;
680	}
681	return (maxcount);
682}
683
684/*
685 * BLST_LEAF_ALLOC() -	allocate at a leaf in the radix tree (a bitmap).
686 *
687 *	This function is the core of the allocator.  Its execution time is
688 *	proportional to log(count), plus height of the tree if the allocation
689 *	crosses a leaf boundary.
690 */
691static daddr_t
692blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int *count, int maxcount)
693{
694	u_daddr_t mask;
695	int bighint, count1, hi, lo, num_shifts;
696
697	count1 = *count - 1;
698	num_shifts = fls(count1);
699	mask = ~scan->bm_bitmap;
700	while ((mask & (mask + 1)) != 0 && num_shifts > 0) {
701		/*
702		 * If bit i is 0 in mask, then bits in [i, i + (count1 >>
703		 * num_shifts)] are 1 in scan->bm_bitmap.  Reduce num_shifts to
704		 * 0, while preserving this invariant.  The updates to mask
705		 * leave fewer bits 0, but each bit that remains 0 represents a
706		 * longer string of consecutive 1-bits in scan->bm_bitmap.  If
707		 * more updates to mask cannot set more bits, because mask is
708		 * partitioned with all 1 bits following all 0 bits, the loop
709		 * terminates immediately.
710		 */
711		num_shifts--;
712		mask |= mask >> ((count1 >> num_shifts) + 1) / 2;
713	}
714	bighint = count1 >> num_shifts;
715	if (~mask == 0) {
716		/*
717		 * Update bighint.  There is no allocation bigger than
718		 * count1 >> num_shifts starting in this leaf.
719		 */
720		scan->bm_bighint = bighint;
721		return (SWAPBLK_NONE);
722	}
723
724	/* Discard any candidates that appear before blk. */
725	if ((blk & BLIST_MASK) != 0) {
726		if ((~mask & bitrange(0, blk & BLIST_MASK)) != 0) {
727			/* Grow bighint in case all discarded bits are set. */
728			bighint += blk & BLIST_MASK;
729			mask |= bitrange(0, blk & BLIST_MASK);
730			if (~mask == 0) {
731				scan->bm_bighint = bighint;
732				return (SWAPBLK_NONE);
733			}
734		}
735		blk -= blk & BLIST_MASK;
736	}
737
738	/*
739	 * The least significant set bit in mask marks the start of the first
740	 * available range of sufficient size.  Find its position.
741	 */
742	lo = bitpos(~mask);
743
744	/*
745	 * Find how much space is available starting at that position.
746	 */
747	if ((mask & (mask + 1)) != 0) {
748		/* Count the 1 bits starting at position lo. */
749		hi = bitpos(mask & (mask + 1)) + count1;
750		if (maxcount < hi - lo)
751			hi = lo + maxcount;
752		*count = hi - lo;
753		mask = ~bitrange(lo, *count);
754	} else if (maxcount <= BLIST_RADIX - lo) {
755		/* All the blocks we can use are available here. */
756		hi = lo + maxcount;
757		*count = maxcount;
758		mask = ~bitrange(lo, *count);
759		if (hi == BLIST_RADIX)
760			scan->bm_bighint = bighint;
761	} else {
762		/* Check next leaf for some of the blocks we want or need. */
763		count1 = *count - (BLIST_RADIX - lo);
764		maxcount -= BLIST_RADIX - lo;
765		hi = blst_next_leaf_alloc(scan, blk, count1, maxcount);
766		if (hi < count1)
767			/*
768			 * The next leaf cannot supply enough blocks to reach
769			 * the minimum required allocation.  The hint cannot be
770			 * updated, because the same allocation request could
771			 * be satisfied later, by this leaf, if the state of
772			 * the next leaf changes, and without any changes to
773			 * this leaf.
774			 */
775			return (SWAPBLK_NONE);
776		*count = BLIST_RADIX - lo + hi;
777		scan->bm_bighint = bighint;
778	}
779
780	/* Clear the allocated bits from this leaf. */
781	scan->bm_bitmap &= mask;
782	return (blk + lo);
783}
784
785/*
786 * blist_meta_alloc() -	allocate at a meta in the radix tree.
787 *
788 *	Attempt to allocate at a meta node.  If we can't, we update
789 *	bighint and return a failure.  Updating bighint optimize future
790 *	calls that hit this node.  We have to check for our collapse cases
791 *	and we have a few optimizations strewn in as well.
792 */
793static daddr_t
794blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count,
795    int maxcount, u_daddr_t radix)
796{
797	daddr_t blk, i, r, skip;
798	u_daddr_t mask;
799	bool scan_from_start;
800	int digit;
801
802	if (radix == 1)
803		return (blst_leaf_alloc(scan, cursor, count, maxcount));
804	blk = cursor & -(radix * BLIST_RADIX);
805	scan_from_start = (cursor == blk);
806	skip = radix_to_skip(radix);
807	mask = scan->bm_bitmap;
808
809	/* Discard any candidates that appear before cursor. */
810	digit = (cursor / radix) & BLIST_MASK;
811	mask &= (u_daddr_t)-1 << digit;
812	if (mask == 0)
813		return (SWAPBLK_NONE);
814
815	/*
816	 * If the first try is for a block that includes the cursor, pre-undo
817	 * the digit * radix offset in the first call; otherwise, ignore the
818	 * cursor entirely.
819	 */
820	if (((mask >> digit) & 1) == 1)
821		cursor -= digit * radix;
822	else
823		cursor = blk;
824
825	/*
826	 * Examine the nonempty subtree associated with each bit set in mask.
827	 */
828	do {
829		digit = bitpos(mask);
830		i = 1 + digit * skip;
831		if (*count <= scan[i].bm_bighint) {
832			/*
833			 * The allocation might fit beginning in the i'th subtree.
834			 */
835			r = blst_meta_alloc(&scan[i], cursor + digit * radix,
836			    count, maxcount, radix / BLIST_RADIX);
837			if (r != SWAPBLK_NONE) {
838				if (scan[i].bm_bitmap == 0)
839					scan->bm_bitmap ^= bitrange(digit, 1);
840				return (r);
841			}
842		}
843		cursor = blk;
844	} while ((mask ^= bitrange(digit, 1)) != 0);
845
846	/*
847	 * We couldn't allocate count in this subtree.  If the whole tree was
848	 * scanned, and the last tree node is allocated, update bighint.
849	 */
850	if (scan_from_start && !(digit == BLIST_RADIX - 1 &&
851	    scan[i].bm_bighint == BLIST_MAX_ALLOC))
852		scan->bm_bighint = *count - 1;
853
854	return (SWAPBLK_NONE);
855}
856
857/*
858 * BLST_LEAF_FREE() -	free allocated block from leaf bitmap
859 *
860 */
861static void
862blst_leaf_free(blmeta_t *scan, daddr_t blk, int count)
863{
864	u_daddr_t mask;
865
866	/*
867	 * free some data in this bitmap
868	 * mask=0000111111111110000
869	 *          \_________/\__/
870	 *		count   n
871	 */
872	mask = bitrange(blk & BLIST_MASK, count);
873	KASSERT((scan->bm_bitmap & mask) == 0,
874	    ("freeing free block: %jx, size %d, mask %jx",
875	    (uintmax_t)blk, count, (uintmax_t)scan->bm_bitmap & mask));
876	scan->bm_bitmap |= mask;
877}
878
879/*
880 * BLST_META_FREE() - free allocated blocks from radix tree meta info
881 *
882 *	This support routine frees a range of blocks from the bitmap.
883 *	The range must be entirely enclosed by this radix node.  If a
884 *	meta node, we break the range down recursively to free blocks
885 *	in subnodes (which means that this code can free an arbitrary
886 *	range whereas the allocation code cannot allocate an arbitrary
887 *	range).
888 */
889static void
890blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix)
891{
892	daddr_t blk, endBlk, i, skip;
893	int digit, endDigit;
894
895	/*
896	 * We could probably do a better job here.  We are required to make
897	 * bighint at least as large as the biggest allocable block of data.
898	 * If we just shoehorn it, a little extra overhead will be incurred
899	 * on the next allocation (but only that one typically).
900	 */
901	scan->bm_bighint = BLIST_MAX_ALLOC;
902
903	if (radix == 1)
904		return (blst_leaf_free(scan, freeBlk, count));
905
906	endBlk = freeBlk + count;
907	blk = (freeBlk + radix * BLIST_RADIX) & -(radix * BLIST_RADIX);
908	/*
909	 * blk is first block past the end of the range of this meta node,
910	 * or 0 in case of overflow.
911	 */
912	if (blk != 0)
913		endBlk = ummin(endBlk, blk);
914	skip = radix_to_skip(radix);
915	blk = freeBlk & -radix;
916	digit = (blk / radix) & BLIST_MASK;
917	endDigit = 1 + (((endBlk - 1) / radix) & BLIST_MASK);
918	scan->bm_bitmap |= bitrange(digit, endDigit - digit);
919	for (i = 1 + digit * skip; blk < endBlk; i += skip) {
920		blk += radix;
921		count = ummin(blk, endBlk) - freeBlk;
922		blst_meta_free(&scan[i], freeBlk, count, radix / BLIST_RADIX);
923		freeBlk = blk;
924	}
925}
926
927/*
928 * BLST_COPY() - copy one radix tree to another
929 *
930 *	Locates free space in the source tree and frees it in the destination
931 *	tree.  The space may not already be free in the destination.
932 */
933static void
934blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest,
935    daddr_t count)
936{
937	daddr_t endBlk, i, skip;
938
939	/*
940	 * Leaf node
941	 */
942
943	if (radix == 1) {
944		u_daddr_t v = scan->bm_bitmap;
945
946		if (v == (u_daddr_t)-1) {
947			blist_free(dest, blk, count);
948		} else if (v != 0) {
949			int i;
950
951			for (i = 0; i < count; ++i) {
952				if (v & ((u_daddr_t)1 << i))
953					blist_free(dest, blk + i, 1);
954			}
955		}
956		return;
957	}
958
959	/*
960	 * Meta node
961	 */
962
963	if (scan->bm_bitmap == 0) {
964		/*
965		 * Source all allocated, leave dest allocated
966		 */
967		return;
968	}
969
970	endBlk = blk + count;
971	skip = radix_to_skip(radix);
972	for (i = 1; blk < endBlk; i += skip) {
973		blk += radix;
974		count = radix;
975		if (blk >= endBlk)
976			count -= blk - endBlk;
977		blst_copy(&scan[i], blk - radix,
978		    radix / BLIST_RADIX, dest, count);
979	}
980}
981
982/*
983 * BLST_LEAF_FILL() -	allocate specific blocks in leaf bitmap
984 *
985 *	This routine allocates all blocks in the specified range
986 *	regardless of any existing allocations in that range.  Returns
987 *	the number of blocks allocated by the call.
988 */
989static daddr_t
990blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
991{
992	daddr_t nblks;
993	u_daddr_t mask;
994
995	mask = bitrange(blk & BLIST_MASK, count);
996
997	/* Count the number of blocks that we are allocating. */
998	nblks = bitcount64(scan->bm_bitmap & mask);
999
1000	scan->bm_bitmap &= ~mask;
1001	return (nblks);
1002}
1003
1004/*
1005 * BLIST_META_FILL() -	allocate specific blocks at a meta node
1006 *
1007 *	This routine allocates the specified range of blocks,
1008 *	regardless of any existing allocations in the range.  The
1009 *	range must be within the extent of this node.  Returns the
1010 *	number of blocks allocated by the call.
1011 */
1012static daddr_t
1013blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix)
1014{
1015	daddr_t blk, endBlk, i, nblks, skip;
1016	int digit;
1017
1018	if (radix == 1)
1019		return (blst_leaf_fill(scan, allocBlk, count));
1020
1021	endBlk = allocBlk + count;
1022	blk = (allocBlk + radix * BLIST_RADIX) & -(radix * BLIST_RADIX);
1023	/*
1024	 * blk is first block past the end of the range of this meta node,
1025	 * or 0 in case of overflow.
1026	 */
1027	if (blk != 0)
1028		endBlk = ummin(endBlk, blk);
1029	skip = radix_to_skip(radix);
1030	blk = allocBlk & -radix;
1031	nblks = 0;
1032	while (blk < endBlk) {
1033		digit = (blk / radix) & BLIST_MASK;
1034		i = 1 + digit * skip;
1035		blk += radix;
1036		count = ummin(blk, endBlk) - allocBlk;
1037		nblks += blst_meta_fill(&scan[i], allocBlk, count,
1038		    radix / BLIST_RADIX);
1039		if (scan[i].bm_bitmap == 0)
1040			scan->bm_bitmap &= ~((u_daddr_t)1 << digit);
1041		allocBlk = blk;
1042	}
1043	return (nblks);
1044}
1045
1046#ifdef BLIST_DEBUG
1047
1048static void
1049blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab)
1050{
1051	daddr_t skip;
1052	u_daddr_t mask;
1053	int digit;
1054
1055	if (radix == 1) {
1056		printf(
1057		    "%*.*s(%08llx,%lld): bitmap %0*llx big=%lld\n",
1058		    tab, tab, "",
1059		    (long long)blk, (long long)BLIST_RADIX,
1060		    (int)(1 + (BLIST_RADIX - 1) / 4),
1061		    (long long)scan->bm_bitmap,
1062		    (long long)scan->bm_bighint
1063		);
1064		return;
1065	}
1066
1067	printf(
1068	    "%*.*s(%08llx): subtree (%lld/%lld) bitmap %0*llx big=%lld {\n",
1069	    tab, tab, "",
1070	    (long long)blk, (long long)radix * BLIST_RADIX,
1071	    (long long)radix * BLIST_RADIX,
1072	    (int)(1 + (BLIST_RADIX - 1) / 4),
1073	    (long long)scan->bm_bitmap,
1074	    (long long)scan->bm_bighint
1075	);
1076
1077	skip = radix_to_skip(radix);
1078	tab += 4;
1079
1080	mask = scan->bm_bitmap;
1081	/* Examine the nonempty subtree associated with each bit set in mask */
1082	do {
1083		digit = bitpos(mask);
1084		blst_radix_print(&scan[1 + digit * skip], blk + digit * radix,
1085		    radix / BLIST_RADIX, tab);
1086	} while ((mask ^= bitrange(digit, 1)) != 0);
1087	tab -= 4;
1088
1089	printf(
1090	    "%*.*s}\n",
1091	    tab, tab, ""
1092	);
1093}
1094
1095#endif
1096
1097#ifdef BLIST_DEBUG
1098
1099int
1100main(int ac, char **av)
1101{
1102	daddr_t size = BLIST_RADIX * BLIST_RADIX;
1103	int i;
1104	blist_t bl;
1105	struct sbuf *s;
1106
1107	for (i = 1; i < ac; ++i) {
1108		const char *ptr = av[i];
1109		if (*ptr != '-') {
1110			size = strtoll(ptr, NULL, 0);
1111			continue;
1112		}
1113		ptr += 2;
1114		fprintf(stderr, "Bad option: %s\n", ptr - 2);
1115		exit(1);
1116	}
1117	bl = blist_create(size, M_WAITOK);
1118	if (bl == NULL) {
1119		fprintf(stderr, "blist_create failed\n");
1120		exit(1);
1121	}
1122	blist_free(bl, 0, size);
1123
1124	for (;;) {
1125		char buf[1024];
1126		long long da = 0;
1127		int count = 0, maxcount = 0;
1128
1129		printf("%lld/%lld/%lld> ", (long long)blist_avail(bl),
1130		    (long long)size, (long long)bl->bl_radix * BLIST_RADIX);
1131		fflush(stdout);
1132		if (fgets(buf, sizeof(buf), stdin) == NULL)
1133			break;
1134		switch(buf[0]) {
1135		case 'r':
1136			if (sscanf(buf + 1, "%d", &count) == 1) {
1137				blist_resize(&bl, count, 1, M_WAITOK);
1138			} else {
1139				printf("?\n");
1140			}
1141		case 'p':
1142			blist_print(bl);
1143			break;
1144		case 's':
1145			s = sbuf_new_auto();
1146			blist_stats(bl, s);
1147			sbuf_finish(s);
1148			printf("%s", sbuf_data(s));
1149			sbuf_delete(s);
1150			break;
1151		case 'a':
1152			if (sscanf(buf + 1, "%d%d", &count, &maxcount) == 2) {
1153				daddr_t blk = blist_alloc(bl, &count, maxcount);
1154				printf("    R=%08llx, c=%08d\n",
1155				    (long long)blk, count);
1156			} else {
1157				printf("?\n");
1158			}
1159			break;
1160		case 'f':
1161			if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) {
1162				blist_free(bl, da, count);
1163			} else {
1164				printf("?\n");
1165			}
1166			break;
1167		case 'l':
1168			if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) {
1169				printf("    n=%jd\n",
1170				    (intmax_t)blist_fill(bl, da, count));
1171			} else {
1172				printf("?\n");
1173			}
1174			break;
1175		case '?':
1176		case 'h':
1177			puts(
1178			    "p          -print\n"
1179			    "s          -stats\n"
1180			    "a %d %d    -allocate\n"
1181			    "f %x %d    -free\n"
1182			    "l %x %d    -fill\n"
1183			    "r %d       -resize\n"
1184			    "h/?        -help\n"
1185			    "q          -quit"
1186			);
1187			break;
1188		case 'q':
1189			break;
1190		default:
1191			printf("?\n");
1192			break;
1193		}
1194		if (buf[0] == 'q')
1195			break;
1196	}
1197	return (0);
1198}
1199
1200#endif
1201