1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of the University nor the names of its contributors
14 *    may be used to endorse or promote products derived from this software
15 *    without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
18 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
23 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29/*
30 * BLIST.C -	Bitmap allocator/deallocator, using a radix tree with hinting
31 *
32 *	This module implements a general bitmap allocator/deallocator.  The
33 *	allocator eats around 2 bits per 'block'.  The module does not
34 *	try to interpret the meaning of a 'block' other than to return
35 *	SWAPBLK_NONE on an allocation failure.
36 *
37 *	A radix tree controls access to pieces of the bitmap, and includes
38 *	auxiliary information at each interior node about the availabilty of
39 *	contiguous free blocks in the subtree rooted at that node.  A radix
40 *	constant defines the size of the bitmaps contained in a leaf node
41 *	and the number of descendents of each of the meta (interior) nodes.
42 *	Each subtree is associated with a range of blocks.  The root of any
43 *	subtree stores a hint field that defines an upper bound on the size
44 *	of the largest allocation that can begin in the associated block
45 *	range.  A hint is an upper bound on a potential allocation, but not
46 *	necessarily a tight upper bound.
47 *
48 *	The bitmap field in each node directs the search for available blocks.
49 *	For a leaf node, a bit is set if the corresponding block is free.  For a
50 *	meta node, a bit is set if the corresponding subtree contains a free
51 *	block somewhere within it.  The search at a meta node considers only
52 *	children of that node that represent a range that includes a free block.
53 *
54 * 	The hinting greatly increases code efficiency for allocations while
55 *	the general radix structure optimizes both allocations and frees.  The
56 *	radix tree should be able to operate well no matter how much
57 *	fragmentation there is and no matter how large a bitmap is used.
58 *
59 *	The blist code wires all necessary memory at creation time.  Neither
60 *	allocations nor frees require interaction with the memory subsystem.
61 *	The non-blocking nature of allocations and frees is required by swap
62 *	code (vm/swap_pager.c).
63 *
64 *	LAYOUT: The radix tree is laid out recursively using a linear array.
65 *	Each meta node is immediately followed (laid out sequentially in
66 *	memory) by BLIST_RADIX lower-level nodes.  This is a recursive
67 *	structure but one that can be easily scanned through a very simple
68 *	'skip' calculation.  The memory allocation is only large enough to
69 *	cover the number of blocks requested at creation time.  Nodes that
70 *	represent blocks beyond that limit, nodes that would never be read
71 *	or written, are not allocated, so that the last of the
72 *	BLIST_RADIX lower-level nodes of a some nodes may not be allocated.
73 *
74 *	NOTE: the allocator cannot currently allocate more than
75 *	BLIST_RADIX blocks per call.  It will panic with 'allocation too
76 *	large' if you try.  This is an area that could use improvement.  The
77 *	radix is large enough that this restriction does not effect the swap
78 *	system, though.  Currently only the allocation code is affected by
79 *	this algorithmic unfeature.  The freeing code can handle arbitrary
80 *	ranges.
81 *
82 *	This code can be compiled stand-alone for debugging.
83 */
84
85#include <sys/cdefs.h>
86#ifdef _KERNEL
87
88#include <sys/param.h>
89#include <sys/systm.h>
90#include <sys/lock.h>
91#include <sys/kernel.h>
92#include <sys/blist.h>
93#include <sys/malloc.h>
94#include <sys/sbuf.h>
95#include <sys/proc.h>
96#include <sys/mutex.h>
97
98#else
99
100#ifndef BLIST_NO_DEBUG
101#define BLIST_DEBUG
102#endif
103
104#include <sys/errno.h>
105#include <sys/types.h>
106#include <sys/malloc.h>
107#include <sys/sbuf.h>
108#include <assert.h>
109#include <stdio.h>
110#include <string.h>
111#include <stddef.h>
112#include <stdlib.h>
113#include <stdarg.h>
114#include <stdbool.h>
115
116#define	bitcount64(x)	__bitcount64((uint64_t)(x))
117#define malloc(a,b,c)	calloc(a, 1)
118#define free(a,b)	free(a)
119#define ummin(a,b)	((a) < (b) ? (a) : (b))
120#define imin(a,b)	((a) < (b) ? (a) : (b))
121#define KASSERT(a,b)	assert(a)
122
123#include <sys/blist.h>
124
125#endif
126
127/*
128 * static support functions
129 */
130static daddr_t	blst_leaf_alloc(blmeta_t *scan, daddr_t blk,
131    int *count, int maxcount);
132static daddr_t	blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count,
133    int maxcount, u_daddr_t radix);
134static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
135static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
136		    u_daddr_t radix);
137static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
138		    blist_t dest, daddr_t count);
139static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
140static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
141		    u_daddr_t radix);
142#ifndef _KERNEL
143static void	blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix,
144		    int tab);
145#endif
146
147#ifdef _KERNEL
148static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
149#endif
150
151#define	BLIST_MASK	(BLIST_RADIX - 1)
152
153/*
154 * For a subtree that can represent the state of up to 'radix' blocks, the
155 * number of leaf nodes of the subtree is L=radix/BLIST_RADIX.  If 'm'
156 * is short for BLIST_RADIX, then for a tree of height h with L=m**h
157 * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h,
158 * or, equivalently, (m**(h+1)-1)/(m-1).  This quantity is called 'skip'
159 * in the 'meta' functions that process subtrees.  Since integer division
160 * discards remainders, we can express this computation as
161 * skip = (m * m**h) / (m - 1)
162 * skip = (m * (radix / m)) / (m - 1)
163 * skip = radix / (m - 1)
164 * so that simple integer division by a constant can safely be used for the
165 * calculation.
166 */
167static inline daddr_t
168radix_to_skip(daddr_t radix)
169{
170
171	return (radix / BLIST_MASK);
172}
173
174/*
175 * Provide a mask with count bits set, starting as position n.
176 */
177static inline u_daddr_t
178bitrange(int n, int count)
179{
180
181	return (((u_daddr_t)-1 << n) &
182	    ((u_daddr_t)-1 >> (BLIST_RADIX - (n + count))));
183}
184
185static inline int
186bitpos(u_daddr_t mask)
187{
188
189	_Static_assert(sizeof(long long) >= sizeof(mask),
190	    "mask too big for ffsll()");
191	return (ffsll(mask) - 1);
192}
193
194/*
195 * blist_create() - create a blist capable of handling up to the specified
196 *		    number of blocks
197 *
198 *	blocks - must be greater than 0
199 * 	flags  - malloc flags
200 *
201 *	The smallest blist consists of a single leaf node capable of
202 *	managing BLIST_RADIX blocks.
203 */
204blist_t
205blist_create(daddr_t blocks, int flags)
206{
207	blist_t bl;
208	u_daddr_t nodes, radix;
209
210	KASSERT(blocks > 0, ("invalid block count"));
211
212	/*
213	 * Calculate the radix and node count used for scanning.
214	 */
215	nodes = 1;
216	for (radix = 1; (blocks - 1) / BLIST_RADIX / radix > 0;
217	    radix *= BLIST_RADIX)
218		nodes += 1 + (blocks - 1) / BLIST_RADIX / radix;
219
220	/*
221	 * Include a sentinel node to ensure that cross-leaf scans stay within
222	 * the bounds of the allocation.
223	 */
224	if (blocks % BLIST_RADIX == 0)
225		nodes++;
226
227	bl = malloc(offsetof(struct blist, bl_root[nodes]), M_SWAP, flags |
228	    M_ZERO);
229	if (bl == NULL)
230		return (NULL);
231
232	bl->bl_blocks = blocks;
233	bl->bl_radix = radix;
234
235#if defined(BLIST_DEBUG)
236	printf(
237		"BLIST representing %lld blocks (%lld MB of swap)"
238		", requiring %lldK of ram\n",
239		(long long)bl->bl_blocks,
240		(long long)bl->bl_blocks * 4 / 1024,
241		(long long)(nodes * sizeof(blmeta_t) + 1023) / 1024
242	);
243	printf("BLIST raw radix tree contains %lld records\n",
244	    (long long)nodes);
245#endif
246
247	return (bl);
248}
249
250void
251blist_destroy(blist_t bl)
252{
253
254	free(bl, M_SWAP);
255}
256
257/*
258 * blist_alloc() -   reserve space in the block bitmap.  Return the base
259 *		     of a contiguous region or SWAPBLK_NONE if space could
260 *		     not be allocated.
261 */
262daddr_t
263blist_alloc(blist_t bl, int *count, int maxcount)
264{
265	daddr_t blk, cursor;
266
267	KASSERT(*count <= maxcount,
268	    ("invalid parameters %d > %d", *count, maxcount));
269	KASSERT(*count <= BLIST_MAX_ALLOC,
270	    ("minimum allocation too large: %d", *count));
271
272	/*
273	 * This loop iterates at most twice.  An allocation failure in the
274	 * first iteration leads to a second iteration only if the cursor was
275	 * non-zero.  When the cursor is zero, an allocation failure will
276	 * stop further iterations.
277	 */
278	for (cursor = bl->bl_cursor;; cursor = 0) {
279		blk = blst_meta_alloc(bl->bl_root, cursor, count, maxcount,
280		    bl->bl_radix);
281		if (blk != SWAPBLK_NONE) {
282			bl->bl_avail -= *count;
283			bl->bl_cursor = blk + *count;
284			if (bl->bl_cursor == bl->bl_blocks)
285				bl->bl_cursor = 0;
286			return (blk);
287		}
288		if (cursor == 0)
289			return (SWAPBLK_NONE);
290	}
291}
292
293/*
294 * blist_avail() -	return the number of free blocks.
295 */
296daddr_t
297blist_avail(blist_t bl)
298{
299
300	return (bl->bl_avail);
301}
302
303/*
304 * blist_free() -	free up space in the block bitmap.  Return the base
305 *		     	of a contiguous region.
306 */
307void
308blist_free(blist_t bl, daddr_t blkno, daddr_t count)
309{
310
311	KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks,
312	    ("freeing invalid range: blkno %jx, count %d, blocks %jd",
313	    (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks));
314	blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix);
315	bl->bl_avail += count;
316}
317
318/*
319 * blist_fill() -	mark a region in the block bitmap as off-limits
320 *			to the allocator (i.e. allocate it), ignoring any
321 *			existing allocations.  Return the number of blocks
322 *			actually filled that were free before the call.
323 */
324daddr_t
325blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
326{
327	daddr_t filled;
328
329	KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks,
330	    ("filling invalid range: blkno %jx, count %d, blocks %jd",
331	    (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks));
332	filled = blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix);
333	bl->bl_avail -= filled;
334	return (filled);
335}
336
337/*
338 * blist_resize() -	resize an existing radix tree to handle the
339 *			specified number of blocks.  This will reallocate
340 *			the tree and transfer the previous bitmap to the new
341 *			one.  When extending the tree you can specify whether
342 *			the new blocks are to left allocated or freed.
343 */
344void
345blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
346{
347    blist_t newbl = blist_create(count, flags);
348    blist_t save = *pbl;
349
350    *pbl = newbl;
351    if (count > save->bl_blocks)
352	    count = save->bl_blocks;
353    blst_copy(save->bl_root, 0, save->bl_radix, newbl, count);
354
355    /*
356     * If resizing upwards, should we free the new space or not?
357     */
358    if (freenew && count < newbl->bl_blocks) {
359	    blist_free(newbl, count, newbl->bl_blocks - count);
360    }
361    blist_destroy(save);
362}
363
364#ifdef BLIST_DEBUG
365
366/*
367 * blist_print()    - dump radix tree
368 */
369void
370blist_print(blist_t bl)
371{
372	printf("BLIST avail = %jd, cursor = %08jx {\n",
373	    (uintmax_t)bl->bl_avail, (uintmax_t)bl->bl_cursor);
374
375	if (bl->bl_root->bm_bitmap != 0)
376		blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4);
377	printf("}\n");
378}
379
380#endif
381
382static const u_daddr_t fib[] = {
383	1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584,
384	4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811,
385	514229, 832040, 1346269, 2178309, 3524578,
386};
387
388/*
389 * Use 'gap' to describe a maximal range of unallocated blocks/bits.
390 */
391struct gap_stats {
392	daddr_t	start;		/* current gap start, or SWAPBLK_NONE */
393	daddr_t	num;		/* number of gaps observed */
394	daddr_t	max;		/* largest gap size */
395	daddr_t	avg;		/* average gap size */
396	daddr_t	err;		/* sum - num * avg */
397	daddr_t	histo[nitems(fib)]; /* # gaps in each size range */
398	int	max_bucket;	/* last histo elt with nonzero val */
399};
400
401/*
402 * gap_stats_counting()    - is the state 'counting 1 bits'?
403 *                           or 'skipping 0 bits'?
404 */
405static inline bool
406gap_stats_counting(const struct gap_stats *stats)
407{
408
409	return (stats->start != SWAPBLK_NONE);
410}
411
412/*
413 * init_gap_stats()    - initialize stats on gap sizes
414 */
415static inline void
416init_gap_stats(struct gap_stats *stats)
417{
418
419	bzero(stats, sizeof(*stats));
420	stats->start = SWAPBLK_NONE;
421}
422
423/*
424 * update_gap_stats()    - update stats on gap sizes
425 */
426static void
427update_gap_stats(struct gap_stats *stats, daddr_t posn)
428{
429	daddr_t size;
430	int hi, lo, mid;
431
432	if (!gap_stats_counting(stats)) {
433		stats->start = posn;
434		return;
435	}
436	size = posn - stats->start;
437	stats->start = SWAPBLK_NONE;
438	if (size > stats->max)
439		stats->max = size;
440
441	/*
442	 * Find the fibonacci range that contains size,
443	 * expecting to find it in an early range.
444	 */
445	lo = 0;
446	hi = 1;
447	while (hi < nitems(fib) && fib[hi] <= size) {
448		lo = hi;
449		hi *= 2;
450	}
451	if (hi >= nitems(fib))
452		hi = nitems(fib);
453	while (lo + 1 != hi) {
454		mid = (lo + hi) >> 1;
455		if (fib[mid] <= size)
456			lo = mid;
457		else
458			hi = mid;
459	}
460	stats->histo[lo]++;
461	if (lo > stats->max_bucket)
462		stats->max_bucket = lo;
463	stats->err += size - stats->avg;
464	stats->num++;
465	stats->avg += stats->err / stats->num;
466	stats->err %= stats->num;
467}
468
469/*
470 * dump_gap_stats()    - print stats on gap sizes
471 */
472static inline void
473dump_gap_stats(const struct gap_stats *stats, struct sbuf *s)
474{
475	int i;
476
477	sbuf_printf(s, "number of maximal free ranges: %jd\n",
478	    (intmax_t)stats->num);
479	sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max);
480	sbuf_printf(s, "average maximal free range size: %jd\n",
481	    (intmax_t)stats->avg);
482	sbuf_cat(s, "number of maximal free ranges of different sizes:\n");
483	sbuf_cat(s, "               count  |  size range\n");
484	sbuf_cat(s, "               -----  |  ----------\n");
485	for (i = 0; i < stats->max_bucket; i++) {
486		if (stats->histo[i] != 0) {
487			sbuf_printf(s, "%20jd  |  ",
488			    (intmax_t)stats->histo[i]);
489			if (fib[i] != fib[i + 1] - 1)
490				sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
491				    (intmax_t)fib[i + 1] - 1);
492			else
493				sbuf_printf(s, "%jd\n", (intmax_t)fib[i]);
494		}
495	}
496	sbuf_printf(s, "%20jd  |  ", (intmax_t)stats->histo[i]);
497	if (stats->histo[i] > 1)
498		sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
499		    (intmax_t)stats->max);
500	else
501		sbuf_printf(s, "%jd\n", (intmax_t)stats->max);
502}
503
504/*
505 * blist_stats()    - dump radix tree stats
506 */
507void
508blist_stats(blist_t bl, struct sbuf *s)
509{
510	struct gap_stats gstats;
511	struct gap_stats *stats = &gstats;
512	daddr_t i, nodes, radix;
513	u_daddr_t diff, mask;
514	int digit;
515
516	init_gap_stats(stats);
517	nodes = 0;
518	radix = bl->bl_radix;
519	for (i = 0; i < bl->bl_blocks; ) {
520		/*
521		 * Check for skippable subtrees starting at i.
522		 */
523		while (radix != 1) {
524			if (bl->bl_root[nodes].bm_bitmap == 0) {
525				if (gap_stats_counting(stats))
526					update_gap_stats(stats, i);
527				break;
528			}
529
530			/*
531			 * Skip subtree root.
532			 */
533			nodes++;
534			radix /= BLIST_RADIX;
535		}
536		if (radix == 1) {
537			/*
538			 * Scan leaf.
539			 */
540			mask = bl->bl_root[nodes].bm_bitmap;
541			diff = mask ^ (mask << 1);
542			if (gap_stats_counting(stats))
543				diff ^= 1;
544			while (diff != 0) {
545				digit = bitpos(diff);
546				update_gap_stats(stats, i + digit);
547				diff ^= bitrange(digit, 1);
548			}
549		}
550		nodes += radix_to_skip(radix * BLIST_RADIX);
551		i += radix * BLIST_RADIX;
552
553		/*
554		 * Find max size subtree starting at i.
555		 */
556		for (radix = 1;
557		    ((i / BLIST_RADIX / radix) & BLIST_MASK) == 0;
558		    radix *= BLIST_RADIX)
559			;
560	}
561	update_gap_stats(stats, i);
562	dump_gap_stats(stats, s);
563}
564
565/************************************************************************
566 *			  ALLOCATION SUPPORT FUNCTIONS			*
567 ************************************************************************
568 *
569 *	These support functions do all the actual work.  They may seem
570 *	rather longish, but that's because I've commented them up.  The
571 *	actual code is straight forward.
572 *
573 */
574
575/*
576 * BLST_NEXT_LEAF_ALLOC() - allocate the blocks starting with the next leaf.
577 *
578 *	'scan' is a leaf node, and its first block is at address 'start'.  The
579 *	next leaf node could be adjacent, or several nodes away if the least
580 *	common ancestor of 'scan' and its neighbor is several levels up.  Use
581 *	addresses to determine how many meta-nodes lie between the leaves.  If
582 *	sequence of leaves starting with the next one has enough initial bits
583 *	set, clear them and clear the bits in the meta nodes on the path up to
584 *	the least common ancestor to mark any subtrees made completely empty.
585 */
586static int
587blst_next_leaf_alloc(blmeta_t *scan, daddr_t start, int count, int maxcount)
588{
589	u_daddr_t radix;
590	daddr_t blk;
591	int avail, digit;
592
593	start += BLIST_RADIX;
594	for (blk = start; blk - start < maxcount; blk += BLIST_RADIX) {
595		/* Skip meta-nodes, as long as they promise more free blocks. */
596		radix = BLIST_RADIX;
597		while (((++scan)->bm_bitmap & 1) == 1 &&
598		    ((blk / radix) & BLIST_MASK) == 0)
599			radix *= BLIST_RADIX;
600		if (~scan->bm_bitmap != 0) {
601			/*
602			 * Either there is no next leaf with any free blocks,
603			 * or we've reached the next leaf and found that some
604			 * of its blocks are not free.  In the first case,
605			 * bitpos() returns zero here.
606			 */
607			avail = blk - start + bitpos(~scan->bm_bitmap);
608			if (avail < count || avail == 0) {
609				/*
610				 * There isn't a next leaf with enough free
611				 * blocks at its beginning to bother
612				 * allocating.
613				 */
614				return (avail);
615			}
616			maxcount = imin(avail, maxcount);
617			if (maxcount % BLIST_RADIX == 0) {
618				/*
619				 * There was no next leaf.  Back scan up to
620				 * last leaf.
621				 */
622				do {
623					radix /= BLIST_RADIX;
624					--scan;
625				} while (radix != 1);
626				blk -= BLIST_RADIX;
627			}
628		}
629	}
630
631	/*
632	 * 'scan' is the last leaf that provides blocks.  Clear from 1 to
633	 * BLIST_RADIX bits to represent the allocation of those last blocks.
634	 */
635	if (maxcount % BLIST_RADIX != 0)
636		scan->bm_bitmap &= ~bitrange(0, maxcount % BLIST_RADIX);
637	else
638		scan->bm_bitmap = 0;
639
640	for (;;) {
641		/* Back up over meta-nodes, clearing bits if necessary. */
642		blk -= BLIST_RADIX;
643		for (radix = BLIST_RADIX;
644		    (digit = ((blk / radix) & BLIST_MASK)) == 0;
645		    radix *= BLIST_RADIX) {
646			if ((scan--)->bm_bitmap == 0)
647				scan->bm_bitmap ^= 1;
648		}
649		if ((scan--)->bm_bitmap == 0)
650			scan[-digit * radix_to_skip(radix)].bm_bitmap ^=
651			    (u_daddr_t)1 << digit;
652
653		if (blk == start)
654			break;
655		/* Clear all the bits of this leaf. */
656		scan->bm_bitmap = 0;
657	}
658	return (maxcount);
659}
660
661/*
662 * BLST_LEAF_ALLOC() -	allocate at a leaf in the radix tree (a bitmap).
663 *
664 *	This function is the core of the allocator.  Its execution time is
665 *	proportional to log(count), plus height of the tree if the allocation
666 *	crosses a leaf boundary.
667 */
668static daddr_t
669blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int *count, int maxcount)
670{
671	u_daddr_t mask;
672	int bighint, count1, hi, lo, num_shifts;
673
674	count1 = *count - 1;
675	num_shifts = fls(count1);
676	mask = ~scan->bm_bitmap;
677	while ((mask & (mask + 1)) != 0 && num_shifts > 0) {
678		/*
679		 * If bit i is 0 in mask, then bits in [i, i + (count1 >>
680		 * num_shifts)] are 1 in scan->bm_bitmap.  Reduce num_shifts to
681		 * 0, while preserving this invariant.  The updates to mask
682		 * leave fewer bits 0, but each bit that remains 0 represents a
683		 * longer string of consecutive 1-bits in scan->bm_bitmap.  If
684		 * more updates to mask cannot set more bits, because mask is
685		 * partitioned with all 1 bits following all 0 bits, the loop
686		 * terminates immediately.
687		 */
688		num_shifts--;
689		mask |= mask >> ((count1 >> num_shifts) + 1) / 2;
690	}
691	bighint = count1 >> num_shifts;
692	if (~mask == 0) {
693		/*
694		 * Update bighint.  There is no allocation bigger than
695		 * count1 >> num_shifts starting in this leaf.
696		 */
697		scan->bm_bighint = bighint;
698		return (SWAPBLK_NONE);
699	}
700
701	/* Discard any candidates that appear before blk. */
702	if ((blk & BLIST_MASK) != 0) {
703		if ((~mask & bitrange(0, blk & BLIST_MASK)) != 0) {
704			/* Grow bighint in case all discarded bits are set. */
705			bighint += blk & BLIST_MASK;
706			mask |= bitrange(0, blk & BLIST_MASK);
707			if (~mask == 0) {
708				scan->bm_bighint = bighint;
709				return (SWAPBLK_NONE);
710			}
711		}
712		blk -= blk & BLIST_MASK;
713	}
714
715	/*
716	 * The least significant set bit in mask marks the start of the first
717	 * available range of sufficient size.  Find its position.
718	 */
719	lo = bitpos(~mask);
720
721	/*
722	 * Find how much space is available starting at that position.
723	 */
724	if ((mask & (mask + 1)) != 0) {
725		/* Count the 1 bits starting at position lo. */
726		hi = bitpos(mask & (mask + 1)) + count1;
727		if (maxcount < hi - lo)
728			hi = lo + maxcount;
729		*count = hi - lo;
730		mask = ~bitrange(lo, *count);
731	} else if (maxcount <= BLIST_RADIX - lo) {
732		/* All the blocks we can use are available here. */
733		hi = lo + maxcount;
734		*count = maxcount;
735		mask = ~bitrange(lo, *count);
736		if (hi == BLIST_RADIX)
737			scan->bm_bighint = bighint;
738	} else {
739		/* Check next leaf for some of the blocks we want or need. */
740		count1 = *count - (BLIST_RADIX - lo);
741		maxcount -= BLIST_RADIX - lo;
742		hi = blst_next_leaf_alloc(scan, blk, count1, maxcount);
743		if (hi < count1)
744			/*
745			 * The next leaf cannot supply enough blocks to reach
746			 * the minimum required allocation.  The hint cannot be
747			 * updated, because the same allocation request could
748			 * be satisfied later, by this leaf, if the state of
749			 * the next leaf changes, and without any changes to
750			 * this leaf.
751			 */
752			return (SWAPBLK_NONE);
753		*count = BLIST_RADIX - lo + hi;
754		scan->bm_bighint = bighint;
755	}
756
757	/* Clear the allocated bits from this leaf. */
758	scan->bm_bitmap &= mask;
759	return (blk + lo);
760}
761
762/*
763 * blist_meta_alloc() -	allocate at a meta in the radix tree.
764 *
765 *	Attempt to allocate at a meta node.  If we can't, we update
766 *	bighint and return a failure.  Updating bighint optimize future
767 *	calls that hit this node.  We have to check for our collapse cases
768 *	and we have a few optimizations strewn in as well.
769 */
770static daddr_t
771blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count,
772    int maxcount, u_daddr_t radix)
773{
774	daddr_t blk, i, r, skip;
775	u_daddr_t mask;
776	bool scan_from_start;
777	int digit;
778
779	if (radix == 1)
780		return (blst_leaf_alloc(scan, cursor, count, maxcount));
781	blk = cursor & -(radix * BLIST_RADIX);
782	scan_from_start = (cursor == blk);
783	skip = radix_to_skip(radix);
784	mask = scan->bm_bitmap;
785
786	/* Discard any candidates that appear before cursor. */
787	digit = (cursor / radix) & BLIST_MASK;
788	mask &= (u_daddr_t)-1 << digit;
789	if (mask == 0)
790		return (SWAPBLK_NONE);
791
792	/*
793	 * If the first try is for a block that includes the cursor, pre-undo
794	 * the digit * radix offset in the first call; otherwise, ignore the
795	 * cursor entirely.
796	 */
797	if (((mask >> digit) & 1) == 1)
798		cursor -= digit * radix;
799	else
800		cursor = blk;
801
802	/*
803	 * Examine the nonempty subtree associated with each bit set in mask.
804	 */
805	do {
806		digit = bitpos(mask);
807		i = 1 + digit * skip;
808		if (*count <= scan[i].bm_bighint) {
809			/*
810			 * The allocation might fit beginning in the i'th subtree.
811			 */
812			r = blst_meta_alloc(&scan[i], cursor + digit * radix,
813			    count, maxcount, radix / BLIST_RADIX);
814			if (r != SWAPBLK_NONE) {
815				if (scan[i].bm_bitmap == 0)
816					scan->bm_bitmap ^= bitrange(digit, 1);
817				return (r);
818			}
819		}
820		cursor = blk;
821	} while ((mask ^= bitrange(digit, 1)) != 0);
822
823	/*
824	 * We couldn't allocate count in this subtree.  If the whole tree was
825	 * scanned, and the last tree node is allocated, update bighint.
826	 */
827	if (scan_from_start && !(digit == BLIST_RADIX - 1 &&
828	    scan[i].bm_bighint == BLIST_MAX_ALLOC))
829		scan->bm_bighint = *count - 1;
830
831	return (SWAPBLK_NONE);
832}
833
834/*
835 * BLST_LEAF_FREE() -	free allocated block from leaf bitmap
836 *
837 */
838static void
839blst_leaf_free(blmeta_t *scan, daddr_t blk, int count)
840{
841	u_daddr_t mask;
842
843	/*
844	 * free some data in this bitmap
845	 * mask=0000111111111110000
846	 *          \_________/\__/
847	 *		count   n
848	 */
849	mask = bitrange(blk & BLIST_MASK, count);
850	KASSERT((scan->bm_bitmap & mask) == 0,
851	    ("freeing free block: %jx, size %d, mask %jx",
852	    (uintmax_t)blk, count, (uintmax_t)scan->bm_bitmap & mask));
853	scan->bm_bitmap |= mask;
854}
855
856/*
857 * BLST_META_FREE() - free allocated blocks from radix tree meta info
858 *
859 *	This support routine frees a range of blocks from the bitmap.
860 *	The range must be entirely enclosed by this radix node.  If a
861 *	meta node, we break the range down recursively to free blocks
862 *	in subnodes (which means that this code can free an arbitrary
863 *	range whereas the allocation code cannot allocate an arbitrary
864 *	range).
865 */
866static void
867blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix)
868{
869	daddr_t blk, endBlk, i, skip;
870	int digit, endDigit;
871
872	/*
873	 * We could probably do a better job here.  We are required to make
874	 * bighint at least as large as the biggest allocable block of data.
875	 * If we just shoehorn it, a little extra overhead will be incurred
876	 * on the next allocation (but only that one typically).
877	 */
878	scan->bm_bighint = BLIST_MAX_ALLOC;
879
880	if (radix == 1)
881		return (blst_leaf_free(scan, freeBlk, count));
882
883	endBlk = freeBlk + count;
884	blk = (freeBlk + radix * BLIST_RADIX) & -(radix * BLIST_RADIX);
885	/*
886	 * blk is first block past the end of the range of this meta node,
887	 * or 0 in case of overflow.
888	 */
889	if (blk != 0)
890		endBlk = ummin(endBlk, blk);
891	skip = radix_to_skip(radix);
892	blk = freeBlk & -radix;
893	digit = (blk / radix) & BLIST_MASK;
894	endDigit = 1 + (((endBlk - 1) / radix) & BLIST_MASK);
895	scan->bm_bitmap |= bitrange(digit, endDigit - digit);
896	for (i = 1 + digit * skip; blk < endBlk; i += skip) {
897		blk += radix;
898		count = ummin(blk, endBlk) - freeBlk;
899		blst_meta_free(&scan[i], freeBlk, count, radix / BLIST_RADIX);
900		freeBlk = blk;
901	}
902}
903
904/*
905 * BLST_COPY() - copy one radix tree to another
906 *
907 *	Locates free space in the source tree and frees it in the destination
908 *	tree.  The space may not already be free in the destination.
909 */
910static void
911blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest,
912    daddr_t count)
913{
914	daddr_t endBlk, i, skip;
915
916	/*
917	 * Leaf node
918	 */
919
920	if (radix == 1) {
921		u_daddr_t v = scan->bm_bitmap;
922
923		if (v == (u_daddr_t)-1) {
924			blist_free(dest, blk, count);
925		} else if (v != 0) {
926			int i;
927
928			for (i = 0; i < count; ++i) {
929				if (v & ((u_daddr_t)1 << i))
930					blist_free(dest, blk + i, 1);
931			}
932		}
933		return;
934	}
935
936	/*
937	 * Meta node
938	 */
939
940	if (scan->bm_bitmap == 0) {
941		/*
942		 * Source all allocated, leave dest allocated
943		 */
944		return;
945	}
946
947	endBlk = blk + count;
948	skip = radix_to_skip(radix);
949	for (i = 1; blk < endBlk; i += skip) {
950		blk += radix;
951		count = radix;
952		if (blk >= endBlk)
953			count -= blk - endBlk;
954		blst_copy(&scan[i], blk - radix,
955		    radix / BLIST_RADIX, dest, count);
956	}
957}
958
959/*
960 * BLST_LEAF_FILL() -	allocate specific blocks in leaf bitmap
961 *
962 *	This routine allocates all blocks in the specified range
963 *	regardless of any existing allocations in that range.  Returns
964 *	the number of blocks allocated by the call.
965 */
966static daddr_t
967blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
968{
969	daddr_t nblks;
970	u_daddr_t mask;
971
972	mask = bitrange(blk & BLIST_MASK, count);
973
974	/* Count the number of blocks that we are allocating. */
975	nblks = bitcount64(scan->bm_bitmap & mask);
976
977	scan->bm_bitmap &= ~mask;
978	return (nblks);
979}
980
981/*
982 * BLIST_META_FILL() -	allocate specific blocks at a meta node
983 *
984 *	This routine allocates the specified range of blocks,
985 *	regardless of any existing allocations in the range.  The
986 *	range must be within the extent of this node.  Returns the
987 *	number of blocks allocated by the call.
988 */
989static daddr_t
990blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix)
991{
992	daddr_t blk, endBlk, i, nblks, skip;
993	int digit;
994
995	if (radix == 1)
996		return (blst_leaf_fill(scan, allocBlk, count));
997
998	endBlk = allocBlk + count;
999	blk = (allocBlk + radix * BLIST_RADIX) & -(radix * BLIST_RADIX);
1000	/*
1001	 * blk is first block past the end of the range of this meta node,
1002	 * or 0 in case of overflow.
1003	 */
1004	if (blk != 0)
1005		endBlk = ummin(endBlk, blk);
1006	skip = radix_to_skip(radix);
1007	blk = allocBlk & -radix;
1008	nblks = 0;
1009	while (blk < endBlk) {
1010		digit = (blk / radix) & BLIST_MASK;
1011		i = 1 + digit * skip;
1012		blk += radix;
1013		count = ummin(blk, endBlk) - allocBlk;
1014		nblks += blst_meta_fill(&scan[i], allocBlk, count,
1015		    radix / BLIST_RADIX);
1016		if (scan[i].bm_bitmap == 0)
1017			scan->bm_bitmap &= ~((u_daddr_t)1 << digit);
1018		allocBlk = blk;
1019	}
1020	return (nblks);
1021}
1022
1023#ifdef BLIST_DEBUG
1024
1025static void
1026blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab)
1027{
1028	daddr_t skip;
1029	u_daddr_t mask;
1030	int digit;
1031
1032	if (radix == 1) {
1033		printf(
1034		    "%*.*s(%08llx,%lld): bitmap %0*llx big=%lld\n",
1035		    tab, tab, "",
1036		    (long long)blk, (long long)BLIST_RADIX,
1037		    (int)(1 + (BLIST_RADIX - 1) / 4),
1038		    (long long)scan->bm_bitmap,
1039		    (long long)scan->bm_bighint
1040		);
1041		return;
1042	}
1043
1044	printf(
1045	    "%*.*s(%08llx): subtree (%lld/%lld) bitmap %0*llx big=%lld {\n",
1046	    tab, tab, "",
1047	    (long long)blk, (long long)radix * BLIST_RADIX,
1048	    (long long)radix * BLIST_RADIX,
1049	    (int)(1 + (BLIST_RADIX - 1) / 4),
1050	    (long long)scan->bm_bitmap,
1051	    (long long)scan->bm_bighint
1052	);
1053
1054	skip = radix_to_skip(radix);
1055	tab += 4;
1056
1057	mask = scan->bm_bitmap;
1058	/* Examine the nonempty subtree associated with each bit set in mask */
1059	do {
1060		digit = bitpos(mask);
1061		blst_radix_print(&scan[1 + digit * skip], blk + digit * radix,
1062		    radix / BLIST_RADIX, tab);
1063	} while ((mask ^= bitrange(digit, 1)) != 0);
1064	tab -= 4;
1065
1066	printf(
1067	    "%*.*s}\n",
1068	    tab, tab, ""
1069	);
1070}
1071
1072#endif
1073
1074#ifdef BLIST_DEBUG
1075
1076int
1077main(int ac, char **av)
1078{
1079	daddr_t size = BLIST_RADIX * BLIST_RADIX;
1080	int i;
1081	blist_t bl;
1082	struct sbuf *s;
1083
1084	for (i = 1; i < ac; ++i) {
1085		const char *ptr = av[i];
1086		if (*ptr != '-') {
1087			size = strtoll(ptr, NULL, 0);
1088			continue;
1089		}
1090		ptr += 2;
1091		fprintf(stderr, "Bad option: %s\n", ptr - 2);
1092		exit(1);
1093	}
1094	bl = blist_create(size, M_WAITOK);
1095	if (bl == NULL) {
1096		fprintf(stderr, "blist_create failed\n");
1097		exit(1);
1098	}
1099	blist_free(bl, 0, size);
1100
1101	for (;;) {
1102		char buf[1024];
1103		long long da = 0;
1104		int count = 0, maxcount = 0;
1105
1106		printf("%lld/%lld/%lld> ", (long long)blist_avail(bl),
1107		    (long long)size, (long long)bl->bl_radix * BLIST_RADIX);
1108		fflush(stdout);
1109		if (fgets(buf, sizeof(buf), stdin) == NULL)
1110			break;
1111		switch(buf[0]) {
1112		case 'r':
1113			if (sscanf(buf + 1, "%d", &count) == 1) {
1114				blist_resize(&bl, count, 1, M_WAITOK);
1115			} else {
1116				printf("?\n");
1117			}
1118		case 'p':
1119			blist_print(bl);
1120			break;
1121		case 's':
1122			s = sbuf_new_auto();
1123			blist_stats(bl, s);
1124			sbuf_finish(s);
1125			printf("%s", sbuf_data(s));
1126			sbuf_delete(s);
1127			break;
1128		case 'a':
1129			if (sscanf(buf + 1, "%d%d", &count, &maxcount) == 2) {
1130				daddr_t blk = blist_alloc(bl, &count, maxcount);
1131				printf("    R=%08llx, c=%08d\n",
1132				    (long long)blk, count);
1133			} else {
1134				printf("?\n");
1135			}
1136			break;
1137		case 'f':
1138			if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) {
1139				blist_free(bl, da, count);
1140			} else {
1141				printf("?\n");
1142			}
1143			break;
1144		case 'l':
1145			if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) {
1146				printf("    n=%jd\n",
1147				    (intmax_t)blist_fill(bl, da, count));
1148			} else {
1149				printf("?\n");
1150			}
1151			break;
1152		case '?':
1153		case 'h':
1154			puts(
1155			    "p          -print\n"
1156			    "s          -stats\n"
1157			    "a %d %d    -allocate\n"
1158			    "f %x %d    -free\n"
1159			    "l %x %d    -fill\n"
1160			    "r %d       -resize\n"
1161			    "h/?        -help\n"
1162			    "q          -quit"
1163			);
1164			break;
1165		case 'q':
1166			break;
1167		default:
1168			printf("?\n");
1169			break;
1170		}
1171		if (buf[0] == 'q')
1172			break;
1173	}
1174	return (0);
1175}
1176
1177#endif
1178