1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2002-2006 Rice University
5 * Copyright (c) 2007-2011 Alan L. Cox <alc@cs.rice.edu>
6 * All rights reserved.
7 *
8 * This software was developed for the FreeBSD Project by Alan L. Cox,
9 * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
24 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
27 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
30 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 */
33
34/*
35 *	Superpage reservation management module
36 *
37 * Any external functions defined by this module are only to be used by the
38 * virtual memory system.
39 */
40
41#include <sys/cdefs.h>
42#include "opt_vm.h"
43
44#include <sys/param.h>
45#include <sys/kernel.h>
46#include <sys/lock.h>
47#include <sys/malloc.h>
48#include <sys/mutex.h>
49#include <sys/queue.h>
50#include <sys/rwlock.h>
51#include <sys/sbuf.h>
52#include <sys/sysctl.h>
53#include <sys/systm.h>
54#include <sys/bitstring.h>
55#include <sys/counter.h>
56#include <sys/ktr.h>
57#include <sys/vmmeter.h>
58#include <sys/smp.h>
59
60#include <vm/vm.h>
61#include <vm/vm_extern.h>
62#include <vm/vm_param.h>
63#include <vm/vm_object.h>
64#include <vm/vm_page.h>
65#include <vm/vm_pageout.h>
66#include <vm/vm_pagequeue.h>
67#include <vm/vm_phys.h>
68#include <vm/vm_radix.h>
69#include <vm/vm_reserv.h>
70
71/*
72 * The reservation system supports the speculative allocation of large physical
73 * pages ("superpages").  Speculative allocation enables the fully automatic
74 * utilization of superpages by the virtual memory system.  In other words, no
75 * programmatic directives are required to use superpages.
76 */
77
78#if VM_NRESERVLEVEL > 0
79
80#ifndef VM_LEVEL_0_ORDER_MAX
81#define	VM_LEVEL_0_ORDER_MAX	VM_LEVEL_0_ORDER
82#endif
83
84/*
85 * The number of small pages that are contained in a level 0 reservation
86 */
87#define	VM_LEVEL_0_NPAGES	(1 << VM_LEVEL_0_ORDER)
88#define	VM_LEVEL_0_NPAGES_MAX	(1 << VM_LEVEL_0_ORDER_MAX)
89
90/*
91 * The number of bits by which a physical address is shifted to obtain the
92 * reservation number
93 */
94#define	VM_LEVEL_0_SHIFT	(VM_LEVEL_0_ORDER + PAGE_SHIFT)
95
96/*
97 * The size of a level 0 reservation in bytes
98 */
99#define	VM_LEVEL_0_SIZE		(1 << VM_LEVEL_0_SHIFT)
100
101/*
102 * Computes the index of the small page underlying the given (object, pindex)
103 * within the reservation's array of small pages.
104 */
105#define	VM_RESERV_INDEX(object, pindex)	\
106    (((object)->pg_color + (pindex)) & (VM_LEVEL_0_NPAGES - 1))
107
108/*
109 * Number of elapsed ticks before we update the LRU queue position.  Used
110 * to reduce contention and churn on the list.
111 */
112#define	PARTPOPSLOP	1
113
114/*
115 * The reservation structure
116 *
117 * A reservation structure is constructed whenever a large physical page is
118 * speculatively allocated to an object.  The reservation provides the small
119 * physical pages for the range [pindex, pindex + VM_LEVEL_0_NPAGES) of offsets
120 * within that object.  The reservation's "popcnt" tracks the number of these
121 * small physical pages that are in use at any given time.  When and if the
122 * reservation is not fully utilized, it appears in the queue of partially
123 * populated reservations.  The reservation always appears on the containing
124 * object's list of reservations.
125 *
126 * A partially populated reservation can be broken and reclaimed at any time.
127 *
128 * c - constant after boot
129 * d - vm_reserv_domain_lock
130 * o - vm_reserv_object_lock
131 * r - vm_reserv_lock
132 * s - vm_reserv_domain_scan_lock
133 */
134struct vm_reserv {
135	struct mtx	lock;			/* reservation lock. */
136	TAILQ_ENTRY(vm_reserv) partpopq;	/* (d, r) per-domain queue. */
137	LIST_ENTRY(vm_reserv) objq;		/* (o, r) object queue */
138	vm_object_t	object;			/* (o, r) containing object */
139	vm_pindex_t	pindex;			/* (o, r) offset in object */
140	vm_page_t	pages;			/* (c) first page  */
141	uint16_t	popcnt;			/* (r) # of pages in use */
142	uint8_t		domain;			/* (c) NUMA domain. */
143	char		inpartpopq;		/* (d, r) */
144	int		lasttick;		/* (r) last pop update tick. */
145	bitstr_t	bit_decl(popmap, VM_LEVEL_0_NPAGES_MAX);
146						/* (r) bit vector, used pages */
147};
148
149TAILQ_HEAD(vm_reserv_queue, vm_reserv);
150
151#define	vm_reserv_lockptr(rv)		(&(rv)->lock)
152#define	vm_reserv_assert_locked(rv)					\
153	    mtx_assert(vm_reserv_lockptr(rv), MA_OWNED)
154#define	vm_reserv_lock(rv)		mtx_lock(vm_reserv_lockptr(rv))
155#define	vm_reserv_trylock(rv)		mtx_trylock(vm_reserv_lockptr(rv))
156#define	vm_reserv_unlock(rv)		mtx_unlock(vm_reserv_lockptr(rv))
157
158/*
159 * The reservation array
160 *
161 * This array is analoguous in function to vm_page_array.  It differs in the
162 * respect that it may contain a greater number of useful reservation
163 * structures than there are (physical) superpages.  These "invalid"
164 * reservation structures exist to trade-off space for time in the
165 * implementation of vm_reserv_from_page().  Invalid reservation structures are
166 * distinguishable from "valid" reservation structures by inspecting the
167 * reservation's "pages" field.  Invalid reservation structures have a NULL
168 * "pages" field.
169 *
170 * vm_reserv_from_page() maps a small (physical) page to an element of this
171 * array by computing a physical reservation number from the page's physical
172 * address.  The physical reservation number is used as the array index.
173 *
174 * An "active" reservation is a valid reservation structure that has a non-NULL
175 * "object" field and a non-zero "popcnt" field.  In other words, every active
176 * reservation belongs to a particular object.  Moreover, every active
177 * reservation has an entry in the containing object's list of reservations.
178 */
179static vm_reserv_t vm_reserv_array;
180
181/*
182 * The per-domain partially populated reservation queues
183 *
184 * These queues enable the fast recovery of an unused free small page from a
185 * partially populated reservation.  The reservation at the head of a queue
186 * is the least recently changed, partially populated reservation.
187 *
188 * Access to this queue is synchronized by the per-domain reservation lock.
189 * Threads reclaiming free pages from the queue must hold the per-domain scan
190 * lock.
191 */
192struct vm_reserv_domain {
193	struct mtx 		lock;
194	struct vm_reserv_queue	partpop;	/* (d) */
195	struct vm_reserv	marker;		/* (d, s) scan marker/lock */
196} __aligned(CACHE_LINE_SIZE);
197
198static struct vm_reserv_domain vm_rvd[MAXMEMDOM];
199
200#define	vm_reserv_domain_lockptr(d)	(&vm_rvd[(d)].lock)
201#define	vm_reserv_domain_assert_locked(d)	\
202	mtx_assert(vm_reserv_domain_lockptr(d), MA_OWNED)
203#define	vm_reserv_domain_lock(d)	mtx_lock(vm_reserv_domain_lockptr(d))
204#define	vm_reserv_domain_unlock(d)	mtx_unlock(vm_reserv_domain_lockptr(d))
205
206#define	vm_reserv_domain_scan_lock(d)	mtx_lock(&vm_rvd[(d)].marker.lock)
207#define	vm_reserv_domain_scan_unlock(d)	mtx_unlock(&vm_rvd[(d)].marker.lock)
208
209static SYSCTL_NODE(_vm, OID_AUTO, reserv, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
210    "Reservation Info");
211
212static COUNTER_U64_DEFINE_EARLY(vm_reserv_broken);
213SYSCTL_COUNTER_U64(_vm_reserv, OID_AUTO, broken, CTLFLAG_RD,
214    &vm_reserv_broken, "Cumulative number of broken reservations");
215
216static COUNTER_U64_DEFINE_EARLY(vm_reserv_freed);
217SYSCTL_COUNTER_U64(_vm_reserv, OID_AUTO, freed, CTLFLAG_RD,
218    &vm_reserv_freed, "Cumulative number of freed reservations");
219
220static int sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS);
221
222SYSCTL_PROC(_vm_reserv, OID_AUTO, fullpop, CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD,
223    NULL, 0, sysctl_vm_reserv_fullpop, "I", "Current number of full reservations");
224
225static int sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS);
226
227SYSCTL_OID(_vm_reserv, OID_AUTO, partpopq,
228    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
229    sysctl_vm_reserv_partpopq, "A",
230    "Partially populated reservation queues");
231
232static COUNTER_U64_DEFINE_EARLY(vm_reserv_reclaimed);
233SYSCTL_COUNTER_U64(_vm_reserv, OID_AUTO, reclaimed, CTLFLAG_RD,
234    &vm_reserv_reclaimed, "Cumulative number of reclaimed reservations");
235
236/*
237 * The object lock pool is used to synchronize the rvq.  We can not use a
238 * pool mutex because it is required before malloc works.
239 *
240 * The "hash" function could be made faster without divide and modulo.
241 */
242#define	VM_RESERV_OBJ_LOCK_COUNT	MAXCPU
243
244struct mtx_padalign vm_reserv_object_mtx[VM_RESERV_OBJ_LOCK_COUNT];
245
246#define	vm_reserv_object_lock_idx(object)			\
247	    (((uintptr_t)object / sizeof(*object)) % VM_RESERV_OBJ_LOCK_COUNT)
248#define	vm_reserv_object_lock_ptr(object)			\
249	    &vm_reserv_object_mtx[vm_reserv_object_lock_idx((object))]
250#define	vm_reserv_object_lock(object)				\
251	    mtx_lock(vm_reserv_object_lock_ptr((object)))
252#define	vm_reserv_object_unlock(object)				\
253	    mtx_unlock(vm_reserv_object_lock_ptr((object)))
254
255static void		vm_reserv_break(vm_reserv_t rv);
256static void		vm_reserv_depopulate(vm_reserv_t rv, int index);
257static vm_reserv_t	vm_reserv_from_page(vm_page_t m);
258static boolean_t	vm_reserv_has_pindex(vm_reserv_t rv,
259			    vm_pindex_t pindex);
260static void		vm_reserv_populate(vm_reserv_t rv, int index);
261static void		vm_reserv_reclaim(vm_reserv_t rv);
262
263/*
264 * Returns the current number of full reservations.
265 *
266 * Since the number of full reservations is computed without acquiring any
267 * locks, the returned value is inexact.
268 */
269static int
270sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS)
271{
272	vm_paddr_t paddr;
273	struct vm_phys_seg *seg;
274	vm_reserv_t rv;
275	int fullpop, segind;
276
277	fullpop = 0;
278	for (segind = 0; segind < vm_phys_nsegs; segind++) {
279		seg = &vm_phys_segs[segind];
280		paddr = roundup2(seg->start, VM_LEVEL_0_SIZE);
281#ifdef VM_PHYSSEG_SPARSE
282		rv = seg->first_reserv + (paddr >> VM_LEVEL_0_SHIFT) -
283		    (seg->start >> VM_LEVEL_0_SHIFT);
284#else
285		rv = &vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT];
286#endif
287		while (paddr + VM_LEVEL_0_SIZE > paddr && paddr +
288		    VM_LEVEL_0_SIZE <= seg->end) {
289			fullpop += rv->popcnt == VM_LEVEL_0_NPAGES;
290			paddr += VM_LEVEL_0_SIZE;
291			rv++;
292		}
293	}
294	return (sysctl_handle_int(oidp, &fullpop, 0, req));
295}
296
297/*
298 * Describes the current state of the partially populated reservation queue.
299 */
300static int
301sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS)
302{
303	struct sbuf sbuf;
304	vm_reserv_t rv;
305	int counter, error, domain, level, unused_pages;
306
307	error = sysctl_wire_old_buffer(req, 0);
308	if (error != 0)
309		return (error);
310	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
311	sbuf_printf(&sbuf, "\nDOMAIN    LEVEL     SIZE  NUMBER\n\n");
312	for (domain = 0; domain < vm_ndomains; domain++) {
313		for (level = -1; level <= VM_NRESERVLEVEL - 2; level++) {
314			counter = 0;
315			unused_pages = 0;
316			vm_reserv_domain_lock(domain);
317			TAILQ_FOREACH(rv, &vm_rvd[domain].partpop, partpopq) {
318				if (rv == &vm_rvd[domain].marker)
319					continue;
320				counter++;
321				unused_pages += VM_LEVEL_0_NPAGES - rv->popcnt;
322			}
323			vm_reserv_domain_unlock(domain);
324			sbuf_printf(&sbuf, "%6d, %7d, %6dK, %6d\n",
325			    domain, level,
326			    unused_pages * ((int)PAGE_SIZE / 1024), counter);
327		}
328	}
329	error = sbuf_finish(&sbuf);
330	sbuf_delete(&sbuf);
331	return (error);
332}
333
334/*
335 * Remove a reservation from the object's objq.
336 */
337static void
338vm_reserv_remove(vm_reserv_t rv)
339{
340	vm_object_t object;
341
342	vm_reserv_assert_locked(rv);
343	CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d",
344	    __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq);
345	KASSERT(rv->object != NULL,
346	    ("vm_reserv_remove: reserv %p is free", rv));
347	KASSERT(!rv->inpartpopq,
348	    ("vm_reserv_remove: reserv %p's inpartpopq is TRUE", rv));
349	object = rv->object;
350	vm_reserv_object_lock(object);
351	LIST_REMOVE(rv, objq);
352	rv->object = NULL;
353	vm_reserv_object_unlock(object);
354}
355
356/*
357 * Insert a new reservation into the object's objq.
358 */
359static void
360vm_reserv_insert(vm_reserv_t rv, vm_object_t object, vm_pindex_t pindex)
361{
362
363	vm_reserv_assert_locked(rv);
364	CTR6(KTR_VM,
365	    "%s: rv %p(%p) object %p new %p popcnt %d",
366	    __FUNCTION__, rv, rv->pages, rv->object, object,
367	   rv->popcnt);
368	KASSERT(rv->object == NULL,
369	    ("vm_reserv_insert: reserv %p isn't free", rv));
370	KASSERT(rv->popcnt == 0,
371	    ("vm_reserv_insert: reserv %p's popcnt is corrupted", rv));
372	KASSERT(!rv->inpartpopq,
373	    ("vm_reserv_insert: reserv %p's inpartpopq is TRUE", rv));
374	KASSERT(bit_ntest(rv->popmap, 0, VM_LEVEL_0_NPAGES - 1, 0),
375	    ("vm_reserv_insert: reserv %p's popmap is corrupted", rv));
376	vm_reserv_object_lock(object);
377	rv->pindex = pindex;
378	rv->object = object;
379	rv->lasttick = ticks;
380	LIST_INSERT_HEAD(&object->rvq, rv, objq);
381	vm_reserv_object_unlock(object);
382}
383
384/*
385 * Reduces the given reservation's population count.  If the population count
386 * becomes zero, the reservation is destroyed.  Additionally, moves the
387 * reservation to the tail of the partially populated reservation queue if the
388 * population count is non-zero.
389 */
390static void
391vm_reserv_depopulate(vm_reserv_t rv, int index)
392{
393	struct vm_domain *vmd;
394
395	vm_reserv_assert_locked(rv);
396	CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d",
397	    __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq);
398	KASSERT(rv->object != NULL,
399	    ("vm_reserv_depopulate: reserv %p is free", rv));
400	KASSERT(bit_test(rv->popmap, index),
401	    ("vm_reserv_depopulate: reserv %p's popmap[%d] is clear", rv,
402	    index));
403	KASSERT(rv->popcnt > 0,
404	    ("vm_reserv_depopulate: reserv %p's popcnt is corrupted", rv));
405	KASSERT(rv->domain < vm_ndomains,
406	    ("vm_reserv_depopulate: reserv %p's domain is corrupted %d",
407	    rv, rv->domain));
408	if (rv->popcnt == VM_LEVEL_0_NPAGES) {
409		KASSERT(rv->pages->psind == 1,
410		    ("vm_reserv_depopulate: reserv %p is already demoted",
411		    rv));
412		rv->pages->psind = 0;
413	}
414	bit_clear(rv->popmap, index);
415	rv->popcnt--;
416	if ((unsigned)(ticks - rv->lasttick) >= PARTPOPSLOP ||
417	    rv->popcnt == 0) {
418		vm_reserv_domain_lock(rv->domain);
419		if (rv->inpartpopq) {
420			TAILQ_REMOVE(&vm_rvd[rv->domain].partpop, rv, partpopq);
421			rv->inpartpopq = FALSE;
422		}
423		if (rv->popcnt != 0) {
424			rv->inpartpopq = TRUE;
425			TAILQ_INSERT_TAIL(&vm_rvd[rv->domain].partpop, rv,
426			    partpopq);
427		}
428		vm_reserv_domain_unlock(rv->domain);
429		rv->lasttick = ticks;
430	}
431	vmd = VM_DOMAIN(rv->domain);
432	if (rv->popcnt == 0) {
433		vm_reserv_remove(rv);
434		vm_domain_free_lock(vmd);
435		vm_phys_free_pages(rv->pages, VM_LEVEL_0_ORDER);
436		vm_domain_free_unlock(vmd);
437		counter_u64_add(vm_reserv_freed, 1);
438	}
439	vm_domain_freecnt_inc(vmd, 1);
440}
441
442/*
443 * Returns the reservation to which the given page might belong.
444 */
445static __inline vm_reserv_t
446vm_reserv_from_page(vm_page_t m)
447{
448#ifdef VM_PHYSSEG_SPARSE
449	struct vm_phys_seg *seg;
450
451	seg = &vm_phys_segs[m->segind];
452	return (seg->first_reserv + (VM_PAGE_TO_PHYS(m) >> VM_LEVEL_0_SHIFT) -
453	    (seg->start >> VM_LEVEL_0_SHIFT));
454#else
455	return (&vm_reserv_array[VM_PAGE_TO_PHYS(m) >> VM_LEVEL_0_SHIFT]);
456#endif
457}
458
459/*
460 * Returns an existing reservation or NULL and initialized successor pointer.
461 */
462static vm_reserv_t
463vm_reserv_from_object(vm_object_t object, vm_pindex_t pindex,
464    vm_page_t mpred, vm_page_t *msuccp)
465{
466	vm_reserv_t rv;
467	vm_page_t msucc;
468
469	msucc = NULL;
470	if (mpred != NULL) {
471		KASSERT(mpred->object == object,
472		    ("vm_reserv_from_object: object doesn't contain mpred"));
473		KASSERT(mpred->pindex < pindex,
474		    ("vm_reserv_from_object: mpred doesn't precede pindex"));
475		rv = vm_reserv_from_page(mpred);
476		if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
477			goto found;
478		msucc = TAILQ_NEXT(mpred, listq);
479	} else
480		msucc = TAILQ_FIRST(&object->memq);
481	if (msucc != NULL) {
482		KASSERT(msucc->pindex > pindex,
483		    ("vm_reserv_from_object: msucc doesn't succeed pindex"));
484		rv = vm_reserv_from_page(msucc);
485		if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
486			goto found;
487	}
488	rv = NULL;
489
490found:
491	*msuccp = msucc;
492
493	return (rv);
494}
495
496/*
497 * Returns TRUE if the given reservation contains the given page index and
498 * FALSE otherwise.
499 */
500static __inline boolean_t
501vm_reserv_has_pindex(vm_reserv_t rv, vm_pindex_t pindex)
502{
503
504	return (((pindex - rv->pindex) & ~(VM_LEVEL_0_NPAGES - 1)) == 0);
505}
506
507/*
508 * Increases the given reservation's population count.  Moves the reservation
509 * to the tail of the partially populated reservation queue.
510 */
511static void
512vm_reserv_populate(vm_reserv_t rv, int index)
513{
514
515	vm_reserv_assert_locked(rv);
516	CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d",
517	    __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq);
518	KASSERT(rv->object != NULL,
519	    ("vm_reserv_populate: reserv %p is free", rv));
520	KASSERT(!bit_test(rv->popmap, index),
521	    ("vm_reserv_populate: reserv %p's popmap[%d] is set", rv,
522	    index));
523	KASSERT(rv->popcnt < VM_LEVEL_0_NPAGES,
524	    ("vm_reserv_populate: reserv %p is already full", rv));
525	KASSERT(rv->pages->psind == 0,
526	    ("vm_reserv_populate: reserv %p is already promoted", rv));
527	KASSERT(rv->domain < vm_ndomains,
528	    ("vm_reserv_populate: reserv %p's domain is corrupted %d",
529	    rv, rv->domain));
530	bit_set(rv->popmap, index);
531	rv->popcnt++;
532	if ((unsigned)(ticks - rv->lasttick) < PARTPOPSLOP &&
533	    rv->inpartpopq && rv->popcnt != VM_LEVEL_0_NPAGES)
534		return;
535	rv->lasttick = ticks;
536	vm_reserv_domain_lock(rv->domain);
537	if (rv->inpartpopq) {
538		TAILQ_REMOVE(&vm_rvd[rv->domain].partpop, rv, partpopq);
539		rv->inpartpopq = FALSE;
540	}
541	if (rv->popcnt < VM_LEVEL_0_NPAGES) {
542		rv->inpartpopq = TRUE;
543		TAILQ_INSERT_TAIL(&vm_rvd[rv->domain].partpop, rv, partpopq);
544	} else {
545		KASSERT(rv->pages->psind == 0,
546		    ("vm_reserv_populate: reserv %p is already promoted",
547		    rv));
548		rv->pages->psind = 1;
549	}
550	vm_reserv_domain_unlock(rv->domain);
551}
552
553/*
554 * Allocates a contiguous set of physical pages of the given size "npages"
555 * from existing or newly created reservations.  All of the physical pages
556 * must be at or above the given physical address "low" and below the given
557 * physical address "high".  The given value "alignment" determines the
558 * alignment of the first physical page in the set.  If the given value
559 * "boundary" is non-zero, then the set of physical pages cannot cross any
560 * physical address boundary that is a multiple of that value.  Both
561 * "alignment" and "boundary" must be a power of two.
562 *
563 * The page "mpred" must immediately precede the offset "pindex" within the
564 * specified object.
565 *
566 * The object must be locked.
567 */
568vm_page_t
569vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex, int domain,
570    int req, vm_page_t mpred, u_long npages, vm_paddr_t low, vm_paddr_t high,
571    u_long alignment, vm_paddr_t boundary)
572{
573	struct vm_domain *vmd;
574	vm_paddr_t pa, size;
575	vm_page_t m, m_ret, msucc;
576	vm_pindex_t first, leftcap, rightcap;
577	vm_reserv_t rv;
578	u_long allocpages, maxpages, minpages;
579	int i, index, n;
580
581	VM_OBJECT_ASSERT_WLOCKED(object);
582	KASSERT(npages != 0, ("vm_reserv_alloc_contig: npages is 0"));
583
584	/*
585	 * Is a reservation fundamentally impossible?
586	 */
587	if (pindex < VM_RESERV_INDEX(object, pindex) ||
588	    pindex + npages > object->size)
589		return (NULL);
590
591	/*
592	 * All reservations of a particular size have the same alignment.
593	 * Assuming that the first page is allocated from a reservation, the
594	 * least significant bits of its physical address can be determined
595	 * from its offset from the beginning of the reservation and the size
596	 * of the reservation.
597	 *
598	 * Could the specified index within a reservation of the smallest
599	 * possible size satisfy the alignment and boundary requirements?
600	 */
601	pa = VM_RESERV_INDEX(object, pindex) << PAGE_SHIFT;
602	size = npages << PAGE_SHIFT;
603	if (!vm_addr_ok(pa, size, alignment, boundary))
604		return (NULL);
605
606	/*
607	 * Look for an existing reservation.
608	 */
609	rv = vm_reserv_from_object(object, pindex, mpred, &msucc);
610	if (rv != NULL) {
611		KASSERT(object != kernel_object || rv->domain == domain,
612		    ("vm_reserv_alloc_contig: domain mismatch"));
613		index = VM_RESERV_INDEX(object, pindex);
614		/* Does the allocation fit within the reservation? */
615		if (index + npages > VM_LEVEL_0_NPAGES)
616			return (NULL);
617		domain = rv->domain;
618		vmd = VM_DOMAIN(domain);
619		vm_reserv_lock(rv);
620		/* Handle reclaim race. */
621		if (rv->object != object)
622			goto out;
623		m = &rv->pages[index];
624		pa = VM_PAGE_TO_PHYS(m);
625		if (pa < low || pa + size > high ||
626		    !vm_addr_ok(pa, size, alignment, boundary))
627			goto out;
628		/* Handle vm_page_rename(m, new_object, ...). */
629		if (!bit_ntest(rv->popmap, index, index + npages - 1, 0))
630			goto out;
631		if (!vm_domain_allocate(vmd, req, npages))
632			goto out;
633		for (i = 0; i < npages; i++)
634			vm_reserv_populate(rv, index + i);
635		vm_reserv_unlock(rv);
636		return (m);
637out:
638		vm_reserv_unlock(rv);
639		return (NULL);
640	}
641
642	/*
643	 * Could at least one reservation fit between the first index to the
644	 * left that can be used ("leftcap") and the first index to the right
645	 * that cannot be used ("rightcap")?
646	 *
647	 * We must synchronize with the reserv object lock to protect the
648	 * pindex/object of the resulting reservations against rename while
649	 * we are inspecting.
650	 */
651	first = pindex - VM_RESERV_INDEX(object, pindex);
652	minpages = VM_RESERV_INDEX(object, pindex) + npages;
653	maxpages = roundup2(minpages, VM_LEVEL_0_NPAGES);
654	allocpages = maxpages;
655	vm_reserv_object_lock(object);
656	if (mpred != NULL) {
657		if ((rv = vm_reserv_from_page(mpred))->object != object)
658			leftcap = mpred->pindex + 1;
659		else
660			leftcap = rv->pindex + VM_LEVEL_0_NPAGES;
661		if (leftcap > first) {
662			vm_reserv_object_unlock(object);
663			return (NULL);
664		}
665	}
666	if (msucc != NULL) {
667		if ((rv = vm_reserv_from_page(msucc))->object != object)
668			rightcap = msucc->pindex;
669		else
670			rightcap = rv->pindex;
671		if (first + maxpages > rightcap) {
672			if (maxpages == VM_LEVEL_0_NPAGES) {
673				vm_reserv_object_unlock(object);
674				return (NULL);
675			}
676
677			/*
678			 * At least one reservation will fit between "leftcap"
679			 * and "rightcap".  However, a reservation for the
680			 * last of the requested pages will not fit.  Reduce
681			 * the size of the upcoming allocation accordingly.
682			 */
683			allocpages = minpages;
684		}
685	}
686	vm_reserv_object_unlock(object);
687
688	/*
689	 * Would the last new reservation extend past the end of the object?
690	 *
691	 * If the object is unlikely to grow don't allocate a reservation for
692	 * the tail.
693	 */
694	if ((object->flags & OBJ_ANON) == 0 &&
695	    first + maxpages > object->size) {
696		if (maxpages == VM_LEVEL_0_NPAGES)
697			return (NULL);
698		allocpages = minpages;
699	}
700
701	/*
702	 * Allocate the physical pages.  The alignment and boundary specified
703	 * for this allocation may be different from the alignment and
704	 * boundary specified for the requested pages.  For instance, the
705	 * specified index may not be the first page within the first new
706	 * reservation.
707	 */
708	m = NULL;
709	vmd = VM_DOMAIN(domain);
710	if (vm_domain_allocate(vmd, req, npages)) {
711		vm_domain_free_lock(vmd);
712		m = vm_phys_alloc_contig(domain, allocpages, low, high,
713		    ulmax(alignment, VM_LEVEL_0_SIZE),
714		    boundary > VM_LEVEL_0_SIZE ? boundary : 0);
715		vm_domain_free_unlock(vmd);
716		if (m == NULL) {
717			vm_domain_freecnt_inc(vmd, npages);
718			return (NULL);
719		}
720	} else
721		return (NULL);
722	KASSERT(vm_page_domain(m) == domain,
723	    ("vm_reserv_alloc_contig: Page domain does not match requested."));
724
725	/*
726	 * The allocated physical pages always begin at a reservation
727	 * boundary, but they do not always end at a reservation boundary.
728	 * Initialize every reservation that is completely covered by the
729	 * allocated physical pages.
730	 */
731	m_ret = NULL;
732	index = VM_RESERV_INDEX(object, pindex);
733	do {
734		rv = vm_reserv_from_page(m);
735		KASSERT(rv->pages == m,
736		    ("vm_reserv_alloc_contig: reserv %p's pages is corrupted",
737		    rv));
738		vm_reserv_lock(rv);
739		vm_reserv_insert(rv, object, first);
740		n = ulmin(VM_LEVEL_0_NPAGES - index, npages);
741		for (i = 0; i < n; i++)
742			vm_reserv_populate(rv, index + i);
743		npages -= n;
744		if (m_ret == NULL) {
745			m_ret = &rv->pages[index];
746			index = 0;
747		}
748		vm_reserv_unlock(rv);
749		m += VM_LEVEL_0_NPAGES;
750		first += VM_LEVEL_0_NPAGES;
751		allocpages -= VM_LEVEL_0_NPAGES;
752	} while (allocpages >= VM_LEVEL_0_NPAGES);
753	return (m_ret);
754}
755
756/*
757 * Allocate a physical page from an existing or newly created reservation.
758 *
759 * The page "mpred" must immediately precede the offset "pindex" within the
760 * specified object.
761 *
762 * The object must be locked.
763 */
764vm_page_t
765vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex, int domain,
766    int req, vm_page_t mpred)
767{
768	struct vm_domain *vmd;
769	vm_page_t m, msucc;
770	vm_pindex_t first, leftcap, rightcap;
771	vm_reserv_t rv;
772	int index;
773
774	VM_OBJECT_ASSERT_WLOCKED(object);
775
776	/*
777	 * Is a reservation fundamentally impossible?
778	 */
779	if (pindex < VM_RESERV_INDEX(object, pindex) ||
780	    pindex >= object->size)
781		return (NULL);
782
783	/*
784	 * Look for an existing reservation.
785	 */
786	rv = vm_reserv_from_object(object, pindex, mpred, &msucc);
787	if (rv != NULL) {
788		KASSERT(object != kernel_object || rv->domain == domain,
789		    ("vm_reserv_alloc_page: domain mismatch"));
790		domain = rv->domain;
791		vmd = VM_DOMAIN(domain);
792		index = VM_RESERV_INDEX(object, pindex);
793		m = &rv->pages[index];
794		vm_reserv_lock(rv);
795		/* Handle reclaim race. */
796		if (rv->object != object ||
797		    /* Handle vm_page_rename(m, new_object, ...). */
798		    bit_test(rv->popmap, index)) {
799			m = NULL;
800			goto out;
801		}
802		if (vm_domain_allocate(vmd, req, 1) == 0)
803			m = NULL;
804		else
805			vm_reserv_populate(rv, index);
806out:
807		vm_reserv_unlock(rv);
808		return (m);
809	}
810
811	/*
812	 * Could a reservation fit between the first index to the left that
813	 * can be used and the first index to the right that cannot be used?
814	 *
815	 * We must synchronize with the reserv object lock to protect the
816	 * pindex/object of the resulting reservations against rename while
817	 * we are inspecting.
818	 */
819	first = pindex - VM_RESERV_INDEX(object, pindex);
820	vm_reserv_object_lock(object);
821	if (mpred != NULL) {
822		if ((rv = vm_reserv_from_page(mpred))->object != object)
823			leftcap = mpred->pindex + 1;
824		else
825			leftcap = rv->pindex + VM_LEVEL_0_NPAGES;
826		if (leftcap > first) {
827			vm_reserv_object_unlock(object);
828			return (NULL);
829		}
830	}
831	if (msucc != NULL) {
832		if ((rv = vm_reserv_from_page(msucc))->object != object)
833			rightcap = msucc->pindex;
834		else
835			rightcap = rv->pindex;
836		if (first + VM_LEVEL_0_NPAGES > rightcap) {
837			vm_reserv_object_unlock(object);
838			return (NULL);
839		}
840	}
841	vm_reserv_object_unlock(object);
842
843	/*
844	 * Would the last new reservation extend past the end of the object?
845	 *
846	 * If the object is unlikely to grow don't allocate a reservation for
847	 * the tail.
848	 */
849	if ((object->flags & OBJ_ANON) == 0 &&
850	    first + VM_LEVEL_0_NPAGES > object->size)
851		return (NULL);
852
853	/*
854	 * Allocate and populate the new reservation.
855	 */
856	m = NULL;
857	vmd = VM_DOMAIN(domain);
858	if (vm_domain_allocate(vmd, req, 1)) {
859		vm_domain_free_lock(vmd);
860		m = vm_phys_alloc_pages(domain, VM_FREEPOOL_DEFAULT,
861		    VM_LEVEL_0_ORDER);
862		vm_domain_free_unlock(vmd);
863		if (m == NULL) {
864			vm_domain_freecnt_inc(vmd, 1);
865			return (NULL);
866		}
867	} else
868		return (NULL);
869	rv = vm_reserv_from_page(m);
870	vm_reserv_lock(rv);
871	KASSERT(rv->pages == m,
872	    ("vm_reserv_alloc_page: reserv %p's pages is corrupted", rv));
873	vm_reserv_insert(rv, object, first);
874	index = VM_RESERV_INDEX(object, pindex);
875	vm_reserv_populate(rv, index);
876	vm_reserv_unlock(rv);
877
878	return (&rv->pages[index]);
879}
880
881/*
882 * Breaks the given reservation.  All free pages in the reservation
883 * are returned to the physical memory allocator.  The reservation's
884 * population count and map are reset to their initial state.
885 *
886 * The given reservation must not be in the partially populated reservation
887 * queue.
888 */
889static void
890vm_reserv_break(vm_reserv_t rv)
891{
892	int hi, lo, pos;
893
894	vm_reserv_assert_locked(rv);
895	CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d",
896	    __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq);
897	vm_reserv_remove(rv);
898	rv->pages->psind = 0;
899	hi = lo = -1;
900	pos = 0;
901	for (;;) {
902		bit_ff_at(rv->popmap, pos, VM_LEVEL_0_NPAGES, lo != hi, &pos);
903		if (lo == hi) {
904			if (pos == -1)
905				break;
906			lo = pos;
907			continue;
908		}
909		if (pos == -1)
910			pos = VM_LEVEL_0_NPAGES;
911		hi = pos;
912		vm_domain_free_lock(VM_DOMAIN(rv->domain));
913		vm_phys_enqueue_contig(&rv->pages[lo], hi - lo);
914		vm_domain_free_unlock(VM_DOMAIN(rv->domain));
915		lo = hi;
916	}
917	bit_nclear(rv->popmap, 0, VM_LEVEL_0_NPAGES - 1);
918	rv->popcnt = 0;
919	counter_u64_add(vm_reserv_broken, 1);
920}
921
922/*
923 * Breaks all reservations belonging to the given object.
924 */
925void
926vm_reserv_break_all(vm_object_t object)
927{
928	vm_reserv_t rv;
929
930	/*
931	 * This access of object->rvq is unsynchronized so that the
932	 * object rvq lock can nest after the domain_free lock.  We
933	 * must check for races in the results.  However, the object
934	 * lock prevents new additions, so we are guaranteed that when
935	 * it returns NULL the object is properly empty.
936	 */
937	while ((rv = LIST_FIRST(&object->rvq)) != NULL) {
938		vm_reserv_lock(rv);
939		/* Reclaim race. */
940		if (rv->object != object) {
941			vm_reserv_unlock(rv);
942			continue;
943		}
944		vm_reserv_domain_lock(rv->domain);
945		if (rv->inpartpopq) {
946			TAILQ_REMOVE(&vm_rvd[rv->domain].partpop, rv, partpopq);
947			rv->inpartpopq = FALSE;
948		}
949		vm_reserv_domain_unlock(rv->domain);
950		vm_reserv_break(rv);
951		vm_reserv_unlock(rv);
952	}
953}
954
955/*
956 * Frees the given page if it belongs to a reservation.  Returns TRUE if the
957 * page is freed and FALSE otherwise.
958 */
959boolean_t
960vm_reserv_free_page(vm_page_t m)
961{
962	vm_reserv_t rv;
963	boolean_t ret;
964
965	rv = vm_reserv_from_page(m);
966	if (rv->object == NULL)
967		return (FALSE);
968	vm_reserv_lock(rv);
969	/* Re-validate after lock. */
970	if (rv->object != NULL) {
971		vm_reserv_depopulate(rv, m - rv->pages);
972		ret = TRUE;
973	} else
974		ret = FALSE;
975	vm_reserv_unlock(rv);
976
977	return (ret);
978}
979
980/*
981 * Initializes the reservation management system.  Specifically, initializes
982 * the reservation array.
983 *
984 * Requires that vm_page_array and first_page are initialized!
985 */
986void
987vm_reserv_init(void)
988{
989	vm_paddr_t paddr;
990	struct vm_phys_seg *seg;
991	struct vm_reserv *rv;
992	struct vm_reserv_domain *rvd;
993#ifdef VM_PHYSSEG_SPARSE
994	vm_pindex_t used;
995#endif
996	int i, segind;
997
998	/*
999	 * Initialize the reservation array.  Specifically, initialize the
1000	 * "pages" field for every element that has an underlying superpage.
1001	 */
1002#ifdef VM_PHYSSEG_SPARSE
1003	used = 0;
1004#endif
1005	for (segind = 0; segind < vm_phys_nsegs; segind++) {
1006		seg = &vm_phys_segs[segind];
1007#ifdef VM_PHYSSEG_SPARSE
1008		seg->first_reserv = &vm_reserv_array[used];
1009		used += howmany(seg->end, VM_LEVEL_0_SIZE) -
1010		    seg->start / VM_LEVEL_0_SIZE;
1011#else
1012		seg->first_reserv =
1013		    &vm_reserv_array[seg->start >> VM_LEVEL_0_SHIFT];
1014#endif
1015		paddr = roundup2(seg->start, VM_LEVEL_0_SIZE);
1016		rv = seg->first_reserv + (paddr >> VM_LEVEL_0_SHIFT) -
1017		    (seg->start >> VM_LEVEL_0_SHIFT);
1018		while (paddr + VM_LEVEL_0_SIZE > paddr && paddr +
1019		    VM_LEVEL_0_SIZE <= seg->end) {
1020			rv->pages = PHYS_TO_VM_PAGE(paddr);
1021			rv->domain = seg->domain;
1022			mtx_init(&rv->lock, "vm reserv", NULL, MTX_DEF);
1023			paddr += VM_LEVEL_0_SIZE;
1024			rv++;
1025		}
1026	}
1027	for (i = 0; i < MAXMEMDOM; i++) {
1028		rvd = &vm_rvd[i];
1029		mtx_init(&rvd->lock, "vm reserv domain", NULL, MTX_DEF);
1030		TAILQ_INIT(&rvd->partpop);
1031		mtx_init(&rvd->marker.lock, "vm reserv marker", NULL, MTX_DEF);
1032
1033		/*
1034		 * Fully populated reservations should never be present in the
1035		 * partially populated reservation queues.
1036		 */
1037		rvd->marker.popcnt = VM_LEVEL_0_NPAGES;
1038		bit_nset(rvd->marker.popmap, 0, VM_LEVEL_0_NPAGES - 1);
1039	}
1040
1041	for (i = 0; i < VM_RESERV_OBJ_LOCK_COUNT; i++)
1042		mtx_init(&vm_reserv_object_mtx[i], "resv obj lock", NULL,
1043		    MTX_DEF);
1044}
1045
1046/*
1047 * Returns true if the given page belongs to a reservation and that page is
1048 * free.  Otherwise, returns false.
1049 */
1050bool
1051vm_reserv_is_page_free(vm_page_t m)
1052{
1053	vm_reserv_t rv;
1054
1055	rv = vm_reserv_from_page(m);
1056	if (rv->object == NULL)
1057		return (false);
1058	return (!bit_test(rv->popmap, m - rv->pages));
1059}
1060
1061/*
1062 * Returns true if the given page is part of a block of npages, starting at a
1063 * multiple of npages, that are all allocated.  Otherwise, returns false.
1064 */
1065bool
1066vm_reserv_is_populated(vm_page_t m, int npages)
1067{
1068	vm_reserv_t rv;
1069	int index;
1070
1071	KASSERT(npages <= VM_LEVEL_0_NPAGES,
1072	    ("%s: npages %d exceeds VM_LEVEL_0_NPAGES", __func__, npages));
1073	KASSERT(powerof2(npages),
1074	    ("%s: npages %d is not a power of 2", __func__, npages));
1075	rv = vm_reserv_from_page(m);
1076	if (rv->object == NULL)
1077		return (false);
1078	index = rounddown2(m - rv->pages, npages);
1079	return (bit_ntest(rv->popmap, index, index + npages - 1, 1));
1080}
1081
1082/*
1083 * If the given page belongs to a reservation, returns the level of that
1084 * reservation.  Otherwise, returns -1.
1085 */
1086int
1087vm_reserv_level(vm_page_t m)
1088{
1089	vm_reserv_t rv;
1090
1091	rv = vm_reserv_from_page(m);
1092	return (rv->object != NULL ? 0 : -1);
1093}
1094
1095/*
1096 * Returns a reservation level if the given page belongs to a fully populated
1097 * reservation and -1 otherwise.
1098 */
1099int
1100vm_reserv_level_iffullpop(vm_page_t m)
1101{
1102	vm_reserv_t rv;
1103
1104	rv = vm_reserv_from_page(m);
1105	return (rv->popcnt == VM_LEVEL_0_NPAGES ? 0 : -1);
1106}
1107
1108/*
1109 * Remove a partially populated reservation from the queue.
1110 */
1111static void
1112vm_reserv_dequeue(vm_reserv_t rv)
1113{
1114
1115	vm_reserv_domain_assert_locked(rv->domain);
1116	vm_reserv_assert_locked(rv);
1117	CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d",
1118	    __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq);
1119	KASSERT(rv->inpartpopq,
1120	    ("vm_reserv_reclaim: reserv %p's inpartpopq is FALSE", rv));
1121
1122	TAILQ_REMOVE(&vm_rvd[rv->domain].partpop, rv, partpopq);
1123	rv->inpartpopq = FALSE;
1124}
1125
1126/*
1127 * Breaks the given partially populated reservation, releasing its free pages
1128 * to the physical memory allocator.
1129 */
1130static void
1131vm_reserv_reclaim(vm_reserv_t rv)
1132{
1133
1134	vm_reserv_assert_locked(rv);
1135	CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d",
1136	    __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq);
1137	if (rv->inpartpopq) {
1138		vm_reserv_domain_lock(rv->domain);
1139		vm_reserv_dequeue(rv);
1140		vm_reserv_domain_unlock(rv->domain);
1141	}
1142	vm_reserv_break(rv);
1143	counter_u64_add(vm_reserv_reclaimed, 1);
1144}
1145
1146/*
1147 * Breaks a reservation near the head of the partially populated reservation
1148 * queue, releasing its free pages to the physical memory allocator.  Returns
1149 * TRUE if a reservation is broken and FALSE otherwise.
1150 */
1151bool
1152vm_reserv_reclaim_inactive(int domain)
1153{
1154	vm_reserv_t rv;
1155
1156	vm_reserv_domain_lock(domain);
1157	TAILQ_FOREACH(rv, &vm_rvd[domain].partpop, partpopq) {
1158		/*
1159		 * A locked reservation is likely being updated or reclaimed,
1160		 * so just skip ahead.
1161		 */
1162		if (rv != &vm_rvd[domain].marker && vm_reserv_trylock(rv)) {
1163			vm_reserv_dequeue(rv);
1164			break;
1165		}
1166	}
1167	vm_reserv_domain_unlock(domain);
1168	if (rv != NULL) {
1169		vm_reserv_reclaim(rv);
1170		vm_reserv_unlock(rv);
1171		return (true);
1172	}
1173	return (false);
1174}
1175
1176/*
1177 * Determine whether this reservation has free pages that satisfy the given
1178 * request for contiguous physical memory.  Start searching from the lower
1179 * bound, defined by lo, and stop at the upper bound, hi.  Return the index
1180 * of the first satisfactory free page, or -1 if none is found.
1181 */
1182static int
1183vm_reserv_find_contig(vm_reserv_t rv, int npages, int lo,
1184    int hi, int ppn_align, int ppn_bound)
1185{
1186
1187	vm_reserv_assert_locked(rv);
1188	KASSERT(npages <= VM_LEVEL_0_NPAGES - 1,
1189	    ("%s: Too many pages", __func__));
1190	KASSERT(ppn_bound <= VM_LEVEL_0_NPAGES,
1191	    ("%s: Too big a boundary for reservation size", __func__));
1192	KASSERT(npages <= ppn_bound,
1193	    ("%s: Too many pages for given boundary", __func__));
1194	KASSERT(ppn_align != 0 && powerof2(ppn_align),
1195	    ("ppn_align is not a positive power of 2"));
1196	KASSERT(ppn_bound != 0 && powerof2(ppn_bound),
1197	    ("ppn_bound is not a positive power of 2"));
1198	while (bit_ffc_area_at(rv->popmap, lo, hi, npages, &lo), lo != -1) {
1199		if (lo < roundup2(lo, ppn_align)) {
1200			/* Skip to next aligned page. */
1201			lo = roundup2(lo, ppn_align);
1202		} else if (roundup2(lo + 1, ppn_bound) >= lo + npages)
1203			return (lo);
1204		if (roundup2(lo + 1, ppn_bound) < lo + npages) {
1205			/* Skip to next boundary-matching page. */
1206			lo = roundup2(lo + 1, ppn_bound);
1207		}
1208	}
1209	return (-1);
1210}
1211
1212/*
1213 * Searches the partially populated reservation queue for the least recently
1214 * changed reservation with free pages that satisfy the given request for
1215 * contiguous physical memory.  If a satisfactory reservation is found, it is
1216 * broken.  Returns a page if a reservation is broken and NULL otherwise.
1217 */
1218vm_page_t
1219vm_reserv_reclaim_contig(int domain, u_long npages, vm_paddr_t low,
1220    vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
1221{
1222	struct vm_reserv_queue *queue;
1223	vm_paddr_t pa, size;
1224	vm_page_t m_ret;
1225	vm_reserv_t marker, rv, rvn;
1226	int hi, lo, posn, ppn_align, ppn_bound;
1227
1228	KASSERT(npages > 0, ("npages is 0"));
1229	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
1230	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
1231	if (npages > VM_LEVEL_0_NPAGES - 1)
1232		return (NULL);
1233	size = npages << PAGE_SHIFT;
1234	/*
1235	 * Ensure that a free range starting at a boundary-multiple
1236	 * doesn't include a boundary-multiple within it.  Otherwise,
1237	 * no boundary-constrained allocation is possible.
1238	 */
1239	if (!vm_addr_bound_ok(0, size, boundary))
1240		return (NULL);
1241	marker = &vm_rvd[domain].marker;
1242	queue = &vm_rvd[domain].partpop;
1243	/*
1244	 * Compute shifted alignment, boundary values for page-based
1245	 * calculations.  Constrain to range [1, VM_LEVEL_0_NPAGES] to
1246	 * avoid overflow.
1247	 */
1248	ppn_align = (int)(ulmin(ulmax(PAGE_SIZE, alignment),
1249	    VM_LEVEL_0_SIZE) >> PAGE_SHIFT);
1250	ppn_bound = boundary == 0 ? VM_LEVEL_0_NPAGES :
1251	    (int)(MIN(MAX(PAGE_SIZE, boundary),
1252            VM_LEVEL_0_SIZE) >> PAGE_SHIFT);
1253
1254	vm_reserv_domain_scan_lock(domain);
1255	vm_reserv_domain_lock(domain);
1256	TAILQ_FOREACH_SAFE(rv, queue, partpopq, rvn) {
1257		pa = VM_PAGE_TO_PHYS(&rv->pages[0]);
1258		if (pa + VM_LEVEL_0_SIZE - size < low) {
1259			/* This entire reservation is too low; go to next. */
1260			continue;
1261		}
1262		if (pa + size > high) {
1263			/* This entire reservation is too high; go to next. */
1264			continue;
1265		}
1266		if (!vm_addr_align_ok(pa, alignment)) {
1267			/* This entire reservation is unaligned; go to next. */
1268			continue;
1269		}
1270
1271		if (vm_reserv_trylock(rv) == 0) {
1272			TAILQ_INSERT_AFTER(queue, rv, marker, partpopq);
1273			vm_reserv_domain_unlock(domain);
1274			vm_reserv_lock(rv);
1275			if (TAILQ_PREV(marker, vm_reserv_queue, partpopq) !=
1276			    rv) {
1277				vm_reserv_unlock(rv);
1278				vm_reserv_domain_lock(domain);
1279				rvn = TAILQ_NEXT(marker, partpopq);
1280				TAILQ_REMOVE(queue, marker, partpopq);
1281				continue;
1282			}
1283			vm_reserv_domain_lock(domain);
1284			TAILQ_REMOVE(queue, marker, partpopq);
1285		}
1286		vm_reserv_domain_unlock(domain);
1287		lo = (pa >= low) ? 0 :
1288		    (int)((low + PAGE_MASK - pa) >> PAGE_SHIFT);
1289		hi = (pa + VM_LEVEL_0_SIZE <= high) ? VM_LEVEL_0_NPAGES :
1290		    (int)((high - pa) >> PAGE_SHIFT);
1291		posn = vm_reserv_find_contig(rv, (int)npages, lo, hi,
1292		    ppn_align, ppn_bound);
1293		if (posn >= 0) {
1294			vm_reserv_domain_scan_unlock(domain);
1295			/* Allocate requested space */
1296			rv->popcnt += npages;
1297			bit_nset(rv->popmap, posn, posn + npages - 1);
1298			vm_reserv_reclaim(rv);
1299			vm_reserv_unlock(rv);
1300			m_ret = &rv->pages[posn];
1301			pa = VM_PAGE_TO_PHYS(m_ret);
1302			KASSERT(vm_addr_ok(pa, size, alignment, boundary),
1303			    ("%s: adjusted address not aligned/bounded to "
1304			     "%lx/%jx",
1305			     __func__, alignment, (uintmax_t)boundary));
1306			return (m_ret);
1307		}
1308		vm_reserv_domain_lock(domain);
1309		rvn = TAILQ_NEXT(rv, partpopq);
1310		vm_reserv_unlock(rv);
1311	}
1312	vm_reserv_domain_unlock(domain);
1313	vm_reserv_domain_scan_unlock(domain);
1314	return (NULL);
1315}
1316
1317/*
1318 * Transfers the reservation underlying the given page to a new object.
1319 *
1320 * The object must be locked.
1321 */
1322void
1323vm_reserv_rename(vm_page_t m, vm_object_t new_object, vm_object_t old_object,
1324    vm_pindex_t old_object_offset)
1325{
1326	vm_reserv_t rv;
1327
1328	VM_OBJECT_ASSERT_WLOCKED(new_object);
1329	rv = vm_reserv_from_page(m);
1330	if (rv->object == old_object) {
1331		vm_reserv_lock(rv);
1332		CTR6(KTR_VM,
1333		    "%s: rv %p object %p new %p popcnt %d inpartpop %d",
1334		    __FUNCTION__, rv, rv->object, new_object, rv->popcnt,
1335		    rv->inpartpopq);
1336		if (rv->object == old_object) {
1337			vm_reserv_object_lock(old_object);
1338			rv->object = NULL;
1339			LIST_REMOVE(rv, objq);
1340			vm_reserv_object_unlock(old_object);
1341			vm_reserv_object_lock(new_object);
1342			rv->object = new_object;
1343			rv->pindex -= old_object_offset;
1344			LIST_INSERT_HEAD(&new_object->rvq, rv, objq);
1345			vm_reserv_object_unlock(new_object);
1346		}
1347		vm_reserv_unlock(rv);
1348	}
1349}
1350
1351/*
1352 * Returns the size (in bytes) of a reservation of the specified level.
1353 */
1354int
1355vm_reserv_size(int level)
1356{
1357
1358	switch (level) {
1359	case 0:
1360		return (VM_LEVEL_0_SIZE);
1361	case -1:
1362		return (PAGE_SIZE);
1363	default:
1364		return (0);
1365	}
1366}
1367
1368/*
1369 * Allocates the virtual and physical memory required by the reservation
1370 * management system's data structures, in particular, the reservation array.
1371 */
1372vm_paddr_t
1373vm_reserv_startup(vm_offset_t *vaddr, vm_paddr_t end)
1374{
1375	vm_paddr_t new_end;
1376	vm_pindex_t count;
1377	size_t size;
1378	int i;
1379
1380	count = 0;
1381	for (i = 0; i < vm_phys_nsegs; i++) {
1382#ifdef VM_PHYSSEG_SPARSE
1383		count += howmany(vm_phys_segs[i].end, VM_LEVEL_0_SIZE) -
1384		    vm_phys_segs[i].start / VM_LEVEL_0_SIZE;
1385#else
1386		count = MAX(count,
1387		    howmany(vm_phys_segs[i].end, VM_LEVEL_0_SIZE));
1388#endif
1389	}
1390
1391	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
1392#ifdef VM_PHYSSEG_SPARSE
1393		count += howmany(phys_avail[i + 1], VM_LEVEL_0_SIZE) -
1394		    phys_avail[i] / VM_LEVEL_0_SIZE;
1395#else
1396		count = MAX(count,
1397		    howmany(phys_avail[i + 1], VM_LEVEL_0_SIZE));
1398#endif
1399	}
1400
1401	/*
1402	 * Calculate the size (in bytes) of the reservation array.  Rounding up
1403	 * for partial superpages at boundaries, as every small page is mapped
1404	 * to an element in the reservation array based on its physical address.
1405	 * Thus, the number of elements in the reservation array can be greater
1406	 * than the number of superpages.
1407	 */
1408	size = count * sizeof(struct vm_reserv);
1409
1410	/*
1411	 * Allocate and map the physical memory for the reservation array.  The
1412	 * next available virtual address is returned by reference.
1413	 */
1414	new_end = end - round_page(size);
1415	vm_reserv_array = (void *)(uintptr_t)pmap_map(vaddr, new_end, end,
1416	    VM_PROT_READ | VM_PROT_WRITE);
1417	bzero(vm_reserv_array, size);
1418
1419	/*
1420	 * Return the next available physical address.
1421	 */
1422	return (new_end);
1423}
1424
1425/*
1426 * Returns the superpage containing the given page.
1427 */
1428vm_page_t
1429vm_reserv_to_superpage(vm_page_t m)
1430{
1431	vm_reserv_t rv;
1432
1433	VM_OBJECT_ASSERT_LOCKED(m->object);
1434	rv = vm_reserv_from_page(m);
1435	if (rv->object == m->object && rv->popcnt == VM_LEVEL_0_NPAGES)
1436		m = rv->pages;
1437	else
1438		m = NULL;
1439
1440	return (m);
1441}
1442
1443#endif	/* VM_NRESERVLEVEL > 0 */
1444