1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2002-2006 Rice University
5 * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu>
6 * All rights reserved.
7 *
8 * This software was developed for the FreeBSD Project by Alan L. Cox,
9 * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
24 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
27 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
30 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 */
33
34/*
35 *	Physical memory system implementation
36 *
37 * Any external functions defined by this module are only to be used by the
38 * virtual memory system.
39 */
40
41#include <sys/cdefs.h>
42#include "opt_ddb.h"
43#include "opt_vm.h"
44
45#include <sys/param.h>
46#include <sys/systm.h>
47#include <sys/domainset.h>
48#include <sys/lock.h>
49#include <sys/kernel.h>
50#include <sys/malloc.h>
51#include <sys/mutex.h>
52#include <sys/proc.h>
53#include <sys/queue.h>
54#include <sys/rwlock.h>
55#include <sys/sbuf.h>
56#include <sys/sysctl.h>
57#include <sys/tree.h>
58#include <sys/vmmeter.h>
59
60#include <ddb/ddb.h>
61
62#include <vm/vm.h>
63#include <vm/vm_extern.h>
64#include <vm/vm_param.h>
65#include <vm/vm_kern.h>
66#include <vm/vm_object.h>
67#include <vm/vm_page.h>
68#include <vm/vm_phys.h>
69#include <vm/vm_pagequeue.h>
70
71_Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX,
72    "Too many physsegs.");
73_Static_assert(sizeof(long long) >= sizeof(vm_paddr_t),
74    "vm_paddr_t too big for ffsll, flsll.");
75
76#ifdef NUMA
77struct mem_affinity __read_mostly *mem_affinity;
78int __read_mostly *mem_locality;
79
80static int numa_disabled;
81static SYSCTL_NODE(_vm, OID_AUTO, numa, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
82    "NUMA options");
83SYSCTL_INT(_vm_numa, OID_AUTO, disabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
84    &numa_disabled, 0, "NUMA-awareness in the allocators is disabled");
85#endif
86
87int __read_mostly vm_ndomains = 1;
88domainset_t __read_mostly all_domains = DOMAINSET_T_INITIALIZER(0x1);
89
90struct vm_phys_seg __read_mostly vm_phys_segs[VM_PHYSSEG_MAX];
91int __read_mostly vm_phys_nsegs;
92static struct vm_phys_seg vm_phys_early_segs[8];
93static int vm_phys_early_nsegs;
94
95struct vm_phys_fictitious_seg;
96static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *,
97    struct vm_phys_fictitious_seg *);
98
99RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree =
100    RB_INITIALIZER(&vm_phys_fictitious_tree);
101
102struct vm_phys_fictitious_seg {
103	RB_ENTRY(vm_phys_fictitious_seg) node;
104	/* Memory region data */
105	vm_paddr_t	start;
106	vm_paddr_t	end;
107	vm_page_t	first_page;
108};
109
110RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node,
111    vm_phys_fictitious_cmp);
112
113static struct rwlock_padalign vm_phys_fictitious_reg_lock;
114MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages");
115
116static struct vm_freelist __aligned(CACHE_LINE_SIZE)
117    vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL]
118    [VM_NFREEORDER_MAX];
119
120static int __read_mostly vm_nfreelists;
121
122/*
123 * These "avail lists" are globals used to communicate boot-time physical
124 * memory layout to other parts of the kernel.  Each physically contiguous
125 * region of memory is defined by a start address at an even index and an
126 * end address at the following odd index.  Each list is terminated by a
127 * pair of zero entries.
128 *
129 * dump_avail tells the dump code what regions to include in a crash dump, and
130 * phys_avail is all of the remaining physical memory that is available for
131 * the vm system.
132 *
133 * Initially dump_avail and phys_avail are identical.  Boot time memory
134 * allocations remove extents from phys_avail that may still be included
135 * in dumps.
136 */
137vm_paddr_t phys_avail[PHYS_AVAIL_COUNT];
138vm_paddr_t dump_avail[PHYS_AVAIL_COUNT];
139
140/*
141 * Provides the mapping from VM_FREELIST_* to free list indices (flind).
142 */
143static int __read_mostly vm_freelist_to_flind[VM_NFREELIST];
144
145CTASSERT(VM_FREELIST_DEFAULT == 0);
146
147#ifdef VM_FREELIST_DMA32
148#define	VM_DMA32_BOUNDARY	((vm_paddr_t)1 << 32)
149#endif
150
151/*
152 * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about
153 * the ordering of the free list boundaries.
154 */
155#if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY)
156CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY);
157#endif
158
159static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS);
160SYSCTL_OID(_vm, OID_AUTO, phys_free,
161    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
162    sysctl_vm_phys_free, "A",
163    "Phys Free Info");
164
165static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS);
166SYSCTL_OID(_vm, OID_AUTO, phys_segs,
167    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
168    sysctl_vm_phys_segs, "A",
169    "Phys Seg Info");
170
171#ifdef NUMA
172static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS);
173SYSCTL_OID(_vm, OID_AUTO, phys_locality,
174    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
175    sysctl_vm_phys_locality, "A",
176    "Phys Locality Info");
177#endif
178
179SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD,
180    &vm_ndomains, 0, "Number of physical memory domains available.");
181
182static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain);
183static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end);
184static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
185    int order, int tail);
186
187/*
188 * Red-black tree helpers for vm fictitious range management.
189 */
190static inline int
191vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p,
192    struct vm_phys_fictitious_seg *range)
193{
194
195	KASSERT(range->start != 0 && range->end != 0,
196	    ("Invalid range passed on search for vm_fictitious page"));
197	if (p->start >= range->end)
198		return (1);
199	if (p->start < range->start)
200		return (-1);
201
202	return (0);
203}
204
205static int
206vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1,
207    struct vm_phys_fictitious_seg *p2)
208{
209
210	/* Check if this is a search for a page */
211	if (p1->end == 0)
212		return (vm_phys_fictitious_in_range(p1, p2));
213
214	KASSERT(p2->end != 0,
215    ("Invalid range passed as second parameter to vm fictitious comparison"));
216
217	/* Searching to add a new range */
218	if (p1->end <= p2->start)
219		return (-1);
220	if (p1->start >= p2->end)
221		return (1);
222
223	panic("Trying to add overlapping vm fictitious ranges:\n"
224	    "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start,
225	    (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end);
226}
227
228int
229vm_phys_domain_match(int prefer __numa_used, vm_paddr_t low __numa_used,
230    vm_paddr_t high __numa_used)
231{
232#ifdef NUMA
233	domainset_t mask;
234	int i;
235
236	if (vm_ndomains == 1 || mem_affinity == NULL)
237		return (0);
238
239	DOMAINSET_ZERO(&mask);
240	/*
241	 * Check for any memory that overlaps low, high.
242	 */
243	for (i = 0; mem_affinity[i].end != 0; i++)
244		if (mem_affinity[i].start <= high &&
245		    mem_affinity[i].end >= low)
246			DOMAINSET_SET(mem_affinity[i].domain, &mask);
247	if (prefer != -1 && DOMAINSET_ISSET(prefer, &mask))
248		return (prefer);
249	if (DOMAINSET_EMPTY(&mask))
250		panic("vm_phys_domain_match:  Impossible constraint");
251	return (DOMAINSET_FFS(&mask) - 1);
252#else
253	return (0);
254#endif
255}
256
257/*
258 * Outputs the state of the physical memory allocator, specifically,
259 * the amount of physical memory in each free list.
260 */
261static int
262sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS)
263{
264	struct sbuf sbuf;
265	struct vm_freelist *fl;
266	int dom, error, flind, oind, pind;
267
268	error = sysctl_wire_old_buffer(req, 0);
269	if (error != 0)
270		return (error);
271	sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req);
272	for (dom = 0; dom < vm_ndomains; dom++) {
273		sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom);
274		for (flind = 0; flind < vm_nfreelists; flind++) {
275			sbuf_printf(&sbuf, "\nFREE LIST %d:\n"
276			    "\n  ORDER (SIZE)  |  NUMBER"
277			    "\n              ", flind);
278			for (pind = 0; pind < VM_NFREEPOOL; pind++)
279				sbuf_printf(&sbuf, "  |  POOL %d", pind);
280			sbuf_printf(&sbuf, "\n--            ");
281			for (pind = 0; pind < VM_NFREEPOOL; pind++)
282				sbuf_printf(&sbuf, "-- --      ");
283			sbuf_printf(&sbuf, "--\n");
284			for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
285				sbuf_printf(&sbuf, "  %2d (%6dK)", oind,
286				    1 << (PAGE_SHIFT - 10 + oind));
287				for (pind = 0; pind < VM_NFREEPOOL; pind++) {
288				fl = vm_phys_free_queues[dom][flind][pind];
289					sbuf_printf(&sbuf, "  |  %6d",
290					    fl[oind].lcnt);
291				}
292				sbuf_printf(&sbuf, "\n");
293			}
294		}
295	}
296	error = sbuf_finish(&sbuf);
297	sbuf_delete(&sbuf);
298	return (error);
299}
300
301/*
302 * Outputs the set of physical memory segments.
303 */
304static int
305sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS)
306{
307	struct sbuf sbuf;
308	struct vm_phys_seg *seg;
309	int error, segind;
310
311	error = sysctl_wire_old_buffer(req, 0);
312	if (error != 0)
313		return (error);
314	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
315	for (segind = 0; segind < vm_phys_nsegs; segind++) {
316		sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind);
317		seg = &vm_phys_segs[segind];
318		sbuf_printf(&sbuf, "start:     %#jx\n",
319		    (uintmax_t)seg->start);
320		sbuf_printf(&sbuf, "end:       %#jx\n",
321		    (uintmax_t)seg->end);
322		sbuf_printf(&sbuf, "domain:    %d\n", seg->domain);
323		sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues);
324	}
325	error = sbuf_finish(&sbuf);
326	sbuf_delete(&sbuf);
327	return (error);
328}
329
330/*
331 * Return affinity, or -1 if there's no affinity information.
332 */
333int
334vm_phys_mem_affinity(int f __numa_used, int t __numa_used)
335{
336
337#ifdef NUMA
338	if (mem_locality == NULL)
339		return (-1);
340	if (f >= vm_ndomains || t >= vm_ndomains)
341		return (-1);
342	return (mem_locality[f * vm_ndomains + t]);
343#else
344	return (-1);
345#endif
346}
347
348#ifdef NUMA
349/*
350 * Outputs the VM locality table.
351 */
352static int
353sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS)
354{
355	struct sbuf sbuf;
356	int error, i, j;
357
358	error = sysctl_wire_old_buffer(req, 0);
359	if (error != 0)
360		return (error);
361	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
362
363	sbuf_printf(&sbuf, "\n");
364
365	for (i = 0; i < vm_ndomains; i++) {
366		sbuf_printf(&sbuf, "%d: ", i);
367		for (j = 0; j < vm_ndomains; j++) {
368			sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j));
369		}
370		sbuf_printf(&sbuf, "\n");
371	}
372	error = sbuf_finish(&sbuf);
373	sbuf_delete(&sbuf);
374	return (error);
375}
376#endif
377
378static void
379vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail)
380{
381
382	m->order = order;
383	if (tail)
384		TAILQ_INSERT_TAIL(&fl[order].pl, m, listq);
385	else
386		TAILQ_INSERT_HEAD(&fl[order].pl, m, listq);
387	fl[order].lcnt++;
388}
389
390static void
391vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order)
392{
393
394	TAILQ_REMOVE(&fl[order].pl, m, listq);
395	fl[order].lcnt--;
396	m->order = VM_NFREEORDER;
397}
398
399/*
400 * Create a physical memory segment.
401 */
402static void
403_vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain)
404{
405	struct vm_phys_seg *seg;
406
407	KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX,
408	    ("vm_phys_create_seg: increase VM_PHYSSEG_MAX"));
409	KASSERT(domain >= 0 && domain < vm_ndomains,
410	    ("vm_phys_create_seg: invalid domain provided"));
411	seg = &vm_phys_segs[vm_phys_nsegs++];
412	while (seg > vm_phys_segs && (seg - 1)->start >= end) {
413		*seg = *(seg - 1);
414		seg--;
415	}
416	seg->start = start;
417	seg->end = end;
418	seg->domain = domain;
419}
420
421static void
422vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end)
423{
424#ifdef NUMA
425	int i;
426
427	if (mem_affinity == NULL) {
428		_vm_phys_create_seg(start, end, 0);
429		return;
430	}
431
432	for (i = 0;; i++) {
433		if (mem_affinity[i].end == 0)
434			panic("Reached end of affinity info");
435		if (mem_affinity[i].end <= start)
436			continue;
437		if (mem_affinity[i].start > start)
438			panic("No affinity info for start %jx",
439			    (uintmax_t)start);
440		if (mem_affinity[i].end >= end) {
441			_vm_phys_create_seg(start, end,
442			    mem_affinity[i].domain);
443			break;
444		}
445		_vm_phys_create_seg(start, mem_affinity[i].end,
446		    mem_affinity[i].domain);
447		start = mem_affinity[i].end;
448	}
449#else
450	_vm_phys_create_seg(start, end, 0);
451#endif
452}
453
454/*
455 * Add a physical memory segment.
456 */
457void
458vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end)
459{
460	vm_paddr_t paddr;
461
462	KASSERT((start & PAGE_MASK) == 0,
463	    ("vm_phys_define_seg: start is not page aligned"));
464	KASSERT((end & PAGE_MASK) == 0,
465	    ("vm_phys_define_seg: end is not page aligned"));
466
467	/*
468	 * Split the physical memory segment if it spans two or more free
469	 * list boundaries.
470	 */
471	paddr = start;
472#ifdef	VM_FREELIST_LOWMEM
473	if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) {
474		vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY);
475		paddr = VM_LOWMEM_BOUNDARY;
476	}
477#endif
478#ifdef	VM_FREELIST_DMA32
479	if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) {
480		vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY);
481		paddr = VM_DMA32_BOUNDARY;
482	}
483#endif
484	vm_phys_create_seg(paddr, end);
485}
486
487/*
488 * Initialize the physical memory allocator.
489 *
490 * Requires that vm_page_array is initialized!
491 */
492void
493vm_phys_init(void)
494{
495	struct vm_freelist *fl;
496	struct vm_phys_seg *end_seg, *prev_seg, *seg, *tmp_seg;
497#if defined(VM_DMA32_NPAGES_THRESHOLD) || defined(VM_PHYSSEG_SPARSE)
498	u_long npages;
499#endif
500	int dom, flind, freelist, oind, pind, segind;
501
502	/*
503	 * Compute the number of free lists, and generate the mapping from the
504	 * manifest constants VM_FREELIST_* to the free list indices.
505	 *
506	 * Initially, the entries of vm_freelist_to_flind[] are set to either
507	 * 0 or 1 to indicate which free lists should be created.
508	 */
509#ifdef	VM_DMA32_NPAGES_THRESHOLD
510	npages = 0;
511#endif
512	for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
513		seg = &vm_phys_segs[segind];
514#ifdef	VM_FREELIST_LOWMEM
515		if (seg->end <= VM_LOWMEM_BOUNDARY)
516			vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1;
517		else
518#endif
519#ifdef	VM_FREELIST_DMA32
520		if (
521#ifdef	VM_DMA32_NPAGES_THRESHOLD
522		    /*
523		     * Create the DMA32 free list only if the amount of
524		     * physical memory above physical address 4G exceeds the
525		     * given threshold.
526		     */
527		    npages > VM_DMA32_NPAGES_THRESHOLD &&
528#endif
529		    seg->end <= VM_DMA32_BOUNDARY)
530			vm_freelist_to_flind[VM_FREELIST_DMA32] = 1;
531		else
532#endif
533		{
534#ifdef	VM_DMA32_NPAGES_THRESHOLD
535			npages += atop(seg->end - seg->start);
536#endif
537			vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1;
538		}
539	}
540	/* Change each entry into a running total of the free lists. */
541	for (freelist = 1; freelist < VM_NFREELIST; freelist++) {
542		vm_freelist_to_flind[freelist] +=
543		    vm_freelist_to_flind[freelist - 1];
544	}
545	vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1];
546	KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists"));
547	/* Change each entry into a free list index. */
548	for (freelist = 0; freelist < VM_NFREELIST; freelist++)
549		vm_freelist_to_flind[freelist]--;
550
551	/*
552	 * Initialize the first_page and free_queues fields of each physical
553	 * memory segment.
554	 */
555#ifdef VM_PHYSSEG_SPARSE
556	npages = 0;
557#endif
558	for (segind = 0; segind < vm_phys_nsegs; segind++) {
559		seg = &vm_phys_segs[segind];
560#ifdef VM_PHYSSEG_SPARSE
561		seg->first_page = &vm_page_array[npages];
562		npages += atop(seg->end - seg->start);
563#else
564		seg->first_page = PHYS_TO_VM_PAGE(seg->start);
565#endif
566#ifdef	VM_FREELIST_LOWMEM
567		if (seg->end <= VM_LOWMEM_BOUNDARY) {
568			flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM];
569			KASSERT(flind >= 0,
570			    ("vm_phys_init: LOWMEM flind < 0"));
571		} else
572#endif
573#ifdef	VM_FREELIST_DMA32
574		if (seg->end <= VM_DMA32_BOUNDARY) {
575			flind = vm_freelist_to_flind[VM_FREELIST_DMA32];
576			KASSERT(flind >= 0,
577			    ("vm_phys_init: DMA32 flind < 0"));
578		} else
579#endif
580		{
581			flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT];
582			KASSERT(flind >= 0,
583			    ("vm_phys_init: DEFAULT flind < 0"));
584		}
585		seg->free_queues = &vm_phys_free_queues[seg->domain][flind];
586	}
587
588	/*
589	 * Coalesce physical memory segments that are contiguous and share the
590	 * same per-domain free queues.
591	 */
592	prev_seg = vm_phys_segs;
593	seg = &vm_phys_segs[1];
594	end_seg = &vm_phys_segs[vm_phys_nsegs];
595	while (seg < end_seg) {
596		if (prev_seg->end == seg->start &&
597		    prev_seg->free_queues == seg->free_queues) {
598			prev_seg->end = seg->end;
599			KASSERT(prev_seg->domain == seg->domain,
600			    ("vm_phys_init: free queues cannot span domains"));
601			vm_phys_nsegs--;
602			end_seg--;
603			for (tmp_seg = seg; tmp_seg < end_seg; tmp_seg++)
604				*tmp_seg = *(tmp_seg + 1);
605		} else {
606			prev_seg = seg;
607			seg++;
608		}
609	}
610
611	/*
612	 * Initialize the free queues.
613	 */
614	for (dom = 0; dom < vm_ndomains; dom++) {
615		for (flind = 0; flind < vm_nfreelists; flind++) {
616			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
617				fl = vm_phys_free_queues[dom][flind][pind];
618				for (oind = 0; oind < VM_NFREEORDER; oind++)
619					TAILQ_INIT(&fl[oind].pl);
620			}
621		}
622	}
623
624	rw_init(&vm_phys_fictitious_reg_lock, "vmfctr");
625}
626
627/*
628 * Register info about the NUMA topology of the system.
629 *
630 * Invoked by platform-dependent code prior to vm_phys_init().
631 */
632void
633vm_phys_register_domains(int ndomains __numa_used,
634    struct mem_affinity *affinity __numa_used, int *locality __numa_used)
635{
636#ifdef NUMA
637	int i;
638
639	/*
640	 * For now the only override value that we support is 1, which
641	 * effectively disables NUMA-awareness in the allocators.
642	 */
643	TUNABLE_INT_FETCH("vm.numa.disabled", &numa_disabled);
644	if (numa_disabled)
645		ndomains = 1;
646
647	if (ndomains > 1) {
648		vm_ndomains = ndomains;
649		mem_affinity = affinity;
650		mem_locality = locality;
651	}
652
653	for (i = 0; i < vm_ndomains; i++)
654		DOMAINSET_SET(i, &all_domains);
655#endif
656}
657
658/*
659 * Split a contiguous, power of two-sized set of physical pages.
660 *
661 * When this function is called by a page allocation function, the caller
662 * should request insertion at the head unless the order [order, oind) queues
663 * are known to be empty.  The objective being to reduce the likelihood of
664 * long-term fragmentation by promoting contemporaneous allocation and
665 * (hopefully) deallocation.
666 */
667static __inline void
668vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order,
669    int tail)
670{
671	vm_page_t m_buddy;
672
673	while (oind > order) {
674		oind--;
675		m_buddy = &m[1 << oind];
676		KASSERT(m_buddy->order == VM_NFREEORDER,
677		    ("vm_phys_split_pages: page %p has unexpected order %d",
678		    m_buddy, m_buddy->order));
679		vm_freelist_add(fl, m_buddy, oind, tail);
680        }
681}
682
683/*
684 * Add the physical pages [m, m + npages) at the beginning of a power-of-two
685 * aligned and sized set to the specified free list.
686 *
687 * When this function is called by a page allocation function, the caller
688 * should request insertion at the head unless the lower-order queues are
689 * known to be empty.  The objective being to reduce the likelihood of long-
690 * term fragmentation by promoting contemporaneous allocation and (hopefully)
691 * deallocation.
692 *
693 * The physical page m's buddy must not be free.
694 */
695static void
696vm_phys_enq_beg(vm_page_t m, u_int npages, struct vm_freelist *fl, int tail)
697{
698        int order;
699
700	KASSERT(npages == 0 ||
701	    (VM_PAGE_TO_PHYS(m) &
702	    ((PAGE_SIZE << ilog2(npages)) - 1)) == 0,
703	    ("%s: page %p and npages %u are misaligned",
704	    __func__, m, npages));
705        while (npages > 0) {
706		KASSERT(m->order == VM_NFREEORDER,
707		    ("%s: page %p has unexpected order %d",
708		    __func__, m, m->order));
709                order = ilog2(npages);
710		KASSERT(order < VM_NFREEORDER,
711		    ("%s: order %d is out of range", __func__, order));
712                vm_freelist_add(fl, m, order, tail);
713		m += 1 << order;
714                npages -= 1 << order;
715        }
716}
717
718/*
719 * Add the physical pages [m, m + npages) at the end of a power-of-two aligned
720 * and sized set to the specified free list.
721 *
722 * When this function is called by a page allocation function, the caller
723 * should request insertion at the head unless the lower-order queues are
724 * known to be empty.  The objective being to reduce the likelihood of long-
725 * term fragmentation by promoting contemporaneous allocation and (hopefully)
726 * deallocation.
727 *
728 * If npages is zero, this function does nothing and ignores the physical page
729 * parameter m.  Otherwise, the physical page m's buddy must not be free.
730 */
731static vm_page_t
732vm_phys_enq_range(vm_page_t m, u_int npages, struct vm_freelist *fl, int tail)
733{
734	int order;
735
736	KASSERT(npages == 0 ||
737	    ((VM_PAGE_TO_PHYS(m) + npages * PAGE_SIZE) &
738	    ((PAGE_SIZE << ilog2(npages)) - 1)) == 0,
739	    ("vm_phys_enq_range: page %p and npages %u are misaligned",
740	    m, npages));
741	while (npages > 0) {
742		KASSERT(m->order == VM_NFREEORDER,
743		    ("vm_phys_enq_range: page %p has unexpected order %d",
744		    m, m->order));
745		order = ffs(npages) - 1;
746		KASSERT(order < VM_NFREEORDER,
747		    ("vm_phys_enq_range: order %d is out of range", order));
748		vm_freelist_add(fl, m, order, tail);
749		m += 1 << order;
750		npages -= 1 << order;
751	}
752	return (m);
753}
754
755/*
756 * Set the pool for a contiguous, power of two-sized set of physical pages.
757 */
758static void
759vm_phys_set_pool(int pool, vm_page_t m, int order)
760{
761	vm_page_t m_tmp;
762
763	for (m_tmp = m; m_tmp < &m[1 << order]; m_tmp++)
764		m_tmp->pool = pool;
765}
766
767/*
768 * Tries to allocate the specified number of pages from the specified pool
769 * within the specified domain.  Returns the actual number of allocated pages
770 * and a pointer to each page through the array ma[].
771 *
772 * The returned pages may not be physically contiguous.  However, in contrast
773 * to performing multiple, back-to-back calls to vm_phys_alloc_pages(..., 0),
774 * calling this function once to allocate the desired number of pages will
775 * avoid wasted time in vm_phys_split_pages().
776 *
777 * The free page queues for the specified domain must be locked.
778 */
779int
780vm_phys_alloc_npages(int domain, int pool, int npages, vm_page_t ma[])
781{
782	struct vm_freelist *alt, *fl;
783	vm_page_t m;
784	int avail, end, flind, freelist, i, oind, pind;
785
786	KASSERT(domain >= 0 && domain < vm_ndomains,
787	    ("vm_phys_alloc_npages: domain %d is out of range", domain));
788	KASSERT(pool < VM_NFREEPOOL,
789	    ("vm_phys_alloc_npages: pool %d is out of range", pool));
790	KASSERT(npages <= 1 << (VM_NFREEORDER - 1),
791	    ("vm_phys_alloc_npages: npages %d is out of range", npages));
792	vm_domain_free_assert_locked(VM_DOMAIN(domain));
793	i = 0;
794	for (freelist = 0; freelist < VM_NFREELIST; freelist++) {
795		flind = vm_freelist_to_flind[freelist];
796		if (flind < 0)
797			continue;
798		fl = vm_phys_free_queues[domain][flind][pool];
799		for (oind = 0; oind < VM_NFREEORDER; oind++) {
800			while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) {
801				vm_freelist_rem(fl, m, oind);
802				avail = i + (1 << oind);
803				end = imin(npages, avail);
804				while (i < end)
805					ma[i++] = m++;
806				if (i == npages) {
807					/*
808					 * Return excess pages to fl.  Its order
809					 * [0, oind) queues are empty.
810					 */
811					vm_phys_enq_range(m, avail - i, fl, 1);
812					return (npages);
813				}
814			}
815		}
816		for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
817			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
818				alt = vm_phys_free_queues[domain][flind][pind];
819				while ((m = TAILQ_FIRST(&alt[oind].pl)) !=
820				    NULL) {
821					vm_freelist_rem(alt, m, oind);
822					vm_phys_set_pool(pool, m, oind);
823					avail = i + (1 << oind);
824					end = imin(npages, avail);
825					while (i < end)
826						ma[i++] = m++;
827					if (i == npages) {
828						/*
829						 * Return excess pages to fl.
830						 * Its order [0, oind) queues
831						 * are empty.
832						 */
833						vm_phys_enq_range(m, avail - i,
834						    fl, 1);
835						return (npages);
836					}
837				}
838			}
839		}
840	}
841	return (i);
842}
843
844/*
845 * Allocate a contiguous, power of two-sized set of physical pages
846 * from the free lists.
847 *
848 * The free page queues must be locked.
849 */
850vm_page_t
851vm_phys_alloc_pages(int domain, int pool, int order)
852{
853	vm_page_t m;
854	int freelist;
855
856	for (freelist = 0; freelist < VM_NFREELIST; freelist++) {
857		m = vm_phys_alloc_freelist_pages(domain, freelist, pool, order);
858		if (m != NULL)
859			return (m);
860	}
861	return (NULL);
862}
863
864/*
865 * Allocate a contiguous, power of two-sized set of physical pages from the
866 * specified free list.  The free list must be specified using one of the
867 * manifest constants VM_FREELIST_*.
868 *
869 * The free page queues must be locked.
870 */
871vm_page_t
872vm_phys_alloc_freelist_pages(int domain, int freelist, int pool, int order)
873{
874	struct vm_freelist *alt, *fl;
875	vm_page_t m;
876	int oind, pind, flind;
877
878	KASSERT(domain >= 0 && domain < vm_ndomains,
879	    ("vm_phys_alloc_freelist_pages: domain %d is out of range",
880	    domain));
881	KASSERT(freelist < VM_NFREELIST,
882	    ("vm_phys_alloc_freelist_pages: freelist %d is out of range",
883	    freelist));
884	KASSERT(pool < VM_NFREEPOOL,
885	    ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool));
886	KASSERT(order < VM_NFREEORDER,
887	    ("vm_phys_alloc_freelist_pages: order %d is out of range", order));
888
889	flind = vm_freelist_to_flind[freelist];
890	/* Check if freelist is present */
891	if (flind < 0)
892		return (NULL);
893
894	vm_domain_free_assert_locked(VM_DOMAIN(domain));
895	fl = &vm_phys_free_queues[domain][flind][pool][0];
896	for (oind = order; oind < VM_NFREEORDER; oind++) {
897		m = TAILQ_FIRST(&fl[oind].pl);
898		if (m != NULL) {
899			vm_freelist_rem(fl, m, oind);
900			/* The order [order, oind) queues are empty. */
901			vm_phys_split_pages(m, oind, fl, order, 1);
902			return (m);
903		}
904	}
905
906	/*
907	 * The given pool was empty.  Find the largest
908	 * contiguous, power-of-two-sized set of pages in any
909	 * pool.  Transfer these pages to the given pool, and
910	 * use them to satisfy the allocation.
911	 */
912	for (oind = VM_NFREEORDER - 1; oind >= order; oind--) {
913		for (pind = 0; pind < VM_NFREEPOOL; pind++) {
914			alt = &vm_phys_free_queues[domain][flind][pind][0];
915			m = TAILQ_FIRST(&alt[oind].pl);
916			if (m != NULL) {
917				vm_freelist_rem(alt, m, oind);
918				vm_phys_set_pool(pool, m, oind);
919				/* The order [order, oind) queues are empty. */
920				vm_phys_split_pages(m, oind, fl, order, 1);
921				return (m);
922			}
923		}
924	}
925	return (NULL);
926}
927
928/*
929 * Find the vm_page corresponding to the given physical address.
930 */
931vm_page_t
932vm_phys_paddr_to_vm_page(vm_paddr_t pa)
933{
934	struct vm_phys_seg *seg;
935
936	if ((seg = vm_phys_paddr_to_seg(pa)) != NULL)
937		return (&seg->first_page[atop(pa - seg->start)]);
938	return (NULL);
939}
940
941vm_page_t
942vm_phys_fictitious_to_vm_page(vm_paddr_t pa)
943{
944	struct vm_phys_fictitious_seg tmp, *seg;
945	vm_page_t m;
946
947	m = NULL;
948	tmp.start = pa;
949	tmp.end = 0;
950
951	rw_rlock(&vm_phys_fictitious_reg_lock);
952	seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
953	rw_runlock(&vm_phys_fictitious_reg_lock);
954	if (seg == NULL)
955		return (NULL);
956
957	m = &seg->first_page[atop(pa - seg->start)];
958	KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m));
959
960	return (m);
961}
962
963static inline void
964vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start,
965    long page_count, vm_memattr_t memattr)
966{
967	long i;
968
969	bzero(range, page_count * sizeof(*range));
970	for (i = 0; i < page_count; i++) {
971		vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr);
972		range[i].oflags &= ~VPO_UNMANAGED;
973		range[i].busy_lock = VPB_UNBUSIED;
974	}
975}
976
977int
978vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end,
979    vm_memattr_t memattr)
980{
981	struct vm_phys_fictitious_seg *seg;
982	vm_page_t fp;
983	long page_count;
984#ifdef VM_PHYSSEG_DENSE
985	long pi, pe;
986	long dpage_count;
987#endif
988
989	KASSERT(start < end,
990	    ("Start of segment isn't less than end (start: %jx end: %jx)",
991	    (uintmax_t)start, (uintmax_t)end));
992
993	page_count = (end - start) / PAGE_SIZE;
994
995#ifdef VM_PHYSSEG_DENSE
996	pi = atop(start);
997	pe = atop(end);
998	if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
999		fp = &vm_page_array[pi - first_page];
1000		if ((pe - first_page) > vm_page_array_size) {
1001			/*
1002			 * We have a segment that starts inside
1003			 * of vm_page_array, but ends outside of it.
1004			 *
1005			 * Use vm_page_array pages for those that are
1006			 * inside of the vm_page_array range, and
1007			 * allocate the remaining ones.
1008			 */
1009			dpage_count = vm_page_array_size - (pi - first_page);
1010			vm_phys_fictitious_init_range(fp, start, dpage_count,
1011			    memattr);
1012			page_count -= dpage_count;
1013			start += ptoa(dpage_count);
1014			goto alloc;
1015		}
1016		/*
1017		 * We can allocate the full range from vm_page_array,
1018		 * so there's no need to register the range in the tree.
1019		 */
1020		vm_phys_fictitious_init_range(fp, start, page_count, memattr);
1021		return (0);
1022	} else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
1023		/*
1024		 * We have a segment that ends inside of vm_page_array,
1025		 * but starts outside of it.
1026		 */
1027		fp = &vm_page_array[0];
1028		dpage_count = pe - first_page;
1029		vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count,
1030		    memattr);
1031		end -= ptoa(dpage_count);
1032		page_count -= dpage_count;
1033		goto alloc;
1034	} else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
1035		/*
1036		 * Trying to register a fictitious range that expands before
1037		 * and after vm_page_array.
1038		 */
1039		return (EINVAL);
1040	} else {
1041alloc:
1042#endif
1043		fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES,
1044		    M_WAITOK);
1045#ifdef VM_PHYSSEG_DENSE
1046	}
1047#endif
1048	vm_phys_fictitious_init_range(fp, start, page_count, memattr);
1049
1050	seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO);
1051	seg->start = start;
1052	seg->end = end;
1053	seg->first_page = fp;
1054
1055	rw_wlock(&vm_phys_fictitious_reg_lock);
1056	RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg);
1057	rw_wunlock(&vm_phys_fictitious_reg_lock);
1058
1059	return (0);
1060}
1061
1062void
1063vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end)
1064{
1065	struct vm_phys_fictitious_seg *seg, tmp;
1066#ifdef VM_PHYSSEG_DENSE
1067	long pi, pe;
1068#endif
1069
1070	KASSERT(start < end,
1071	    ("Start of segment isn't less than end (start: %jx end: %jx)",
1072	    (uintmax_t)start, (uintmax_t)end));
1073
1074#ifdef VM_PHYSSEG_DENSE
1075	pi = atop(start);
1076	pe = atop(end);
1077	if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
1078		if ((pe - first_page) <= vm_page_array_size) {
1079			/*
1080			 * This segment was allocated using vm_page_array
1081			 * only, there's nothing to do since those pages
1082			 * were never added to the tree.
1083			 */
1084			return;
1085		}
1086		/*
1087		 * We have a segment that starts inside
1088		 * of vm_page_array, but ends outside of it.
1089		 *
1090		 * Calculate how many pages were added to the
1091		 * tree and free them.
1092		 */
1093		start = ptoa(first_page + vm_page_array_size);
1094	} else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
1095		/*
1096		 * We have a segment that ends inside of vm_page_array,
1097		 * but starts outside of it.
1098		 */
1099		end = ptoa(first_page);
1100	} else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
1101		/* Since it's not possible to register such a range, panic. */
1102		panic(
1103		    "Unregistering not registered fictitious range [%#jx:%#jx]",
1104		    (uintmax_t)start, (uintmax_t)end);
1105	}
1106#endif
1107	tmp.start = start;
1108	tmp.end = 0;
1109
1110	rw_wlock(&vm_phys_fictitious_reg_lock);
1111	seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
1112	if (seg->start != start || seg->end != end) {
1113		rw_wunlock(&vm_phys_fictitious_reg_lock);
1114		panic(
1115		    "Unregistering not registered fictitious range [%#jx:%#jx]",
1116		    (uintmax_t)start, (uintmax_t)end);
1117	}
1118	RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg);
1119	rw_wunlock(&vm_phys_fictitious_reg_lock);
1120	free(seg->first_page, M_FICT_PAGES);
1121	free(seg, M_FICT_PAGES);
1122}
1123
1124/*
1125 * Free a contiguous, power of two-sized set of physical pages.
1126 *
1127 * The free page queues must be locked.
1128 */
1129void
1130vm_phys_free_pages(vm_page_t m, int order)
1131{
1132	struct vm_freelist *fl;
1133	struct vm_phys_seg *seg;
1134	vm_paddr_t pa;
1135	vm_page_t m_buddy;
1136
1137	KASSERT(m->order == VM_NFREEORDER,
1138	    ("vm_phys_free_pages: page %p has unexpected order %d",
1139	    m, m->order));
1140	KASSERT(m->pool < VM_NFREEPOOL,
1141	    ("vm_phys_free_pages: page %p has unexpected pool %d",
1142	    m, m->pool));
1143	KASSERT(order < VM_NFREEORDER,
1144	    ("vm_phys_free_pages: order %d is out of range", order));
1145	seg = &vm_phys_segs[m->segind];
1146	vm_domain_free_assert_locked(VM_DOMAIN(seg->domain));
1147	if (order < VM_NFREEORDER - 1) {
1148		pa = VM_PAGE_TO_PHYS(m);
1149		do {
1150			pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order));
1151			if (pa < seg->start || pa >= seg->end)
1152				break;
1153			m_buddy = &seg->first_page[atop(pa - seg->start)];
1154			if (m_buddy->order != order)
1155				break;
1156			fl = (*seg->free_queues)[m_buddy->pool];
1157			vm_freelist_rem(fl, m_buddy, order);
1158			if (m_buddy->pool != m->pool)
1159				vm_phys_set_pool(m->pool, m_buddy, order);
1160			order++;
1161			pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1);
1162			m = &seg->first_page[atop(pa - seg->start)];
1163		} while (order < VM_NFREEORDER - 1);
1164	}
1165	fl = (*seg->free_queues)[m->pool];
1166	vm_freelist_add(fl, m, order, 1);
1167}
1168
1169/*
1170 * Free a contiguous, arbitrarily sized set of physical pages, without
1171 * merging across set boundaries.
1172 *
1173 * The free page queues must be locked.
1174 */
1175void
1176vm_phys_enqueue_contig(vm_page_t m, u_long npages)
1177{
1178	struct vm_freelist *fl;
1179	struct vm_phys_seg *seg;
1180	vm_page_t m_end;
1181	vm_paddr_t diff, lo;
1182	int order;
1183
1184	/*
1185	 * Avoid unnecessary coalescing by freeing the pages in the largest
1186	 * possible power-of-two-sized subsets.
1187	 */
1188	vm_domain_free_assert_locked(vm_pagequeue_domain(m));
1189	seg = &vm_phys_segs[m->segind];
1190	fl = (*seg->free_queues)[m->pool];
1191	m_end = m + npages;
1192	/* Free blocks of increasing size. */
1193	lo = atop(VM_PAGE_TO_PHYS(m));
1194	if (m < m_end &&
1195	    (diff = lo ^ (lo + npages - 1)) != 0) {
1196		order = min(ilog2(diff), VM_NFREEORDER - 1);
1197		m = vm_phys_enq_range(m, roundup2(lo, 1 << order) - lo, fl, 1);
1198	}
1199
1200	/* Free blocks of maximum size. */
1201	order = VM_NFREEORDER - 1;
1202	while (m + (1 << order) <= m_end) {
1203		KASSERT(seg == &vm_phys_segs[m->segind],
1204		    ("%s: page range [%p,%p) spans multiple segments",
1205		    __func__, m_end - npages, m));
1206		vm_freelist_add(fl, m, order, 1);
1207		m += 1 << order;
1208	}
1209	/* Free blocks of diminishing size. */
1210	vm_phys_enq_beg(m, m_end - m, fl, 1);
1211}
1212
1213/*
1214 * Free a contiguous, arbitrarily sized set of physical pages.
1215 *
1216 * The free page queues must be locked.
1217 */
1218void
1219vm_phys_free_contig(vm_page_t m, u_long npages)
1220{
1221	vm_paddr_t lo;
1222	vm_page_t m_start, m_end;
1223	unsigned max_order, order_start, order_end;
1224
1225	vm_domain_free_assert_locked(vm_pagequeue_domain(m));
1226
1227	lo = atop(VM_PAGE_TO_PHYS(m));
1228	max_order = min(ilog2(lo ^ (lo + npages)), VM_NFREEORDER - 1);
1229
1230	m_start = m;
1231	order_start = ffsll(lo) - 1;
1232	if (order_start < max_order)
1233		m_start += 1 << order_start;
1234	m_end = m + npages;
1235	order_end = ffsll(lo + npages) - 1;
1236	if (order_end < max_order)
1237		m_end -= 1 << order_end;
1238	/*
1239	 * Avoid unnecessary coalescing by freeing the pages at the start and
1240	 * end of the range last.
1241	 */
1242	if (m_start < m_end)
1243		vm_phys_enqueue_contig(m_start, m_end - m_start);
1244	if (order_start < max_order)
1245		vm_phys_free_pages(m, order_start);
1246	if (order_end < max_order)
1247		vm_phys_free_pages(m_end, order_end);
1248}
1249
1250/*
1251 * Identify the first address range within segment segind or greater
1252 * that matches the domain, lies within the low/high range, and has
1253 * enough pages.  Return -1 if there is none.
1254 */
1255int
1256vm_phys_find_range(vm_page_t bounds[], int segind, int domain,
1257    u_long npages, vm_paddr_t low, vm_paddr_t high)
1258{
1259	vm_paddr_t pa_end, pa_start;
1260	struct vm_phys_seg *end_seg, *seg;
1261
1262	KASSERT(npages > 0, ("npages is zero"));
1263	KASSERT(domain >= 0 && domain < vm_ndomains, ("domain out of range"));
1264	end_seg = &vm_phys_segs[vm_phys_nsegs];
1265	for (seg = &vm_phys_segs[segind]; seg < end_seg; seg++) {
1266		if (seg->domain != domain)
1267			continue;
1268		if (seg->start >= high)
1269			return (-1);
1270		pa_start = MAX(low, seg->start);
1271		pa_end = MIN(high, seg->end);
1272		if (pa_end - pa_start < ptoa(npages))
1273			continue;
1274		bounds[0] = &seg->first_page[atop(pa_start - seg->start)];
1275		bounds[1] = &seg->first_page[atop(pa_end - seg->start)];
1276		return (seg - vm_phys_segs);
1277	}
1278	return (-1);
1279}
1280
1281/*
1282 * Search for the given physical page "m" in the free lists.  If the search
1283 * succeeds, remove "m" from the free lists and return true.  Otherwise, return
1284 * false, indicating that "m" is not in the free lists.
1285 *
1286 * The free page queues must be locked.
1287 */
1288bool
1289vm_phys_unfree_page(vm_page_t m)
1290{
1291	struct vm_freelist *fl;
1292	struct vm_phys_seg *seg;
1293	vm_paddr_t pa, pa_half;
1294	vm_page_t m_set, m_tmp;
1295	int order;
1296
1297	/*
1298	 * First, find the contiguous, power of two-sized set of free
1299	 * physical pages containing the given physical page "m" and
1300	 * assign it to "m_set".
1301	 */
1302	seg = &vm_phys_segs[m->segind];
1303	vm_domain_free_assert_locked(VM_DOMAIN(seg->domain));
1304	for (m_set = m, order = 0; m_set->order == VM_NFREEORDER &&
1305	    order < VM_NFREEORDER - 1; ) {
1306		order++;
1307		pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order));
1308		if (pa >= seg->start)
1309			m_set = &seg->first_page[atop(pa - seg->start)];
1310		else
1311			return (false);
1312	}
1313	if (m_set->order < order)
1314		return (false);
1315	if (m_set->order == VM_NFREEORDER)
1316		return (false);
1317	KASSERT(m_set->order < VM_NFREEORDER,
1318	    ("vm_phys_unfree_page: page %p has unexpected order %d",
1319	    m_set, m_set->order));
1320
1321	/*
1322	 * Next, remove "m_set" from the free lists.  Finally, extract
1323	 * "m" from "m_set" using an iterative algorithm: While "m_set"
1324	 * is larger than a page, shrink "m_set" by returning the half
1325	 * of "m_set" that does not contain "m" to the free lists.
1326	 */
1327	fl = (*seg->free_queues)[m_set->pool];
1328	order = m_set->order;
1329	vm_freelist_rem(fl, m_set, order);
1330	while (order > 0) {
1331		order--;
1332		pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order));
1333		if (m->phys_addr < pa_half)
1334			m_tmp = &seg->first_page[atop(pa_half - seg->start)];
1335		else {
1336			m_tmp = m_set;
1337			m_set = &seg->first_page[atop(pa_half - seg->start)];
1338		}
1339		vm_freelist_add(fl, m_tmp, order, 0);
1340	}
1341	KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency"));
1342	return (true);
1343}
1344
1345/*
1346 * Find a run of contiguous physical pages, meeting alignment requirements, from
1347 * a list of max-sized page blocks, where we need at least two consecutive
1348 * blocks to satisfy the (large) page request.
1349 */
1350static vm_page_t
1351vm_phys_find_freelist_contig(struct vm_freelist *fl, u_long npages,
1352    vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
1353{
1354	struct vm_phys_seg *seg;
1355	vm_page_t m, m_iter, m_ret;
1356	vm_paddr_t max_size, size;
1357	int max_order;
1358
1359	max_order = VM_NFREEORDER - 1;
1360	size = npages << PAGE_SHIFT;
1361	max_size = (vm_paddr_t)1 << (PAGE_SHIFT + max_order);
1362	KASSERT(size > max_size, ("size is too small"));
1363
1364	/*
1365	 * In order to avoid examining any free max-sized page block more than
1366	 * twice, identify the ones that are first in a physically-contiguous
1367	 * sequence of such blocks, and only for those walk the sequence to
1368	 * check if there are enough free blocks starting at a properly aligned
1369	 * block.  Thus, no block is checked for free-ness more than twice.
1370	 */
1371	TAILQ_FOREACH(m, &fl[max_order].pl, listq) {
1372		/*
1373		 * Skip m unless it is first in a sequence of free max page
1374		 * blocks >= low in its segment.
1375		 */
1376		seg = &vm_phys_segs[m->segind];
1377		if (VM_PAGE_TO_PHYS(m) < MAX(low, seg->start))
1378			continue;
1379		if (VM_PAGE_TO_PHYS(m) >= max_size &&
1380		    VM_PAGE_TO_PHYS(m) - max_size >= MAX(low, seg->start) &&
1381		    max_order == m[-1 << max_order].order)
1382			continue;
1383
1384		/*
1385		 * Advance m_ret from m to the first of the sequence, if any,
1386		 * that satisfies alignment conditions and might leave enough
1387		 * space.
1388		 */
1389		m_ret = m;
1390		while (!vm_addr_ok(VM_PAGE_TO_PHYS(m_ret),
1391		    size, alignment, boundary) &&
1392		    VM_PAGE_TO_PHYS(m_ret) + size <= MIN(high, seg->end) &&
1393		    max_order == m_ret[1 << max_order].order)
1394			m_ret += 1 << max_order;
1395
1396		/*
1397		 * Skip m unless some block m_ret in the sequence is properly
1398		 * aligned, and begins a sequence of enough pages less than
1399		 * high, and in the same segment.
1400		 */
1401		if (VM_PAGE_TO_PHYS(m_ret) + size > MIN(high, seg->end))
1402			continue;
1403
1404		/*
1405		 * Skip m unless the blocks to allocate starting at m_ret are
1406		 * all free.
1407		 */
1408		for (m_iter = m_ret;
1409		    m_iter < m_ret + npages && max_order == m_iter->order;
1410		    m_iter += 1 << max_order) {
1411		}
1412		if (m_iter < m_ret + npages)
1413			continue;
1414		return (m_ret);
1415	}
1416	return (NULL);
1417}
1418
1419/*
1420 * Find a run of contiguous physical pages from the specified free list
1421 * table.
1422 */
1423static vm_page_t
1424vm_phys_find_queues_contig(
1425    struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX],
1426    u_long npages, vm_paddr_t low, vm_paddr_t high,
1427    u_long alignment, vm_paddr_t boundary)
1428{
1429	struct vm_freelist *fl;
1430	vm_page_t m_ret;
1431	vm_paddr_t pa, pa_end, size;
1432	int oind, order, pind;
1433
1434	KASSERT(npages > 0, ("npages is 0"));
1435	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
1436	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
1437	/* Compute the queue that is the best fit for npages. */
1438	order = flsl(npages - 1);
1439	/* Search for a large enough free block. */
1440	size = npages << PAGE_SHIFT;
1441	for (oind = order; oind < VM_NFREEORDER; oind++) {
1442		for (pind = 0; pind < VM_NFREEPOOL; pind++) {
1443			fl = (*queues)[pind];
1444			TAILQ_FOREACH(m_ret, &fl[oind].pl, listq) {
1445				/*
1446				 * Determine if the address range starting at pa
1447				 * is within the given range, satisfies the
1448				 * given alignment, and does not cross the given
1449				 * boundary.
1450				 */
1451				pa = VM_PAGE_TO_PHYS(m_ret);
1452				pa_end = pa + size;
1453				if (low <= pa && pa_end <= high &&
1454				    vm_addr_ok(pa, size, alignment, boundary))
1455					return (m_ret);
1456			}
1457		}
1458	}
1459	if (order < VM_NFREEORDER)
1460		return (NULL);
1461	/* Search for a long-enough sequence of max-order blocks. */
1462	for (pind = 0; pind < VM_NFREEPOOL; pind++) {
1463		fl = (*queues)[pind];
1464		m_ret = vm_phys_find_freelist_contig(fl, npages,
1465		    low, high, alignment, boundary);
1466		if (m_ret != NULL)
1467			return (m_ret);
1468	}
1469	return (NULL);
1470}
1471
1472/*
1473 * Allocate a contiguous set of physical pages of the given size
1474 * "npages" from the free lists.  All of the physical pages must be at
1475 * or above the given physical address "low" and below the given
1476 * physical address "high".  The given value "alignment" determines the
1477 * alignment of the first physical page in the set.  If the given value
1478 * "boundary" is non-zero, then the set of physical pages cannot cross
1479 * any physical address boundary that is a multiple of that value.  Both
1480 * "alignment" and "boundary" must be a power of two.
1481 */
1482vm_page_t
1483vm_phys_alloc_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high,
1484    u_long alignment, vm_paddr_t boundary)
1485{
1486	vm_paddr_t pa_end, pa_start;
1487	struct vm_freelist *fl;
1488	vm_page_t m, m_run;
1489	struct vm_phys_seg *seg;
1490	struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX];
1491	int oind, segind;
1492
1493	KASSERT(npages > 0, ("npages is 0"));
1494	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
1495	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
1496	vm_domain_free_assert_locked(VM_DOMAIN(domain));
1497	if (low >= high)
1498		return (NULL);
1499	queues = NULL;
1500	m_run = NULL;
1501	for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
1502		seg = &vm_phys_segs[segind];
1503		if (seg->start >= high || seg->domain != domain)
1504			continue;
1505		if (low >= seg->end)
1506			break;
1507		if (low <= seg->start)
1508			pa_start = seg->start;
1509		else
1510			pa_start = low;
1511		if (high < seg->end)
1512			pa_end = high;
1513		else
1514			pa_end = seg->end;
1515		if (pa_end - pa_start < ptoa(npages))
1516			continue;
1517		/*
1518		 * If a previous segment led to a search using
1519		 * the same free lists as would this segment, then
1520		 * we've actually already searched within this
1521		 * too.  So skip it.
1522		 */
1523		if (seg->free_queues == queues)
1524			continue;
1525		queues = seg->free_queues;
1526		m_run = vm_phys_find_queues_contig(queues, npages,
1527		    low, high, alignment, boundary);
1528		if (m_run != NULL)
1529			break;
1530	}
1531	if (m_run == NULL)
1532		return (NULL);
1533
1534	/* Allocate pages from the page-range found. */
1535	for (m = m_run; m < &m_run[npages]; m = &m[1 << oind]) {
1536		fl = (*queues)[m->pool];
1537		oind = m->order;
1538		vm_freelist_rem(fl, m, oind);
1539		if (m->pool != VM_FREEPOOL_DEFAULT)
1540			vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, oind);
1541	}
1542	/* Return excess pages to the free lists. */
1543	fl = (*queues)[VM_FREEPOOL_DEFAULT];
1544	vm_phys_enq_range(&m_run[npages], m - &m_run[npages], fl, 0);
1545
1546	/* Return page verified to satisfy conditions of request. */
1547	pa_start = VM_PAGE_TO_PHYS(m_run);
1548	KASSERT(low <= pa_start,
1549	    ("memory allocated below minimum requested range"));
1550	KASSERT(pa_start + ptoa(npages) <= high,
1551	    ("memory allocated above maximum requested range"));
1552	seg = &vm_phys_segs[m_run->segind];
1553	KASSERT(seg->domain == domain,
1554	    ("memory not allocated from specified domain"));
1555	KASSERT(vm_addr_ok(pa_start, ptoa(npages), alignment, boundary),
1556	    ("memory alignment/boundary constraints not satisfied"));
1557	return (m_run);
1558}
1559
1560/*
1561 * Return the index of the first unused slot which may be the terminating
1562 * entry.
1563 */
1564static int
1565vm_phys_avail_count(void)
1566{
1567	int i;
1568
1569	for (i = 0; phys_avail[i + 1]; i += 2)
1570		continue;
1571	if (i > PHYS_AVAIL_ENTRIES)
1572		panic("Improperly terminated phys_avail %d entries", i);
1573
1574	return (i);
1575}
1576
1577/*
1578 * Assert that a phys_avail entry is valid.
1579 */
1580static void
1581vm_phys_avail_check(int i)
1582{
1583	if (phys_avail[i] & PAGE_MASK)
1584		panic("Unaligned phys_avail[%d]: %#jx", i,
1585		    (intmax_t)phys_avail[i]);
1586	if (phys_avail[i+1] & PAGE_MASK)
1587		panic("Unaligned phys_avail[%d + 1]: %#jx", i,
1588		    (intmax_t)phys_avail[i]);
1589	if (phys_avail[i + 1] < phys_avail[i])
1590		panic("phys_avail[%d] start %#jx < end %#jx", i,
1591		    (intmax_t)phys_avail[i], (intmax_t)phys_avail[i+1]);
1592}
1593
1594/*
1595 * Return the index of an overlapping phys_avail entry or -1.
1596 */
1597#ifdef NUMA
1598static int
1599vm_phys_avail_find(vm_paddr_t pa)
1600{
1601	int i;
1602
1603	for (i = 0; phys_avail[i + 1]; i += 2)
1604		if (phys_avail[i] <= pa && phys_avail[i + 1] > pa)
1605			return (i);
1606	return (-1);
1607}
1608#endif
1609
1610/*
1611 * Return the index of the largest entry.
1612 */
1613int
1614vm_phys_avail_largest(void)
1615{
1616	vm_paddr_t sz, largesz;
1617	int largest;
1618	int i;
1619
1620	largest = 0;
1621	largesz = 0;
1622	for (i = 0; phys_avail[i + 1]; i += 2) {
1623		sz = vm_phys_avail_size(i);
1624		if (sz > largesz) {
1625			largesz = sz;
1626			largest = i;
1627		}
1628	}
1629
1630	return (largest);
1631}
1632
1633vm_paddr_t
1634vm_phys_avail_size(int i)
1635{
1636
1637	return (phys_avail[i + 1] - phys_avail[i]);
1638}
1639
1640/*
1641 * Split an entry at the address 'pa'.  Return zero on success or errno.
1642 */
1643static int
1644vm_phys_avail_split(vm_paddr_t pa, int i)
1645{
1646	int cnt;
1647
1648	vm_phys_avail_check(i);
1649	if (pa <= phys_avail[i] || pa >= phys_avail[i + 1])
1650		panic("vm_phys_avail_split: invalid address");
1651	cnt = vm_phys_avail_count();
1652	if (cnt >= PHYS_AVAIL_ENTRIES)
1653		return (ENOSPC);
1654	memmove(&phys_avail[i + 2], &phys_avail[i],
1655	    (cnt - i) * sizeof(phys_avail[0]));
1656	phys_avail[i + 1] = pa;
1657	phys_avail[i + 2] = pa;
1658	vm_phys_avail_check(i);
1659	vm_phys_avail_check(i+2);
1660
1661	return (0);
1662}
1663
1664/*
1665 * Check if a given physical address can be included as part of a crash dump.
1666 */
1667bool
1668vm_phys_is_dumpable(vm_paddr_t pa)
1669{
1670	vm_page_t m;
1671	int i;
1672
1673	if ((m = vm_phys_paddr_to_vm_page(pa)) != NULL)
1674		return ((m->flags & PG_NODUMP) == 0);
1675
1676	for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) {
1677		if (pa >= dump_avail[i] && pa < dump_avail[i + 1])
1678			return (true);
1679	}
1680	return (false);
1681}
1682
1683void
1684vm_phys_early_add_seg(vm_paddr_t start, vm_paddr_t end)
1685{
1686	struct vm_phys_seg *seg;
1687
1688	if (vm_phys_early_nsegs == -1)
1689		panic("%s: called after initialization", __func__);
1690	if (vm_phys_early_nsegs == nitems(vm_phys_early_segs))
1691		panic("%s: ran out of early segments", __func__);
1692
1693	seg = &vm_phys_early_segs[vm_phys_early_nsegs++];
1694	seg->start = start;
1695	seg->end = end;
1696}
1697
1698/*
1699 * This routine allocates NUMA node specific memory before the page
1700 * allocator is bootstrapped.
1701 */
1702vm_paddr_t
1703vm_phys_early_alloc(int domain, size_t alloc_size)
1704{
1705#ifdef NUMA
1706	int mem_index;
1707#endif
1708	int i, biggestone;
1709	vm_paddr_t pa, mem_start, mem_end, size, biggestsize, align;
1710
1711	KASSERT(domain == -1 || (domain >= 0 && domain < vm_ndomains),
1712	    ("%s: invalid domain index %d", __func__, domain));
1713
1714	/*
1715	 * Search the mem_affinity array for the biggest address
1716	 * range in the desired domain.  This is used to constrain
1717	 * the phys_avail selection below.
1718	 */
1719	biggestsize = 0;
1720	mem_start = 0;
1721	mem_end = -1;
1722#ifdef NUMA
1723	mem_index = 0;
1724	if (mem_affinity != NULL) {
1725		for (i = 0;; i++) {
1726			size = mem_affinity[i].end - mem_affinity[i].start;
1727			if (size == 0)
1728				break;
1729			if (domain != -1 && mem_affinity[i].domain != domain)
1730				continue;
1731			if (size > biggestsize) {
1732				mem_index = i;
1733				biggestsize = size;
1734			}
1735		}
1736		mem_start = mem_affinity[mem_index].start;
1737		mem_end = mem_affinity[mem_index].end;
1738	}
1739#endif
1740
1741	/*
1742	 * Now find biggest physical segment in within the desired
1743	 * numa domain.
1744	 */
1745	biggestsize = 0;
1746	biggestone = 0;
1747	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
1748		/* skip regions that are out of range */
1749		if (phys_avail[i+1] - alloc_size < mem_start ||
1750		    phys_avail[i+1] > mem_end)
1751			continue;
1752		size = vm_phys_avail_size(i);
1753		if (size > biggestsize) {
1754			biggestone = i;
1755			biggestsize = size;
1756		}
1757	}
1758	alloc_size = round_page(alloc_size);
1759
1760	/*
1761	 * Grab single pages from the front to reduce fragmentation.
1762	 */
1763	if (alloc_size == PAGE_SIZE) {
1764		pa = phys_avail[biggestone];
1765		phys_avail[biggestone] += PAGE_SIZE;
1766		vm_phys_avail_check(biggestone);
1767		return (pa);
1768	}
1769
1770	/*
1771	 * Naturally align large allocations.
1772	 */
1773	align = phys_avail[biggestone + 1] & (alloc_size - 1);
1774	if (alloc_size + align > biggestsize)
1775		panic("cannot find a large enough size\n");
1776	if (align != 0 &&
1777	    vm_phys_avail_split(phys_avail[biggestone + 1] - align,
1778	    biggestone) != 0)
1779		/* Wasting memory. */
1780		phys_avail[biggestone + 1] -= align;
1781
1782	phys_avail[biggestone + 1] -= alloc_size;
1783	vm_phys_avail_check(biggestone);
1784	pa = phys_avail[biggestone + 1];
1785	return (pa);
1786}
1787
1788void
1789vm_phys_early_startup(void)
1790{
1791	struct vm_phys_seg *seg;
1792	int i;
1793
1794	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
1795		phys_avail[i] = round_page(phys_avail[i]);
1796		phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
1797	}
1798
1799	for (i = 0; i < vm_phys_early_nsegs; i++) {
1800		seg = &vm_phys_early_segs[i];
1801		vm_phys_add_seg(seg->start, seg->end);
1802	}
1803	vm_phys_early_nsegs = -1;
1804
1805#ifdef NUMA
1806	/* Force phys_avail to be split by domain. */
1807	if (mem_affinity != NULL) {
1808		int idx;
1809
1810		for (i = 0; mem_affinity[i].end != 0; i++) {
1811			idx = vm_phys_avail_find(mem_affinity[i].start);
1812			if (idx != -1 &&
1813			    phys_avail[idx] != mem_affinity[i].start)
1814				vm_phys_avail_split(mem_affinity[i].start, idx);
1815			idx = vm_phys_avail_find(mem_affinity[i].end);
1816			if (idx != -1 &&
1817			    phys_avail[idx] != mem_affinity[i].end)
1818				vm_phys_avail_split(mem_affinity[i].end, idx);
1819		}
1820	}
1821#endif
1822}
1823
1824#ifdef DDB
1825/*
1826 * Show the number of physical pages in each of the free lists.
1827 */
1828DB_SHOW_COMMAND_FLAGS(freepages, db_show_freepages, DB_CMD_MEMSAFE)
1829{
1830	struct vm_freelist *fl;
1831	int flind, oind, pind, dom;
1832
1833	for (dom = 0; dom < vm_ndomains; dom++) {
1834		db_printf("DOMAIN: %d\n", dom);
1835		for (flind = 0; flind < vm_nfreelists; flind++) {
1836			db_printf("FREE LIST %d:\n"
1837			    "\n  ORDER (SIZE)  |  NUMBER"
1838			    "\n              ", flind);
1839			for (pind = 0; pind < VM_NFREEPOOL; pind++)
1840				db_printf("  |  POOL %d", pind);
1841			db_printf("\n--            ");
1842			for (pind = 0; pind < VM_NFREEPOOL; pind++)
1843				db_printf("-- --      ");
1844			db_printf("--\n");
1845			for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
1846				db_printf("  %2.2d (%6.6dK)", oind,
1847				    1 << (PAGE_SHIFT - 10 + oind));
1848				for (pind = 0; pind < VM_NFREEPOOL; pind++) {
1849				fl = vm_phys_free_queues[dom][flind][pind];
1850					db_printf("  |  %6.6d", fl[oind].lcnt);
1851				}
1852				db_printf("\n");
1853			}
1854			db_printf("\n");
1855		}
1856		db_printf("\n");
1857	}
1858}
1859#endif
1860