1/*-
2 * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU)
3 *
4 * Copyright (c) 1991 Regents of the University of California.
5 * All rights reserved.
6 * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
7 *
8 * This code is derived from software contributed to Berkeley by
9 * The Mach Operating System project at Carnegie-Mellon University.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 * 3. Neither the name of the University nor the names of its contributors
20 *    may be used to endorse or promote products derived from this software
21 *    without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 */
35
36/*-
37 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
38 * All rights reserved.
39 *
40 * Authors: Avadis Tevanian, Jr., Michael Wayne Young
41 *
42 * Permission to use, copy, modify and distribute this software and
43 * its documentation is hereby granted, provided that both the copyright
44 * notice and this permission notice appear in all copies of the
45 * software, derivative works or modified versions, and any portions
46 * thereof, and that both notices appear in supporting documentation.
47 *
48 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
49 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
50 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
51 *
52 * Carnegie Mellon requests users of this software to return to
53 *
54 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
55 *  School of Computer Science
56 *  Carnegie Mellon University
57 *  Pittsburgh PA 15213-3890
58 *
59 * any improvements or extensions that they make and grant Carnegie the
60 * rights to redistribute these changes.
61 */
62
63/*
64 *	Resident memory management module.
65 */
66
67#include <sys/cdefs.h>
68#include "opt_vm.h"
69
70#include <sys/param.h>
71#include <sys/systm.h>
72#include <sys/counter.h>
73#include <sys/domainset.h>
74#include <sys/kernel.h>
75#include <sys/limits.h>
76#include <sys/linker.h>
77#include <sys/lock.h>
78#include <sys/malloc.h>
79#include <sys/mman.h>
80#include <sys/msgbuf.h>
81#include <sys/mutex.h>
82#include <sys/proc.h>
83#include <sys/rwlock.h>
84#include <sys/sleepqueue.h>
85#include <sys/sbuf.h>
86#include <sys/sched.h>
87#include <sys/smp.h>
88#include <sys/sysctl.h>
89#include <sys/vmmeter.h>
90#include <sys/vnode.h>
91
92#include <vm/vm.h>
93#include <vm/pmap.h>
94#include <vm/vm_param.h>
95#include <vm/vm_domainset.h>
96#include <vm/vm_kern.h>
97#include <vm/vm_map.h>
98#include <vm/vm_object.h>
99#include <vm/vm_page.h>
100#include <vm/vm_pageout.h>
101#include <vm/vm_phys.h>
102#include <vm/vm_pagequeue.h>
103#include <vm/vm_pager.h>
104#include <vm/vm_radix.h>
105#include <vm/vm_reserv.h>
106#include <vm/vm_extern.h>
107#include <vm/vm_dumpset.h>
108#include <vm/uma.h>
109#include <vm/uma_int.h>
110
111#include <machine/md_var.h>
112
113struct vm_domain vm_dom[MAXMEMDOM];
114
115DPCPU_DEFINE_STATIC(struct vm_batchqueue, pqbatch[MAXMEMDOM][PQ_COUNT]);
116
117struct mtx_padalign __exclusive_cache_line pa_lock[PA_LOCK_COUNT];
118
119struct mtx_padalign __exclusive_cache_line vm_domainset_lock;
120/* The following fields are protected by the domainset lock. */
121domainset_t __exclusive_cache_line vm_min_domains;
122domainset_t __exclusive_cache_line vm_severe_domains;
123static int vm_min_waiters;
124static int vm_severe_waiters;
125static int vm_pageproc_waiters;
126
127static SYSCTL_NODE(_vm_stats, OID_AUTO, page, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
128    "VM page statistics");
129
130static COUNTER_U64_DEFINE_EARLY(pqstate_commit_retries);
131SYSCTL_COUNTER_U64(_vm_stats_page, OID_AUTO, pqstate_commit_retries,
132    CTLFLAG_RD, &pqstate_commit_retries,
133    "Number of failed per-page atomic queue state updates");
134
135static COUNTER_U64_DEFINE_EARLY(queue_ops);
136SYSCTL_COUNTER_U64(_vm_stats_page, OID_AUTO, queue_ops,
137    CTLFLAG_RD, &queue_ops,
138    "Number of batched queue operations");
139
140static COUNTER_U64_DEFINE_EARLY(queue_nops);
141SYSCTL_COUNTER_U64(_vm_stats_page, OID_AUTO, queue_nops,
142    CTLFLAG_RD, &queue_nops,
143    "Number of batched queue operations with no effects");
144
145/*
146 * bogus page -- for I/O to/from partially complete buffers,
147 * or for paging into sparsely invalid regions.
148 */
149vm_page_t bogus_page;
150
151vm_page_t vm_page_array;
152long vm_page_array_size;
153long first_page;
154
155struct bitset *vm_page_dump;
156long vm_page_dump_pages;
157
158static TAILQ_HEAD(, vm_page) blacklist_head;
159static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS);
160SYSCTL_PROC(_vm, OID_AUTO, page_blacklist, CTLTYPE_STRING | CTLFLAG_RD |
161    CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_page_blacklist, "A", "Blacklist pages");
162
163static uma_zone_t fakepg_zone;
164
165static void vm_page_alloc_check(vm_page_t m);
166static bool _vm_page_busy_sleep(vm_object_t obj, vm_page_t m,
167    vm_pindex_t pindex, const char *wmesg, int allocflags, bool locked);
168static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits);
169static void vm_page_enqueue(vm_page_t m, uint8_t queue);
170static bool vm_page_free_prep(vm_page_t m);
171static void vm_page_free_toq(vm_page_t m);
172static void vm_page_init(void *dummy);
173static int vm_page_insert_after(vm_page_t m, vm_object_t object,
174    vm_pindex_t pindex, vm_page_t mpred);
175static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object,
176    vm_page_t mpred);
177static void vm_page_mvqueue(vm_page_t m, const uint8_t queue,
178    const uint16_t nflag);
179static int vm_page_reclaim_run(int req_class, int domain, u_long npages,
180    vm_page_t m_run, vm_paddr_t high);
181static void vm_page_release_toq(vm_page_t m, uint8_t nqueue, bool noreuse);
182static int vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object,
183    int req);
184static int vm_page_zone_import(void *arg, void **store, int cnt, int domain,
185    int flags);
186static void vm_page_zone_release(void *arg, void **store, int cnt);
187
188SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init, NULL);
189
190static void
191vm_page_init(void *dummy)
192{
193
194	fakepg_zone = uma_zcreate("fakepg", sizeof(struct vm_page), NULL, NULL,
195	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
196	bogus_page = vm_page_alloc_noobj(VM_ALLOC_WIRED);
197}
198
199static int pgcache_zone_max_pcpu;
200SYSCTL_INT(_vm, OID_AUTO, pgcache_zone_max_pcpu,
201    CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pgcache_zone_max_pcpu, 0,
202    "Per-CPU page cache size");
203
204/*
205 * The cache page zone is initialized later since we need to be able to allocate
206 * pages before UMA is fully initialized.
207 */
208static void
209vm_page_init_cache_zones(void *dummy __unused)
210{
211	struct vm_domain *vmd;
212	struct vm_pgcache *pgcache;
213	int cache, domain, maxcache, pool;
214
215	TUNABLE_INT_FETCH("vm.pgcache_zone_max_pcpu", &pgcache_zone_max_pcpu);
216	maxcache = pgcache_zone_max_pcpu * mp_ncpus;
217	for (domain = 0; domain < vm_ndomains; domain++) {
218		vmd = VM_DOMAIN(domain);
219		for (pool = 0; pool < VM_NFREEPOOL; pool++) {
220			pgcache = &vmd->vmd_pgcache[pool];
221			pgcache->domain = domain;
222			pgcache->pool = pool;
223			pgcache->zone = uma_zcache_create("vm pgcache",
224			    PAGE_SIZE, NULL, NULL, NULL, NULL,
225			    vm_page_zone_import, vm_page_zone_release, pgcache,
226			    UMA_ZONE_VM);
227
228			/*
229			 * Limit each pool's zone to 0.1% of the pages in the
230			 * domain.
231			 */
232			cache = maxcache != 0 ? maxcache :
233			    vmd->vmd_page_count / 1000;
234			uma_zone_set_maxcache(pgcache->zone, cache);
235		}
236	}
237}
238SYSINIT(vm_page2, SI_SUB_VM_CONF, SI_ORDER_ANY, vm_page_init_cache_zones, NULL);
239
240/* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */
241#if PAGE_SIZE == 32768
242#ifdef CTASSERT
243CTASSERT(sizeof(u_long) >= 8);
244#endif
245#endif
246
247/*
248 *	vm_set_page_size:
249 *
250 *	Sets the page size, perhaps based upon the memory
251 *	size.  Must be called before any use of page-size
252 *	dependent functions.
253 */
254void
255vm_set_page_size(void)
256{
257	if (vm_cnt.v_page_size == 0)
258		vm_cnt.v_page_size = PAGE_SIZE;
259	if (((vm_cnt.v_page_size - 1) & vm_cnt.v_page_size) != 0)
260		panic("vm_set_page_size: page size not a power of two");
261}
262
263/*
264 *	vm_page_blacklist_next:
265 *
266 *	Find the next entry in the provided string of blacklist
267 *	addresses.  Entries are separated by space, comma, or newline.
268 *	If an invalid integer is encountered then the rest of the
269 *	string is skipped.  Updates the list pointer to the next
270 *	character, or NULL if the string is exhausted or invalid.
271 */
272static vm_paddr_t
273vm_page_blacklist_next(char **list, char *end)
274{
275	vm_paddr_t bad;
276	char *cp, *pos;
277
278	if (list == NULL || *list == NULL)
279		return (0);
280	if (**list =='\0') {
281		*list = NULL;
282		return (0);
283	}
284
285	/*
286	 * If there's no end pointer then the buffer is coming from
287	 * the kenv and we know it's null-terminated.
288	 */
289	if (end == NULL)
290		end = *list + strlen(*list);
291
292	/* Ensure that strtoq() won't walk off the end */
293	if (*end != '\0') {
294		if (*end == '\n' || *end == ' ' || *end  == ',')
295			*end = '\0';
296		else {
297			printf("Blacklist not terminated, skipping\n");
298			*list = NULL;
299			return (0);
300		}
301	}
302
303	for (pos = *list; *pos != '\0'; pos = cp) {
304		bad = strtoq(pos, &cp, 0);
305		if (*cp == '\0' || *cp == ' ' || *cp == ',' || *cp == '\n') {
306			if (bad == 0) {
307				if (++cp < end)
308					continue;
309				else
310					break;
311			}
312		} else
313			break;
314		if (*cp == '\0' || ++cp >= end)
315			*list = NULL;
316		else
317			*list = cp;
318		return (trunc_page(bad));
319	}
320	printf("Garbage in RAM blacklist, skipping\n");
321	*list = NULL;
322	return (0);
323}
324
325bool
326vm_page_blacklist_add(vm_paddr_t pa, bool verbose)
327{
328	struct vm_domain *vmd;
329	vm_page_t m;
330	bool found;
331
332	m = vm_phys_paddr_to_vm_page(pa);
333	if (m == NULL)
334		return (true); /* page does not exist, no failure */
335
336	vmd = vm_pagequeue_domain(m);
337	vm_domain_free_lock(vmd);
338	found = vm_phys_unfree_page(m);
339	vm_domain_free_unlock(vmd);
340	if (found) {
341		vm_domain_freecnt_inc(vmd, -1);
342		TAILQ_INSERT_TAIL(&blacklist_head, m, listq);
343		if (verbose)
344			printf("Skipping page with pa 0x%jx\n", (uintmax_t)pa);
345	}
346	return (found);
347}
348
349/*
350 *	vm_page_blacklist_check:
351 *
352 *	Iterate through the provided string of blacklist addresses, pulling
353 *	each entry out of the physical allocator free list and putting it
354 *	onto a list for reporting via the vm.page_blacklist sysctl.
355 */
356static void
357vm_page_blacklist_check(char *list, char *end)
358{
359	vm_paddr_t pa;
360	char *next;
361
362	next = list;
363	while (next != NULL) {
364		if ((pa = vm_page_blacklist_next(&next, end)) == 0)
365			continue;
366		vm_page_blacklist_add(pa, bootverbose);
367	}
368}
369
370/*
371 *	vm_page_blacklist_load:
372 *
373 *	Search for a special module named "ram_blacklist".  It'll be a
374 *	plain text file provided by the user via the loader directive
375 *	of the same name.
376 */
377static void
378vm_page_blacklist_load(char **list, char **end)
379{
380	void *mod;
381	u_char *ptr;
382	u_int len;
383
384	mod = NULL;
385	ptr = NULL;
386
387	mod = preload_search_by_type("ram_blacklist");
388	if (mod != NULL) {
389		ptr = preload_fetch_addr(mod);
390		len = preload_fetch_size(mod);
391        }
392	*list = ptr;
393	if (ptr != NULL)
394		*end = ptr + len;
395	else
396		*end = NULL;
397	return;
398}
399
400static int
401sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS)
402{
403	vm_page_t m;
404	struct sbuf sbuf;
405	int error, first;
406
407	first = 1;
408	error = sysctl_wire_old_buffer(req, 0);
409	if (error != 0)
410		return (error);
411	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
412	TAILQ_FOREACH(m, &blacklist_head, listq) {
413		sbuf_printf(&sbuf, "%s%#jx", first ? "" : ",",
414		    (uintmax_t)m->phys_addr);
415		first = 0;
416	}
417	error = sbuf_finish(&sbuf);
418	sbuf_delete(&sbuf);
419	return (error);
420}
421
422/*
423 * Initialize a dummy page for use in scans of the specified paging queue.
424 * In principle, this function only needs to set the flag PG_MARKER.
425 * Nonetheless, it write busies the page as a safety precaution.
426 */
427void
428vm_page_init_marker(vm_page_t marker, int queue, uint16_t aflags)
429{
430
431	bzero(marker, sizeof(*marker));
432	marker->flags = PG_MARKER;
433	marker->a.flags = aflags;
434	marker->busy_lock = VPB_CURTHREAD_EXCLUSIVE;
435	marker->a.queue = queue;
436}
437
438static void
439vm_page_domain_init(int domain)
440{
441	struct vm_domain *vmd;
442	struct vm_pagequeue *pq;
443	int i;
444
445	vmd = VM_DOMAIN(domain);
446	bzero(vmd, sizeof(*vmd));
447	*__DECONST(const char **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) =
448	    "vm inactive pagequeue";
449	*__DECONST(const char **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) =
450	    "vm active pagequeue";
451	*__DECONST(const char **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_name) =
452	    "vm laundry pagequeue";
453	*__DECONST(const char **,
454	    &vmd->vmd_pagequeues[PQ_UNSWAPPABLE].pq_name) =
455	    "vm unswappable pagequeue";
456	vmd->vmd_domain = domain;
457	vmd->vmd_page_count = 0;
458	vmd->vmd_free_count = 0;
459	vmd->vmd_segs = 0;
460	vmd->vmd_oom = FALSE;
461	for (i = 0; i < PQ_COUNT; i++) {
462		pq = &vmd->vmd_pagequeues[i];
463		TAILQ_INIT(&pq->pq_pl);
464		mtx_init(&pq->pq_mutex, pq->pq_name, "vm pagequeue",
465		    MTX_DEF | MTX_DUPOK);
466		pq->pq_pdpages = 0;
467		vm_page_init_marker(&vmd->vmd_markers[i], i, 0);
468	}
469	mtx_init(&vmd->vmd_free_mtx, "vm page free queue", NULL, MTX_DEF);
470	mtx_init(&vmd->vmd_pageout_mtx, "vm pageout lock", NULL, MTX_DEF);
471	snprintf(vmd->vmd_name, sizeof(vmd->vmd_name), "%d", domain);
472
473	/*
474	 * inacthead is used to provide FIFO ordering for LRU-bypassing
475	 * insertions.
476	 */
477	vm_page_init_marker(&vmd->vmd_inacthead, PQ_INACTIVE, PGA_ENQUEUED);
478	TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_INACTIVE].pq_pl,
479	    &vmd->vmd_inacthead, plinks.q);
480
481	/*
482	 * The clock pages are used to implement active queue scanning without
483	 * requeues.  Scans start at clock[0], which is advanced after the scan
484	 * ends.  When the two clock hands meet, they are reset and scanning
485	 * resumes from the head of the queue.
486	 */
487	vm_page_init_marker(&vmd->vmd_clock[0], PQ_ACTIVE, PGA_ENQUEUED);
488	vm_page_init_marker(&vmd->vmd_clock[1], PQ_ACTIVE, PGA_ENQUEUED);
489	TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_ACTIVE].pq_pl,
490	    &vmd->vmd_clock[0], plinks.q);
491	TAILQ_INSERT_TAIL(&vmd->vmd_pagequeues[PQ_ACTIVE].pq_pl,
492	    &vmd->vmd_clock[1], plinks.q);
493}
494
495/*
496 * Initialize a physical page in preparation for adding it to the free
497 * lists.
498 */
499void
500vm_page_init_page(vm_page_t m, vm_paddr_t pa, int segind)
501{
502
503	m->object = NULL;
504	m->ref_count = 0;
505	m->busy_lock = VPB_FREED;
506	m->flags = m->a.flags = 0;
507	m->phys_addr = pa;
508	m->a.queue = PQ_NONE;
509	m->psind = 0;
510	m->segind = segind;
511	m->order = VM_NFREEORDER;
512	m->pool = VM_FREEPOOL_DEFAULT;
513	m->valid = m->dirty = 0;
514	pmap_page_init(m);
515}
516
517#ifndef PMAP_HAS_PAGE_ARRAY
518static vm_paddr_t
519vm_page_array_alloc(vm_offset_t *vaddr, vm_paddr_t end, vm_paddr_t page_range)
520{
521	vm_paddr_t new_end;
522
523	/*
524	 * Reserve an unmapped guard page to trap access to vm_page_array[-1].
525	 * However, because this page is allocated from KVM, out-of-bounds
526	 * accesses using the direct map will not be trapped.
527	 */
528	*vaddr += PAGE_SIZE;
529
530	/*
531	 * Allocate physical memory for the page structures, and map it.
532	 */
533	new_end = trunc_page(end - page_range * sizeof(struct vm_page));
534	vm_page_array = (vm_page_t)pmap_map(vaddr, new_end, end,
535	    VM_PROT_READ | VM_PROT_WRITE);
536	vm_page_array_size = page_range;
537
538	return (new_end);
539}
540#endif
541
542/*
543 *	vm_page_startup:
544 *
545 *	Initializes the resident memory module.  Allocates physical memory for
546 *	bootstrapping UMA and some data structures that are used to manage
547 *	physical pages.  Initializes these structures, and populates the free
548 *	page queues.
549 */
550vm_offset_t
551vm_page_startup(vm_offset_t vaddr)
552{
553	struct vm_phys_seg *seg;
554	struct vm_domain *vmd;
555	vm_page_t m;
556	char *list, *listend;
557	vm_paddr_t end, high_avail, low_avail, new_end, size;
558	vm_paddr_t page_range __unused;
559	vm_paddr_t last_pa, pa, startp, endp;
560	u_long pagecount;
561#if MINIDUMP_PAGE_TRACKING
562	u_long vm_page_dump_size;
563#endif
564	int biggestone, i, segind;
565#ifdef WITNESS
566	vm_offset_t mapped;
567	int witness_size;
568#endif
569#if defined(__i386__) && defined(VM_PHYSSEG_DENSE)
570	long ii;
571#endif
572
573	vaddr = round_page(vaddr);
574
575	vm_phys_early_startup();
576	biggestone = vm_phys_avail_largest();
577	end = phys_avail[biggestone+1];
578
579	/*
580	 * Initialize the page and queue locks.
581	 */
582	mtx_init(&vm_domainset_lock, "vm domainset lock", NULL, MTX_DEF);
583	for (i = 0; i < PA_LOCK_COUNT; i++)
584		mtx_init(&pa_lock[i], "vm page", NULL, MTX_DEF);
585	for (i = 0; i < vm_ndomains; i++)
586		vm_page_domain_init(i);
587
588	new_end = end;
589#ifdef WITNESS
590	witness_size = round_page(witness_startup_count());
591	new_end -= witness_size;
592	mapped = pmap_map(&vaddr, new_end, new_end + witness_size,
593	    VM_PROT_READ | VM_PROT_WRITE);
594	bzero((void *)mapped, witness_size);
595	witness_startup((void *)mapped);
596#endif
597
598#if MINIDUMP_PAGE_TRACKING
599	/*
600	 * Allocate a bitmap to indicate that a random physical page
601	 * needs to be included in a minidump.
602	 *
603	 * The amd64 port needs this to indicate which direct map pages
604	 * need to be dumped, via calls to dump_add_page()/dump_drop_page().
605	 *
606	 * However, i386 still needs this workspace internally within the
607	 * minidump code.  In theory, they are not needed on i386, but are
608	 * included should the sf_buf code decide to use them.
609	 */
610	last_pa = 0;
611	vm_page_dump_pages = 0;
612	for (i = 0; dump_avail[i + 1] != 0; i += 2) {
613		vm_page_dump_pages += howmany(dump_avail[i + 1], PAGE_SIZE) -
614		    dump_avail[i] / PAGE_SIZE;
615		if (dump_avail[i + 1] > last_pa)
616			last_pa = dump_avail[i + 1];
617	}
618	vm_page_dump_size = round_page(BITSET_SIZE(vm_page_dump_pages));
619	new_end -= vm_page_dump_size;
620	vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end,
621	    new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE);
622	bzero((void *)vm_page_dump, vm_page_dump_size);
623#if MINIDUMP_STARTUP_PAGE_TRACKING
624	/*
625	 * Include the UMA bootstrap pages, witness pages and vm_page_dump
626	 * in a crash dump.  When pmap_map() uses the direct map, they are
627	 * not automatically included.
628	 */
629	for (pa = new_end; pa < end; pa += PAGE_SIZE)
630		dump_add_page(pa);
631#endif
632#else
633	(void)last_pa;
634#endif
635	phys_avail[biggestone + 1] = new_end;
636#ifdef __amd64__
637	/*
638	 * Request that the physical pages underlying the message buffer be
639	 * included in a crash dump.  Since the message buffer is accessed
640	 * through the direct map, they are not automatically included.
641	 */
642	pa = DMAP_TO_PHYS((vm_offset_t)msgbufp->msg_ptr);
643	last_pa = pa + round_page(msgbufsize);
644	while (pa < last_pa) {
645		dump_add_page(pa);
646		pa += PAGE_SIZE;
647	}
648#endif
649	/*
650	 * Compute the number of pages of memory that will be available for
651	 * use, taking into account the overhead of a page structure per page.
652	 * In other words, solve
653	 *	"available physical memory" - round_page(page_range *
654	 *	    sizeof(struct vm_page)) = page_range * PAGE_SIZE
655	 * for page_range.
656	 */
657	low_avail = phys_avail[0];
658	high_avail = phys_avail[1];
659	for (i = 0; i < vm_phys_nsegs; i++) {
660		if (vm_phys_segs[i].start < low_avail)
661			low_avail = vm_phys_segs[i].start;
662		if (vm_phys_segs[i].end > high_avail)
663			high_avail = vm_phys_segs[i].end;
664	}
665	/* Skip the first chunk.  It is already accounted for. */
666	for (i = 2; phys_avail[i + 1] != 0; i += 2) {
667		if (phys_avail[i] < low_avail)
668			low_avail = phys_avail[i];
669		if (phys_avail[i + 1] > high_avail)
670			high_avail = phys_avail[i + 1];
671	}
672	first_page = low_avail / PAGE_SIZE;
673#ifdef VM_PHYSSEG_SPARSE
674	size = 0;
675	for (i = 0; i < vm_phys_nsegs; i++)
676		size += vm_phys_segs[i].end - vm_phys_segs[i].start;
677	for (i = 0; phys_avail[i + 1] != 0; i += 2)
678		size += phys_avail[i + 1] - phys_avail[i];
679#elif defined(VM_PHYSSEG_DENSE)
680	size = high_avail - low_avail;
681#else
682#error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined."
683#endif
684
685#ifdef PMAP_HAS_PAGE_ARRAY
686	pmap_page_array_startup(size / PAGE_SIZE);
687	biggestone = vm_phys_avail_largest();
688	end = new_end = phys_avail[biggestone + 1];
689#else
690#ifdef VM_PHYSSEG_DENSE
691	/*
692	 * In the VM_PHYSSEG_DENSE case, the number of pages can account for
693	 * the overhead of a page structure per page only if vm_page_array is
694	 * allocated from the last physical memory chunk.  Otherwise, we must
695	 * allocate page structures representing the physical memory
696	 * underlying vm_page_array, even though they will not be used.
697	 */
698	if (new_end != high_avail)
699		page_range = size / PAGE_SIZE;
700	else
701#endif
702	{
703		page_range = size / (PAGE_SIZE + sizeof(struct vm_page));
704
705		/*
706		 * If the partial bytes remaining are large enough for
707		 * a page (PAGE_SIZE) without a corresponding
708		 * 'struct vm_page', then new_end will contain an
709		 * extra page after subtracting the length of the VM
710		 * page array.  Compensate by subtracting an extra
711		 * page from new_end.
712		 */
713		if (size % (PAGE_SIZE + sizeof(struct vm_page)) >= PAGE_SIZE) {
714			if (new_end == high_avail)
715				high_avail -= PAGE_SIZE;
716			new_end -= PAGE_SIZE;
717		}
718	}
719	end = new_end;
720	new_end = vm_page_array_alloc(&vaddr, end, page_range);
721#endif
722
723#if VM_NRESERVLEVEL > 0
724	/*
725	 * Allocate physical memory for the reservation management system's
726	 * data structures, and map it.
727	 */
728	new_end = vm_reserv_startup(&vaddr, new_end);
729#endif
730#if MINIDUMP_PAGE_TRACKING && MINIDUMP_STARTUP_PAGE_TRACKING
731	/*
732	 * Include vm_page_array and vm_reserv_array in a crash dump.
733	 */
734	for (pa = new_end; pa < end; pa += PAGE_SIZE)
735		dump_add_page(pa);
736#endif
737	phys_avail[biggestone + 1] = new_end;
738
739	/*
740	 * Add physical memory segments corresponding to the available
741	 * physical pages.
742	 */
743	for (i = 0; phys_avail[i + 1] != 0; i += 2)
744		if (vm_phys_avail_size(i) != 0)
745			vm_phys_add_seg(phys_avail[i], phys_avail[i + 1]);
746
747	/*
748	 * Initialize the physical memory allocator.
749	 */
750	vm_phys_init();
751
752	/*
753	 * Initialize the page structures and add every available page to the
754	 * physical memory allocator's free lists.
755	 */
756#if defined(__i386__) && defined(VM_PHYSSEG_DENSE)
757	for (ii = 0; ii < vm_page_array_size; ii++) {
758		m = &vm_page_array[ii];
759		vm_page_init_page(m, (first_page + ii) << PAGE_SHIFT, 0);
760		m->flags = PG_FICTITIOUS;
761	}
762#endif
763	vm_cnt.v_page_count = 0;
764	for (segind = 0; segind < vm_phys_nsegs; segind++) {
765		seg = &vm_phys_segs[segind];
766		for (m = seg->first_page, pa = seg->start; pa < seg->end;
767		    m++, pa += PAGE_SIZE)
768			vm_page_init_page(m, pa, segind);
769
770		/*
771		 * Add the segment's pages that are covered by one of
772		 * phys_avail's ranges to the free lists.
773		 */
774		for (i = 0; phys_avail[i + 1] != 0; i += 2) {
775			if (seg->end <= phys_avail[i] ||
776			    seg->start >= phys_avail[i + 1])
777				continue;
778
779			startp = MAX(seg->start, phys_avail[i]);
780			endp = MIN(seg->end, phys_avail[i + 1]);
781			pagecount = (u_long)atop(endp - startp);
782			if (pagecount == 0)
783				continue;
784
785			m = seg->first_page + atop(startp - seg->start);
786			vmd = VM_DOMAIN(seg->domain);
787			vm_domain_free_lock(vmd);
788			vm_phys_enqueue_contig(m, pagecount);
789			vm_domain_free_unlock(vmd);
790			vm_domain_freecnt_inc(vmd, pagecount);
791			vm_cnt.v_page_count += (u_int)pagecount;
792			vmd->vmd_page_count += (u_int)pagecount;
793			vmd->vmd_segs |= 1UL << segind;
794		}
795	}
796
797	/*
798	 * Remove blacklisted pages from the physical memory allocator.
799	 */
800	TAILQ_INIT(&blacklist_head);
801	vm_page_blacklist_load(&list, &listend);
802	vm_page_blacklist_check(list, listend);
803
804	list = kern_getenv("vm.blacklist");
805	vm_page_blacklist_check(list, NULL);
806
807	freeenv(list);
808#if VM_NRESERVLEVEL > 0
809	/*
810	 * Initialize the reservation management system.
811	 */
812	vm_reserv_init();
813#endif
814
815	return (vaddr);
816}
817
818void
819vm_page_reference(vm_page_t m)
820{
821
822	vm_page_aflag_set(m, PGA_REFERENCED);
823}
824
825/*
826 *	vm_page_trybusy
827 *
828 *	Helper routine for grab functions to trylock busy.
829 *
830 *	Returns true on success and false on failure.
831 */
832static bool
833vm_page_trybusy(vm_page_t m, int allocflags)
834{
835
836	if ((allocflags & (VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY)) != 0)
837		return (vm_page_trysbusy(m));
838	else
839		return (vm_page_tryxbusy(m));
840}
841
842/*
843 *	vm_page_tryacquire
844 *
845 *	Helper routine for grab functions to trylock busy and wire.
846 *
847 *	Returns true on success and false on failure.
848 */
849static inline bool
850vm_page_tryacquire(vm_page_t m, int allocflags)
851{
852	bool locked;
853
854	locked = vm_page_trybusy(m, allocflags);
855	if (locked && (allocflags & VM_ALLOC_WIRED) != 0)
856		vm_page_wire(m);
857	return (locked);
858}
859
860/*
861 *	vm_page_busy_acquire:
862 *
863 *	Acquire the busy lock as described by VM_ALLOC_* flags.  Will loop
864 *	and drop the object lock if necessary.
865 */
866bool
867vm_page_busy_acquire(vm_page_t m, int allocflags)
868{
869	vm_object_t obj;
870	bool locked;
871
872	/*
873	 * The page-specific object must be cached because page
874	 * identity can change during the sleep, causing the
875	 * re-lock of a different object.
876	 * It is assumed that a reference to the object is already
877	 * held by the callers.
878	 */
879	obj = atomic_load_ptr(&m->object);
880	for (;;) {
881		if (vm_page_tryacquire(m, allocflags))
882			return (true);
883		if ((allocflags & VM_ALLOC_NOWAIT) != 0)
884			return (false);
885		if (obj != NULL)
886			locked = VM_OBJECT_WOWNED(obj);
887		else
888			locked = false;
889		MPASS(locked || vm_page_wired(m));
890		if (_vm_page_busy_sleep(obj, m, m->pindex, "vmpba", allocflags,
891		    locked) && locked)
892			VM_OBJECT_WLOCK(obj);
893		if ((allocflags & VM_ALLOC_WAITFAIL) != 0)
894			return (false);
895		KASSERT(m->object == obj || m->object == NULL,
896		    ("vm_page_busy_acquire: page %p does not belong to %p",
897		    m, obj));
898	}
899}
900
901/*
902 *	vm_page_busy_downgrade:
903 *
904 *	Downgrade an exclusive busy page into a single shared busy page.
905 */
906void
907vm_page_busy_downgrade(vm_page_t m)
908{
909	u_int x;
910
911	vm_page_assert_xbusied(m);
912
913	x = vm_page_busy_fetch(m);
914	for (;;) {
915		if (atomic_fcmpset_rel_int(&m->busy_lock,
916		    &x, VPB_SHARERS_WORD(1)))
917			break;
918	}
919	if ((x & VPB_BIT_WAITERS) != 0)
920		wakeup(m);
921}
922
923/*
924 *
925 *	vm_page_busy_tryupgrade:
926 *
927 *	Attempt to upgrade a single shared busy into an exclusive busy.
928 */
929int
930vm_page_busy_tryupgrade(vm_page_t m)
931{
932	u_int ce, x;
933
934	vm_page_assert_sbusied(m);
935
936	x = vm_page_busy_fetch(m);
937	ce = VPB_CURTHREAD_EXCLUSIVE;
938	for (;;) {
939		if (VPB_SHARERS(x) > 1)
940			return (0);
941		KASSERT((x & ~VPB_BIT_WAITERS) == VPB_SHARERS_WORD(1),
942		    ("vm_page_busy_tryupgrade: invalid lock state"));
943		if (!atomic_fcmpset_acq_int(&m->busy_lock, &x,
944		    ce | (x & VPB_BIT_WAITERS)))
945			continue;
946		return (1);
947	}
948}
949
950/*
951 *	vm_page_sbusied:
952 *
953 *	Return a positive value if the page is shared busied, 0 otherwise.
954 */
955int
956vm_page_sbusied(vm_page_t m)
957{
958	u_int x;
959
960	x = vm_page_busy_fetch(m);
961	return ((x & VPB_BIT_SHARED) != 0 && x != VPB_UNBUSIED);
962}
963
964/*
965 *	vm_page_sunbusy:
966 *
967 *	Shared unbusy a page.
968 */
969void
970vm_page_sunbusy(vm_page_t m)
971{
972	u_int x;
973
974	vm_page_assert_sbusied(m);
975
976	x = vm_page_busy_fetch(m);
977	for (;;) {
978		KASSERT(x != VPB_FREED,
979		    ("vm_page_sunbusy: Unlocking freed page."));
980		if (VPB_SHARERS(x) > 1) {
981			if (atomic_fcmpset_int(&m->busy_lock, &x,
982			    x - VPB_ONE_SHARER))
983				break;
984			continue;
985		}
986		KASSERT((x & ~VPB_BIT_WAITERS) == VPB_SHARERS_WORD(1),
987		    ("vm_page_sunbusy: invalid lock state"));
988		if (!atomic_fcmpset_rel_int(&m->busy_lock, &x, VPB_UNBUSIED))
989			continue;
990		if ((x & VPB_BIT_WAITERS) == 0)
991			break;
992		wakeup(m);
993		break;
994	}
995}
996
997/*
998 *	vm_page_busy_sleep:
999 *
1000 *	Sleep if the page is busy, using the page pointer as wchan.
1001 *	This is used to implement the hard-path of the busying mechanism.
1002 *
1003 *	If VM_ALLOC_IGN_SBUSY is specified in allocflags, the function
1004 *	will not sleep if the page is shared-busy.
1005 *
1006 *	The object lock must be held on entry.
1007 *
1008 *	Returns true if it slept and dropped the object lock, or false
1009 *	if there was no sleep and the lock is still held.
1010 */
1011bool
1012vm_page_busy_sleep(vm_page_t m, const char *wmesg, int allocflags)
1013{
1014	vm_object_t obj;
1015
1016	obj = m->object;
1017	VM_OBJECT_ASSERT_LOCKED(obj);
1018
1019	return (_vm_page_busy_sleep(obj, m, m->pindex, wmesg, allocflags,
1020	    true));
1021}
1022
1023/*
1024 *	vm_page_busy_sleep_unlocked:
1025 *
1026 *	Sleep if the page is busy, using the page pointer as wchan.
1027 *	This is used to implement the hard-path of busying mechanism.
1028 *
1029 *	If VM_ALLOC_IGN_SBUSY is specified in allocflags, the function
1030 *	will not sleep if the page is shared-busy.
1031 *
1032 *	The object lock must not be held on entry.  The operation will
1033 *	return if the page changes identity.
1034 */
1035void
1036vm_page_busy_sleep_unlocked(vm_object_t obj, vm_page_t m, vm_pindex_t pindex,
1037    const char *wmesg, int allocflags)
1038{
1039	VM_OBJECT_ASSERT_UNLOCKED(obj);
1040
1041	(void)_vm_page_busy_sleep(obj, m, pindex, wmesg, allocflags, false);
1042}
1043
1044/*
1045 *	_vm_page_busy_sleep:
1046 *
1047 *	Internal busy sleep function.  Verifies the page identity and
1048 *	lockstate against parameters.  Returns true if it sleeps and
1049 *	false otherwise.
1050 *
1051 *	allocflags uses VM_ALLOC_* flags to specify the lock required.
1052 *
1053 *	If locked is true the lock will be dropped for any true returns
1054 *	and held for any false returns.
1055 */
1056static bool
1057_vm_page_busy_sleep(vm_object_t obj, vm_page_t m, vm_pindex_t pindex,
1058    const char *wmesg, int allocflags, bool locked)
1059{
1060	bool xsleep;
1061	u_int x;
1062
1063	/*
1064	 * If the object is busy we must wait for that to drain to zero
1065	 * before trying the page again.
1066	 */
1067	if (obj != NULL && vm_object_busied(obj)) {
1068		if (locked)
1069			VM_OBJECT_DROP(obj);
1070		vm_object_busy_wait(obj, wmesg);
1071		return (true);
1072	}
1073
1074	if (!vm_page_busied(m))
1075		return (false);
1076
1077	xsleep = (allocflags & (VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY)) != 0;
1078	sleepq_lock(m);
1079	x = vm_page_busy_fetch(m);
1080	do {
1081		/*
1082		 * If the page changes objects or becomes unlocked we can
1083		 * simply return.
1084		 */
1085		if (x == VPB_UNBUSIED ||
1086		    (xsleep && (x & VPB_BIT_SHARED) != 0) ||
1087		    m->object != obj || m->pindex != pindex) {
1088			sleepq_release(m);
1089			return (false);
1090		}
1091		if ((x & VPB_BIT_WAITERS) != 0)
1092			break;
1093	} while (!atomic_fcmpset_int(&m->busy_lock, &x, x | VPB_BIT_WAITERS));
1094	if (locked)
1095		VM_OBJECT_DROP(obj);
1096	DROP_GIANT();
1097	sleepq_add(m, NULL, wmesg, 0, 0);
1098	sleepq_wait(m, PVM);
1099	PICKUP_GIANT();
1100	return (true);
1101}
1102
1103/*
1104 *	vm_page_trysbusy:
1105 *
1106 *	Try to shared busy a page.
1107 *	If the operation succeeds 1 is returned otherwise 0.
1108 *	The operation never sleeps.
1109 */
1110int
1111vm_page_trysbusy(vm_page_t m)
1112{
1113	vm_object_t obj;
1114	u_int x;
1115
1116	obj = m->object;
1117	x = vm_page_busy_fetch(m);
1118	for (;;) {
1119		if ((x & VPB_BIT_SHARED) == 0)
1120			return (0);
1121		/*
1122		 * Reduce the window for transient busies that will trigger
1123		 * false negatives in vm_page_ps_test().
1124		 */
1125		if (obj != NULL && vm_object_busied(obj))
1126			return (0);
1127		if (atomic_fcmpset_acq_int(&m->busy_lock, &x,
1128		    x + VPB_ONE_SHARER))
1129			break;
1130	}
1131
1132	/* Refetch the object now that we're guaranteed that it is stable. */
1133	obj = m->object;
1134	if (obj != NULL && vm_object_busied(obj)) {
1135		vm_page_sunbusy(m);
1136		return (0);
1137	}
1138	return (1);
1139}
1140
1141/*
1142 *	vm_page_tryxbusy:
1143 *
1144 *	Try to exclusive busy a page.
1145 *	If the operation succeeds 1 is returned otherwise 0.
1146 *	The operation never sleeps.
1147 */
1148int
1149vm_page_tryxbusy(vm_page_t m)
1150{
1151	vm_object_t obj;
1152
1153        if (atomic_cmpset_acq_int(&m->busy_lock, VPB_UNBUSIED,
1154            VPB_CURTHREAD_EXCLUSIVE) == 0)
1155		return (0);
1156
1157	obj = m->object;
1158	if (obj != NULL && vm_object_busied(obj)) {
1159		vm_page_xunbusy(m);
1160		return (0);
1161	}
1162	return (1);
1163}
1164
1165static void
1166vm_page_xunbusy_hard_tail(vm_page_t m)
1167{
1168	atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED);
1169	/* Wake the waiter. */
1170	wakeup(m);
1171}
1172
1173/*
1174 *	vm_page_xunbusy_hard:
1175 *
1176 *	Called when unbusy has failed because there is a waiter.
1177 */
1178void
1179vm_page_xunbusy_hard(vm_page_t m)
1180{
1181	vm_page_assert_xbusied(m);
1182	vm_page_xunbusy_hard_tail(m);
1183}
1184
1185void
1186vm_page_xunbusy_hard_unchecked(vm_page_t m)
1187{
1188	vm_page_assert_xbusied_unchecked(m);
1189	vm_page_xunbusy_hard_tail(m);
1190}
1191
1192static void
1193vm_page_busy_free(vm_page_t m)
1194{
1195	u_int x;
1196
1197	atomic_thread_fence_rel();
1198	x = atomic_swap_int(&m->busy_lock, VPB_FREED);
1199	if ((x & VPB_BIT_WAITERS) != 0)
1200		wakeup(m);
1201}
1202
1203/*
1204 *	vm_page_unhold_pages:
1205 *
1206 *	Unhold each of the pages that is referenced by the given array.
1207 */
1208void
1209vm_page_unhold_pages(vm_page_t *ma, int count)
1210{
1211
1212	for (; count != 0; count--) {
1213		vm_page_unwire(*ma, PQ_ACTIVE);
1214		ma++;
1215	}
1216}
1217
1218vm_page_t
1219PHYS_TO_VM_PAGE(vm_paddr_t pa)
1220{
1221	vm_page_t m;
1222
1223#ifdef VM_PHYSSEG_SPARSE
1224	m = vm_phys_paddr_to_vm_page(pa);
1225	if (m == NULL)
1226		m = vm_phys_fictitious_to_vm_page(pa);
1227	return (m);
1228#elif defined(VM_PHYSSEG_DENSE)
1229	long pi;
1230
1231	pi = atop(pa);
1232	if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
1233		m = &vm_page_array[pi - first_page];
1234		return (m);
1235	}
1236	return (vm_phys_fictitious_to_vm_page(pa));
1237#else
1238#error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined."
1239#endif
1240}
1241
1242/*
1243 *	vm_page_getfake:
1244 *
1245 *	Create a fictitious page with the specified physical address and
1246 *	memory attribute.  The memory attribute is the only the machine-
1247 *	dependent aspect of a fictitious page that must be initialized.
1248 */
1249vm_page_t
1250vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr)
1251{
1252	vm_page_t m;
1253
1254	m = uma_zalloc(fakepg_zone, M_WAITOK | M_ZERO);
1255	vm_page_initfake(m, paddr, memattr);
1256	return (m);
1257}
1258
1259void
1260vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr)
1261{
1262
1263	if ((m->flags & PG_FICTITIOUS) != 0) {
1264		/*
1265		 * The page's memattr might have changed since the
1266		 * previous initialization.  Update the pmap to the
1267		 * new memattr.
1268		 */
1269		goto memattr;
1270	}
1271	m->phys_addr = paddr;
1272	m->a.queue = PQ_NONE;
1273	/* Fictitious pages don't use "segind". */
1274	m->flags = PG_FICTITIOUS;
1275	/* Fictitious pages don't use "order" or "pool". */
1276	m->oflags = VPO_UNMANAGED;
1277	m->busy_lock = VPB_CURTHREAD_EXCLUSIVE;
1278	/* Fictitious pages are unevictable. */
1279	m->ref_count = 1;
1280	pmap_page_init(m);
1281memattr:
1282	pmap_page_set_memattr(m, memattr);
1283}
1284
1285/*
1286 *	vm_page_putfake:
1287 *
1288 *	Release a fictitious page.
1289 */
1290void
1291vm_page_putfake(vm_page_t m)
1292{
1293
1294	KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("managed %p", m));
1295	KASSERT((m->flags & PG_FICTITIOUS) != 0,
1296	    ("vm_page_putfake: bad page %p", m));
1297	vm_page_assert_xbusied(m);
1298	vm_page_busy_free(m);
1299	uma_zfree(fakepg_zone, m);
1300}
1301
1302/*
1303 *	vm_page_updatefake:
1304 *
1305 *	Update the given fictitious page to the specified physical address and
1306 *	memory attribute.
1307 */
1308void
1309vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr)
1310{
1311
1312	KASSERT((m->flags & PG_FICTITIOUS) != 0,
1313	    ("vm_page_updatefake: bad page %p", m));
1314	m->phys_addr = paddr;
1315	pmap_page_set_memattr(m, memattr);
1316}
1317
1318/*
1319 *	vm_page_free:
1320 *
1321 *	Free a page.
1322 */
1323void
1324vm_page_free(vm_page_t m)
1325{
1326
1327	m->flags &= ~PG_ZERO;
1328	vm_page_free_toq(m);
1329}
1330
1331/*
1332 *	vm_page_free_zero:
1333 *
1334 *	Free a page to the zerod-pages queue
1335 */
1336void
1337vm_page_free_zero(vm_page_t m)
1338{
1339
1340	m->flags |= PG_ZERO;
1341	vm_page_free_toq(m);
1342}
1343
1344/*
1345 * Unbusy and handle the page queueing for a page from a getpages request that
1346 * was optionally read ahead or behind.
1347 */
1348void
1349vm_page_readahead_finish(vm_page_t m)
1350{
1351
1352	/* We shouldn't put invalid pages on queues. */
1353	KASSERT(!vm_page_none_valid(m), ("%s: %p is invalid", __func__, m));
1354
1355	/*
1356	 * Since the page is not the actually needed one, whether it should
1357	 * be activated or deactivated is not obvious.  Empirical results
1358	 * have shown that deactivating the page is usually the best choice,
1359	 * unless the page is wanted by another thread.
1360	 */
1361	if ((vm_page_busy_fetch(m) & VPB_BIT_WAITERS) != 0)
1362		vm_page_activate(m);
1363	else
1364		vm_page_deactivate(m);
1365	vm_page_xunbusy_unchecked(m);
1366}
1367
1368/*
1369 * Destroy the identity of an invalid page and free it if possible.
1370 * This is intended to be used when reading a page from backing store fails.
1371 */
1372void
1373vm_page_free_invalid(vm_page_t m)
1374{
1375
1376	KASSERT(vm_page_none_valid(m), ("page %p is valid", m));
1377	KASSERT(!pmap_page_is_mapped(m), ("page %p is mapped", m));
1378	KASSERT(m->object != NULL, ("page %p has no object", m));
1379	VM_OBJECT_ASSERT_WLOCKED(m->object);
1380
1381	/*
1382	 * We may be attempting to free the page as part of the handling for an
1383	 * I/O error, in which case the page was xbusied by a different thread.
1384	 */
1385	vm_page_xbusy_claim(m);
1386
1387	/*
1388	 * If someone has wired this page while the object lock
1389	 * was not held, then the thread that unwires is responsible
1390	 * for freeing the page.  Otherwise just free the page now.
1391	 * The wire count of this unmapped page cannot change while
1392	 * we have the page xbusy and the page's object wlocked.
1393	 */
1394	if (vm_page_remove(m))
1395		vm_page_free(m);
1396}
1397
1398/*
1399 *	vm_page_dirty_KBI:		[ internal use only ]
1400 *
1401 *	Set all bits in the page's dirty field.
1402 *
1403 *	The object containing the specified page must be locked if the
1404 *	call is made from the machine-independent layer.
1405 *
1406 *	See vm_page_clear_dirty_mask().
1407 *
1408 *	This function should only be called by vm_page_dirty().
1409 */
1410void
1411vm_page_dirty_KBI(vm_page_t m)
1412{
1413
1414	/* Refer to this operation by its public name. */
1415	KASSERT(vm_page_all_valid(m), ("vm_page_dirty: page is invalid!"));
1416	m->dirty = VM_PAGE_BITS_ALL;
1417}
1418
1419/*
1420 *	vm_page_insert:		[ internal use only ]
1421 *
1422 *	Inserts the given mem entry into the object and object list.
1423 *
1424 *	The object must be locked.
1425 */
1426int
1427vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
1428{
1429	vm_page_t mpred;
1430
1431	VM_OBJECT_ASSERT_WLOCKED(object);
1432	mpred = vm_radix_lookup_le(&object->rtree, pindex);
1433	return (vm_page_insert_after(m, object, pindex, mpred));
1434}
1435
1436/*
1437 *	vm_page_insert_after:
1438 *
1439 *	Inserts the page "m" into the specified object at offset "pindex".
1440 *
1441 *	The page "mpred" must immediately precede the offset "pindex" within
1442 *	the specified object.
1443 *
1444 *	The object must be locked.
1445 */
1446static int
1447vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex,
1448    vm_page_t mpred)
1449{
1450	vm_page_t msucc;
1451
1452	VM_OBJECT_ASSERT_WLOCKED(object);
1453	KASSERT(m->object == NULL,
1454	    ("vm_page_insert_after: page already inserted"));
1455	if (mpred != NULL) {
1456		KASSERT(mpred->object == object,
1457		    ("vm_page_insert_after: object doesn't contain mpred"));
1458		KASSERT(mpred->pindex < pindex,
1459		    ("vm_page_insert_after: mpred doesn't precede pindex"));
1460		msucc = TAILQ_NEXT(mpred, listq);
1461	} else
1462		msucc = TAILQ_FIRST(&object->memq);
1463	if (msucc != NULL)
1464		KASSERT(msucc->pindex > pindex,
1465		    ("vm_page_insert_after: msucc doesn't succeed pindex"));
1466
1467	/*
1468	 * Record the object/offset pair in this page.
1469	 */
1470	m->object = object;
1471	m->pindex = pindex;
1472	m->ref_count |= VPRC_OBJREF;
1473
1474	/*
1475	 * Now link into the object's ordered list of backed pages.
1476	 */
1477	if (vm_radix_insert(&object->rtree, m)) {
1478		m->object = NULL;
1479		m->pindex = 0;
1480		m->ref_count &= ~VPRC_OBJREF;
1481		return (1);
1482	}
1483	vm_page_insert_radixdone(m, object, mpred);
1484	vm_pager_page_inserted(object, m);
1485	return (0);
1486}
1487
1488/*
1489 *	vm_page_insert_radixdone:
1490 *
1491 *	Complete page "m" insertion into the specified object after the
1492 *	radix trie hooking.
1493 *
1494 *	The page "mpred" must precede the offset "m->pindex" within the
1495 *	specified object.
1496 *
1497 *	The object must be locked.
1498 */
1499static void
1500vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred)
1501{
1502
1503	VM_OBJECT_ASSERT_WLOCKED(object);
1504	KASSERT(object != NULL && m->object == object,
1505	    ("vm_page_insert_radixdone: page %p has inconsistent object", m));
1506	KASSERT((m->ref_count & VPRC_OBJREF) != 0,
1507	    ("vm_page_insert_radixdone: page %p is missing object ref", m));
1508	if (mpred != NULL) {
1509		KASSERT(mpred->object == object,
1510		    ("vm_page_insert_radixdone: object doesn't contain mpred"));
1511		KASSERT(mpred->pindex < m->pindex,
1512		    ("vm_page_insert_radixdone: mpred doesn't precede pindex"));
1513	}
1514
1515	if (mpred != NULL)
1516		TAILQ_INSERT_AFTER(&object->memq, mpred, m, listq);
1517	else
1518		TAILQ_INSERT_HEAD(&object->memq, m, listq);
1519
1520	/*
1521	 * Show that the object has one more resident page.
1522	 */
1523	object->resident_page_count++;
1524
1525	/*
1526	 * Hold the vnode until the last page is released.
1527	 */
1528	if (object->resident_page_count == 1 && object->type == OBJT_VNODE)
1529		vhold(object->handle);
1530
1531	/*
1532	 * Since we are inserting a new and possibly dirty page,
1533	 * update the object's generation count.
1534	 */
1535	if (pmap_page_is_write_mapped(m))
1536		vm_object_set_writeable_dirty(object);
1537}
1538
1539/*
1540 * Do the work to remove a page from its object.  The caller is responsible for
1541 * updating the page's fields to reflect this removal.
1542 */
1543static void
1544vm_page_object_remove(vm_page_t m)
1545{
1546	vm_object_t object;
1547	vm_page_t mrem __diagused;
1548
1549	vm_page_assert_xbusied(m);
1550	object = m->object;
1551	VM_OBJECT_ASSERT_WLOCKED(object);
1552	KASSERT((m->ref_count & VPRC_OBJREF) != 0,
1553	    ("page %p is missing its object ref", m));
1554
1555	/* Deferred free of swap space. */
1556	if ((m->a.flags & PGA_SWAP_FREE) != 0)
1557		vm_pager_page_unswapped(m);
1558
1559	vm_pager_page_removed(object, m);
1560
1561	m->object = NULL;
1562	mrem = vm_radix_remove(&object->rtree, m->pindex);
1563	KASSERT(mrem == m, ("removed page %p, expected page %p", mrem, m));
1564
1565	/*
1566	 * Now remove from the object's list of backed pages.
1567	 */
1568	TAILQ_REMOVE(&object->memq, m, listq);
1569
1570	/*
1571	 * And show that the object has one fewer resident page.
1572	 */
1573	object->resident_page_count--;
1574
1575	/*
1576	 * The vnode may now be recycled.
1577	 */
1578	if (object->resident_page_count == 0 && object->type == OBJT_VNODE)
1579		vdrop(object->handle);
1580}
1581
1582/*
1583 *	vm_page_remove:
1584 *
1585 *	Removes the specified page from its containing object, but does not
1586 *	invalidate any backing storage.  Returns true if the object's reference
1587 *	was the last reference to the page, and false otherwise.
1588 *
1589 *	The object must be locked and the page must be exclusively busied.
1590 *	The exclusive busy will be released on return.  If this is not the
1591 *	final ref and the caller does not hold a wire reference it may not
1592 *	continue to access the page.
1593 */
1594bool
1595vm_page_remove(vm_page_t m)
1596{
1597	bool dropped;
1598
1599	dropped = vm_page_remove_xbusy(m);
1600	vm_page_xunbusy(m);
1601
1602	return (dropped);
1603}
1604
1605/*
1606 *	vm_page_remove_xbusy
1607 *
1608 *	Removes the page but leaves the xbusy held.  Returns true if this
1609 *	removed the final ref and false otherwise.
1610 */
1611bool
1612vm_page_remove_xbusy(vm_page_t m)
1613{
1614
1615	vm_page_object_remove(m);
1616	return (vm_page_drop(m, VPRC_OBJREF) == VPRC_OBJREF);
1617}
1618
1619/*
1620 *	vm_page_lookup:
1621 *
1622 *	Returns the page associated with the object/offset
1623 *	pair specified; if none is found, NULL is returned.
1624 *
1625 *	The object must be locked.
1626 */
1627vm_page_t
1628vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
1629{
1630
1631	VM_OBJECT_ASSERT_LOCKED(object);
1632	return (vm_radix_lookup(&object->rtree, pindex));
1633}
1634
1635/*
1636 *	vm_page_lookup_unlocked:
1637 *
1638 *	Returns the page associated with the object/offset pair specified;
1639 *	if none is found, NULL is returned.  The page may be no longer be
1640 *	present in the object at the time that this function returns.  Only
1641 *	useful for opportunistic checks such as inmem().
1642 */
1643vm_page_t
1644vm_page_lookup_unlocked(vm_object_t object, vm_pindex_t pindex)
1645{
1646
1647	return (vm_radix_lookup_unlocked(&object->rtree, pindex));
1648}
1649
1650/*
1651 *	vm_page_relookup:
1652 *
1653 *	Returns a page that must already have been busied by
1654 *	the caller.  Used for bogus page replacement.
1655 */
1656vm_page_t
1657vm_page_relookup(vm_object_t object, vm_pindex_t pindex)
1658{
1659	vm_page_t m;
1660
1661	m = vm_radix_lookup_unlocked(&object->rtree, pindex);
1662	KASSERT(m != NULL && (vm_page_busied(m) || vm_page_wired(m)) &&
1663	    m->object == object && m->pindex == pindex,
1664	    ("vm_page_relookup: Invalid page %p", m));
1665	return (m);
1666}
1667
1668/*
1669 * This should only be used by lockless functions for releasing transient
1670 * incorrect acquires.  The page may have been freed after we acquired a
1671 * busy lock.  In this case busy_lock == VPB_FREED and we have nothing
1672 * further to do.
1673 */
1674static void
1675vm_page_busy_release(vm_page_t m)
1676{
1677	u_int x;
1678
1679	x = vm_page_busy_fetch(m);
1680	for (;;) {
1681		if (x == VPB_FREED)
1682			break;
1683		if ((x & VPB_BIT_SHARED) != 0 && VPB_SHARERS(x) > 1) {
1684			if (atomic_fcmpset_int(&m->busy_lock, &x,
1685			    x - VPB_ONE_SHARER))
1686				break;
1687			continue;
1688		}
1689		KASSERT((x & VPB_BIT_SHARED) != 0 ||
1690		    (x & ~VPB_BIT_WAITERS) == VPB_CURTHREAD_EXCLUSIVE,
1691		    ("vm_page_busy_release: %p xbusy not owned.", m));
1692		if (!atomic_fcmpset_rel_int(&m->busy_lock, &x, VPB_UNBUSIED))
1693			continue;
1694		if ((x & VPB_BIT_WAITERS) != 0)
1695			wakeup(m);
1696		break;
1697	}
1698}
1699
1700/*
1701 *	vm_page_find_least:
1702 *
1703 *	Returns the page associated with the object with least pindex
1704 *	greater than or equal to the parameter pindex, or NULL.
1705 *
1706 *	The object must be locked.
1707 */
1708vm_page_t
1709vm_page_find_least(vm_object_t object, vm_pindex_t pindex)
1710{
1711	vm_page_t m;
1712
1713	VM_OBJECT_ASSERT_LOCKED(object);
1714	if ((m = TAILQ_FIRST(&object->memq)) != NULL && m->pindex < pindex)
1715		m = vm_radix_lookup_ge(&object->rtree, pindex);
1716	return (m);
1717}
1718
1719/*
1720 * Returns the given page's successor (by pindex) within the object if it is
1721 * resident; if none is found, NULL is returned.
1722 *
1723 * The object must be locked.
1724 */
1725vm_page_t
1726vm_page_next(vm_page_t m)
1727{
1728	vm_page_t next;
1729
1730	VM_OBJECT_ASSERT_LOCKED(m->object);
1731	if ((next = TAILQ_NEXT(m, listq)) != NULL) {
1732		MPASS(next->object == m->object);
1733		if (next->pindex != m->pindex + 1)
1734			next = NULL;
1735	}
1736	return (next);
1737}
1738
1739/*
1740 * Returns the given page's predecessor (by pindex) within the object if it is
1741 * resident; if none is found, NULL is returned.
1742 *
1743 * The object must be locked.
1744 */
1745vm_page_t
1746vm_page_prev(vm_page_t m)
1747{
1748	vm_page_t prev;
1749
1750	VM_OBJECT_ASSERT_LOCKED(m->object);
1751	if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL) {
1752		MPASS(prev->object == m->object);
1753		if (prev->pindex != m->pindex - 1)
1754			prev = NULL;
1755	}
1756	return (prev);
1757}
1758
1759/*
1760 * Uses the page mnew as a replacement for an existing page at index
1761 * pindex which must be already present in the object.
1762 *
1763 * Both pages must be exclusively busied on enter.  The old page is
1764 * unbusied on exit.
1765 *
1766 * A return value of true means mold is now free.  If this is not the
1767 * final ref and the caller does not hold a wire reference it may not
1768 * continue to access the page.
1769 */
1770static bool
1771vm_page_replace_hold(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex,
1772    vm_page_t mold)
1773{
1774	vm_page_t mret __diagused;
1775	bool dropped;
1776
1777	VM_OBJECT_ASSERT_WLOCKED(object);
1778	vm_page_assert_xbusied(mold);
1779	KASSERT(mnew->object == NULL && (mnew->ref_count & VPRC_OBJREF) == 0,
1780	    ("vm_page_replace: page %p already in object", mnew));
1781
1782	/*
1783	 * This function mostly follows vm_page_insert() and
1784	 * vm_page_remove() without the radix, object count and vnode
1785	 * dance.  Double check such functions for more comments.
1786	 */
1787
1788	mnew->object = object;
1789	mnew->pindex = pindex;
1790	atomic_set_int(&mnew->ref_count, VPRC_OBJREF);
1791	mret = vm_radix_replace(&object->rtree, mnew);
1792	KASSERT(mret == mold,
1793	    ("invalid page replacement, mold=%p, mret=%p", mold, mret));
1794	KASSERT((mold->oflags & VPO_UNMANAGED) ==
1795	    (mnew->oflags & VPO_UNMANAGED),
1796	    ("vm_page_replace: mismatched VPO_UNMANAGED"));
1797
1798	/* Keep the resident page list in sorted order. */
1799	TAILQ_INSERT_AFTER(&object->memq, mold, mnew, listq);
1800	TAILQ_REMOVE(&object->memq, mold, listq);
1801	mold->object = NULL;
1802
1803	/*
1804	 * The object's resident_page_count does not change because we have
1805	 * swapped one page for another, but the generation count should
1806	 * change if the page is dirty.
1807	 */
1808	if (pmap_page_is_write_mapped(mnew))
1809		vm_object_set_writeable_dirty(object);
1810	dropped = vm_page_drop(mold, VPRC_OBJREF) == VPRC_OBJREF;
1811	vm_page_xunbusy(mold);
1812
1813	return (dropped);
1814}
1815
1816void
1817vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex,
1818    vm_page_t mold)
1819{
1820
1821	vm_page_assert_xbusied(mnew);
1822
1823	if (vm_page_replace_hold(mnew, object, pindex, mold))
1824		vm_page_free(mold);
1825}
1826
1827/*
1828 *	vm_page_rename:
1829 *
1830 *	Move the given memory entry from its
1831 *	current object to the specified target object/offset.
1832 *
1833 *	Note: swap associated with the page must be invalidated by the move.  We
1834 *	      have to do this for several reasons:  (1) we aren't freeing the
1835 *	      page, (2) we are dirtying the page, (3) the VM system is probably
1836 *	      moving the page from object A to B, and will then later move
1837 *	      the backing store from A to B and we can't have a conflict.
1838 *
1839 *	Note: we *always* dirty the page.  It is necessary both for the
1840 *	      fact that we moved it, and because we may be invalidating
1841 *	      swap.
1842 *
1843 *	The objects must be locked.
1844 */
1845int
1846vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
1847{
1848	vm_page_t mpred;
1849	vm_pindex_t opidx;
1850
1851	VM_OBJECT_ASSERT_WLOCKED(new_object);
1852
1853	KASSERT(m->ref_count != 0, ("vm_page_rename: page %p has no refs", m));
1854	mpred = vm_radix_lookup_le(&new_object->rtree, new_pindex);
1855	KASSERT(mpred == NULL || mpred->pindex != new_pindex,
1856	    ("vm_page_rename: pindex already renamed"));
1857
1858	/*
1859	 * Create a custom version of vm_page_insert() which does not depend
1860	 * by m_prev and can cheat on the implementation aspects of the
1861	 * function.
1862	 */
1863	opidx = m->pindex;
1864	m->pindex = new_pindex;
1865	if (vm_radix_insert(&new_object->rtree, m)) {
1866		m->pindex = opidx;
1867		return (1);
1868	}
1869
1870	/*
1871	 * The operation cannot fail anymore.  The removal must happen before
1872	 * the listq iterator is tainted.
1873	 */
1874	m->pindex = opidx;
1875	vm_page_object_remove(m);
1876
1877	/* Return back to the new pindex to complete vm_page_insert(). */
1878	m->pindex = new_pindex;
1879	m->object = new_object;
1880
1881	vm_page_insert_radixdone(m, new_object, mpred);
1882	vm_page_dirty(m);
1883	vm_pager_page_inserted(new_object, m);
1884	return (0);
1885}
1886
1887/*
1888 *	vm_page_alloc:
1889 *
1890 *	Allocate and return a page that is associated with the specified
1891 *	object and offset pair.  By default, this page is exclusive busied.
1892 *
1893 *	The caller must always specify an allocation class.
1894 *
1895 *	allocation classes:
1896 *	VM_ALLOC_NORMAL		normal process request
1897 *	VM_ALLOC_SYSTEM		system *really* needs a page
1898 *	VM_ALLOC_INTERRUPT	interrupt time request
1899 *
1900 *	optional allocation flags:
1901 *	VM_ALLOC_COUNT(number)	the number of additional pages that the caller
1902 *				intends to allocate
1903 *	VM_ALLOC_NOBUSY		do not exclusive busy the page
1904 *	VM_ALLOC_NODUMP		do not include the page in a kernel core dump
1905 *	VM_ALLOC_SBUSY		shared busy the allocated page
1906 *	VM_ALLOC_WIRED		wire the allocated page
1907 *	VM_ALLOC_ZERO		prefer a zeroed page
1908 */
1909vm_page_t
1910vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req)
1911{
1912
1913	return (vm_page_alloc_after(object, pindex, req,
1914	    vm_radix_lookup_le(&object->rtree, pindex)));
1915}
1916
1917vm_page_t
1918vm_page_alloc_domain(vm_object_t object, vm_pindex_t pindex, int domain,
1919    int req)
1920{
1921
1922	return (vm_page_alloc_domain_after(object, pindex, domain, req,
1923	    vm_radix_lookup_le(&object->rtree, pindex)));
1924}
1925
1926/*
1927 * Allocate a page in the specified object with the given page index.  To
1928 * optimize insertion of the page into the object, the caller must also specifiy
1929 * the resident page in the object with largest index smaller than the given
1930 * page index, or NULL if no such page exists.
1931 */
1932vm_page_t
1933vm_page_alloc_after(vm_object_t object, vm_pindex_t pindex,
1934    int req, vm_page_t mpred)
1935{
1936	struct vm_domainset_iter di;
1937	vm_page_t m;
1938	int domain;
1939
1940	vm_domainset_iter_page_init(&di, object, pindex, &domain, &req);
1941	do {
1942		m = vm_page_alloc_domain_after(object, pindex, domain, req,
1943		    mpred);
1944		if (m != NULL)
1945			break;
1946	} while (vm_domainset_iter_page(&di, object, &domain) == 0);
1947
1948	return (m);
1949}
1950
1951/*
1952 * Returns true if the number of free pages exceeds the minimum
1953 * for the request class and false otherwise.
1954 */
1955static int
1956_vm_domain_allocate(struct vm_domain *vmd, int req_class, int npages)
1957{
1958	u_int limit, old, new;
1959
1960	if (req_class == VM_ALLOC_INTERRUPT)
1961		limit = 0;
1962	else if (req_class == VM_ALLOC_SYSTEM)
1963		limit = vmd->vmd_interrupt_free_min;
1964	else
1965		limit = vmd->vmd_free_reserved;
1966
1967	/*
1968	 * Attempt to reserve the pages.  Fail if we're below the limit.
1969	 */
1970	limit += npages;
1971	old = vmd->vmd_free_count;
1972	do {
1973		if (old < limit)
1974			return (0);
1975		new = old - npages;
1976	} while (atomic_fcmpset_int(&vmd->vmd_free_count, &old, new) == 0);
1977
1978	/* Wake the page daemon if we've crossed the threshold. */
1979	if (vm_paging_needed(vmd, new) && !vm_paging_needed(vmd, old))
1980		pagedaemon_wakeup(vmd->vmd_domain);
1981
1982	/* Only update bitsets on transitions. */
1983	if ((old >= vmd->vmd_free_min && new < vmd->vmd_free_min) ||
1984	    (old >= vmd->vmd_free_severe && new < vmd->vmd_free_severe))
1985		vm_domain_set(vmd);
1986
1987	return (1);
1988}
1989
1990int
1991vm_domain_allocate(struct vm_domain *vmd, int req, int npages)
1992{
1993	int req_class;
1994
1995	/*
1996	 * The page daemon is allowed to dig deeper into the free page list.
1997	 */
1998	req_class = req & VM_ALLOC_CLASS_MASK;
1999	if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
2000		req_class = VM_ALLOC_SYSTEM;
2001	return (_vm_domain_allocate(vmd, req_class, npages));
2002}
2003
2004vm_page_t
2005vm_page_alloc_domain_after(vm_object_t object, vm_pindex_t pindex, int domain,
2006    int req, vm_page_t mpred)
2007{
2008	struct vm_domain *vmd;
2009	vm_page_t m;
2010	int flags;
2011
2012#define	VPA_FLAGS	(VM_ALLOC_CLASS_MASK | VM_ALLOC_WAITFAIL |	\
2013			 VM_ALLOC_NOWAIT | VM_ALLOC_NOBUSY |		\
2014			 VM_ALLOC_SBUSY | VM_ALLOC_WIRED |		\
2015			 VM_ALLOC_NODUMP | VM_ALLOC_ZERO | VM_ALLOC_COUNT_MASK)
2016	KASSERT((req & ~VPA_FLAGS) == 0,
2017	    ("invalid request %#x", req));
2018	KASSERT(((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) !=
2019	    (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)),
2020	    ("invalid request %#x", req));
2021	KASSERT(mpred == NULL || mpred->pindex < pindex,
2022	    ("mpred %p doesn't precede pindex 0x%jx", mpred,
2023	    (uintmax_t)pindex));
2024	VM_OBJECT_ASSERT_WLOCKED(object);
2025
2026	flags = 0;
2027	m = NULL;
2028	if (!vm_pager_can_alloc_page(object, pindex))
2029		return (NULL);
2030again:
2031#if VM_NRESERVLEVEL > 0
2032	/*
2033	 * Can we allocate the page from a reservation?
2034	 */
2035	if (vm_object_reserv(object) &&
2036	    (m = vm_reserv_alloc_page(object, pindex, domain, req, mpred)) !=
2037	    NULL) {
2038		goto found;
2039	}
2040#endif
2041	vmd = VM_DOMAIN(domain);
2042	if (vmd->vmd_pgcache[VM_FREEPOOL_DEFAULT].zone != NULL) {
2043		m = uma_zalloc(vmd->vmd_pgcache[VM_FREEPOOL_DEFAULT].zone,
2044		    M_NOWAIT | M_NOVM);
2045		if (m != NULL) {
2046			flags |= PG_PCPU_CACHE;
2047			goto found;
2048		}
2049	}
2050	if (vm_domain_allocate(vmd, req, 1)) {
2051		/*
2052		 * If not, allocate it from the free page queues.
2053		 */
2054		vm_domain_free_lock(vmd);
2055		m = vm_phys_alloc_pages(domain, VM_FREEPOOL_DEFAULT, 0);
2056		vm_domain_free_unlock(vmd);
2057		if (m == NULL) {
2058			vm_domain_freecnt_inc(vmd, 1);
2059#if VM_NRESERVLEVEL > 0
2060			if (vm_reserv_reclaim_inactive(domain))
2061				goto again;
2062#endif
2063		}
2064	}
2065	if (m == NULL) {
2066		/*
2067		 * Not allocatable, give up.
2068		 */
2069		if (vm_domain_alloc_fail(vmd, object, req))
2070			goto again;
2071		return (NULL);
2072	}
2073
2074	/*
2075	 * At this point we had better have found a good page.
2076	 */
2077found:
2078	vm_page_dequeue(m);
2079	vm_page_alloc_check(m);
2080
2081	/*
2082	 * Initialize the page.  Only the PG_ZERO flag is inherited.
2083	 */
2084	flags |= m->flags & PG_ZERO;
2085	if ((req & VM_ALLOC_NODUMP) != 0)
2086		flags |= PG_NODUMP;
2087	m->flags = flags;
2088	m->a.flags = 0;
2089	m->oflags = (object->flags & OBJ_UNMANAGED) != 0 ? VPO_UNMANAGED : 0;
2090	if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0)
2091		m->busy_lock = VPB_CURTHREAD_EXCLUSIVE;
2092	else if ((req & VM_ALLOC_SBUSY) != 0)
2093		m->busy_lock = VPB_SHARERS_WORD(1);
2094	else
2095		m->busy_lock = VPB_UNBUSIED;
2096	if (req & VM_ALLOC_WIRED) {
2097		vm_wire_add(1);
2098		m->ref_count = 1;
2099	}
2100	m->a.act_count = 0;
2101
2102	if (vm_page_insert_after(m, object, pindex, mpred)) {
2103		if (req & VM_ALLOC_WIRED) {
2104			vm_wire_sub(1);
2105			m->ref_count = 0;
2106		}
2107		KASSERT(m->object == NULL, ("page %p has object", m));
2108		m->oflags = VPO_UNMANAGED;
2109		m->busy_lock = VPB_UNBUSIED;
2110		/* Don't change PG_ZERO. */
2111		vm_page_free_toq(m);
2112		if (req & VM_ALLOC_WAITFAIL) {
2113			VM_OBJECT_WUNLOCK(object);
2114			vm_radix_wait();
2115			VM_OBJECT_WLOCK(object);
2116		}
2117		return (NULL);
2118	}
2119
2120	/* Ignore device objects; the pager sets "memattr" for them. */
2121	if (object->memattr != VM_MEMATTR_DEFAULT &&
2122	    (object->flags & OBJ_FICTITIOUS) == 0)
2123		pmap_page_set_memattr(m, object->memattr);
2124
2125	return (m);
2126}
2127
2128/*
2129 *	vm_page_alloc_contig:
2130 *
2131 *	Allocate a contiguous set of physical pages of the given size "npages"
2132 *	from the free lists.  All of the physical pages must be at or above
2133 *	the given physical address "low" and below the given physical address
2134 *	"high".  The given value "alignment" determines the alignment of the
2135 *	first physical page in the set.  If the given value "boundary" is
2136 *	non-zero, then the set of physical pages cannot cross any physical
2137 *	address boundary that is a multiple of that value.  Both "alignment"
2138 *	and "boundary" must be a power of two.
2139 *
2140 *	If the specified memory attribute, "memattr", is VM_MEMATTR_DEFAULT,
2141 *	then the memory attribute setting for the physical pages is configured
2142 *	to the object's memory attribute setting.  Otherwise, the memory
2143 *	attribute setting for the physical pages is configured to "memattr",
2144 *	overriding the object's memory attribute setting.  However, if the
2145 *	object's memory attribute setting is not VM_MEMATTR_DEFAULT, then the
2146 *	memory attribute setting for the physical pages cannot be configured
2147 *	to VM_MEMATTR_DEFAULT.
2148 *
2149 *	The specified object may not contain fictitious pages.
2150 *
2151 *	The caller must always specify an allocation class.
2152 *
2153 *	allocation classes:
2154 *	VM_ALLOC_NORMAL		normal process request
2155 *	VM_ALLOC_SYSTEM		system *really* needs a page
2156 *	VM_ALLOC_INTERRUPT	interrupt time request
2157 *
2158 *	optional allocation flags:
2159 *	VM_ALLOC_NOBUSY		do not exclusive busy the page
2160 *	VM_ALLOC_NODUMP		do not include the page in a kernel core dump
2161 *	VM_ALLOC_SBUSY		shared busy the allocated page
2162 *	VM_ALLOC_WIRED		wire the allocated page
2163 *	VM_ALLOC_ZERO		prefer a zeroed page
2164 */
2165vm_page_t
2166vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req,
2167    u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
2168    vm_paddr_t boundary, vm_memattr_t memattr)
2169{
2170	struct vm_domainset_iter di;
2171	vm_page_t bounds[2];
2172	vm_page_t m;
2173	int domain;
2174	int start_segind;
2175
2176	start_segind = -1;
2177
2178	vm_domainset_iter_page_init(&di, object, pindex, &domain, &req);
2179	do {
2180		m = vm_page_alloc_contig_domain(object, pindex, domain, req,
2181		    npages, low, high, alignment, boundary, memattr);
2182		if (m != NULL)
2183			break;
2184		if (start_segind == -1)
2185			start_segind = vm_phys_lookup_segind(low);
2186		if (vm_phys_find_range(bounds, start_segind, domain,
2187		    npages, low, high) == -1) {
2188			vm_domainset_iter_ignore(&di, domain);
2189		}
2190	} while (vm_domainset_iter_page(&di, object, &domain) == 0);
2191
2192	return (m);
2193}
2194
2195static vm_page_t
2196vm_page_find_contig_domain(int domain, int req, u_long npages, vm_paddr_t low,
2197    vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
2198{
2199	struct vm_domain *vmd;
2200	vm_page_t m_ret;
2201
2202	/*
2203	 * Can we allocate the pages without the number of free pages falling
2204	 * below the lower bound for the allocation class?
2205	 */
2206	vmd = VM_DOMAIN(domain);
2207	if (!vm_domain_allocate(vmd, req, npages))
2208		return (NULL);
2209	/*
2210	 * Try to allocate the pages from the free page queues.
2211	 */
2212	vm_domain_free_lock(vmd);
2213	m_ret = vm_phys_alloc_contig(domain, npages, low, high,
2214	    alignment, boundary);
2215	vm_domain_free_unlock(vmd);
2216	if (m_ret != NULL)
2217		return (m_ret);
2218#if VM_NRESERVLEVEL > 0
2219	/*
2220	 * Try to break a reservation to allocate the pages.
2221	 */
2222	if ((req & VM_ALLOC_NORECLAIM) == 0) {
2223		m_ret = vm_reserv_reclaim_contig(domain, npages, low,
2224	            high, alignment, boundary);
2225		if (m_ret != NULL)
2226			return (m_ret);
2227	}
2228#endif
2229	vm_domain_freecnt_inc(vmd, npages);
2230	return (NULL);
2231}
2232
2233vm_page_t
2234vm_page_alloc_contig_domain(vm_object_t object, vm_pindex_t pindex, int domain,
2235    int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
2236    vm_paddr_t boundary, vm_memattr_t memattr)
2237{
2238	vm_page_t m, m_ret, mpred;
2239	u_int busy_lock, flags, oflags;
2240
2241#define	VPAC_FLAGS	(VPA_FLAGS | VM_ALLOC_NORECLAIM)
2242	KASSERT((req & ~VPAC_FLAGS) == 0,
2243	    ("invalid request %#x", req));
2244	KASSERT(((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) !=
2245	    (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)),
2246	    ("invalid request %#x", req));
2247	KASSERT((req & (VM_ALLOC_WAITOK | VM_ALLOC_NORECLAIM)) !=
2248	    (VM_ALLOC_WAITOK | VM_ALLOC_NORECLAIM),
2249	    ("invalid request %#x", req));
2250	VM_OBJECT_ASSERT_WLOCKED(object);
2251	KASSERT((object->flags & OBJ_FICTITIOUS) == 0,
2252	    ("vm_page_alloc_contig: object %p has fictitious pages",
2253	    object));
2254	KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero"));
2255
2256	mpred = vm_radix_lookup_le(&object->rtree, pindex);
2257	KASSERT(mpred == NULL || mpred->pindex != pindex,
2258	    ("vm_page_alloc_contig: pindex already allocated"));
2259	for (;;) {
2260#if VM_NRESERVLEVEL > 0
2261		/*
2262		 * Can we allocate the pages from a reservation?
2263		 */
2264		if (vm_object_reserv(object) &&
2265		    (m_ret = vm_reserv_alloc_contig(object, pindex, domain, req,
2266		    mpred, npages, low, high, alignment, boundary)) != NULL) {
2267			break;
2268		}
2269#endif
2270		if ((m_ret = vm_page_find_contig_domain(domain, req, npages,
2271		    low, high, alignment, boundary)) != NULL)
2272			break;
2273		if (!vm_domain_alloc_fail(VM_DOMAIN(domain), object, req))
2274			return (NULL);
2275	}
2276	for (m = m_ret; m < &m_ret[npages]; m++) {
2277		vm_page_dequeue(m);
2278		vm_page_alloc_check(m);
2279	}
2280
2281	/*
2282	 * Initialize the pages.  Only the PG_ZERO flag is inherited.
2283	 */
2284	flags = PG_ZERO;
2285	if ((req & VM_ALLOC_NODUMP) != 0)
2286		flags |= PG_NODUMP;
2287	oflags = (object->flags & OBJ_UNMANAGED) != 0 ? VPO_UNMANAGED : 0;
2288	if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0)
2289		busy_lock = VPB_CURTHREAD_EXCLUSIVE;
2290	else if ((req & VM_ALLOC_SBUSY) != 0)
2291		busy_lock = VPB_SHARERS_WORD(1);
2292	else
2293		busy_lock = VPB_UNBUSIED;
2294	if ((req & VM_ALLOC_WIRED) != 0)
2295		vm_wire_add(npages);
2296	if (object->memattr != VM_MEMATTR_DEFAULT &&
2297	    memattr == VM_MEMATTR_DEFAULT)
2298		memattr = object->memattr;
2299	for (m = m_ret; m < &m_ret[npages]; m++) {
2300		m->a.flags = 0;
2301		m->flags = (m->flags | PG_NODUMP) & flags;
2302		m->busy_lock = busy_lock;
2303		if ((req & VM_ALLOC_WIRED) != 0)
2304			m->ref_count = 1;
2305		m->a.act_count = 0;
2306		m->oflags = oflags;
2307		if (vm_page_insert_after(m, object, pindex, mpred)) {
2308			if ((req & VM_ALLOC_WIRED) != 0)
2309				vm_wire_sub(npages);
2310			KASSERT(m->object == NULL,
2311			    ("page %p has object", m));
2312			mpred = m;
2313			for (m = m_ret; m < &m_ret[npages]; m++) {
2314				if (m <= mpred &&
2315				    (req & VM_ALLOC_WIRED) != 0)
2316					m->ref_count = 0;
2317				m->oflags = VPO_UNMANAGED;
2318				m->busy_lock = VPB_UNBUSIED;
2319				/* Don't change PG_ZERO. */
2320				vm_page_free_toq(m);
2321			}
2322			if (req & VM_ALLOC_WAITFAIL) {
2323				VM_OBJECT_WUNLOCK(object);
2324				vm_radix_wait();
2325				VM_OBJECT_WLOCK(object);
2326			}
2327			return (NULL);
2328		}
2329		mpred = m;
2330		if (memattr != VM_MEMATTR_DEFAULT)
2331			pmap_page_set_memattr(m, memattr);
2332		pindex++;
2333	}
2334	return (m_ret);
2335}
2336
2337/*
2338 * Allocate a physical page that is not intended to be inserted into a VM
2339 * object.  If the "freelist" parameter is not equal to VM_NFREELIST, then only
2340 * pages from the specified vm_phys freelist will be returned.
2341 */
2342static __always_inline vm_page_t
2343_vm_page_alloc_noobj_domain(int domain, const int freelist, int req)
2344{
2345	struct vm_domain *vmd;
2346	vm_page_t m;
2347	int flags;
2348
2349#define	VPAN_FLAGS	(VM_ALLOC_CLASS_MASK | VM_ALLOC_WAITFAIL |      \
2350			 VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK |		\
2351			 VM_ALLOC_NOBUSY | VM_ALLOC_WIRED |		\
2352			 VM_ALLOC_NODUMP | VM_ALLOC_ZERO | VM_ALLOC_COUNT_MASK)
2353	KASSERT((req & ~VPAN_FLAGS) == 0,
2354	    ("invalid request %#x", req));
2355
2356	flags = (req & VM_ALLOC_NODUMP) != 0 ? PG_NODUMP : 0;
2357	vmd = VM_DOMAIN(domain);
2358again:
2359	if (freelist == VM_NFREELIST &&
2360	    vmd->vmd_pgcache[VM_FREEPOOL_DIRECT].zone != NULL) {
2361		m = uma_zalloc(vmd->vmd_pgcache[VM_FREEPOOL_DIRECT].zone,
2362		    M_NOWAIT | M_NOVM);
2363		if (m != NULL) {
2364			flags |= PG_PCPU_CACHE;
2365			goto found;
2366		}
2367	}
2368
2369	if (vm_domain_allocate(vmd, req, 1)) {
2370		vm_domain_free_lock(vmd);
2371		if (freelist == VM_NFREELIST)
2372			m = vm_phys_alloc_pages(domain, VM_FREEPOOL_DIRECT, 0);
2373		else
2374			m = vm_phys_alloc_freelist_pages(domain, freelist,
2375			    VM_FREEPOOL_DIRECT, 0);
2376		vm_domain_free_unlock(vmd);
2377		if (m == NULL) {
2378			vm_domain_freecnt_inc(vmd, 1);
2379#if VM_NRESERVLEVEL > 0
2380			if (freelist == VM_NFREELIST &&
2381			    vm_reserv_reclaim_inactive(domain))
2382				goto again;
2383#endif
2384		}
2385	}
2386	if (m == NULL) {
2387		if (vm_domain_alloc_fail(vmd, NULL, req))
2388			goto again;
2389		return (NULL);
2390	}
2391
2392found:
2393	vm_page_dequeue(m);
2394	vm_page_alloc_check(m);
2395
2396	/*
2397	 * Consumers should not rely on a useful default pindex value.
2398	 */
2399	m->pindex = 0xdeadc0dedeadc0de;
2400	m->flags = (m->flags & PG_ZERO) | flags;
2401	m->a.flags = 0;
2402	m->oflags = VPO_UNMANAGED;
2403	m->busy_lock = VPB_UNBUSIED;
2404	if ((req & VM_ALLOC_WIRED) != 0) {
2405		vm_wire_add(1);
2406		m->ref_count = 1;
2407	}
2408
2409	if ((req & VM_ALLOC_ZERO) != 0 && (m->flags & PG_ZERO) == 0)
2410		pmap_zero_page(m);
2411
2412	return (m);
2413}
2414
2415vm_page_t
2416vm_page_alloc_freelist(int freelist, int req)
2417{
2418	struct vm_domainset_iter di;
2419	vm_page_t m;
2420	int domain;
2421
2422	vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req);
2423	do {
2424		m = vm_page_alloc_freelist_domain(domain, freelist, req);
2425		if (m != NULL)
2426			break;
2427	} while (vm_domainset_iter_page(&di, NULL, &domain) == 0);
2428
2429	return (m);
2430}
2431
2432vm_page_t
2433vm_page_alloc_freelist_domain(int domain, int freelist, int req)
2434{
2435	KASSERT(freelist >= 0 && freelist < VM_NFREELIST,
2436	    ("%s: invalid freelist %d", __func__, freelist));
2437
2438	return (_vm_page_alloc_noobj_domain(domain, freelist, req));
2439}
2440
2441vm_page_t
2442vm_page_alloc_noobj(int req)
2443{
2444	struct vm_domainset_iter di;
2445	vm_page_t m;
2446	int domain;
2447
2448	vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req);
2449	do {
2450		m = vm_page_alloc_noobj_domain(domain, req);
2451		if (m != NULL)
2452			break;
2453	} while (vm_domainset_iter_page(&di, NULL, &domain) == 0);
2454
2455	return (m);
2456}
2457
2458vm_page_t
2459vm_page_alloc_noobj_domain(int domain, int req)
2460{
2461	return (_vm_page_alloc_noobj_domain(domain, VM_NFREELIST, req));
2462}
2463
2464vm_page_t
2465vm_page_alloc_noobj_contig(int req, u_long npages, vm_paddr_t low,
2466    vm_paddr_t high, u_long alignment, vm_paddr_t boundary,
2467    vm_memattr_t memattr)
2468{
2469	struct vm_domainset_iter di;
2470	vm_page_t m;
2471	int domain;
2472
2473	vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req);
2474	do {
2475		m = vm_page_alloc_noobj_contig_domain(domain, req, npages, low,
2476		    high, alignment, boundary, memattr);
2477		if (m != NULL)
2478			break;
2479	} while (vm_domainset_iter_page(&di, NULL, &domain) == 0);
2480
2481	return (m);
2482}
2483
2484vm_page_t
2485vm_page_alloc_noobj_contig_domain(int domain, int req, u_long npages,
2486    vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary,
2487    vm_memattr_t memattr)
2488{
2489	vm_page_t m, m_ret;
2490	u_int flags;
2491
2492#define	VPANC_FLAGS	(VPAN_FLAGS | VM_ALLOC_NORECLAIM)
2493	KASSERT((req & ~VPANC_FLAGS) == 0,
2494	    ("invalid request %#x", req));
2495	KASSERT((req & (VM_ALLOC_WAITOK | VM_ALLOC_NORECLAIM)) !=
2496	    (VM_ALLOC_WAITOK | VM_ALLOC_NORECLAIM),
2497	    ("invalid request %#x", req));
2498	KASSERT(((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) !=
2499	    (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)),
2500	    ("invalid request %#x", req));
2501	KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero"));
2502
2503	while ((m_ret = vm_page_find_contig_domain(domain, req, npages,
2504	    low, high, alignment, boundary)) == NULL) {
2505		if (!vm_domain_alloc_fail(VM_DOMAIN(domain), NULL, req))
2506			return (NULL);
2507	}
2508
2509	/*
2510	 * Initialize the pages.  Only the PG_ZERO flag is inherited.
2511	 */
2512	flags = PG_ZERO;
2513	if ((req & VM_ALLOC_NODUMP) != 0)
2514		flags |= PG_NODUMP;
2515	if ((req & VM_ALLOC_WIRED) != 0)
2516		vm_wire_add(npages);
2517	for (m = m_ret; m < &m_ret[npages]; m++) {
2518		vm_page_dequeue(m);
2519		vm_page_alloc_check(m);
2520
2521		/*
2522		 * Consumers should not rely on a useful default pindex value.
2523		 */
2524		m->pindex = 0xdeadc0dedeadc0de;
2525		m->a.flags = 0;
2526		m->flags = (m->flags | PG_NODUMP) & flags;
2527		m->busy_lock = VPB_UNBUSIED;
2528		if ((req & VM_ALLOC_WIRED) != 0)
2529			m->ref_count = 1;
2530		m->a.act_count = 0;
2531		m->oflags = VPO_UNMANAGED;
2532
2533		/*
2534		 * Zero the page before updating any mappings since the page is
2535		 * not yet shared with any devices which might require the
2536		 * non-default memory attribute.  pmap_page_set_memattr()
2537		 * flushes data caches before returning.
2538		 */
2539		if ((req & VM_ALLOC_ZERO) != 0 && (m->flags & PG_ZERO) == 0)
2540			pmap_zero_page(m);
2541		if (memattr != VM_MEMATTR_DEFAULT)
2542			pmap_page_set_memattr(m, memattr);
2543	}
2544	return (m_ret);
2545}
2546
2547/*
2548 * Check a page that has been freshly dequeued from a freelist.
2549 */
2550static void
2551vm_page_alloc_check(vm_page_t m)
2552{
2553
2554	KASSERT(m->object == NULL, ("page %p has object", m));
2555	KASSERT(m->a.queue == PQ_NONE &&
2556	    (m->a.flags & PGA_QUEUE_STATE_MASK) == 0,
2557	    ("page %p has unexpected queue %d, flags %#x",
2558	    m, m->a.queue, (m->a.flags & PGA_QUEUE_STATE_MASK)));
2559	KASSERT(m->ref_count == 0, ("page %p has references", m));
2560	KASSERT(vm_page_busy_freed(m), ("page %p is not freed", m));
2561	KASSERT(m->dirty == 0, ("page %p is dirty", m));
2562	KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT,
2563	    ("page %p has unexpected memattr %d",
2564	    m, pmap_page_get_memattr(m)));
2565	KASSERT(vm_page_none_valid(m), ("free page %p is valid", m));
2566	pmap_vm_page_alloc_check(m);
2567}
2568
2569static int
2570vm_page_zone_import(void *arg, void **store, int cnt, int domain, int flags)
2571{
2572	struct vm_domain *vmd;
2573	struct vm_pgcache *pgcache;
2574	int i;
2575
2576	pgcache = arg;
2577	vmd = VM_DOMAIN(pgcache->domain);
2578
2579	/*
2580	 * The page daemon should avoid creating extra memory pressure since its
2581	 * main purpose is to replenish the store of free pages.
2582	 */
2583	if (vmd->vmd_severeset || curproc == pageproc ||
2584	    !_vm_domain_allocate(vmd, VM_ALLOC_NORMAL, cnt))
2585		return (0);
2586	domain = vmd->vmd_domain;
2587	vm_domain_free_lock(vmd);
2588	i = vm_phys_alloc_npages(domain, pgcache->pool, cnt,
2589	    (vm_page_t *)store);
2590	vm_domain_free_unlock(vmd);
2591	if (cnt != i)
2592		vm_domain_freecnt_inc(vmd, cnt - i);
2593
2594	return (i);
2595}
2596
2597static void
2598vm_page_zone_release(void *arg, void **store, int cnt)
2599{
2600	struct vm_domain *vmd;
2601	struct vm_pgcache *pgcache;
2602	vm_page_t m;
2603	int i;
2604
2605	pgcache = arg;
2606	vmd = VM_DOMAIN(pgcache->domain);
2607	vm_domain_free_lock(vmd);
2608	for (i = 0; i < cnt; i++) {
2609		m = (vm_page_t)store[i];
2610		vm_phys_free_pages(m, 0);
2611	}
2612	vm_domain_free_unlock(vmd);
2613	vm_domain_freecnt_inc(vmd, cnt);
2614}
2615
2616#define	VPSC_ANY	0	/* No restrictions. */
2617#define	VPSC_NORESERV	1	/* Skip reservations; implies VPSC_NOSUPER. */
2618#define	VPSC_NOSUPER	2	/* Skip superpages. */
2619
2620/*
2621 *	vm_page_scan_contig:
2622 *
2623 *	Scan vm_page_array[] between the specified entries "m_start" and
2624 *	"m_end" for a run of contiguous physical pages that satisfy the
2625 *	specified conditions, and return the lowest page in the run.  The
2626 *	specified "alignment" determines the alignment of the lowest physical
2627 *	page in the run.  If the specified "boundary" is non-zero, then the
2628 *	run of physical pages cannot span a physical address that is a
2629 *	multiple of "boundary".
2630 *
2631 *	"m_end" is never dereferenced, so it need not point to a vm_page
2632 *	structure within vm_page_array[].
2633 *
2634 *	"npages" must be greater than zero.  "m_start" and "m_end" must not
2635 *	span a hole (or discontiguity) in the physical address space.  Both
2636 *	"alignment" and "boundary" must be a power of two.
2637 */
2638static vm_page_t
2639vm_page_scan_contig(u_long npages, vm_page_t m_start, vm_page_t m_end,
2640    u_long alignment, vm_paddr_t boundary, int options)
2641{
2642	vm_object_t object;
2643	vm_paddr_t pa;
2644	vm_page_t m, m_run;
2645#if VM_NRESERVLEVEL > 0
2646	int level;
2647#endif
2648	int m_inc, order, run_ext, run_len;
2649
2650	KASSERT(npages > 0, ("npages is 0"));
2651	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
2652	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
2653	m_run = NULL;
2654	run_len = 0;
2655	for (m = m_start; m < m_end && run_len < npages; m += m_inc) {
2656		KASSERT((m->flags & PG_MARKER) == 0,
2657		    ("page %p is PG_MARKER", m));
2658		KASSERT((m->flags & PG_FICTITIOUS) == 0 || m->ref_count >= 1,
2659		    ("fictitious page %p has invalid ref count", m));
2660
2661		/*
2662		 * If the current page would be the start of a run, check its
2663		 * physical address against the end, alignment, and boundary
2664		 * conditions.  If it doesn't satisfy these conditions, either
2665		 * terminate the scan or advance to the next page that
2666		 * satisfies the failed condition.
2667		 */
2668		if (run_len == 0) {
2669			KASSERT(m_run == NULL, ("m_run != NULL"));
2670			if (m + npages > m_end)
2671				break;
2672			pa = VM_PAGE_TO_PHYS(m);
2673			if (!vm_addr_align_ok(pa, alignment)) {
2674				m_inc = atop(roundup2(pa, alignment) - pa);
2675				continue;
2676			}
2677			if (!vm_addr_bound_ok(pa, ptoa(npages), boundary)) {
2678				m_inc = atop(roundup2(pa, boundary) - pa);
2679				continue;
2680			}
2681		} else
2682			KASSERT(m_run != NULL, ("m_run == NULL"));
2683
2684retry:
2685		m_inc = 1;
2686		if (vm_page_wired(m))
2687			run_ext = 0;
2688#if VM_NRESERVLEVEL > 0
2689		else if ((level = vm_reserv_level(m)) >= 0 &&
2690		    (options & VPSC_NORESERV) != 0) {
2691			run_ext = 0;
2692			/* Advance to the end of the reservation. */
2693			pa = VM_PAGE_TO_PHYS(m);
2694			m_inc = atop(roundup2(pa + 1, vm_reserv_size(level)) -
2695			    pa);
2696		}
2697#endif
2698		else if ((object = atomic_load_ptr(&m->object)) != NULL) {
2699			/*
2700			 * The page is considered eligible for relocation if
2701			 * and only if it could be laundered or reclaimed by
2702			 * the page daemon.
2703			 */
2704			VM_OBJECT_RLOCK(object);
2705			if (object != m->object) {
2706				VM_OBJECT_RUNLOCK(object);
2707				goto retry;
2708			}
2709			/* Don't care: PG_NODUMP, PG_ZERO. */
2710			if ((object->flags & OBJ_SWAP) == 0 &&
2711			    object->type != OBJT_VNODE) {
2712				run_ext = 0;
2713#if VM_NRESERVLEVEL > 0
2714			} else if ((options & VPSC_NOSUPER) != 0 &&
2715			    (level = vm_reserv_level_iffullpop(m)) >= 0) {
2716				run_ext = 0;
2717				/* Advance to the end of the superpage. */
2718				pa = VM_PAGE_TO_PHYS(m);
2719				m_inc = atop(roundup2(pa + 1,
2720				    vm_reserv_size(level)) - pa);
2721#endif
2722			} else if (object->memattr == VM_MEMATTR_DEFAULT &&
2723			    vm_page_queue(m) != PQ_NONE && !vm_page_busied(m)) {
2724				/*
2725				 * The page is allocated but eligible for
2726				 * relocation.  Extend the current run by one
2727				 * page.
2728				 */
2729				KASSERT(pmap_page_get_memattr(m) ==
2730				    VM_MEMATTR_DEFAULT,
2731				    ("page %p has an unexpected memattr", m));
2732				KASSERT((m->oflags & (VPO_SWAPINPROG |
2733				    VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0,
2734				    ("page %p has unexpected oflags", m));
2735				/* Don't care: PGA_NOSYNC. */
2736				run_ext = 1;
2737			} else
2738				run_ext = 0;
2739			VM_OBJECT_RUNLOCK(object);
2740#if VM_NRESERVLEVEL > 0
2741		} else if (level >= 0) {
2742			/*
2743			 * The page is reserved but not yet allocated.  In
2744			 * other words, it is still free.  Extend the current
2745			 * run by one page.
2746			 */
2747			run_ext = 1;
2748#endif
2749		} else if ((order = m->order) < VM_NFREEORDER) {
2750			/*
2751			 * The page is enqueued in the physical memory
2752			 * allocator's free page queues.  Moreover, it is the
2753			 * first page in a power-of-two-sized run of
2754			 * contiguous free pages.  Add these pages to the end
2755			 * of the current run, and jump ahead.
2756			 */
2757			run_ext = 1 << order;
2758			m_inc = 1 << order;
2759		} else {
2760			/*
2761			 * Skip the page for one of the following reasons: (1)
2762			 * It is enqueued in the physical memory allocator's
2763			 * free page queues.  However, it is not the first
2764			 * page in a run of contiguous free pages.  (This case
2765			 * rarely occurs because the scan is performed in
2766			 * ascending order.) (2) It is not reserved, and it is
2767			 * transitioning from free to allocated.  (Conversely,
2768			 * the transition from allocated to free for managed
2769			 * pages is blocked by the page busy lock.) (3) It is
2770			 * allocated but not contained by an object and not
2771			 * wired, e.g., allocated by Xen's balloon driver.
2772			 */
2773			run_ext = 0;
2774		}
2775
2776		/*
2777		 * Extend or reset the current run of pages.
2778		 */
2779		if (run_ext > 0) {
2780			if (run_len == 0)
2781				m_run = m;
2782			run_len += run_ext;
2783		} else {
2784			if (run_len > 0) {
2785				m_run = NULL;
2786				run_len = 0;
2787			}
2788		}
2789	}
2790	if (run_len >= npages)
2791		return (m_run);
2792	return (NULL);
2793}
2794
2795/*
2796 *	vm_page_reclaim_run:
2797 *
2798 *	Try to relocate each of the allocated virtual pages within the
2799 *	specified run of physical pages to a new physical address.  Free the
2800 *	physical pages underlying the relocated virtual pages.  A virtual page
2801 *	is relocatable if and only if it could be laundered or reclaimed by
2802 *	the page daemon.  Whenever possible, a virtual page is relocated to a
2803 *	physical address above "high".
2804 *
2805 *	Returns 0 if every physical page within the run was already free or
2806 *	just freed by a successful relocation.  Otherwise, returns a non-zero
2807 *	value indicating why the last attempt to relocate a virtual page was
2808 *	unsuccessful.
2809 *
2810 *	"req_class" must be an allocation class.
2811 */
2812static int
2813vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run,
2814    vm_paddr_t high)
2815{
2816	struct vm_domain *vmd;
2817	struct spglist free;
2818	vm_object_t object;
2819	vm_paddr_t pa;
2820	vm_page_t m, m_end, m_new;
2821	int error, order, req;
2822
2823	KASSERT((req_class & VM_ALLOC_CLASS_MASK) == req_class,
2824	    ("req_class is not an allocation class"));
2825	SLIST_INIT(&free);
2826	error = 0;
2827	m = m_run;
2828	m_end = m_run + npages;
2829	for (; error == 0 && m < m_end; m++) {
2830		KASSERT((m->flags & (PG_FICTITIOUS | PG_MARKER)) == 0,
2831		    ("page %p is PG_FICTITIOUS or PG_MARKER", m));
2832
2833		/*
2834		 * Racily check for wirings.  Races are handled once the object
2835		 * lock is held and the page is unmapped.
2836		 */
2837		if (vm_page_wired(m))
2838			error = EBUSY;
2839		else if ((object = atomic_load_ptr(&m->object)) != NULL) {
2840			/*
2841			 * The page is relocated if and only if it could be
2842			 * laundered or reclaimed by the page daemon.
2843			 */
2844			VM_OBJECT_WLOCK(object);
2845			/* Don't care: PG_NODUMP, PG_ZERO. */
2846			if (m->object != object ||
2847			    ((object->flags & OBJ_SWAP) == 0 &&
2848			    object->type != OBJT_VNODE))
2849				error = EINVAL;
2850			else if (object->memattr != VM_MEMATTR_DEFAULT)
2851				error = EINVAL;
2852			else if (vm_page_queue(m) != PQ_NONE &&
2853			    vm_page_tryxbusy(m) != 0) {
2854				if (vm_page_wired(m)) {
2855					vm_page_xunbusy(m);
2856					error = EBUSY;
2857					goto unlock;
2858				}
2859				KASSERT(pmap_page_get_memattr(m) ==
2860				    VM_MEMATTR_DEFAULT,
2861				    ("page %p has an unexpected memattr", m));
2862				KASSERT(m->oflags == 0,
2863				    ("page %p has unexpected oflags", m));
2864				/* Don't care: PGA_NOSYNC. */
2865				if (!vm_page_none_valid(m)) {
2866					/*
2867					 * First, try to allocate a new page
2868					 * that is above "high".  Failing
2869					 * that, try to allocate a new page
2870					 * that is below "m_run".  Allocate
2871					 * the new page between the end of
2872					 * "m_run" and "high" only as a last
2873					 * resort.
2874					 */
2875					req = req_class;
2876					if ((m->flags & PG_NODUMP) != 0)
2877						req |= VM_ALLOC_NODUMP;
2878					if (trunc_page(high) !=
2879					    ~(vm_paddr_t)PAGE_MASK) {
2880						m_new =
2881						    vm_page_alloc_noobj_contig(
2882						    req, 1, round_page(high),
2883						    ~(vm_paddr_t)0, PAGE_SIZE,
2884						    0, VM_MEMATTR_DEFAULT);
2885					} else
2886						m_new = NULL;
2887					if (m_new == NULL) {
2888						pa = VM_PAGE_TO_PHYS(m_run);
2889						m_new =
2890						    vm_page_alloc_noobj_contig(
2891						    req, 1, 0, pa - 1,
2892						    PAGE_SIZE, 0,
2893						    VM_MEMATTR_DEFAULT);
2894					}
2895					if (m_new == NULL) {
2896						pa += ptoa(npages);
2897						m_new =
2898						    vm_page_alloc_noobj_contig(
2899						    req, 1, pa, high, PAGE_SIZE,
2900						    0, VM_MEMATTR_DEFAULT);
2901					}
2902					if (m_new == NULL) {
2903						vm_page_xunbusy(m);
2904						error = ENOMEM;
2905						goto unlock;
2906					}
2907
2908					/*
2909					 * Unmap the page and check for new
2910					 * wirings that may have been acquired
2911					 * through a pmap lookup.
2912					 */
2913					if (object->ref_count != 0 &&
2914					    !vm_page_try_remove_all(m)) {
2915						vm_page_xunbusy(m);
2916						vm_page_free(m_new);
2917						error = EBUSY;
2918						goto unlock;
2919					}
2920
2921					/*
2922					 * Replace "m" with the new page.  For
2923					 * vm_page_replace(), "m" must be busy
2924					 * and dequeued.  Finally, change "m"
2925					 * as if vm_page_free() was called.
2926					 */
2927					m_new->a.flags = m->a.flags &
2928					    ~PGA_QUEUE_STATE_MASK;
2929					KASSERT(m_new->oflags == VPO_UNMANAGED,
2930					    ("page %p is managed", m_new));
2931					m_new->oflags = 0;
2932					pmap_copy_page(m, m_new);
2933					m_new->valid = m->valid;
2934					m_new->dirty = m->dirty;
2935					m->flags &= ~PG_ZERO;
2936					vm_page_dequeue(m);
2937					if (vm_page_replace_hold(m_new, object,
2938					    m->pindex, m) &&
2939					    vm_page_free_prep(m))
2940						SLIST_INSERT_HEAD(&free, m,
2941						    plinks.s.ss);
2942
2943					/*
2944					 * The new page must be deactivated
2945					 * before the object is unlocked.
2946					 */
2947					vm_page_deactivate(m_new);
2948				} else {
2949					m->flags &= ~PG_ZERO;
2950					vm_page_dequeue(m);
2951					if (vm_page_free_prep(m))
2952						SLIST_INSERT_HEAD(&free, m,
2953						    plinks.s.ss);
2954					KASSERT(m->dirty == 0,
2955					    ("page %p is dirty", m));
2956				}
2957			} else
2958				error = EBUSY;
2959unlock:
2960			VM_OBJECT_WUNLOCK(object);
2961		} else {
2962			MPASS(vm_page_domain(m) == domain);
2963			vmd = VM_DOMAIN(domain);
2964			vm_domain_free_lock(vmd);
2965			order = m->order;
2966			if (order < VM_NFREEORDER) {
2967				/*
2968				 * The page is enqueued in the physical memory
2969				 * allocator's free page queues.  Moreover, it
2970				 * is the first page in a power-of-two-sized
2971				 * run of contiguous free pages.  Jump ahead
2972				 * to the last page within that run, and
2973				 * continue from there.
2974				 */
2975				m += (1 << order) - 1;
2976			}
2977#if VM_NRESERVLEVEL > 0
2978			else if (vm_reserv_is_page_free(m))
2979				order = 0;
2980#endif
2981			vm_domain_free_unlock(vmd);
2982			if (order == VM_NFREEORDER)
2983				error = EINVAL;
2984		}
2985	}
2986	if ((m = SLIST_FIRST(&free)) != NULL) {
2987		int cnt;
2988
2989		vmd = VM_DOMAIN(domain);
2990		cnt = 0;
2991		vm_domain_free_lock(vmd);
2992		do {
2993			MPASS(vm_page_domain(m) == domain);
2994			SLIST_REMOVE_HEAD(&free, plinks.s.ss);
2995			vm_phys_free_pages(m, 0);
2996			cnt++;
2997		} while ((m = SLIST_FIRST(&free)) != NULL);
2998		vm_domain_free_unlock(vmd);
2999		vm_domain_freecnt_inc(vmd, cnt);
3000	}
3001	return (error);
3002}
3003
3004#define	NRUNS	16
3005
3006#define	RUN_INDEX(count, nruns)	((count) % (nruns))
3007
3008#define	MIN_RECLAIM	8
3009
3010/*
3011 *	vm_page_reclaim_contig:
3012 *
3013 *	Reclaim allocated, contiguous physical memory satisfying the specified
3014 *	conditions by relocating the virtual pages using that physical memory.
3015 *	Returns 0 if reclamation is successful, ERANGE if the specified domain
3016 *	can't possibly satisfy the reclamation request, or ENOMEM if not
3017 *	currently able to reclaim the requested number of pages.  Since
3018 *	relocation requires the allocation of physical pages, reclamation may
3019 *	fail with ENOMEM due to a shortage of free pages.  When reclamation
3020 *	fails in this manner, callers are expected to perform vm_wait() before
3021 *	retrying a failed allocation operation, e.g., vm_page_alloc_contig().
3022 *
3023 *	The caller must always specify an allocation class through "req".
3024 *
3025 *	allocation classes:
3026 *	VM_ALLOC_NORMAL		normal process request
3027 *	VM_ALLOC_SYSTEM		system *really* needs a page
3028 *	VM_ALLOC_INTERRUPT	interrupt time request
3029 *
3030 *	The optional allocation flags are ignored.
3031 *
3032 *	"npages" must be greater than zero.  Both "alignment" and "boundary"
3033 *	must be a power of two.
3034 */
3035int
3036vm_page_reclaim_contig_domain_ext(int domain, int req, u_long npages,
3037    vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary,
3038    int desired_runs)
3039{
3040	struct vm_domain *vmd;
3041	vm_page_t bounds[2], m_run, _m_runs[NRUNS], *m_runs;
3042	u_long count, minalign, reclaimed;
3043	int error, i, min_reclaim, nruns, options, req_class;
3044	int segind, start_segind;
3045	int ret;
3046
3047	KASSERT(npages > 0, ("npages is 0"));
3048	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
3049	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
3050
3051	ret = ENOMEM;
3052
3053	/*
3054	 * If the caller wants to reclaim multiple runs, try to allocate
3055	 * space to store the runs.  If that fails, fall back to the old
3056	 * behavior of just reclaiming MIN_RECLAIM pages.
3057	 */
3058	if (desired_runs > 1)
3059		m_runs = malloc((NRUNS + desired_runs) * sizeof(*m_runs),
3060		    M_TEMP, M_NOWAIT);
3061	else
3062		m_runs = NULL;
3063
3064	if (m_runs == NULL) {
3065		m_runs = _m_runs;
3066		nruns = NRUNS;
3067	} else {
3068		nruns = NRUNS + desired_runs - 1;
3069	}
3070	min_reclaim = MAX(desired_runs * npages, MIN_RECLAIM);
3071
3072	/*
3073	 * The caller will attempt an allocation after some runs have been
3074	 * reclaimed and added to the vm_phys buddy lists.  Due to limitations
3075	 * of vm_phys_alloc_contig(), round up the requested length to the next
3076	 * power of two or maximum chunk size, and ensure that each run is
3077	 * suitably aligned.
3078	 */
3079	minalign = 1ul << imin(flsl(npages - 1), VM_NFREEORDER - 1);
3080	npages = roundup2(npages, minalign);
3081	if (alignment < ptoa(minalign))
3082		alignment = ptoa(minalign);
3083
3084	/*
3085	 * The page daemon is allowed to dig deeper into the free page list.
3086	 */
3087	req_class = req & VM_ALLOC_CLASS_MASK;
3088	if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
3089		req_class = VM_ALLOC_SYSTEM;
3090
3091	start_segind = vm_phys_lookup_segind(low);
3092
3093	/*
3094	 * Return if the number of free pages cannot satisfy the requested
3095	 * allocation.
3096	 */
3097	vmd = VM_DOMAIN(domain);
3098	count = vmd->vmd_free_count;
3099	if (count < npages + vmd->vmd_free_reserved || (count < npages +
3100	    vmd->vmd_interrupt_free_min && req_class == VM_ALLOC_SYSTEM) ||
3101	    (count < npages && req_class == VM_ALLOC_INTERRUPT))
3102		goto done;
3103
3104	/*
3105	 * Scan up to three times, relaxing the restrictions ("options") on
3106	 * the reclamation of reservations and superpages each time.
3107	 */
3108	for (options = VPSC_NORESERV;;) {
3109		bool phys_range_exists = false;
3110
3111		/*
3112		 * Find the highest runs that satisfy the given constraints
3113		 * and restrictions, and record them in "m_runs".
3114		 */
3115		count = 0;
3116		segind = start_segind;
3117		while ((segind = vm_phys_find_range(bounds, segind, domain,
3118		    npages, low, high)) != -1) {
3119			phys_range_exists = true;
3120			while ((m_run = vm_page_scan_contig(npages, bounds[0],
3121			    bounds[1], alignment, boundary, options))) {
3122				bounds[0] = m_run + npages;
3123				m_runs[RUN_INDEX(count, nruns)] = m_run;
3124				count++;
3125			}
3126			segind++;
3127		}
3128
3129		if (!phys_range_exists) {
3130			ret = ERANGE;
3131			goto done;
3132		}
3133
3134		/*
3135		 * Reclaim the highest runs in LIFO (descending) order until
3136		 * the number of reclaimed pages, "reclaimed", is at least
3137		 * "min_reclaim".  Reset "reclaimed" each time because each
3138		 * reclamation is idempotent, and runs will (likely) recur
3139		 * from one scan to the next as restrictions are relaxed.
3140		 */
3141		reclaimed = 0;
3142		for (i = 0; count > 0 && i < nruns; i++) {
3143			count--;
3144			m_run = m_runs[RUN_INDEX(count, nruns)];
3145			error = vm_page_reclaim_run(req_class, domain, npages,
3146			    m_run, high);
3147			if (error == 0) {
3148				reclaimed += npages;
3149				if (reclaimed >= min_reclaim) {
3150					ret = 0;
3151					goto done;
3152				}
3153			}
3154		}
3155
3156		/*
3157		 * Either relax the restrictions on the next scan or return if
3158		 * the last scan had no restrictions.
3159		 */
3160		if (options == VPSC_NORESERV)
3161			options = VPSC_NOSUPER;
3162		else if (options == VPSC_NOSUPER)
3163			options = VPSC_ANY;
3164		else if (options == VPSC_ANY) {
3165			if (reclaimed != 0)
3166				ret = 0;
3167			goto done;
3168		}
3169	}
3170done:
3171	if (m_runs != _m_runs)
3172		free(m_runs, M_TEMP);
3173	return (ret);
3174}
3175
3176int
3177vm_page_reclaim_contig_domain(int domain, int req, u_long npages,
3178    vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
3179{
3180	return (vm_page_reclaim_contig_domain_ext(domain, req, npages, low, high,
3181	    alignment, boundary, 1));
3182}
3183
3184int
3185vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high,
3186    u_long alignment, vm_paddr_t boundary)
3187{
3188	struct vm_domainset_iter di;
3189	int domain, ret, status;
3190
3191	ret = ERANGE;
3192
3193	vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req);
3194	do {
3195		status = vm_page_reclaim_contig_domain(domain, req, npages, low,
3196		    high, alignment, boundary);
3197		if (status == 0)
3198			return (0);
3199		else if (status == ERANGE)
3200			vm_domainset_iter_ignore(&di, domain);
3201		else {
3202			KASSERT(status == ENOMEM, ("Unrecognized error %d "
3203			    "from vm_page_reclaim_contig_domain()", status));
3204			ret = ENOMEM;
3205		}
3206	} while (vm_domainset_iter_page(&di, NULL, &domain) == 0);
3207
3208	return (ret);
3209}
3210
3211/*
3212 * Set the domain in the appropriate page level domainset.
3213 */
3214void
3215vm_domain_set(struct vm_domain *vmd)
3216{
3217
3218	mtx_lock(&vm_domainset_lock);
3219	if (!vmd->vmd_minset && vm_paging_min(vmd)) {
3220		vmd->vmd_minset = 1;
3221		DOMAINSET_SET(vmd->vmd_domain, &vm_min_domains);
3222	}
3223	if (!vmd->vmd_severeset && vm_paging_severe(vmd)) {
3224		vmd->vmd_severeset = 1;
3225		DOMAINSET_SET(vmd->vmd_domain, &vm_severe_domains);
3226	}
3227	mtx_unlock(&vm_domainset_lock);
3228}
3229
3230/*
3231 * Clear the domain from the appropriate page level domainset.
3232 */
3233void
3234vm_domain_clear(struct vm_domain *vmd)
3235{
3236
3237	mtx_lock(&vm_domainset_lock);
3238	if (vmd->vmd_minset && !vm_paging_min(vmd)) {
3239		vmd->vmd_minset = 0;
3240		DOMAINSET_CLR(vmd->vmd_domain, &vm_min_domains);
3241		if (vm_min_waiters != 0) {
3242			vm_min_waiters = 0;
3243			wakeup(&vm_min_domains);
3244		}
3245	}
3246	if (vmd->vmd_severeset && !vm_paging_severe(vmd)) {
3247		vmd->vmd_severeset = 0;
3248		DOMAINSET_CLR(vmd->vmd_domain, &vm_severe_domains);
3249		if (vm_severe_waiters != 0) {
3250			vm_severe_waiters = 0;
3251			wakeup(&vm_severe_domains);
3252		}
3253	}
3254
3255	/*
3256	 * If pageout daemon needs pages, then tell it that there are
3257	 * some free.
3258	 */
3259	if (vmd->vmd_pageout_pages_needed &&
3260	    vmd->vmd_free_count >= vmd->vmd_pageout_free_min) {
3261		wakeup(&vmd->vmd_pageout_pages_needed);
3262		vmd->vmd_pageout_pages_needed = 0;
3263	}
3264
3265	/* See comments in vm_wait_doms(). */
3266	if (vm_pageproc_waiters) {
3267		vm_pageproc_waiters = 0;
3268		wakeup(&vm_pageproc_waiters);
3269	}
3270	mtx_unlock(&vm_domainset_lock);
3271}
3272
3273/*
3274 * Wait for free pages to exceed the min threshold globally.
3275 */
3276void
3277vm_wait_min(void)
3278{
3279
3280	mtx_lock(&vm_domainset_lock);
3281	while (vm_page_count_min()) {
3282		vm_min_waiters++;
3283		msleep(&vm_min_domains, &vm_domainset_lock, PVM, "vmwait", 0);
3284	}
3285	mtx_unlock(&vm_domainset_lock);
3286}
3287
3288/*
3289 * Wait for free pages to exceed the severe threshold globally.
3290 */
3291void
3292vm_wait_severe(void)
3293{
3294
3295	mtx_lock(&vm_domainset_lock);
3296	while (vm_page_count_severe()) {
3297		vm_severe_waiters++;
3298		msleep(&vm_severe_domains, &vm_domainset_lock, PVM,
3299		    "vmwait", 0);
3300	}
3301	mtx_unlock(&vm_domainset_lock);
3302}
3303
3304u_int
3305vm_wait_count(void)
3306{
3307
3308	return (vm_severe_waiters + vm_min_waiters + vm_pageproc_waiters);
3309}
3310
3311int
3312vm_wait_doms(const domainset_t *wdoms, int mflags)
3313{
3314	int error;
3315
3316	error = 0;
3317
3318	/*
3319	 * We use racey wakeup synchronization to avoid expensive global
3320	 * locking for the pageproc when sleeping with a non-specific vm_wait.
3321	 * To handle this, we only sleep for one tick in this instance.  It
3322	 * is expected that most allocations for the pageproc will come from
3323	 * kmem or vm_page_grab* which will use the more specific and
3324	 * race-free vm_wait_domain().
3325	 */
3326	if (curproc == pageproc) {
3327		mtx_lock(&vm_domainset_lock);
3328		vm_pageproc_waiters++;
3329		error = msleep(&vm_pageproc_waiters, &vm_domainset_lock,
3330		    PVM | PDROP | mflags, "pageprocwait", 1);
3331	} else {
3332		/*
3333		 * XXX Ideally we would wait only until the allocation could
3334		 * be satisfied.  This condition can cause new allocators to
3335		 * consume all freed pages while old allocators wait.
3336		 */
3337		mtx_lock(&vm_domainset_lock);
3338		if (vm_page_count_min_set(wdoms)) {
3339			if (pageproc == NULL)
3340				panic("vm_wait in early boot");
3341			vm_min_waiters++;
3342			error = msleep(&vm_min_domains, &vm_domainset_lock,
3343			    PVM | PDROP | mflags, "vmwait", 0);
3344		} else
3345			mtx_unlock(&vm_domainset_lock);
3346	}
3347	return (error);
3348}
3349
3350/*
3351 *	vm_wait_domain:
3352 *
3353 *	Sleep until free pages are available for allocation.
3354 *	- Called in various places after failed memory allocations.
3355 */
3356void
3357vm_wait_domain(int domain)
3358{
3359	struct vm_domain *vmd;
3360	domainset_t wdom;
3361
3362	vmd = VM_DOMAIN(domain);
3363	vm_domain_free_assert_unlocked(vmd);
3364
3365	if (curproc == pageproc) {
3366		mtx_lock(&vm_domainset_lock);
3367		if (vmd->vmd_free_count < vmd->vmd_pageout_free_min) {
3368			vmd->vmd_pageout_pages_needed = 1;
3369			msleep(&vmd->vmd_pageout_pages_needed,
3370			    &vm_domainset_lock, PDROP | PSWP, "VMWait", 0);
3371		} else
3372			mtx_unlock(&vm_domainset_lock);
3373	} else {
3374		DOMAINSET_ZERO(&wdom);
3375		DOMAINSET_SET(vmd->vmd_domain, &wdom);
3376		vm_wait_doms(&wdom, 0);
3377	}
3378}
3379
3380static int
3381vm_wait_flags(vm_object_t obj, int mflags)
3382{
3383	struct domainset *d;
3384
3385	d = NULL;
3386
3387	/*
3388	 * Carefully fetch pointers only once: the struct domainset
3389	 * itself is ummutable but the pointer might change.
3390	 */
3391	if (obj != NULL)
3392		d = obj->domain.dr_policy;
3393	if (d == NULL)
3394		d = curthread->td_domain.dr_policy;
3395
3396	return (vm_wait_doms(&d->ds_mask, mflags));
3397}
3398
3399/*
3400 *	vm_wait:
3401 *
3402 *	Sleep until free pages are available for allocation in the
3403 *	affinity domains of the obj.  If obj is NULL, the domain set
3404 *	for the calling thread is used.
3405 *	Called in various places after failed memory allocations.
3406 */
3407void
3408vm_wait(vm_object_t obj)
3409{
3410	(void)vm_wait_flags(obj, 0);
3411}
3412
3413int
3414vm_wait_intr(vm_object_t obj)
3415{
3416	return (vm_wait_flags(obj, PCATCH));
3417}
3418
3419/*
3420 *	vm_domain_alloc_fail:
3421 *
3422 *	Called when a page allocation function fails.  Informs the
3423 *	pagedaemon and performs the requested wait.  Requires the
3424 *	domain_free and object lock on entry.  Returns with the
3425 *	object lock held and free lock released.  Returns an error when
3426 *	retry is necessary.
3427 *
3428 */
3429static int
3430vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object, int req)
3431{
3432
3433	vm_domain_free_assert_unlocked(vmd);
3434
3435	atomic_add_int(&vmd->vmd_pageout_deficit,
3436	    max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1));
3437	if (req & (VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) {
3438		if (object != NULL)
3439			VM_OBJECT_WUNLOCK(object);
3440		vm_wait_domain(vmd->vmd_domain);
3441		if (object != NULL)
3442			VM_OBJECT_WLOCK(object);
3443		if (req & VM_ALLOC_WAITOK)
3444			return (EAGAIN);
3445	}
3446
3447	return (0);
3448}
3449
3450/*
3451 *	vm_waitpfault:
3452 *
3453 *	Sleep until free pages are available for allocation.
3454 *	- Called only in vm_fault so that processes page faulting
3455 *	  can be easily tracked.
3456 *	- Sleeps at a lower priority than vm_wait() so that vm_wait()ing
3457 *	  processes will be able to grab memory first.  Do not change
3458 *	  this balance without careful testing first.
3459 */
3460void
3461vm_waitpfault(struct domainset *dset, int timo)
3462{
3463
3464	/*
3465	 * XXX Ideally we would wait only until the allocation could
3466	 * be satisfied.  This condition can cause new allocators to
3467	 * consume all freed pages while old allocators wait.
3468	 */
3469	mtx_lock(&vm_domainset_lock);
3470	if (vm_page_count_min_set(&dset->ds_mask)) {
3471		vm_min_waiters++;
3472		msleep(&vm_min_domains, &vm_domainset_lock, PUSER | PDROP,
3473		    "pfault", timo);
3474	} else
3475		mtx_unlock(&vm_domainset_lock);
3476}
3477
3478static struct vm_pagequeue *
3479_vm_page_pagequeue(vm_page_t m, uint8_t queue)
3480{
3481
3482	return (&vm_pagequeue_domain(m)->vmd_pagequeues[queue]);
3483}
3484
3485#ifdef INVARIANTS
3486static struct vm_pagequeue *
3487vm_page_pagequeue(vm_page_t m)
3488{
3489
3490	return (_vm_page_pagequeue(m, vm_page_astate_load(m).queue));
3491}
3492#endif
3493
3494static __always_inline bool
3495vm_page_pqstate_fcmpset(vm_page_t m, vm_page_astate_t *old, vm_page_astate_t new)
3496{
3497	vm_page_astate_t tmp;
3498
3499	tmp = *old;
3500	do {
3501		if (__predict_true(vm_page_astate_fcmpset(m, old, new)))
3502			return (true);
3503		counter_u64_add(pqstate_commit_retries, 1);
3504	} while (old->_bits == tmp._bits);
3505
3506	return (false);
3507}
3508
3509/*
3510 * Do the work of committing a queue state update that moves the page out of
3511 * its current queue.
3512 */
3513static bool
3514_vm_page_pqstate_commit_dequeue(struct vm_pagequeue *pq, vm_page_t m,
3515    vm_page_astate_t *old, vm_page_astate_t new)
3516{
3517	vm_page_t next;
3518
3519	vm_pagequeue_assert_locked(pq);
3520	KASSERT(vm_page_pagequeue(m) == pq,
3521	    ("%s: queue %p does not match page %p", __func__, pq, m));
3522	KASSERT(old->queue != PQ_NONE && new.queue != old->queue,
3523	    ("%s: invalid queue indices %d %d",
3524	    __func__, old->queue, new.queue));
3525
3526	/*
3527	 * Once the queue index of the page changes there is nothing
3528	 * synchronizing with further updates to the page's physical
3529	 * queue state.  Therefore we must speculatively remove the page
3530	 * from the queue now and be prepared to roll back if the queue
3531	 * state update fails.  If the page is not physically enqueued then
3532	 * we just update its queue index.
3533	 */
3534	if ((old->flags & PGA_ENQUEUED) != 0) {
3535		new.flags &= ~PGA_ENQUEUED;
3536		next = TAILQ_NEXT(m, plinks.q);
3537		TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
3538		vm_pagequeue_cnt_dec(pq);
3539		if (!vm_page_pqstate_fcmpset(m, old, new)) {
3540			if (next == NULL)
3541				TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
3542			else
3543				TAILQ_INSERT_BEFORE(next, m, plinks.q);
3544			vm_pagequeue_cnt_inc(pq);
3545			return (false);
3546		} else {
3547			return (true);
3548		}
3549	} else {
3550		return (vm_page_pqstate_fcmpset(m, old, new));
3551	}
3552}
3553
3554static bool
3555vm_page_pqstate_commit_dequeue(vm_page_t m, vm_page_astate_t *old,
3556    vm_page_astate_t new)
3557{
3558	struct vm_pagequeue *pq;
3559	vm_page_astate_t as;
3560	bool ret;
3561
3562	pq = _vm_page_pagequeue(m, old->queue);
3563
3564	/*
3565	 * The queue field and PGA_ENQUEUED flag are stable only so long as the
3566	 * corresponding page queue lock is held.
3567	 */
3568	vm_pagequeue_lock(pq);
3569	as = vm_page_astate_load(m);
3570	if (__predict_false(as._bits != old->_bits)) {
3571		*old = as;
3572		ret = false;
3573	} else {
3574		ret = _vm_page_pqstate_commit_dequeue(pq, m, old, new);
3575	}
3576	vm_pagequeue_unlock(pq);
3577	return (ret);
3578}
3579
3580/*
3581 * Commit a queue state update that enqueues or requeues a page.
3582 */
3583static bool
3584_vm_page_pqstate_commit_requeue(struct vm_pagequeue *pq, vm_page_t m,
3585    vm_page_astate_t *old, vm_page_astate_t new)
3586{
3587	struct vm_domain *vmd;
3588
3589	vm_pagequeue_assert_locked(pq);
3590	KASSERT(old->queue != PQ_NONE && new.queue == old->queue,
3591	    ("%s: invalid queue indices %d %d",
3592	    __func__, old->queue, new.queue));
3593
3594	new.flags |= PGA_ENQUEUED;
3595	if (!vm_page_pqstate_fcmpset(m, old, new))
3596		return (false);
3597
3598	if ((old->flags & PGA_ENQUEUED) != 0)
3599		TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
3600	else
3601		vm_pagequeue_cnt_inc(pq);
3602
3603	/*
3604	 * Give PGA_REQUEUE_HEAD precedence over PGA_REQUEUE.  In particular, if
3605	 * both flags are set in close succession, only PGA_REQUEUE_HEAD will be
3606	 * applied, even if it was set first.
3607	 */
3608	if ((old->flags & PGA_REQUEUE_HEAD) != 0) {
3609		vmd = vm_pagequeue_domain(m);
3610		KASSERT(pq == &vmd->vmd_pagequeues[PQ_INACTIVE],
3611		    ("%s: invalid page queue for page %p", __func__, m));
3612		TAILQ_INSERT_BEFORE(&vmd->vmd_inacthead, m, plinks.q);
3613	} else {
3614		TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
3615	}
3616	return (true);
3617}
3618
3619/*
3620 * Commit a queue state update that encodes a request for a deferred queue
3621 * operation.
3622 */
3623static bool
3624vm_page_pqstate_commit_request(vm_page_t m, vm_page_astate_t *old,
3625    vm_page_astate_t new)
3626{
3627
3628	KASSERT(old->queue == new.queue || new.queue != PQ_NONE,
3629	    ("%s: invalid state, queue %d flags %x",
3630	    __func__, new.queue, new.flags));
3631
3632	if (old->_bits != new._bits &&
3633	    !vm_page_pqstate_fcmpset(m, old, new))
3634		return (false);
3635	vm_page_pqbatch_submit(m, new.queue);
3636	return (true);
3637}
3638
3639/*
3640 * A generic queue state update function.  This handles more cases than the
3641 * specialized functions above.
3642 */
3643bool
3644vm_page_pqstate_commit(vm_page_t m, vm_page_astate_t *old, vm_page_astate_t new)
3645{
3646
3647	if (old->_bits == new._bits)
3648		return (true);
3649
3650	if (old->queue != PQ_NONE && new.queue != old->queue) {
3651		if (!vm_page_pqstate_commit_dequeue(m, old, new))
3652			return (false);
3653		if (new.queue != PQ_NONE)
3654			vm_page_pqbatch_submit(m, new.queue);
3655	} else {
3656		if (!vm_page_pqstate_fcmpset(m, old, new))
3657			return (false);
3658		if (new.queue != PQ_NONE &&
3659		    ((new.flags & ~old->flags) & PGA_QUEUE_OP_MASK) != 0)
3660			vm_page_pqbatch_submit(m, new.queue);
3661	}
3662	return (true);
3663}
3664
3665/*
3666 * Apply deferred queue state updates to a page.
3667 */
3668static inline void
3669vm_pqbatch_process_page(struct vm_pagequeue *pq, vm_page_t m, uint8_t queue)
3670{
3671	vm_page_astate_t new, old;
3672
3673	CRITICAL_ASSERT(curthread);
3674	vm_pagequeue_assert_locked(pq);
3675	KASSERT(queue < PQ_COUNT,
3676	    ("%s: invalid queue index %d", __func__, queue));
3677	KASSERT(pq == _vm_page_pagequeue(m, queue),
3678	    ("%s: page %p does not belong to queue %p", __func__, m, pq));
3679
3680	for (old = vm_page_astate_load(m);;) {
3681		if (__predict_false(old.queue != queue ||
3682		    (old.flags & PGA_QUEUE_OP_MASK) == 0)) {
3683			counter_u64_add(queue_nops, 1);
3684			break;
3685		}
3686		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3687		    ("%s: page %p is unmanaged", __func__, m));
3688
3689		new = old;
3690		if ((old.flags & PGA_DEQUEUE) != 0) {
3691			new.flags &= ~PGA_QUEUE_OP_MASK;
3692			new.queue = PQ_NONE;
3693			if (__predict_true(_vm_page_pqstate_commit_dequeue(pq,
3694			    m, &old, new))) {
3695				counter_u64_add(queue_ops, 1);
3696				break;
3697			}
3698		} else {
3699			new.flags &= ~(PGA_REQUEUE | PGA_REQUEUE_HEAD);
3700			if (__predict_true(_vm_page_pqstate_commit_requeue(pq,
3701			    m, &old, new))) {
3702				counter_u64_add(queue_ops, 1);
3703				break;
3704			}
3705		}
3706	}
3707}
3708
3709static void
3710vm_pqbatch_process(struct vm_pagequeue *pq, struct vm_batchqueue *bq,
3711    uint8_t queue)
3712{
3713	int i;
3714
3715	for (i = 0; i < bq->bq_cnt; i++)
3716		vm_pqbatch_process_page(pq, bq->bq_pa[i], queue);
3717	vm_batchqueue_init(bq);
3718}
3719
3720/*
3721 *	vm_page_pqbatch_submit:		[ internal use only ]
3722 *
3723 *	Enqueue a page in the specified page queue's batched work queue.
3724 *	The caller must have encoded the requested operation in the page
3725 *	structure's a.flags field.
3726 */
3727void
3728vm_page_pqbatch_submit(vm_page_t m, uint8_t queue)
3729{
3730	struct vm_batchqueue *bq;
3731	struct vm_pagequeue *pq;
3732	int domain, slots_remaining;
3733
3734	KASSERT(queue < PQ_COUNT, ("invalid queue %d", queue));
3735
3736	domain = vm_page_domain(m);
3737	critical_enter();
3738	bq = DPCPU_PTR(pqbatch[domain][queue]);
3739	slots_remaining = vm_batchqueue_insert(bq, m);
3740	if (slots_remaining > (VM_BATCHQUEUE_SIZE >> 1)) {
3741		/* keep building the bq */
3742		critical_exit();
3743		return;
3744	} else if (slots_remaining > 0 ) {
3745		/* Try to process the bq if we can get the lock */
3746		pq = &VM_DOMAIN(domain)->vmd_pagequeues[queue];
3747		if (vm_pagequeue_trylock(pq)) {
3748			vm_pqbatch_process(pq, bq, queue);
3749			vm_pagequeue_unlock(pq);
3750		}
3751		critical_exit();
3752		return;
3753	}
3754	critical_exit();
3755
3756	/* if we make it here, the bq is full so wait for the lock */
3757
3758	pq = &VM_DOMAIN(domain)->vmd_pagequeues[queue];
3759	vm_pagequeue_lock(pq);
3760	critical_enter();
3761	bq = DPCPU_PTR(pqbatch[domain][queue]);
3762	vm_pqbatch_process(pq, bq, queue);
3763	vm_pqbatch_process_page(pq, m, queue);
3764	vm_pagequeue_unlock(pq);
3765	critical_exit();
3766}
3767
3768/*
3769 *	vm_page_pqbatch_drain:		[ internal use only ]
3770 *
3771 *	Force all per-CPU page queue batch queues to be drained.  This is
3772 *	intended for use in severe memory shortages, to ensure that pages
3773 *	do not remain stuck in the batch queues.
3774 */
3775void
3776vm_page_pqbatch_drain(void)
3777{
3778	struct thread *td;
3779	struct vm_domain *vmd;
3780	struct vm_pagequeue *pq;
3781	int cpu, domain, queue;
3782
3783	td = curthread;
3784	CPU_FOREACH(cpu) {
3785		thread_lock(td);
3786		sched_bind(td, cpu);
3787		thread_unlock(td);
3788
3789		for (domain = 0; domain < vm_ndomains; domain++) {
3790			vmd = VM_DOMAIN(domain);
3791			for (queue = 0; queue < PQ_COUNT; queue++) {
3792				pq = &vmd->vmd_pagequeues[queue];
3793				vm_pagequeue_lock(pq);
3794				critical_enter();
3795				vm_pqbatch_process(pq,
3796				    DPCPU_PTR(pqbatch[domain][queue]), queue);
3797				critical_exit();
3798				vm_pagequeue_unlock(pq);
3799			}
3800		}
3801	}
3802	thread_lock(td);
3803	sched_unbind(td);
3804	thread_unlock(td);
3805}
3806
3807/*
3808 *	vm_page_dequeue_deferred:	[ internal use only ]
3809 *
3810 *	Request removal of the given page from its current page
3811 *	queue.  Physical removal from the queue may be deferred
3812 *	indefinitely.
3813 */
3814void
3815vm_page_dequeue_deferred(vm_page_t m)
3816{
3817	vm_page_astate_t new, old;
3818
3819	old = vm_page_astate_load(m);
3820	do {
3821		if (old.queue == PQ_NONE) {
3822			KASSERT((old.flags & PGA_QUEUE_STATE_MASK) == 0,
3823			    ("%s: page %p has unexpected queue state",
3824			    __func__, m));
3825			break;
3826		}
3827		new = old;
3828		new.flags |= PGA_DEQUEUE;
3829	} while (!vm_page_pqstate_commit_request(m, &old, new));
3830}
3831
3832/*
3833 *	vm_page_dequeue:
3834 *
3835 *	Remove the page from whichever page queue it's in, if any, before
3836 *	returning.
3837 */
3838void
3839vm_page_dequeue(vm_page_t m)
3840{
3841	vm_page_astate_t new, old;
3842
3843	old = vm_page_astate_load(m);
3844	do {
3845		if (old.queue == PQ_NONE) {
3846			KASSERT((old.flags & PGA_QUEUE_STATE_MASK) == 0,
3847			    ("%s: page %p has unexpected queue state",
3848			    __func__, m));
3849			break;
3850		}
3851		new = old;
3852		new.flags &= ~PGA_QUEUE_OP_MASK;
3853		new.queue = PQ_NONE;
3854	} while (!vm_page_pqstate_commit_dequeue(m, &old, new));
3855
3856}
3857
3858/*
3859 * Schedule the given page for insertion into the specified page queue.
3860 * Physical insertion of the page may be deferred indefinitely.
3861 */
3862static void
3863vm_page_enqueue(vm_page_t m, uint8_t queue)
3864{
3865
3866	KASSERT(m->a.queue == PQ_NONE &&
3867	    (m->a.flags & PGA_QUEUE_STATE_MASK) == 0,
3868	    ("%s: page %p is already enqueued", __func__, m));
3869	KASSERT(m->ref_count > 0,
3870	    ("%s: page %p does not carry any references", __func__, m));
3871
3872	m->a.queue = queue;
3873	if ((m->a.flags & PGA_REQUEUE) == 0)
3874		vm_page_aflag_set(m, PGA_REQUEUE);
3875	vm_page_pqbatch_submit(m, queue);
3876}
3877
3878/*
3879 *	vm_page_free_prep:
3880 *
3881 *	Prepares the given page to be put on the free list,
3882 *	disassociating it from any VM object. The caller may return
3883 *	the page to the free list only if this function returns true.
3884 *
3885 *	The object, if it exists, must be locked, and then the page must
3886 *	be xbusy.  Otherwise the page must be not busied.  A managed
3887 *	page must be unmapped.
3888 */
3889static bool
3890vm_page_free_prep(vm_page_t m)
3891{
3892
3893	/*
3894	 * Synchronize with threads that have dropped a reference to this
3895	 * page.
3896	 */
3897	atomic_thread_fence_acq();
3898
3899#if defined(DIAGNOSTIC) && defined(PHYS_TO_DMAP)
3900	if (PMAP_HAS_DMAP && (m->flags & PG_ZERO) != 0) {
3901		uint64_t *p;
3902		int i;
3903		p = (uint64_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
3904		for (i = 0; i < PAGE_SIZE / sizeof(uint64_t); i++, p++)
3905			KASSERT(*p == 0, ("vm_page_free_prep %p PG_ZERO %d %jx",
3906			    m, i, (uintmax_t)*p));
3907	}
3908#endif
3909	if ((m->oflags & VPO_UNMANAGED) == 0) {
3910		KASSERT(!pmap_page_is_mapped(m),
3911		    ("vm_page_free_prep: freeing mapped page %p", m));
3912		KASSERT((m->a.flags & (PGA_EXECUTABLE | PGA_WRITEABLE)) == 0,
3913		    ("vm_page_free_prep: mapping flags set in page %p", m));
3914	} else {
3915		KASSERT(m->a.queue == PQ_NONE,
3916		    ("vm_page_free_prep: unmanaged page %p is queued", m));
3917	}
3918	VM_CNT_INC(v_tfree);
3919
3920	if (m->object != NULL) {
3921		KASSERT(((m->oflags & VPO_UNMANAGED) != 0) ==
3922		    ((m->object->flags & OBJ_UNMANAGED) != 0),
3923		    ("vm_page_free_prep: managed flag mismatch for page %p",
3924		    m));
3925		vm_page_assert_xbusied(m);
3926
3927		/*
3928		 * The object reference can be released without an atomic
3929		 * operation.
3930		 */
3931		KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
3932		    m->ref_count == VPRC_OBJREF,
3933		    ("vm_page_free_prep: page %p has unexpected ref_count %u",
3934		    m, m->ref_count));
3935		vm_page_object_remove(m);
3936		m->ref_count -= VPRC_OBJREF;
3937	} else
3938		vm_page_assert_unbusied(m);
3939
3940	vm_page_busy_free(m);
3941
3942	/*
3943	 * If fictitious remove object association and
3944	 * return.
3945	 */
3946	if ((m->flags & PG_FICTITIOUS) != 0) {
3947		KASSERT(m->ref_count == 1,
3948		    ("fictitious page %p is referenced", m));
3949		KASSERT(m->a.queue == PQ_NONE,
3950		    ("fictitious page %p is queued", m));
3951		return (false);
3952	}
3953
3954	/*
3955	 * Pages need not be dequeued before they are returned to the physical
3956	 * memory allocator, but they must at least be marked for a deferred
3957	 * dequeue.
3958	 */
3959	if ((m->oflags & VPO_UNMANAGED) == 0)
3960		vm_page_dequeue_deferred(m);
3961
3962	m->valid = 0;
3963	vm_page_undirty(m);
3964
3965	if (m->ref_count != 0)
3966		panic("vm_page_free_prep: page %p has references", m);
3967
3968	/*
3969	 * Restore the default memory attribute to the page.
3970	 */
3971	if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT)
3972		pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT);
3973
3974#if VM_NRESERVLEVEL > 0
3975	/*
3976	 * Determine whether the page belongs to a reservation.  If the page was
3977	 * allocated from a per-CPU cache, it cannot belong to a reservation, so
3978	 * as an optimization, we avoid the check in that case.
3979	 */
3980	if ((m->flags & PG_PCPU_CACHE) == 0 && vm_reserv_free_page(m))
3981		return (false);
3982#endif
3983
3984	return (true);
3985}
3986
3987/*
3988 *	vm_page_free_toq:
3989 *
3990 *	Returns the given page to the free list, disassociating it
3991 *	from any VM object.
3992 *
3993 *	The object must be locked.  The page must be exclusively busied if it
3994 *	belongs to an object.
3995 */
3996static void
3997vm_page_free_toq(vm_page_t m)
3998{
3999	struct vm_domain *vmd;
4000	uma_zone_t zone;
4001
4002	if (!vm_page_free_prep(m))
4003		return;
4004
4005	vmd = vm_pagequeue_domain(m);
4006	zone = vmd->vmd_pgcache[m->pool].zone;
4007	if ((m->flags & PG_PCPU_CACHE) != 0 && zone != NULL) {
4008		uma_zfree(zone, m);
4009		return;
4010	}
4011	vm_domain_free_lock(vmd);
4012	vm_phys_free_pages(m, 0);
4013	vm_domain_free_unlock(vmd);
4014	vm_domain_freecnt_inc(vmd, 1);
4015}
4016
4017/*
4018 *	vm_page_free_pages_toq:
4019 *
4020 *	Returns a list of pages to the free list, disassociating it
4021 *	from any VM object.  In other words, this is equivalent to
4022 *	calling vm_page_free_toq() for each page of a list of VM objects.
4023 */
4024void
4025vm_page_free_pages_toq(struct spglist *free, bool update_wire_count)
4026{
4027	vm_page_t m;
4028	int count;
4029
4030	if (SLIST_EMPTY(free))
4031		return;
4032
4033	count = 0;
4034	while ((m = SLIST_FIRST(free)) != NULL) {
4035		count++;
4036		SLIST_REMOVE_HEAD(free, plinks.s.ss);
4037		vm_page_free_toq(m);
4038	}
4039
4040	if (update_wire_count)
4041		vm_wire_sub(count);
4042}
4043
4044/*
4045 * Mark this page as wired down.  For managed pages, this prevents reclamation
4046 * by the page daemon, or when the containing object, if any, is destroyed.
4047 */
4048void
4049vm_page_wire(vm_page_t m)
4050{
4051	u_int old;
4052
4053#ifdef INVARIANTS
4054	if (m->object != NULL && !vm_page_busied(m) &&
4055	    !vm_object_busied(m->object))
4056		VM_OBJECT_ASSERT_LOCKED(m->object);
4057#endif
4058	KASSERT((m->flags & PG_FICTITIOUS) == 0 ||
4059	    VPRC_WIRE_COUNT(m->ref_count) >= 1,
4060	    ("vm_page_wire: fictitious page %p has zero wirings", m));
4061
4062	old = atomic_fetchadd_int(&m->ref_count, 1);
4063	KASSERT(VPRC_WIRE_COUNT(old) != VPRC_WIRE_COUNT_MAX,
4064	    ("vm_page_wire: counter overflow for page %p", m));
4065	if (VPRC_WIRE_COUNT(old) == 0) {
4066		if ((m->oflags & VPO_UNMANAGED) == 0)
4067			vm_page_aflag_set(m, PGA_DEQUEUE);
4068		vm_wire_add(1);
4069	}
4070}
4071
4072/*
4073 * Attempt to wire a mapped page following a pmap lookup of that page.
4074 * This may fail if a thread is concurrently tearing down mappings of the page.
4075 * The transient failure is acceptable because it translates to the
4076 * failure of the caller pmap_extract_and_hold(), which should be then
4077 * followed by the vm_fault() fallback, see e.g. vm_fault_quick_hold_pages().
4078 */
4079bool
4080vm_page_wire_mapped(vm_page_t m)
4081{
4082	u_int old;
4083
4084	old = m->ref_count;
4085	do {
4086		KASSERT(old > 0,
4087		    ("vm_page_wire_mapped: wiring unreferenced page %p", m));
4088		if ((old & VPRC_BLOCKED) != 0)
4089			return (false);
4090	} while (!atomic_fcmpset_int(&m->ref_count, &old, old + 1));
4091
4092	if (VPRC_WIRE_COUNT(old) == 0) {
4093		if ((m->oflags & VPO_UNMANAGED) == 0)
4094			vm_page_aflag_set(m, PGA_DEQUEUE);
4095		vm_wire_add(1);
4096	}
4097	return (true);
4098}
4099
4100/*
4101 * Release a wiring reference to a managed page.  If the page still belongs to
4102 * an object, update its position in the page queues to reflect the reference.
4103 * If the wiring was the last reference to the page, free the page.
4104 */
4105static void
4106vm_page_unwire_managed(vm_page_t m, uint8_t nqueue, bool noreuse)
4107{
4108	u_int old;
4109
4110	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4111	    ("%s: page %p is unmanaged", __func__, m));
4112
4113	/*
4114	 * Update LRU state before releasing the wiring reference.
4115	 * Use a release store when updating the reference count to
4116	 * synchronize with vm_page_free_prep().
4117	 */
4118	old = m->ref_count;
4119	do {
4120		KASSERT(VPRC_WIRE_COUNT(old) > 0,
4121		    ("vm_page_unwire: wire count underflow for page %p", m));
4122
4123		if (old > VPRC_OBJREF + 1) {
4124			/*
4125			 * The page has at least one other wiring reference.  An
4126			 * earlier iteration of this loop may have called
4127			 * vm_page_release_toq() and cleared PGA_DEQUEUE, so
4128			 * re-set it if necessary.
4129			 */
4130			if ((vm_page_astate_load(m).flags & PGA_DEQUEUE) == 0)
4131				vm_page_aflag_set(m, PGA_DEQUEUE);
4132		} else if (old == VPRC_OBJREF + 1) {
4133			/*
4134			 * This is the last wiring.  Clear PGA_DEQUEUE and
4135			 * update the page's queue state to reflect the
4136			 * reference.  If the page does not belong to an object
4137			 * (i.e., the VPRC_OBJREF bit is clear), we only need to
4138			 * clear leftover queue state.
4139			 */
4140			vm_page_release_toq(m, nqueue, noreuse);
4141		} else if (old == 1) {
4142			vm_page_aflag_clear(m, PGA_DEQUEUE);
4143		}
4144	} while (!atomic_fcmpset_rel_int(&m->ref_count, &old, old - 1));
4145
4146	if (VPRC_WIRE_COUNT(old) == 1) {
4147		vm_wire_sub(1);
4148		if (old == 1)
4149			vm_page_free(m);
4150	}
4151}
4152
4153/*
4154 * Release one wiring of the specified page, potentially allowing it to be
4155 * paged out.
4156 *
4157 * Only managed pages belonging to an object can be paged out.  If the number
4158 * of wirings transitions to zero and the page is eligible for page out, then
4159 * the page is added to the specified paging queue.  If the released wiring
4160 * represented the last reference to the page, the page is freed.
4161 */
4162void
4163vm_page_unwire(vm_page_t m, uint8_t nqueue)
4164{
4165
4166	KASSERT(nqueue < PQ_COUNT,
4167	    ("vm_page_unwire: invalid queue %u request for page %p",
4168	    nqueue, m));
4169
4170	if ((m->oflags & VPO_UNMANAGED) != 0) {
4171		if (vm_page_unwire_noq(m) && m->ref_count == 0)
4172			vm_page_free(m);
4173		return;
4174	}
4175	vm_page_unwire_managed(m, nqueue, false);
4176}
4177
4178/*
4179 * Unwire a page without (re-)inserting it into a page queue.  It is up
4180 * to the caller to enqueue, requeue, or free the page as appropriate.
4181 * In most cases involving managed pages, vm_page_unwire() should be used
4182 * instead.
4183 */
4184bool
4185vm_page_unwire_noq(vm_page_t m)
4186{
4187	u_int old;
4188
4189	old = vm_page_drop(m, 1);
4190	KASSERT(VPRC_WIRE_COUNT(old) != 0,
4191	    ("%s: counter underflow for page %p", __func__,  m));
4192	KASSERT((m->flags & PG_FICTITIOUS) == 0 || VPRC_WIRE_COUNT(old) > 1,
4193	    ("%s: missing ref on fictitious page %p", __func__, m));
4194
4195	if (VPRC_WIRE_COUNT(old) > 1)
4196		return (false);
4197	if ((m->oflags & VPO_UNMANAGED) == 0)
4198		vm_page_aflag_clear(m, PGA_DEQUEUE);
4199	vm_wire_sub(1);
4200	return (true);
4201}
4202
4203/*
4204 * Ensure that the page ends up in the specified page queue.  If the page is
4205 * active or being moved to the active queue, ensure that its act_count is
4206 * at least ACT_INIT but do not otherwise mess with it.
4207 */
4208static __always_inline void
4209vm_page_mvqueue(vm_page_t m, const uint8_t nqueue, const uint16_t nflag)
4210{
4211	vm_page_astate_t old, new;
4212
4213	KASSERT(m->ref_count > 0,
4214	    ("%s: page %p does not carry any references", __func__, m));
4215	KASSERT(nflag == PGA_REQUEUE || nflag == PGA_REQUEUE_HEAD,
4216	    ("%s: invalid flags %x", __func__, nflag));
4217
4218	if ((m->oflags & VPO_UNMANAGED) != 0 || vm_page_wired(m))
4219		return;
4220
4221	old = vm_page_astate_load(m);
4222	do {
4223		if ((old.flags & PGA_DEQUEUE) != 0)
4224			break;
4225		new = old;
4226		new.flags &= ~PGA_QUEUE_OP_MASK;
4227		if (nqueue == PQ_ACTIVE)
4228			new.act_count = max(old.act_count, ACT_INIT);
4229		if (old.queue == nqueue) {
4230			/*
4231			 * There is no need to requeue pages already in the
4232			 * active queue.
4233			 */
4234			if (nqueue != PQ_ACTIVE ||
4235			    (old.flags & PGA_ENQUEUED) == 0)
4236				new.flags |= nflag;
4237		} else {
4238			new.flags |= nflag;
4239			new.queue = nqueue;
4240		}
4241	} while (!vm_page_pqstate_commit(m, &old, new));
4242}
4243
4244/*
4245 * Put the specified page on the active list (if appropriate).
4246 */
4247void
4248vm_page_activate(vm_page_t m)
4249{
4250
4251	vm_page_mvqueue(m, PQ_ACTIVE, PGA_REQUEUE);
4252}
4253
4254/*
4255 * Move the specified page to the tail of the inactive queue, or requeue
4256 * the page if it is already in the inactive queue.
4257 */
4258void
4259vm_page_deactivate(vm_page_t m)
4260{
4261
4262	vm_page_mvqueue(m, PQ_INACTIVE, PGA_REQUEUE);
4263}
4264
4265void
4266vm_page_deactivate_noreuse(vm_page_t m)
4267{
4268
4269	vm_page_mvqueue(m, PQ_INACTIVE, PGA_REQUEUE_HEAD);
4270}
4271
4272/*
4273 * Put a page in the laundry, or requeue it if it is already there.
4274 */
4275void
4276vm_page_launder(vm_page_t m)
4277{
4278
4279	vm_page_mvqueue(m, PQ_LAUNDRY, PGA_REQUEUE);
4280}
4281
4282/*
4283 * Put a page in the PQ_UNSWAPPABLE holding queue.
4284 */
4285void
4286vm_page_unswappable(vm_page_t m)
4287{
4288
4289	VM_OBJECT_ASSERT_LOCKED(m->object);
4290	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4291	    ("page %p already unswappable", m));
4292
4293	vm_page_dequeue(m);
4294	vm_page_enqueue(m, PQ_UNSWAPPABLE);
4295}
4296
4297/*
4298 * Release a page back to the page queues in preparation for unwiring.
4299 */
4300static void
4301vm_page_release_toq(vm_page_t m, uint8_t nqueue, const bool noreuse)
4302{
4303	vm_page_astate_t old, new;
4304	uint16_t nflag;
4305
4306	/*
4307	 * Use a check of the valid bits to determine whether we should
4308	 * accelerate reclamation of the page.  The object lock might not be
4309	 * held here, in which case the check is racy.  At worst we will either
4310	 * accelerate reclamation of a valid page and violate LRU, or
4311	 * unnecessarily defer reclamation of an invalid page.
4312	 *
4313	 * If we were asked to not cache the page, place it near the head of the
4314	 * inactive queue so that is reclaimed sooner.
4315	 */
4316	if (noreuse || vm_page_none_valid(m)) {
4317		nqueue = PQ_INACTIVE;
4318		nflag = PGA_REQUEUE_HEAD;
4319	} else {
4320		nflag = PGA_REQUEUE;
4321	}
4322
4323	old = vm_page_astate_load(m);
4324	do {
4325		new = old;
4326
4327		/*
4328		 * If the page is already in the active queue and we are not
4329		 * trying to accelerate reclamation, simply mark it as
4330		 * referenced and avoid any queue operations.
4331		 */
4332		new.flags &= ~PGA_QUEUE_OP_MASK;
4333		if (nflag != PGA_REQUEUE_HEAD && old.queue == PQ_ACTIVE &&
4334		    (old.flags & PGA_ENQUEUED) != 0)
4335			new.flags |= PGA_REFERENCED;
4336		else {
4337			new.flags |= nflag;
4338			new.queue = nqueue;
4339		}
4340	} while (!vm_page_pqstate_commit(m, &old, new));
4341}
4342
4343/*
4344 * Unwire a page and either attempt to free it or re-add it to the page queues.
4345 */
4346void
4347vm_page_release(vm_page_t m, int flags)
4348{
4349	vm_object_t object;
4350
4351	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4352	    ("vm_page_release: page %p is unmanaged", m));
4353
4354	if ((flags & VPR_TRYFREE) != 0) {
4355		for (;;) {
4356			object = atomic_load_ptr(&m->object);
4357			if (object == NULL)
4358				break;
4359			/* Depends on type-stability. */
4360			if (vm_page_busied(m) || !VM_OBJECT_TRYWLOCK(object))
4361				break;
4362			if (object == m->object) {
4363				vm_page_release_locked(m, flags);
4364				VM_OBJECT_WUNLOCK(object);
4365				return;
4366			}
4367			VM_OBJECT_WUNLOCK(object);
4368		}
4369	}
4370	vm_page_unwire_managed(m, PQ_INACTIVE, flags != 0);
4371}
4372
4373/* See vm_page_release(). */
4374void
4375vm_page_release_locked(vm_page_t m, int flags)
4376{
4377
4378	VM_OBJECT_ASSERT_WLOCKED(m->object);
4379	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4380	    ("vm_page_release_locked: page %p is unmanaged", m));
4381
4382	if (vm_page_unwire_noq(m)) {
4383		if ((flags & VPR_TRYFREE) != 0 &&
4384		    (m->object->ref_count == 0 || !pmap_page_is_mapped(m)) &&
4385		    m->dirty == 0 && vm_page_tryxbusy(m)) {
4386			/*
4387			 * An unlocked lookup may have wired the page before the
4388			 * busy lock was acquired, in which case the page must
4389			 * not be freed.
4390			 */
4391			if (__predict_true(!vm_page_wired(m))) {
4392				vm_page_free(m);
4393				return;
4394			}
4395			vm_page_xunbusy(m);
4396		} else {
4397			vm_page_release_toq(m, PQ_INACTIVE, flags != 0);
4398		}
4399	}
4400}
4401
4402static bool
4403vm_page_try_blocked_op(vm_page_t m, void (*op)(vm_page_t))
4404{
4405	u_int old;
4406
4407	KASSERT(m->object != NULL && (m->oflags & VPO_UNMANAGED) == 0,
4408	    ("vm_page_try_blocked_op: page %p has no object", m));
4409	KASSERT(vm_page_busied(m),
4410	    ("vm_page_try_blocked_op: page %p is not busy", m));
4411	VM_OBJECT_ASSERT_LOCKED(m->object);
4412
4413	old = m->ref_count;
4414	do {
4415		KASSERT(old != 0,
4416		    ("vm_page_try_blocked_op: page %p has no references", m));
4417		if (VPRC_WIRE_COUNT(old) != 0)
4418			return (false);
4419	} while (!atomic_fcmpset_int(&m->ref_count, &old, old | VPRC_BLOCKED));
4420
4421	(op)(m);
4422
4423	/*
4424	 * If the object is read-locked, new wirings may be created via an
4425	 * object lookup.
4426	 */
4427	old = vm_page_drop(m, VPRC_BLOCKED);
4428	KASSERT(!VM_OBJECT_WOWNED(m->object) ||
4429	    old == (VPRC_BLOCKED | VPRC_OBJREF),
4430	    ("vm_page_try_blocked_op: unexpected refcount value %u for %p",
4431	    old, m));
4432	return (true);
4433}
4434
4435/*
4436 * Atomically check for wirings and remove all mappings of the page.
4437 */
4438bool
4439vm_page_try_remove_all(vm_page_t m)
4440{
4441
4442	return (vm_page_try_blocked_op(m, pmap_remove_all));
4443}
4444
4445/*
4446 * Atomically check for wirings and remove all writeable mappings of the page.
4447 */
4448bool
4449vm_page_try_remove_write(vm_page_t m)
4450{
4451
4452	return (vm_page_try_blocked_op(m, pmap_remove_write));
4453}
4454
4455/*
4456 * vm_page_advise
4457 *
4458 * 	Apply the specified advice to the given page.
4459 */
4460void
4461vm_page_advise(vm_page_t m, int advice)
4462{
4463
4464	VM_OBJECT_ASSERT_WLOCKED(m->object);
4465	vm_page_assert_xbusied(m);
4466
4467	if (advice == MADV_FREE)
4468		/*
4469		 * Mark the page clean.  This will allow the page to be freed
4470		 * without first paging it out.  MADV_FREE pages are often
4471		 * quickly reused by malloc(3), so we do not do anything that
4472		 * would result in a page fault on a later access.
4473		 */
4474		vm_page_undirty(m);
4475	else if (advice != MADV_DONTNEED) {
4476		if (advice == MADV_WILLNEED)
4477			vm_page_activate(m);
4478		return;
4479	}
4480
4481	if (advice != MADV_FREE && m->dirty == 0 && pmap_is_modified(m))
4482		vm_page_dirty(m);
4483
4484	/*
4485	 * Clear any references to the page.  Otherwise, the page daemon will
4486	 * immediately reactivate the page.
4487	 */
4488	vm_page_aflag_clear(m, PGA_REFERENCED);
4489
4490	/*
4491	 * Place clean pages near the head of the inactive queue rather than
4492	 * the tail, thus defeating the queue's LRU operation and ensuring that
4493	 * the page will be reused quickly.  Dirty pages not already in the
4494	 * laundry are moved there.
4495	 */
4496	if (m->dirty == 0)
4497		vm_page_deactivate_noreuse(m);
4498	else if (!vm_page_in_laundry(m))
4499		vm_page_launder(m);
4500}
4501
4502/*
4503 *	vm_page_grab_release
4504 *
4505 *	Helper routine for grab functions to release busy on return.
4506 */
4507static inline void
4508vm_page_grab_release(vm_page_t m, int allocflags)
4509{
4510
4511	if ((allocflags & VM_ALLOC_NOBUSY) != 0) {
4512		if ((allocflags & VM_ALLOC_IGN_SBUSY) != 0)
4513			vm_page_sunbusy(m);
4514		else
4515			vm_page_xunbusy(m);
4516	}
4517}
4518
4519/*
4520 *	vm_page_grab_sleep
4521 *
4522 *	Sleep for busy according to VM_ALLOC_ parameters.  Returns true
4523 *	if the caller should retry and false otherwise.
4524 *
4525 *	If the object is locked on entry the object will be unlocked with
4526 *	false returns and still locked but possibly having been dropped
4527 *	with true returns.
4528 */
4529static bool
4530vm_page_grab_sleep(vm_object_t object, vm_page_t m, vm_pindex_t pindex,
4531    const char *wmesg, int allocflags, bool locked)
4532{
4533
4534	if ((allocflags & VM_ALLOC_NOWAIT) != 0)
4535		return (false);
4536
4537	/*
4538	 * Reference the page before unlocking and sleeping so that
4539	 * the page daemon is less likely to reclaim it.
4540	 */
4541	if (locked && (allocflags & VM_ALLOC_NOCREAT) == 0)
4542		vm_page_reference(m);
4543
4544	if (_vm_page_busy_sleep(object, m, pindex, wmesg, allocflags, locked) &&
4545	    locked)
4546		VM_OBJECT_WLOCK(object);
4547	if ((allocflags & VM_ALLOC_WAITFAIL) != 0)
4548		return (false);
4549
4550	return (true);
4551}
4552
4553/*
4554 * Assert that the grab flags are valid.
4555 */
4556static inline void
4557vm_page_grab_check(int allocflags)
4558{
4559
4560	KASSERT((allocflags & VM_ALLOC_NOBUSY) == 0 ||
4561	    (allocflags & VM_ALLOC_WIRED) != 0,
4562	    ("vm_page_grab*: the pages must be busied or wired"));
4563
4564	KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 ||
4565	    (allocflags & VM_ALLOC_IGN_SBUSY) != 0,
4566	    ("vm_page_grab*: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch"));
4567}
4568
4569/*
4570 * Calculate the page allocation flags for grab.
4571 */
4572static inline int
4573vm_page_grab_pflags(int allocflags)
4574{
4575	int pflags;
4576
4577	pflags = allocflags &
4578	    ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL |
4579	    VM_ALLOC_NOBUSY | VM_ALLOC_IGN_SBUSY);
4580	if ((allocflags & VM_ALLOC_NOWAIT) == 0)
4581		pflags |= VM_ALLOC_WAITFAIL;
4582	if ((allocflags & VM_ALLOC_IGN_SBUSY) != 0)
4583		pflags |= VM_ALLOC_SBUSY;
4584
4585	return (pflags);
4586}
4587
4588/*
4589 * Grab a page, waiting until we are waken up due to the page
4590 * changing state.  We keep on waiting, if the page continues
4591 * to be in the object.  If the page doesn't exist, first allocate it
4592 * and then conditionally zero it.
4593 *
4594 * This routine may sleep.
4595 *
4596 * The object must be locked on entry.  The lock will, however, be released
4597 * and reacquired if the routine sleeps.
4598 */
4599vm_page_t
4600vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags)
4601{
4602	vm_page_t m;
4603
4604	VM_OBJECT_ASSERT_WLOCKED(object);
4605	vm_page_grab_check(allocflags);
4606
4607retrylookup:
4608	if ((m = vm_page_lookup(object, pindex)) != NULL) {
4609		if (!vm_page_tryacquire(m, allocflags)) {
4610			if (vm_page_grab_sleep(object, m, pindex, "pgrbwt",
4611			    allocflags, true))
4612				goto retrylookup;
4613			return (NULL);
4614		}
4615		goto out;
4616	}
4617	if ((allocflags & VM_ALLOC_NOCREAT) != 0)
4618		return (NULL);
4619	m = vm_page_alloc(object, pindex, vm_page_grab_pflags(allocflags));
4620	if (m == NULL) {
4621		if ((allocflags & (VM_ALLOC_NOWAIT | VM_ALLOC_WAITFAIL)) != 0)
4622			return (NULL);
4623		goto retrylookup;
4624	}
4625	if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0)
4626		pmap_zero_page(m);
4627
4628out:
4629	vm_page_grab_release(m, allocflags);
4630
4631	return (m);
4632}
4633
4634/*
4635 * Locklessly attempt to acquire a page given a (object, pindex) tuple
4636 * and an optional previous page to avoid the radix lookup.  The resulting
4637 * page will be validated against the identity tuple and busied or wired
4638 * as requested.  A NULL *mp return guarantees that the page was not in
4639 * radix at the time of the call but callers must perform higher level
4640 * synchronization or retry the operation under a lock if they require
4641 * an atomic answer.  This is the only lock free validation routine,
4642 * other routines can depend on the resulting page state.
4643 *
4644 * The return value indicates whether the operation failed due to caller
4645 * flags.  The return is tri-state with mp:
4646 *
4647 * (true, *mp != NULL) - The operation was successful.
4648 * (true, *mp == NULL) - The page was not found in tree.
4649 * (false, *mp == NULL) - WAITFAIL or NOWAIT prevented acquisition.
4650 */
4651static bool
4652vm_page_acquire_unlocked(vm_object_t object, vm_pindex_t pindex,
4653    vm_page_t prev, vm_page_t *mp, int allocflags)
4654{
4655	vm_page_t m;
4656
4657	vm_page_grab_check(allocflags);
4658	MPASS(prev == NULL || vm_page_busied(prev) || vm_page_wired(prev));
4659
4660	*mp = NULL;
4661	for (;;) {
4662		/*
4663		 * We may see a false NULL here because the previous page
4664		 * has been removed or just inserted and the list is loaded
4665		 * without barriers.  Switch to radix to verify.
4666		 */
4667		if (prev == NULL || (m = TAILQ_NEXT(prev, listq)) == NULL ||
4668		    QMD_IS_TRASHED(m) || m->pindex != pindex ||
4669		    atomic_load_ptr(&m->object) != object) {
4670			prev = NULL;
4671			/*
4672			 * This guarantees the result is instantaneously
4673			 * correct.
4674			 */
4675			m = vm_radix_lookup_unlocked(&object->rtree, pindex);
4676		}
4677		if (m == NULL)
4678			return (true);
4679		if (vm_page_trybusy(m, allocflags)) {
4680			if (m->object == object && m->pindex == pindex)
4681				break;
4682			/* relookup. */
4683			vm_page_busy_release(m);
4684			cpu_spinwait();
4685			continue;
4686		}
4687		if (!vm_page_grab_sleep(object, m, pindex, "pgnslp",
4688		    allocflags, false))
4689			return (false);
4690	}
4691	if ((allocflags & VM_ALLOC_WIRED) != 0)
4692		vm_page_wire(m);
4693	vm_page_grab_release(m, allocflags);
4694	*mp = m;
4695	return (true);
4696}
4697
4698/*
4699 * Try to locklessly grab a page and fall back to the object lock if NOCREAT
4700 * is not set.
4701 */
4702vm_page_t
4703vm_page_grab_unlocked(vm_object_t object, vm_pindex_t pindex, int allocflags)
4704{
4705	vm_page_t m;
4706
4707	vm_page_grab_check(allocflags);
4708
4709	if (!vm_page_acquire_unlocked(object, pindex, NULL, &m, allocflags))
4710		return (NULL);
4711	if (m != NULL)
4712		return (m);
4713
4714	/*
4715	 * The radix lockless lookup should never return a false negative
4716	 * errors.  If the user specifies NOCREAT they are guaranteed there
4717	 * was no page present at the instant of the call.  A NOCREAT caller
4718	 * must handle create races gracefully.
4719	 */
4720	if ((allocflags & VM_ALLOC_NOCREAT) != 0)
4721		return (NULL);
4722
4723	VM_OBJECT_WLOCK(object);
4724	m = vm_page_grab(object, pindex, allocflags);
4725	VM_OBJECT_WUNLOCK(object);
4726
4727	return (m);
4728}
4729
4730/*
4731 * Grab a page and make it valid, paging in if necessary.  Pages missing from
4732 * their pager are zero filled and validated.  If a VM_ALLOC_COUNT is supplied
4733 * and the page is not valid as many as VM_INITIAL_PAGEIN pages can be brought
4734 * in simultaneously.  Additional pages will be left on a paging queue but
4735 * will neither be wired nor busy regardless of allocflags.
4736 */
4737int
4738vm_page_grab_valid(vm_page_t *mp, vm_object_t object, vm_pindex_t pindex, int allocflags)
4739{
4740	vm_page_t m;
4741	vm_page_t ma[VM_INITIAL_PAGEIN];
4742	int after, i, pflags, rv;
4743
4744	KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 ||
4745	    (allocflags & VM_ALLOC_IGN_SBUSY) != 0,
4746	    ("vm_page_grab_valid: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch"));
4747	KASSERT((allocflags &
4748	    (VM_ALLOC_NOWAIT | VM_ALLOC_WAITFAIL | VM_ALLOC_ZERO)) == 0,
4749	    ("vm_page_grab_valid: Invalid flags 0x%X", allocflags));
4750	VM_OBJECT_ASSERT_WLOCKED(object);
4751	pflags = allocflags & ~(VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY |
4752	    VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY);
4753	pflags |= VM_ALLOC_WAITFAIL;
4754
4755retrylookup:
4756	if ((m = vm_page_lookup(object, pindex)) != NULL) {
4757		/*
4758		 * If the page is fully valid it can only become invalid
4759		 * with the object lock held.  If it is not valid it can
4760		 * become valid with the busy lock held.  Therefore, we
4761		 * may unnecessarily lock the exclusive busy here if we
4762		 * race with I/O completion not using the object lock.
4763		 * However, we will not end up with an invalid page and a
4764		 * shared lock.
4765		 */
4766		if (!vm_page_trybusy(m,
4767		    vm_page_all_valid(m) ? allocflags : 0)) {
4768			(void)vm_page_grab_sleep(object, m, pindex, "pgrbwt",
4769			    allocflags, true);
4770			goto retrylookup;
4771		}
4772		if (vm_page_all_valid(m))
4773			goto out;
4774		if ((allocflags & VM_ALLOC_NOCREAT) != 0) {
4775			vm_page_busy_release(m);
4776			*mp = NULL;
4777			return (VM_PAGER_FAIL);
4778		}
4779	} else if ((allocflags & VM_ALLOC_NOCREAT) != 0) {
4780		*mp = NULL;
4781		return (VM_PAGER_FAIL);
4782	} else if ((m = vm_page_alloc(object, pindex, pflags)) == NULL) {
4783		if (!vm_pager_can_alloc_page(object, pindex)) {
4784			*mp = NULL;
4785			return (VM_PAGER_AGAIN);
4786		}
4787		goto retrylookup;
4788	}
4789
4790	vm_page_assert_xbusied(m);
4791	if (vm_pager_has_page(object, pindex, NULL, &after)) {
4792		after = MIN(after, VM_INITIAL_PAGEIN);
4793		after = MIN(after, allocflags >> VM_ALLOC_COUNT_SHIFT);
4794		after = MAX(after, 1);
4795		ma[0] = m;
4796		for (i = 1; i < after; i++) {
4797			if ((ma[i] = vm_page_next(ma[i - 1])) != NULL) {
4798				if (vm_page_any_valid(ma[i]) ||
4799				    !vm_page_tryxbusy(ma[i]))
4800					break;
4801			} else {
4802				ma[i] = vm_page_alloc(object, m->pindex + i,
4803				    VM_ALLOC_NORMAL);
4804				if (ma[i] == NULL)
4805					break;
4806			}
4807		}
4808		after = i;
4809		vm_object_pip_add(object, after);
4810		VM_OBJECT_WUNLOCK(object);
4811		rv = vm_pager_get_pages(object, ma, after, NULL, NULL);
4812		VM_OBJECT_WLOCK(object);
4813		vm_object_pip_wakeupn(object, after);
4814		/* Pager may have replaced a page. */
4815		m = ma[0];
4816		if (rv != VM_PAGER_OK) {
4817			for (i = 0; i < after; i++) {
4818				if (!vm_page_wired(ma[i]))
4819					vm_page_free(ma[i]);
4820				else
4821					vm_page_xunbusy(ma[i]);
4822			}
4823			*mp = NULL;
4824			return (rv);
4825		}
4826		for (i = 1; i < after; i++)
4827			vm_page_readahead_finish(ma[i]);
4828		MPASS(vm_page_all_valid(m));
4829	} else {
4830		vm_page_zero_invalid(m, TRUE);
4831	}
4832out:
4833	if ((allocflags & VM_ALLOC_WIRED) != 0)
4834		vm_page_wire(m);
4835	if ((allocflags & VM_ALLOC_SBUSY) != 0 && vm_page_xbusied(m))
4836		vm_page_busy_downgrade(m);
4837	else if ((allocflags & VM_ALLOC_NOBUSY) != 0)
4838		vm_page_busy_release(m);
4839	*mp = m;
4840	return (VM_PAGER_OK);
4841}
4842
4843/*
4844 * Locklessly grab a valid page.  If the page is not valid or not yet
4845 * allocated this will fall back to the object lock method.
4846 */
4847int
4848vm_page_grab_valid_unlocked(vm_page_t *mp, vm_object_t object,
4849    vm_pindex_t pindex, int allocflags)
4850{
4851	vm_page_t m;
4852	int flags;
4853	int error;
4854
4855	KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 ||
4856	    (allocflags & VM_ALLOC_IGN_SBUSY) != 0,
4857	    ("vm_page_grab_valid_unlocked: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY "
4858	    "mismatch"));
4859	KASSERT((allocflags &
4860	    (VM_ALLOC_NOWAIT | VM_ALLOC_WAITFAIL | VM_ALLOC_ZERO)) == 0,
4861	    ("vm_page_grab_valid_unlocked: Invalid flags 0x%X", allocflags));
4862
4863	/*
4864	 * Attempt a lockless lookup and busy.  We need at least an sbusy
4865	 * before we can inspect the valid field and return a wired page.
4866	 */
4867	flags = allocflags & ~(VM_ALLOC_NOBUSY | VM_ALLOC_WIRED);
4868	if (!vm_page_acquire_unlocked(object, pindex, NULL, mp, flags))
4869		return (VM_PAGER_FAIL);
4870	if ((m = *mp) != NULL) {
4871		if (vm_page_all_valid(m)) {
4872			if ((allocflags & VM_ALLOC_WIRED) != 0)
4873				vm_page_wire(m);
4874			vm_page_grab_release(m, allocflags);
4875			return (VM_PAGER_OK);
4876		}
4877		vm_page_busy_release(m);
4878	}
4879	if ((allocflags & VM_ALLOC_NOCREAT) != 0) {
4880		*mp = NULL;
4881		return (VM_PAGER_FAIL);
4882	}
4883	VM_OBJECT_WLOCK(object);
4884	error = vm_page_grab_valid(mp, object, pindex, allocflags);
4885	VM_OBJECT_WUNLOCK(object);
4886
4887	return (error);
4888}
4889
4890/*
4891 * Return the specified range of pages from the given object.  For each
4892 * page offset within the range, if a page already exists within the object
4893 * at that offset and it is busy, then wait for it to change state.  If,
4894 * instead, the page doesn't exist, then allocate it.
4895 *
4896 * The caller must always specify an allocation class.
4897 *
4898 * allocation classes:
4899 *	VM_ALLOC_NORMAL		normal process request
4900 *	VM_ALLOC_SYSTEM		system *really* needs the pages
4901 *
4902 * The caller must always specify that the pages are to be busied and/or
4903 * wired.
4904 *
4905 * optional allocation flags:
4906 *	VM_ALLOC_IGN_SBUSY	do not sleep on soft busy pages
4907 *	VM_ALLOC_NOBUSY		do not exclusive busy the page
4908 *	VM_ALLOC_NOWAIT		do not sleep
4909 *	VM_ALLOC_SBUSY		set page to sbusy state
4910 *	VM_ALLOC_WIRED		wire the pages
4911 *	VM_ALLOC_ZERO		zero and validate any invalid pages
4912 *
4913 * If VM_ALLOC_NOWAIT is not specified, this routine may sleep.  Otherwise, it
4914 * may return a partial prefix of the requested range.
4915 */
4916int
4917vm_page_grab_pages(vm_object_t object, vm_pindex_t pindex, int allocflags,
4918    vm_page_t *ma, int count)
4919{
4920	vm_page_t m, mpred;
4921	int pflags;
4922	int i;
4923
4924	VM_OBJECT_ASSERT_WLOCKED(object);
4925	KASSERT(((u_int)allocflags >> VM_ALLOC_COUNT_SHIFT) == 0,
4926	    ("vm_page_grap_pages: VM_ALLOC_COUNT() is not allowed"));
4927	KASSERT(count > 0,
4928	    ("vm_page_grab_pages: invalid page count %d", count));
4929	vm_page_grab_check(allocflags);
4930
4931	pflags = vm_page_grab_pflags(allocflags);
4932	i = 0;
4933retrylookup:
4934	m = vm_radix_lookup_le(&object->rtree, pindex + i);
4935	if (m == NULL || m->pindex != pindex + i) {
4936		mpred = m;
4937		m = NULL;
4938	} else
4939		mpred = TAILQ_PREV(m, pglist, listq);
4940	for (; i < count; i++) {
4941		if (m != NULL) {
4942			if (!vm_page_tryacquire(m, allocflags)) {
4943				if (vm_page_grab_sleep(object, m, pindex + i,
4944				    "grbmaw", allocflags, true))
4945					goto retrylookup;
4946				break;
4947			}
4948		} else {
4949			if ((allocflags & VM_ALLOC_NOCREAT) != 0)
4950				break;
4951			m = vm_page_alloc_after(object, pindex + i,
4952			    pflags | VM_ALLOC_COUNT(count - i), mpred);
4953			if (m == NULL) {
4954				if ((allocflags & (VM_ALLOC_NOWAIT |
4955				    VM_ALLOC_WAITFAIL)) != 0)
4956					break;
4957				goto retrylookup;
4958			}
4959		}
4960		if (vm_page_none_valid(m) &&
4961		    (allocflags & VM_ALLOC_ZERO) != 0) {
4962			if ((m->flags & PG_ZERO) == 0)
4963				pmap_zero_page(m);
4964			vm_page_valid(m);
4965		}
4966		vm_page_grab_release(m, allocflags);
4967		ma[i] = mpred = m;
4968		m = vm_page_next(m);
4969	}
4970	return (i);
4971}
4972
4973/*
4974 * Unlocked variant of vm_page_grab_pages().  This accepts the same flags
4975 * and will fall back to the locked variant to handle allocation.
4976 */
4977int
4978vm_page_grab_pages_unlocked(vm_object_t object, vm_pindex_t pindex,
4979    int allocflags, vm_page_t *ma, int count)
4980{
4981	vm_page_t m, pred;
4982	int flags;
4983	int i;
4984
4985	KASSERT(count > 0,
4986	    ("vm_page_grab_pages_unlocked: invalid page count %d", count));
4987	vm_page_grab_check(allocflags);
4988
4989	/*
4990	 * Modify flags for lockless acquire to hold the page until we
4991	 * set it valid if necessary.
4992	 */
4993	flags = allocflags & ~VM_ALLOC_NOBUSY;
4994	pred = NULL;
4995	for (i = 0; i < count; i++, pindex++) {
4996		if (!vm_page_acquire_unlocked(object, pindex, pred, &m, flags))
4997			return (i);
4998		if (m == NULL)
4999			break;
5000		if ((flags & VM_ALLOC_ZERO) != 0 && vm_page_none_valid(m)) {
5001			if ((m->flags & PG_ZERO) == 0)
5002				pmap_zero_page(m);
5003			vm_page_valid(m);
5004		}
5005		/* m will still be wired or busy according to flags. */
5006		vm_page_grab_release(m, allocflags);
5007		pred = ma[i] = m;
5008	}
5009	if (i == count || (allocflags & VM_ALLOC_NOCREAT) != 0)
5010		return (i);
5011	count -= i;
5012	VM_OBJECT_WLOCK(object);
5013	i += vm_page_grab_pages(object, pindex, allocflags, &ma[i], count);
5014	VM_OBJECT_WUNLOCK(object);
5015
5016	return (i);
5017}
5018
5019/*
5020 * Mapping function for valid or dirty bits in a page.
5021 *
5022 * Inputs are required to range within a page.
5023 */
5024vm_page_bits_t
5025vm_page_bits(int base, int size)
5026{
5027	int first_bit;
5028	int last_bit;
5029
5030	KASSERT(
5031	    base + size <= PAGE_SIZE,
5032	    ("vm_page_bits: illegal base/size %d/%d", base, size)
5033	);
5034
5035	if (size == 0)		/* handle degenerate case */
5036		return (0);
5037
5038	first_bit = base >> DEV_BSHIFT;
5039	last_bit = (base + size - 1) >> DEV_BSHIFT;
5040
5041	return (((vm_page_bits_t)2 << last_bit) -
5042	    ((vm_page_bits_t)1 << first_bit));
5043}
5044
5045void
5046vm_page_bits_set(vm_page_t m, vm_page_bits_t *bits, vm_page_bits_t set)
5047{
5048
5049#if PAGE_SIZE == 32768
5050	atomic_set_64((uint64_t *)bits, set);
5051#elif PAGE_SIZE == 16384
5052	atomic_set_32((uint32_t *)bits, set);
5053#elif (PAGE_SIZE == 8192) && defined(atomic_set_16)
5054	atomic_set_16((uint16_t *)bits, set);
5055#elif (PAGE_SIZE == 4096) && defined(atomic_set_8)
5056	atomic_set_8((uint8_t *)bits, set);
5057#else		/* PAGE_SIZE <= 8192 */
5058	uintptr_t addr;
5059	int shift;
5060
5061	addr = (uintptr_t)bits;
5062	/*
5063	 * Use a trick to perform a 32-bit atomic on the
5064	 * containing aligned word, to not depend on the existence
5065	 * of atomic_{set, clear}_{8, 16}.
5066	 */
5067	shift = addr & (sizeof(uint32_t) - 1);
5068#if BYTE_ORDER == BIG_ENDIAN
5069	shift = (sizeof(uint32_t) - sizeof(vm_page_bits_t) - shift) * NBBY;
5070#else
5071	shift *= NBBY;
5072#endif
5073	addr &= ~(sizeof(uint32_t) - 1);
5074	atomic_set_32((uint32_t *)addr, set << shift);
5075#endif		/* PAGE_SIZE */
5076}
5077
5078static inline void
5079vm_page_bits_clear(vm_page_t m, vm_page_bits_t *bits, vm_page_bits_t clear)
5080{
5081
5082#if PAGE_SIZE == 32768
5083	atomic_clear_64((uint64_t *)bits, clear);
5084#elif PAGE_SIZE == 16384
5085	atomic_clear_32((uint32_t *)bits, clear);
5086#elif (PAGE_SIZE == 8192) && defined(atomic_clear_16)
5087	atomic_clear_16((uint16_t *)bits, clear);
5088#elif (PAGE_SIZE == 4096) && defined(atomic_clear_8)
5089	atomic_clear_8((uint8_t *)bits, clear);
5090#else		/* PAGE_SIZE <= 8192 */
5091	uintptr_t addr;
5092	int shift;
5093
5094	addr = (uintptr_t)bits;
5095	/*
5096	 * Use a trick to perform a 32-bit atomic on the
5097	 * containing aligned word, to not depend on the existence
5098	 * of atomic_{set, clear}_{8, 16}.
5099	 */
5100	shift = addr & (sizeof(uint32_t) - 1);
5101#if BYTE_ORDER == BIG_ENDIAN
5102	shift = (sizeof(uint32_t) - sizeof(vm_page_bits_t) - shift) * NBBY;
5103#else
5104	shift *= NBBY;
5105#endif
5106	addr &= ~(sizeof(uint32_t) - 1);
5107	atomic_clear_32((uint32_t *)addr, clear << shift);
5108#endif		/* PAGE_SIZE */
5109}
5110
5111static inline vm_page_bits_t
5112vm_page_bits_swap(vm_page_t m, vm_page_bits_t *bits, vm_page_bits_t newbits)
5113{
5114#if PAGE_SIZE == 32768
5115	uint64_t old;
5116
5117	old = *bits;
5118	while (atomic_fcmpset_64(bits, &old, newbits) == 0);
5119	return (old);
5120#elif PAGE_SIZE == 16384
5121	uint32_t old;
5122
5123	old = *bits;
5124	while (atomic_fcmpset_32(bits, &old, newbits) == 0);
5125	return (old);
5126#elif (PAGE_SIZE == 8192) && defined(atomic_fcmpset_16)
5127	uint16_t old;
5128
5129	old = *bits;
5130	while (atomic_fcmpset_16(bits, &old, newbits) == 0);
5131	return (old);
5132#elif (PAGE_SIZE == 4096) && defined(atomic_fcmpset_8)
5133	uint8_t old;
5134
5135	old = *bits;
5136	while (atomic_fcmpset_8(bits, &old, newbits) == 0);
5137	return (old);
5138#else		/* PAGE_SIZE <= 4096*/
5139	uintptr_t addr;
5140	uint32_t old, new, mask;
5141	int shift;
5142
5143	addr = (uintptr_t)bits;
5144	/*
5145	 * Use a trick to perform a 32-bit atomic on the
5146	 * containing aligned word, to not depend on the existence
5147	 * of atomic_{set, swap, clear}_{8, 16}.
5148	 */
5149	shift = addr & (sizeof(uint32_t) - 1);
5150#if BYTE_ORDER == BIG_ENDIAN
5151	shift = (sizeof(uint32_t) - sizeof(vm_page_bits_t) - shift) * NBBY;
5152#else
5153	shift *= NBBY;
5154#endif
5155	addr &= ~(sizeof(uint32_t) - 1);
5156	mask = VM_PAGE_BITS_ALL << shift;
5157
5158	old = *bits;
5159	do {
5160		new = old & ~mask;
5161		new |= newbits << shift;
5162	} while (atomic_fcmpset_32((uint32_t *)addr, &old, new) == 0);
5163	return (old >> shift);
5164#endif		/* PAGE_SIZE */
5165}
5166
5167/*
5168 *	vm_page_set_valid_range:
5169 *
5170 *	Sets portions of a page valid.  The arguments are expected
5171 *	to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
5172 *	of any partial chunks touched by the range.  The invalid portion of
5173 *	such chunks will be zeroed.
5174 *
5175 *	(base + size) must be less then or equal to PAGE_SIZE.
5176 */
5177void
5178vm_page_set_valid_range(vm_page_t m, int base, int size)
5179{
5180	int endoff, frag;
5181	vm_page_bits_t pagebits;
5182
5183	vm_page_assert_busied(m);
5184	if (size == 0)	/* handle degenerate case */
5185		return;
5186
5187	/*
5188	 * If the base is not DEV_BSIZE aligned and the valid
5189	 * bit is clear, we have to zero out a portion of the
5190	 * first block.
5191	 */
5192	if ((frag = rounddown2(base, DEV_BSIZE)) != base &&
5193	    (m->valid & (1 << (base >> DEV_BSHIFT))) == 0)
5194		pmap_zero_page_area(m, frag, base - frag);
5195
5196	/*
5197	 * If the ending offset is not DEV_BSIZE aligned and the
5198	 * valid bit is clear, we have to zero out a portion of
5199	 * the last block.
5200	 */
5201	endoff = base + size;
5202	if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff &&
5203	    (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0)
5204		pmap_zero_page_area(m, endoff,
5205		    DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
5206
5207	/*
5208	 * Assert that no previously invalid block that is now being validated
5209	 * is already dirty.
5210	 */
5211	KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0,
5212	    ("vm_page_set_valid_range: page %p is dirty", m));
5213
5214	/*
5215	 * Set valid bits inclusive of any overlap.
5216	 */
5217	pagebits = vm_page_bits(base, size);
5218	if (vm_page_xbusied(m))
5219		m->valid |= pagebits;
5220	else
5221		vm_page_bits_set(m, &m->valid, pagebits);
5222}
5223
5224/*
5225 * Set the page dirty bits and free the invalid swap space if
5226 * present.  Returns the previous dirty bits.
5227 */
5228vm_page_bits_t
5229vm_page_set_dirty(vm_page_t m)
5230{
5231	vm_page_bits_t old;
5232
5233	VM_PAGE_OBJECT_BUSY_ASSERT(m);
5234
5235	if (vm_page_xbusied(m) && !pmap_page_is_write_mapped(m)) {
5236		old = m->dirty;
5237		m->dirty = VM_PAGE_BITS_ALL;
5238	} else
5239		old = vm_page_bits_swap(m, &m->dirty, VM_PAGE_BITS_ALL);
5240	if (old == 0 && (m->a.flags & PGA_SWAP_SPACE) != 0)
5241		vm_pager_page_unswapped(m);
5242
5243	return (old);
5244}
5245
5246/*
5247 * Clear the given bits from the specified page's dirty field.
5248 */
5249static __inline void
5250vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits)
5251{
5252
5253	vm_page_assert_busied(m);
5254
5255	/*
5256	 * If the page is xbusied and not write mapped we are the
5257	 * only thread that can modify dirty bits.  Otherwise, The pmap
5258	 * layer can call vm_page_dirty() without holding a distinguished
5259	 * lock.  The combination of page busy and atomic operations
5260	 * suffice to guarantee consistency of the page dirty field.
5261	 */
5262	if (vm_page_xbusied(m) && !pmap_page_is_write_mapped(m))
5263		m->dirty &= ~pagebits;
5264	else
5265		vm_page_bits_clear(m, &m->dirty, pagebits);
5266}
5267
5268/*
5269 *	vm_page_set_validclean:
5270 *
5271 *	Sets portions of a page valid and clean.  The arguments are expected
5272 *	to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
5273 *	of any partial chunks touched by the range.  The invalid portion of
5274 *	such chunks will be zero'd.
5275 *
5276 *	(base + size) must be less then or equal to PAGE_SIZE.
5277 */
5278void
5279vm_page_set_validclean(vm_page_t m, int base, int size)
5280{
5281	vm_page_bits_t oldvalid, pagebits;
5282	int endoff, frag;
5283
5284	vm_page_assert_busied(m);
5285	if (size == 0)	/* handle degenerate case */
5286		return;
5287
5288	/*
5289	 * If the base is not DEV_BSIZE aligned and the valid
5290	 * bit is clear, we have to zero out a portion of the
5291	 * first block.
5292	 */
5293	if ((frag = rounddown2(base, DEV_BSIZE)) != base &&
5294	    (m->valid & ((vm_page_bits_t)1 << (base >> DEV_BSHIFT))) == 0)
5295		pmap_zero_page_area(m, frag, base - frag);
5296
5297	/*
5298	 * If the ending offset is not DEV_BSIZE aligned and the
5299	 * valid bit is clear, we have to zero out a portion of
5300	 * the last block.
5301	 */
5302	endoff = base + size;
5303	if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff &&
5304	    (m->valid & ((vm_page_bits_t)1 << (endoff >> DEV_BSHIFT))) == 0)
5305		pmap_zero_page_area(m, endoff,
5306		    DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
5307
5308	/*
5309	 * Set valid, clear dirty bits.  If validating the entire
5310	 * page we can safely clear the pmap modify bit.  We also
5311	 * use this opportunity to clear the PGA_NOSYNC flag.  If a process
5312	 * takes a write fault on a MAP_NOSYNC memory area the flag will
5313	 * be set again.
5314	 *
5315	 * We set valid bits inclusive of any overlap, but we can only
5316	 * clear dirty bits for DEV_BSIZE chunks that are fully within
5317	 * the range.
5318	 */
5319	oldvalid = m->valid;
5320	pagebits = vm_page_bits(base, size);
5321	if (vm_page_xbusied(m))
5322		m->valid |= pagebits;
5323	else
5324		vm_page_bits_set(m, &m->valid, pagebits);
5325#if 0	/* NOT YET */
5326	if ((frag = base & (DEV_BSIZE - 1)) != 0) {
5327		frag = DEV_BSIZE - frag;
5328		base += frag;
5329		size -= frag;
5330		if (size < 0)
5331			size = 0;
5332	}
5333	pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1));
5334#endif
5335	if (base == 0 && size == PAGE_SIZE) {
5336		/*
5337		 * The page can only be modified within the pmap if it is
5338		 * mapped, and it can only be mapped if it was previously
5339		 * fully valid.
5340		 */
5341		if (oldvalid == VM_PAGE_BITS_ALL)
5342			/*
5343			 * Perform the pmap_clear_modify() first.  Otherwise,
5344			 * a concurrent pmap operation, such as
5345			 * pmap_protect(), could clear a modification in the
5346			 * pmap and set the dirty field on the page before
5347			 * pmap_clear_modify() had begun and after the dirty
5348			 * field was cleared here.
5349			 */
5350			pmap_clear_modify(m);
5351		m->dirty = 0;
5352		vm_page_aflag_clear(m, PGA_NOSYNC);
5353	} else if (oldvalid != VM_PAGE_BITS_ALL && vm_page_xbusied(m))
5354		m->dirty &= ~pagebits;
5355	else
5356		vm_page_clear_dirty_mask(m, pagebits);
5357}
5358
5359void
5360vm_page_clear_dirty(vm_page_t m, int base, int size)
5361{
5362
5363	vm_page_clear_dirty_mask(m, vm_page_bits(base, size));
5364}
5365
5366/*
5367 *	vm_page_set_invalid:
5368 *
5369 *	Invalidates DEV_BSIZE'd chunks within a page.  Both the
5370 *	valid and dirty bits for the effected areas are cleared.
5371 */
5372void
5373vm_page_set_invalid(vm_page_t m, int base, int size)
5374{
5375	vm_page_bits_t bits;
5376	vm_object_t object;
5377
5378	/*
5379	 * The object lock is required so that pages can't be mapped
5380	 * read-only while we're in the process of invalidating them.
5381	 */
5382	object = m->object;
5383	VM_OBJECT_ASSERT_WLOCKED(object);
5384	vm_page_assert_busied(m);
5385
5386	if (object->type == OBJT_VNODE && base == 0 && IDX_TO_OFF(m->pindex) +
5387	    size >= object->un_pager.vnp.vnp_size)
5388		bits = VM_PAGE_BITS_ALL;
5389	else
5390		bits = vm_page_bits(base, size);
5391	if (object->ref_count != 0 && vm_page_all_valid(m) && bits != 0)
5392		pmap_remove_all(m);
5393	KASSERT((bits == 0 && vm_page_all_valid(m)) ||
5394	    !pmap_page_is_mapped(m),
5395	    ("vm_page_set_invalid: page %p is mapped", m));
5396	if (vm_page_xbusied(m)) {
5397		m->valid &= ~bits;
5398		m->dirty &= ~bits;
5399	} else {
5400		vm_page_bits_clear(m, &m->valid, bits);
5401		vm_page_bits_clear(m, &m->dirty, bits);
5402	}
5403}
5404
5405/*
5406 *	vm_page_invalid:
5407 *
5408 *	Invalidates the entire page.  The page must be busy, unmapped, and
5409 *	the enclosing object must be locked.  The object locks protects
5410 *	against concurrent read-only pmap enter which is done without
5411 *	busy.
5412 */
5413void
5414vm_page_invalid(vm_page_t m)
5415{
5416
5417	vm_page_assert_busied(m);
5418	VM_OBJECT_ASSERT_WLOCKED(m->object);
5419	MPASS(!pmap_page_is_mapped(m));
5420
5421	if (vm_page_xbusied(m))
5422		m->valid = 0;
5423	else
5424		vm_page_bits_clear(m, &m->valid, VM_PAGE_BITS_ALL);
5425}
5426
5427/*
5428 * vm_page_zero_invalid()
5429 *
5430 *	The kernel assumes that the invalid portions of a page contain
5431 *	garbage, but such pages can be mapped into memory by user code.
5432 *	When this occurs, we must zero out the non-valid portions of the
5433 *	page so user code sees what it expects.
5434 *
5435 *	Pages are most often semi-valid when the end of a file is mapped
5436 *	into memory and the file's size is not page aligned.
5437 */
5438void
5439vm_page_zero_invalid(vm_page_t m, boolean_t setvalid)
5440{
5441	int b;
5442	int i;
5443
5444	/*
5445	 * Scan the valid bits looking for invalid sections that
5446	 * must be zeroed.  Invalid sub-DEV_BSIZE'd areas ( where the
5447	 * valid bit may be set ) have already been zeroed by
5448	 * vm_page_set_validclean().
5449	 */
5450	for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
5451		if (i == (PAGE_SIZE / DEV_BSIZE) ||
5452		    (m->valid & ((vm_page_bits_t)1 << i))) {
5453			if (i > b) {
5454				pmap_zero_page_area(m,
5455				    b << DEV_BSHIFT, (i - b) << DEV_BSHIFT);
5456			}
5457			b = i + 1;
5458		}
5459	}
5460
5461	/*
5462	 * setvalid is TRUE when we can safely set the zero'd areas
5463	 * as being valid.  We can do this if there are no cache consistency
5464	 * issues.  e.g. it is ok to do with UFS, but not ok to do with NFS.
5465	 */
5466	if (setvalid)
5467		vm_page_valid(m);
5468}
5469
5470/*
5471 *	vm_page_is_valid:
5472 *
5473 *	Is (partial) page valid?  Note that the case where size == 0
5474 *	will return FALSE in the degenerate case where the page is
5475 *	entirely invalid, and TRUE otherwise.
5476 *
5477 *	Some callers envoke this routine without the busy lock held and
5478 *	handle races via higher level locks.  Typical callers should
5479 *	hold a busy lock to prevent invalidation.
5480 */
5481int
5482vm_page_is_valid(vm_page_t m, int base, int size)
5483{
5484	vm_page_bits_t bits;
5485
5486	bits = vm_page_bits(base, size);
5487	return (vm_page_any_valid(m) && (m->valid & bits) == bits);
5488}
5489
5490/*
5491 * Returns true if all of the specified predicates are true for the entire
5492 * (super)page and false otherwise.
5493 */
5494bool
5495vm_page_ps_test(vm_page_t m, int flags, vm_page_t skip_m)
5496{
5497	vm_object_t object;
5498	int i, npages;
5499
5500	object = m->object;
5501	if (skip_m != NULL && skip_m->object != object)
5502		return (false);
5503	VM_OBJECT_ASSERT_LOCKED(object);
5504	npages = atop(pagesizes[m->psind]);
5505
5506	/*
5507	 * The physically contiguous pages that make up a superpage, i.e., a
5508	 * page with a page size index ("psind") greater than zero, will
5509	 * occupy adjacent entries in vm_page_array[].
5510	 */
5511	for (i = 0; i < npages; i++) {
5512		/* Always test object consistency, including "skip_m". */
5513		if (m[i].object != object)
5514			return (false);
5515		if (&m[i] == skip_m)
5516			continue;
5517		if ((flags & PS_NONE_BUSY) != 0 && vm_page_busied(&m[i]))
5518			return (false);
5519		if ((flags & PS_ALL_DIRTY) != 0) {
5520			/*
5521			 * Calling vm_page_test_dirty() or pmap_is_modified()
5522			 * might stop this case from spuriously returning
5523			 * "false".  However, that would require a write lock
5524			 * on the object containing "m[i]".
5525			 */
5526			if (m[i].dirty != VM_PAGE_BITS_ALL)
5527				return (false);
5528		}
5529		if ((flags & PS_ALL_VALID) != 0 &&
5530		    m[i].valid != VM_PAGE_BITS_ALL)
5531			return (false);
5532	}
5533	return (true);
5534}
5535
5536/*
5537 * Set the page's dirty bits if the page is modified.
5538 */
5539void
5540vm_page_test_dirty(vm_page_t m)
5541{
5542
5543	vm_page_assert_busied(m);
5544	if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m))
5545		vm_page_dirty(m);
5546}
5547
5548void
5549vm_page_valid(vm_page_t m)
5550{
5551
5552	vm_page_assert_busied(m);
5553	if (vm_page_xbusied(m))
5554		m->valid = VM_PAGE_BITS_ALL;
5555	else
5556		vm_page_bits_set(m, &m->valid, VM_PAGE_BITS_ALL);
5557}
5558
5559void
5560vm_page_lock_KBI(vm_page_t m, const char *file, int line)
5561{
5562
5563	mtx_lock_flags_(vm_page_lockptr(m), 0, file, line);
5564}
5565
5566void
5567vm_page_unlock_KBI(vm_page_t m, const char *file, int line)
5568{
5569
5570	mtx_unlock_flags_(vm_page_lockptr(m), 0, file, line);
5571}
5572
5573int
5574vm_page_trylock_KBI(vm_page_t m, const char *file, int line)
5575{
5576
5577	return (mtx_trylock_flags_(vm_page_lockptr(m), 0, file, line));
5578}
5579
5580#if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
5581void
5582vm_page_assert_locked_KBI(vm_page_t m, const char *file, int line)
5583{
5584
5585	vm_page_lock_assert_KBI(m, MA_OWNED, file, line);
5586}
5587
5588void
5589vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line)
5590{
5591
5592	mtx_assert_(vm_page_lockptr(m), a, file, line);
5593}
5594#endif
5595
5596#ifdef INVARIANTS
5597void
5598vm_page_object_busy_assert(vm_page_t m)
5599{
5600
5601	/*
5602	 * Certain of the page's fields may only be modified by the
5603	 * holder of a page or object busy.
5604	 */
5605	if (m->object != NULL && !vm_page_busied(m))
5606		VM_OBJECT_ASSERT_BUSY(m->object);
5607}
5608
5609void
5610vm_page_assert_pga_writeable(vm_page_t m, uint16_t bits)
5611{
5612
5613	if ((bits & PGA_WRITEABLE) == 0)
5614		return;
5615
5616	/*
5617	 * The PGA_WRITEABLE flag can only be set if the page is
5618	 * managed, is exclusively busied or the object is locked.
5619	 * Currently, this flag is only set by pmap_enter().
5620	 */
5621	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5622	    ("PGA_WRITEABLE on unmanaged page"));
5623	if (!vm_page_xbusied(m))
5624		VM_OBJECT_ASSERT_BUSY(m->object);
5625}
5626#endif
5627
5628#include "opt_ddb.h"
5629#ifdef DDB
5630#include <sys/kernel.h>
5631
5632#include <ddb/ddb.h>
5633
5634DB_SHOW_COMMAND_FLAGS(page, vm_page_print_page_info, DB_CMD_MEMSAFE)
5635{
5636
5637	db_printf("vm_cnt.v_free_count: %d\n", vm_free_count());
5638	db_printf("vm_cnt.v_inactive_count: %d\n", vm_inactive_count());
5639	db_printf("vm_cnt.v_active_count: %d\n", vm_active_count());
5640	db_printf("vm_cnt.v_laundry_count: %d\n", vm_laundry_count());
5641	db_printf("vm_cnt.v_wire_count: %d\n", vm_wire_count());
5642	db_printf("vm_cnt.v_free_reserved: %d\n", vm_cnt.v_free_reserved);
5643	db_printf("vm_cnt.v_free_min: %d\n", vm_cnt.v_free_min);
5644	db_printf("vm_cnt.v_free_target: %d\n", vm_cnt.v_free_target);
5645	db_printf("vm_cnt.v_inactive_target: %d\n", vm_cnt.v_inactive_target);
5646}
5647
5648DB_SHOW_COMMAND_FLAGS(pageq, vm_page_print_pageq_info, DB_CMD_MEMSAFE)
5649{
5650	int dom;
5651
5652	db_printf("pq_free %d\n", vm_free_count());
5653	for (dom = 0; dom < vm_ndomains; dom++) {
5654		db_printf(
5655    "dom %d page_cnt %d free %d pq_act %d pq_inact %d pq_laund %d pq_unsw %d\n",
5656		    dom,
5657		    vm_dom[dom].vmd_page_count,
5658		    vm_dom[dom].vmd_free_count,
5659		    vm_dom[dom].vmd_pagequeues[PQ_ACTIVE].pq_cnt,
5660		    vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt,
5661		    vm_dom[dom].vmd_pagequeues[PQ_LAUNDRY].pq_cnt,
5662		    vm_dom[dom].vmd_pagequeues[PQ_UNSWAPPABLE].pq_cnt);
5663	}
5664}
5665
5666DB_SHOW_COMMAND(pginfo, vm_page_print_pginfo)
5667{
5668	vm_page_t m;
5669	boolean_t phys, virt;
5670
5671	if (!have_addr) {
5672		db_printf("show pginfo addr\n");
5673		return;
5674	}
5675
5676	phys = strchr(modif, 'p') != NULL;
5677	virt = strchr(modif, 'v') != NULL;
5678	if (virt)
5679		m = PHYS_TO_VM_PAGE(pmap_kextract(addr));
5680	else if (phys)
5681		m = PHYS_TO_VM_PAGE(addr);
5682	else
5683		m = (vm_page_t)addr;
5684	db_printf(
5685    "page %p obj %p pidx 0x%jx phys 0x%jx q %d ref 0x%x\n"
5686    "  af 0x%x of 0x%x f 0x%x act %d busy %x valid 0x%x dirty 0x%x\n",
5687	    m, m->object, (uintmax_t)m->pindex, (uintmax_t)m->phys_addr,
5688	    m->a.queue, m->ref_count, m->a.flags, m->oflags,
5689	    m->flags, m->a.act_count, m->busy_lock, m->valid, m->dirty);
5690}
5691#endif /* DDB */
5692