pmap.c revision 270439
1/*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2003 Peter Wemm
9 * All rights reserved.
10 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
11 * All rights reserved.
12 *
13 * This code is derived from software contributed to Berkeley by
14 * the Systems Programming Group of the University of Utah Computer
15 * Science Department and William Jolitz of UUNET Technologies Inc.
16 *
17 * Redistribution and use in source and binary forms, with or without
18 * modification, are permitted provided that the following conditions
19 * are met:
20 * 1. Redistributions of source code must retain the above copyright
21 *    notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 *    notice, this list of conditions and the following disclaimer in the
24 *    documentation and/or other materials provided with the distribution.
25 * 3. All advertising materials mentioning features or use of this software
26 *    must display the following acknowledgement:
27 *	This product includes software developed by the University of
28 *	California, Berkeley and its contributors.
29 * 4. Neither the name of the University nor the names of its contributors
30 *    may be used to endorse or promote products derived from this software
31 *    without specific prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
34 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
35 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
36 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
37 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
38 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
39 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
40 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
41 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
42 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
43 * SUCH DAMAGE.
44 *
45 *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
46 */
47/*-
48 * Copyright (c) 2003 Networks Associates Technology, Inc.
49 * All rights reserved.
50 *
51 * This software was developed for the FreeBSD Project by Jake Burkholder,
52 * Safeport Network Services, and Network Associates Laboratories, the
53 * Security Research Division of Network Associates, Inc. under
54 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
55 * CHATS research program.
56 *
57 * Redistribution and use in source and binary forms, with or without
58 * modification, are permitted provided that the following conditions
59 * are met:
60 * 1. Redistributions of source code must retain the above copyright
61 *    notice, this list of conditions and the following disclaimer.
62 * 2. Redistributions in binary form must reproduce the above copyright
63 *    notice, this list of conditions and the following disclaimer in the
64 *    documentation and/or other materials provided with the distribution.
65 *
66 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
67 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
68 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
69 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
70 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
71 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
72 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
73 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
74 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
75 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
76 * SUCH DAMAGE.
77 */
78
79#define	AMD64_NPT_AWARE
80
81#include <sys/cdefs.h>
82__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/pmap.c 270439 2014-08-24 07:53:15Z kib $");
83
84/*
85 *	Manages physical address maps.
86 *
87 *	Since the information managed by this module is
88 *	also stored by the logical address mapping module,
89 *	this module may throw away valid virtual-to-physical
90 *	mappings at almost any time.  However, invalidations
91 *	of virtual-to-physical mappings must be done as
92 *	requested.
93 *
94 *	In order to cope with hardware architectures which
95 *	make virtual-to-physical map invalidates expensive,
96 *	this module may delay invalidate or reduced protection
97 *	operations until such time as they are actually
98 *	necessary.  This module is given full information as
99 *	to which processors are currently using which maps,
100 *	and to when physical maps must be made correct.
101 */
102
103#include "opt_pmap.h"
104#include "opt_vm.h"
105
106#include <sys/param.h>
107#include <sys/bus.h>
108#include <sys/systm.h>
109#include <sys/kernel.h>
110#include <sys/ktr.h>
111#include <sys/lock.h>
112#include <sys/malloc.h>
113#include <sys/mman.h>
114#include <sys/mutex.h>
115#include <sys/proc.h>
116#include <sys/rwlock.h>
117#include <sys/sx.h>
118#include <sys/vmmeter.h>
119#include <sys/sched.h>
120#include <sys/sysctl.h>
121#include <sys/_unrhdr.h>
122#include <sys/smp.h>
123
124#include <vm/vm.h>
125#include <vm/vm_param.h>
126#include <vm/vm_kern.h>
127#include <vm/vm_page.h>
128#include <vm/vm_map.h>
129#include <vm/vm_object.h>
130#include <vm/vm_extern.h>
131#include <vm/vm_pageout.h>
132#include <vm/vm_pager.h>
133#include <vm/vm_radix.h>
134#include <vm/vm_reserv.h>
135#include <vm/uma.h>
136
137#include <machine/intr_machdep.h>
138#include <machine/apicvar.h>
139#include <machine/cpu.h>
140#include <machine/cputypes.h>
141#include <machine/md_var.h>
142#include <machine/pcb.h>
143#include <machine/specialreg.h>
144#ifdef SMP
145#include <machine/smp.h>
146#endif
147
148static __inline boolean_t
149pmap_emulate_ad_bits(pmap_t pmap)
150{
151
152	return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0);
153}
154
155static __inline pt_entry_t
156pmap_valid_bit(pmap_t pmap)
157{
158	pt_entry_t mask;
159
160	switch (pmap->pm_type) {
161	case PT_X86:
162		mask = X86_PG_V;
163		break;
164	case PT_EPT:
165		if (pmap_emulate_ad_bits(pmap))
166			mask = EPT_PG_EMUL_V;
167		else
168			mask = EPT_PG_READ;
169		break;
170	default:
171		panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type);
172	}
173
174	return (mask);
175}
176
177static __inline pt_entry_t
178pmap_rw_bit(pmap_t pmap)
179{
180	pt_entry_t mask;
181
182	switch (pmap->pm_type) {
183	case PT_X86:
184		mask = X86_PG_RW;
185		break;
186	case PT_EPT:
187		if (pmap_emulate_ad_bits(pmap))
188			mask = EPT_PG_EMUL_RW;
189		else
190			mask = EPT_PG_WRITE;
191		break;
192	default:
193		panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type);
194	}
195
196	return (mask);
197}
198
199static __inline pt_entry_t
200pmap_global_bit(pmap_t pmap)
201{
202	pt_entry_t mask;
203
204	switch (pmap->pm_type) {
205	case PT_X86:
206		mask = X86_PG_G;
207		break;
208	case PT_EPT:
209		mask = 0;
210		break;
211	default:
212		panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type);
213	}
214
215	return (mask);
216}
217
218static __inline pt_entry_t
219pmap_accessed_bit(pmap_t pmap)
220{
221	pt_entry_t mask;
222
223	switch (pmap->pm_type) {
224	case PT_X86:
225		mask = X86_PG_A;
226		break;
227	case PT_EPT:
228		if (pmap_emulate_ad_bits(pmap))
229			mask = EPT_PG_READ;
230		else
231			mask = EPT_PG_A;
232		break;
233	default:
234		panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type);
235	}
236
237	return (mask);
238}
239
240static __inline pt_entry_t
241pmap_modified_bit(pmap_t pmap)
242{
243	pt_entry_t mask;
244
245	switch (pmap->pm_type) {
246	case PT_X86:
247		mask = X86_PG_M;
248		break;
249	case PT_EPT:
250		if (pmap_emulate_ad_bits(pmap))
251			mask = EPT_PG_WRITE;
252		else
253			mask = EPT_PG_M;
254		break;
255	default:
256		panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type);
257	}
258
259	return (mask);
260}
261
262#if !defined(DIAGNOSTIC)
263#ifdef __GNUC_GNU_INLINE__
264#define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
265#else
266#define PMAP_INLINE	extern inline
267#endif
268#else
269#define PMAP_INLINE
270#endif
271
272#ifdef PV_STATS
273#define PV_STAT(x)	do { x ; } while (0)
274#else
275#define PV_STAT(x)	do { } while (0)
276#endif
277
278#define	pa_index(pa)	((pa) >> PDRSHIFT)
279#define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
280
281#define	NPV_LIST_LOCKS	MAXCPU
282
283#define	PHYS_TO_PV_LIST_LOCK(pa)	\
284			(&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
285
286#define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)	do {	\
287	struct rwlock **_lockp = (lockp);		\
288	struct rwlock *_new_lock;			\
289							\
290	_new_lock = PHYS_TO_PV_LIST_LOCK(pa);		\
291	if (_new_lock != *_lockp) {			\
292		if (*_lockp != NULL)			\
293			rw_wunlock(*_lockp);		\
294		*_lockp = _new_lock;			\
295		rw_wlock(*_lockp);			\
296	}						\
297} while (0)
298
299#define	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)	\
300			CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
301
302#define	RELEASE_PV_LIST_LOCK(lockp)		do {	\
303	struct rwlock **_lockp = (lockp);		\
304							\
305	if (*_lockp != NULL) {				\
306		rw_wunlock(*_lockp);			\
307		*_lockp = NULL;				\
308	}						\
309} while (0)
310
311#define	VM_PAGE_TO_PV_LIST_LOCK(m)	\
312			PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
313
314struct pmap kernel_pmap_store;
315
316vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
317vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
318
319int nkpt;
320SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0,
321    "Number of kernel page table pages allocated on bootup");
322
323static int ndmpdp;
324vm_paddr_t dmaplimit;
325vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
326pt_entry_t pg_nx;
327
328static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
329
330static int pat_works = 1;
331SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1,
332    "Is page attribute table fully functional?");
333
334static int pg_ps_enabled = 1;
335SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0,
336    "Are large page mappings enabled?");
337
338#define	PAT_INDEX_SIZE	8
339static int pat_index[PAT_INDEX_SIZE];	/* cache mode to PAT index conversion */
340
341static u_int64_t	KPTphys;	/* phys addr of kernel level 1 */
342static u_int64_t	KPDphys;	/* phys addr of kernel level 2 */
343u_int64_t		KPDPphys;	/* phys addr of kernel level 3 */
344u_int64_t		KPML4phys;	/* phys addr of kernel level 4 */
345
346static u_int64_t	DMPDphys;	/* phys addr of direct mapped level 2 */
347static u_int64_t	DMPDPphys;	/* phys addr of direct mapped level 3 */
348static int		ndmpdpphys;	/* number of DMPDPphys pages */
349
350static struct rwlock_padalign pvh_global_lock;
351
352/*
353 * Data for the pv entry allocation mechanism
354 */
355static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
356static struct mtx pv_chunks_mutex;
357static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
358static struct md_page *pv_table;
359
360/*
361 * All those kernel PT submaps that BSD is so fond of
362 */
363pt_entry_t *CMAP1 = 0;
364caddr_t CADDR1 = 0;
365
366static int pmap_flags = PMAP_PDE_SUPERPAGE;	/* flags for x86 pmaps */
367
368static struct unrhdr pcid_unr;
369static struct mtx pcid_mtx;
370int pmap_pcid_enabled = 0;
371SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN, &pmap_pcid_enabled,
372    0, "Is TLB Context ID enabled ?");
373int invpcid_works = 0;
374SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0,
375    "Is the invpcid instruction available ?");
376
377static int
378pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS)
379{
380	int i;
381	uint64_t res;
382
383	res = 0;
384	CPU_FOREACH(i) {
385		res += cpuid_to_pcpu[i]->pc_pm_save_cnt;
386	}
387	return (sysctl_handle_64(oidp, &res, 0, req));
388}
389SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RW |
390    CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU",
391    "Count of saved TLB context on switch");
392
393/* pmap_copy_pages() over non-DMAP */
394static struct mtx cpage_lock;
395static vm_offset_t cpage_a;
396static vm_offset_t cpage_b;
397
398/*
399 * Crashdump maps.
400 */
401static caddr_t crashdumpmap;
402
403static void	free_pv_chunk(struct pv_chunk *pc);
404static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
405static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
406static int	popcnt_pc_map_elem(uint64_t elem);
407static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
408static void	reserve_pv_entries(pmap_t pmap, int needed,
409		    struct rwlock **lockp);
410static void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
411		    struct rwlock **lockp);
412static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
413		    struct rwlock **lockp);
414static void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
415		    struct rwlock **lockp);
416static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
417static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
418		    vm_offset_t va);
419
420static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode);
421static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
422static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde,
423    vm_offset_t va, struct rwlock **lockp);
424static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe,
425    vm_offset_t va);
426static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
427    vm_prot_t prot, struct rwlock **lockp);
428static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
429    vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
430static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
431static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
432static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
433static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va);
434static void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask);
435static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
436    struct rwlock **lockp);
437static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
438    vm_prot_t prot);
439static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask);
440static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
441    struct spglist *free, struct rwlock **lockp);
442static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
443    pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
444static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte);
445static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
446    struct spglist *free);
447static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
448    vm_page_t m, struct rwlock **lockp);
449static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
450    pd_entry_t newpde);
451static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde);
452
453static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
454		struct rwlock **lockp);
455static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va,
456		struct rwlock **lockp);
457static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va,
458		struct rwlock **lockp);
459
460static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
461    struct spglist *free);
462static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
463static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
464
465/*
466 * Move the kernel virtual free pointer to the next
467 * 2MB.  This is used to help improve performance
468 * by using a large (2MB) page for much of the kernel
469 * (.text, .data, .bss)
470 */
471static vm_offset_t
472pmap_kmem_choose(vm_offset_t addr)
473{
474	vm_offset_t newaddr = addr;
475
476	newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
477	return (newaddr);
478}
479
480/********************/
481/* Inline functions */
482/********************/
483
484/* Return a non-clipped PD index for a given VA */
485static __inline vm_pindex_t
486pmap_pde_pindex(vm_offset_t va)
487{
488	return (va >> PDRSHIFT);
489}
490
491
492/* Return various clipped indexes for a given VA */
493static __inline vm_pindex_t
494pmap_pte_index(vm_offset_t va)
495{
496
497	return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1));
498}
499
500static __inline vm_pindex_t
501pmap_pde_index(vm_offset_t va)
502{
503
504	return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
505}
506
507static __inline vm_pindex_t
508pmap_pdpe_index(vm_offset_t va)
509{
510
511	return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1));
512}
513
514static __inline vm_pindex_t
515pmap_pml4e_index(vm_offset_t va)
516{
517
518	return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
519}
520
521/* Return a pointer to the PML4 slot that corresponds to a VA */
522static __inline pml4_entry_t *
523pmap_pml4e(pmap_t pmap, vm_offset_t va)
524{
525
526	return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
527}
528
529/* Return a pointer to the PDP slot that corresponds to a VA */
530static __inline pdp_entry_t *
531pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va)
532{
533	pdp_entry_t *pdpe;
534
535	pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
536	return (&pdpe[pmap_pdpe_index(va)]);
537}
538
539/* Return a pointer to the PDP slot that corresponds to a VA */
540static __inline pdp_entry_t *
541pmap_pdpe(pmap_t pmap, vm_offset_t va)
542{
543	pml4_entry_t *pml4e;
544	pt_entry_t PG_V;
545
546	PG_V = pmap_valid_bit(pmap);
547	pml4e = pmap_pml4e(pmap, va);
548	if ((*pml4e & PG_V) == 0)
549		return (NULL);
550	return (pmap_pml4e_to_pdpe(pml4e, va));
551}
552
553/* Return a pointer to the PD slot that corresponds to a VA */
554static __inline pd_entry_t *
555pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va)
556{
557	pd_entry_t *pde;
558
559	pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
560	return (&pde[pmap_pde_index(va)]);
561}
562
563/* Return a pointer to the PD slot that corresponds to a VA */
564static __inline pd_entry_t *
565pmap_pde(pmap_t pmap, vm_offset_t va)
566{
567	pdp_entry_t *pdpe;
568	pt_entry_t PG_V;
569
570	PG_V = pmap_valid_bit(pmap);
571	pdpe = pmap_pdpe(pmap, va);
572	if (pdpe == NULL || (*pdpe & PG_V) == 0)
573		return (NULL);
574	return (pmap_pdpe_to_pde(pdpe, va));
575}
576
577/* Return a pointer to the PT slot that corresponds to a VA */
578static __inline pt_entry_t *
579pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va)
580{
581	pt_entry_t *pte;
582
583	pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
584	return (&pte[pmap_pte_index(va)]);
585}
586
587/* Return a pointer to the PT slot that corresponds to a VA */
588static __inline pt_entry_t *
589pmap_pte(pmap_t pmap, vm_offset_t va)
590{
591	pd_entry_t *pde;
592	pt_entry_t PG_V;
593
594	PG_V = pmap_valid_bit(pmap);
595	pde = pmap_pde(pmap, va);
596	if (pde == NULL || (*pde & PG_V) == 0)
597		return (NULL);
598	if ((*pde & PG_PS) != 0)	/* compat with i386 pmap_pte() */
599		return ((pt_entry_t *)pde);
600	return (pmap_pde_to_pte(pde, va));
601}
602
603static __inline void
604pmap_resident_count_inc(pmap_t pmap, int count)
605{
606
607	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
608	pmap->pm_stats.resident_count += count;
609}
610
611static __inline void
612pmap_resident_count_dec(pmap_t pmap, int count)
613{
614
615	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
616	KASSERT(pmap->pm_stats.resident_count >= count,
617	    ("pmap %p resident count underflow %ld %d", pmap,
618	    pmap->pm_stats.resident_count, count));
619	pmap->pm_stats.resident_count -= count;
620}
621
622PMAP_INLINE pt_entry_t *
623vtopte(vm_offset_t va)
624{
625	u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
626
627	KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va));
628
629	return (PTmap + ((va >> PAGE_SHIFT) & mask));
630}
631
632static __inline pd_entry_t *
633vtopde(vm_offset_t va)
634{
635	u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
636
637	KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va));
638
639	return (PDmap + ((va >> PDRSHIFT) & mask));
640}
641
642static u_int64_t
643allocpages(vm_paddr_t *firstaddr, int n)
644{
645	u_int64_t ret;
646
647	ret = *firstaddr;
648	bzero((void *)ret, n * PAGE_SIZE);
649	*firstaddr += n * PAGE_SIZE;
650	return (ret);
651}
652
653CTASSERT(powerof2(NDMPML4E));
654
655/* number of kernel PDP slots */
656#define	NKPDPE(ptpgs)		howmany((ptpgs), NPDEPG)
657
658static void
659nkpt_init(vm_paddr_t addr)
660{
661	int pt_pages;
662
663#ifdef NKPT
664	pt_pages = NKPT;
665#else
666	pt_pages = howmany(addr, 1 << PDRSHIFT);
667	pt_pages += NKPDPE(pt_pages);
668
669	/*
670	 * Add some slop beyond the bare minimum required for bootstrapping
671	 * the kernel.
672	 *
673	 * This is quite important when allocating KVA for kernel modules.
674	 * The modules are required to be linked in the negative 2GB of
675	 * the address space.  If we run out of KVA in this region then
676	 * pmap_growkernel() will need to allocate page table pages to map
677	 * the entire 512GB of KVA space which is an unnecessary tax on
678	 * physical memory.
679	 */
680	pt_pages += 8;		/* 16MB additional slop for kernel modules */
681#endif
682	nkpt = pt_pages;
683}
684
685static void
686create_pagetables(vm_paddr_t *firstaddr)
687{
688	int i, j, ndm1g, nkpdpe;
689	pt_entry_t *pt_p;
690	pd_entry_t *pd_p;
691	pdp_entry_t *pdp_p;
692	pml4_entry_t *p4_p;
693
694	/* Allocate page table pages for the direct map */
695	ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT;
696	if (ndmpdp < 4)		/* Minimum 4GB of dirmap */
697		ndmpdp = 4;
698	ndmpdpphys = howmany(ndmpdp, NPDPEPG);
699	if (ndmpdpphys > NDMPML4E) {
700		/*
701		 * Each NDMPML4E allows 512 GB, so limit to that,
702		 * and then readjust ndmpdp and ndmpdpphys.
703		 */
704		printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512);
705		Maxmem = atop(NDMPML4E * NBPML4);
706		ndmpdpphys = NDMPML4E;
707		ndmpdp = NDMPML4E * NPDEPG;
708	}
709	DMPDPphys = allocpages(firstaddr, ndmpdpphys);
710	ndm1g = 0;
711	if ((amd_feature & AMDID_PAGE1GB) != 0)
712		ndm1g = ptoa(Maxmem) >> PDPSHIFT;
713	if (ndm1g < ndmpdp)
714		DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g);
715	dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
716
717	/* Allocate pages */
718	KPML4phys = allocpages(firstaddr, 1);
719	KPDPphys = allocpages(firstaddr, NKPML4E);
720
721	/*
722	 * Allocate the initial number of kernel page table pages required to
723	 * bootstrap.  We defer this until after all memory-size dependent
724	 * allocations are done (e.g. direct map), so that we don't have to
725	 * build in too much slop in our estimate.
726	 *
727	 * Note that when NKPML4E > 1, we have an empty page underneath
728	 * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed)
729	 * pages.  (pmap_enter requires a PD page to exist for each KPML4E.)
730	 */
731	nkpt_init(*firstaddr);
732	nkpdpe = NKPDPE(nkpt);
733
734	KPTphys = allocpages(firstaddr, nkpt);
735	KPDphys = allocpages(firstaddr, nkpdpe);
736
737	/* Fill in the underlying page table pages */
738	/* Nominally read-only (but really R/W) from zero to physfree */
739	/* XXX not fully used, underneath 2M pages */
740	pt_p = (pt_entry_t *)KPTphys;
741	for (i = 0; ptoa(i) < *firstaddr; i++)
742		pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | X86_PG_G;
743
744	/* Now map the page tables at their location within PTmap */
745	pd_p = (pd_entry_t *)KPDphys;
746	for (i = 0; i < nkpt; i++)
747		pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
748
749	/* Map from zero to end of allocations under 2M pages */
750	/* This replaces some of the KPTphys entries above */
751	for (i = 0; (i << PDRSHIFT) < *firstaddr; i++)
752		pd_p[i] = (i << PDRSHIFT) | X86_PG_RW | X86_PG_V | PG_PS |
753		    X86_PG_G;
754
755	/* And connect up the PD to the PDP (leaving room for L4 pages) */
756	pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE));
757	for (i = 0; i < nkpdpe; i++)
758		pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V |
759		    PG_U;
760
761	/*
762	 * Now, set up the direct map region using 2MB and/or 1GB pages.  If
763	 * the end of physical memory is not aligned to a 1GB page boundary,
764	 * then the residual physical memory is mapped with 2MB pages.  Later,
765	 * if pmap_mapdev{_attr}() uses the direct map for non-write-back
766	 * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings
767	 * that are partially used.
768	 */
769	pd_p = (pd_entry_t *)DMPDphys;
770	for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) {
771		pd_p[j] = (vm_paddr_t)i << PDRSHIFT;
772		/* Preset PG_M and PG_A because demotion expects it. */
773		pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G |
774		    X86_PG_M | X86_PG_A;
775	}
776	pdp_p = (pdp_entry_t *)DMPDPphys;
777	for (i = 0; i < ndm1g; i++) {
778		pdp_p[i] = (vm_paddr_t)i << PDPSHIFT;
779		/* Preset PG_M and PG_A because demotion expects it. */
780		pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G |
781		    X86_PG_M | X86_PG_A;
782	}
783	for (j = 0; i < ndmpdp; i++, j++) {
784		pdp_p[i] = DMPDphys + ptoa(j);
785		pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_U;
786	}
787
788	/* And recursively map PML4 to itself in order to get PTmap */
789	p4_p = (pml4_entry_t *)KPML4phys;
790	p4_p[PML4PML4I] = KPML4phys;
791	p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | PG_U;
792
793	/* Connect the Direct Map slot(s) up to the PML4. */
794	for (i = 0; i < ndmpdpphys; i++) {
795		p4_p[DMPML4I + i] = DMPDPphys + ptoa(i);
796		p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | PG_U;
797	}
798
799	/* Connect the KVA slots up to the PML4 */
800	for (i = 0; i < NKPML4E; i++) {
801		p4_p[KPML4BASE + i] = KPDPphys + ptoa(i);
802		p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V | PG_U;
803	}
804}
805
806/*
807 *	Bootstrap the system enough to run with virtual memory.
808 *
809 *	On amd64 this is called after mapping has already been enabled
810 *	and just syncs the pmap module with what has already been done.
811 *	[We can't call it easily with mapping off since the kernel is not
812 *	mapped with PA == VA, hence we would have to relocate every address
813 *	from the linked base (virtual) address "KERNBASE" to the actual
814 *	(physical) address starting relative to 0]
815 */
816void
817pmap_bootstrap(vm_paddr_t *firstaddr)
818{
819	vm_offset_t va;
820	pt_entry_t *pte;
821
822	/*
823	 * Create an initial set of page tables to run the kernel in.
824	 */
825	create_pagetables(firstaddr);
826
827	virtual_avail = (vm_offset_t) KERNBASE + *firstaddr;
828	virtual_avail = pmap_kmem_choose(virtual_avail);
829
830	virtual_end = VM_MAX_KERNEL_ADDRESS;
831
832
833	/* XXX do %cr0 as well */
834	load_cr4(rcr4() | CR4_PGE | CR4_PSE);
835	load_cr3(KPML4phys);
836	if (cpu_stdext_feature & CPUID_STDEXT_SMEP)
837		load_cr4(rcr4() | CR4_SMEP);
838
839	/*
840	 * Initialize the kernel pmap (which is statically allocated).
841	 */
842	PMAP_LOCK_INIT(kernel_pmap);
843	kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys);
844	kernel_pmap->pm_cr3 = KPML4phys;
845	CPU_FILL(&kernel_pmap->pm_active);	/* don't allow deactivation */
846	CPU_FILL(&kernel_pmap->pm_save);	/* always superset of pm_active */
847	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
848	kernel_pmap->pm_flags = pmap_flags;
849
850 	/*
851	 * Initialize the global pv list lock.
852	 */
853	rw_init(&pvh_global_lock, "pmap pv global");
854
855	/*
856	 * Reserve some special page table entries/VA space for temporary
857	 * mapping of pages.
858	 */
859#define	SYSMAP(c, p, v, n)	\
860	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
861
862	va = virtual_avail;
863	pte = vtopte(va);
864
865	/*
866	 * Crashdump maps.  The first page is reused as CMAP1 for the
867	 * memory test.
868	 */
869	SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS)
870	CADDR1 = crashdumpmap;
871
872	virtual_avail = va;
873
874	/* Initialize the PAT MSR. */
875	pmap_init_pat();
876
877	/* Initialize TLB Context Id. */
878	TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
879	if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
880		load_cr4(rcr4() | CR4_PCIDE);
881		mtx_init(&pcid_mtx, "pcid", NULL, MTX_DEF);
882		init_unrhdr(&pcid_unr, 1, (1 << 12) - 1, &pcid_mtx);
883		/* Check for INVPCID support */
884		invpcid_works = (cpu_stdext_feature & CPUID_STDEXT_INVPCID)
885		    != 0;
886		kernel_pmap->pm_pcid = 0;
887#ifndef SMP
888		pmap_pcid_enabled = 0;
889#endif
890	} else
891		pmap_pcid_enabled = 0;
892}
893
894/*
895 * Setup the PAT MSR.
896 */
897void
898pmap_init_pat(void)
899{
900	int pat_table[PAT_INDEX_SIZE];
901	uint64_t pat_msr;
902	u_long cr0, cr4;
903	int i;
904
905	/* Bail if this CPU doesn't implement PAT. */
906	if ((cpu_feature & CPUID_PAT) == 0)
907		panic("no PAT??");
908
909	/* Set default PAT index table. */
910	for (i = 0; i < PAT_INDEX_SIZE; i++)
911		pat_table[i] = -1;
912	pat_table[PAT_WRITE_BACK] = 0;
913	pat_table[PAT_WRITE_THROUGH] = 1;
914	pat_table[PAT_UNCACHEABLE] = 3;
915	pat_table[PAT_WRITE_COMBINING] = 3;
916	pat_table[PAT_WRITE_PROTECTED] = 3;
917	pat_table[PAT_UNCACHED] = 3;
918
919	/* Initialize default PAT entries. */
920	pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
921	    PAT_VALUE(1, PAT_WRITE_THROUGH) |
922	    PAT_VALUE(2, PAT_UNCACHED) |
923	    PAT_VALUE(3, PAT_UNCACHEABLE) |
924	    PAT_VALUE(4, PAT_WRITE_BACK) |
925	    PAT_VALUE(5, PAT_WRITE_THROUGH) |
926	    PAT_VALUE(6, PAT_UNCACHED) |
927	    PAT_VALUE(7, PAT_UNCACHEABLE);
928
929	if (pat_works) {
930		/*
931		 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
932		 * Program 5 and 6 as WP and WC.
933		 * Leave 4 and 7 as WB and UC.
934		 */
935		pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6));
936		pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) |
937		    PAT_VALUE(6, PAT_WRITE_COMBINING);
938		pat_table[PAT_UNCACHED] = 2;
939		pat_table[PAT_WRITE_PROTECTED] = 5;
940		pat_table[PAT_WRITE_COMBINING] = 6;
941	} else {
942		/*
943		 * Just replace PAT Index 2 with WC instead of UC-.
944		 */
945		pat_msr &= ~PAT_MASK(2);
946		pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
947		pat_table[PAT_WRITE_COMBINING] = 2;
948	}
949
950	/* Disable PGE. */
951	cr4 = rcr4();
952	load_cr4(cr4 & ~CR4_PGE);
953
954	/* Disable caches (CD = 1, NW = 0). */
955	cr0 = rcr0();
956	load_cr0((cr0 & ~CR0_NW) | CR0_CD);
957
958	/* Flushes caches and TLBs. */
959	wbinvd();
960	invltlb();
961
962	/* Update PAT and index table. */
963	wrmsr(MSR_PAT, pat_msr);
964	for (i = 0; i < PAT_INDEX_SIZE; i++)
965		pat_index[i] = pat_table[i];
966
967	/* Flush caches and TLBs again. */
968	wbinvd();
969	invltlb();
970
971	/* Restore caches and PGE. */
972	load_cr0(cr0);
973	load_cr4(cr4);
974}
975
976/*
977 *	Initialize a vm_page's machine-dependent fields.
978 */
979void
980pmap_page_init(vm_page_t m)
981{
982
983	TAILQ_INIT(&m->md.pv_list);
984	m->md.pat_mode = PAT_WRITE_BACK;
985}
986
987/*
988 *	Initialize the pmap module.
989 *	Called by vm_init, to initialize any structures that the pmap
990 *	system needs to map virtual memory.
991 */
992void
993pmap_init(void)
994{
995	vm_page_t mpte;
996	vm_size_t s;
997	int i, pv_npg;
998
999	/*
1000	 * Initialize the vm page array entries for the kernel pmap's
1001	 * page table pages.
1002	 */
1003	for (i = 0; i < nkpt; i++) {
1004		mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
1005		KASSERT(mpte >= vm_page_array &&
1006		    mpte < &vm_page_array[vm_page_array_size],
1007		    ("pmap_init: page table page is out of range"));
1008		mpte->pindex = pmap_pde_pindex(KERNBASE) + i;
1009		mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
1010	}
1011
1012	/*
1013	 * If the kernel is running on a virtual machine, then it must assume
1014	 * that MCA is enabled by the hypervisor.  Moreover, the kernel must
1015	 * be prepared for the hypervisor changing the vendor and family that
1016	 * are reported by CPUID.  Consequently, the workaround for AMD Family
1017	 * 10h Erratum 383 is enabled if the processor's feature set does not
1018	 * include at least one feature that is only supported by older Intel
1019	 * or newer AMD processors.
1020	 */
1021	if (vm_guest == VM_GUEST_VM && (cpu_feature & CPUID_SS) == 0 &&
1022	    (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI |
1023	    CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP |
1024	    AMDID2_FMA4)) == 0)
1025		workaround_erratum383 = 1;
1026
1027	/*
1028	 * Are large page mappings enabled?
1029	 */
1030	TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
1031	if (pg_ps_enabled) {
1032		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
1033		    ("pmap_init: can't assign to pagesizes[1]"));
1034		pagesizes[1] = NBPDR;
1035	}
1036
1037	/*
1038	 * Initialize the pv chunk list mutex.
1039	 */
1040	mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
1041
1042	/*
1043	 * Initialize the pool of pv list locks.
1044	 */
1045	for (i = 0; i < NPV_LIST_LOCKS; i++)
1046		rw_init(&pv_list_locks[i], "pmap pv list");
1047
1048	/*
1049	 * Calculate the size of the pv head table for superpages.
1050	 */
1051	for (i = 0; phys_avail[i + 1]; i += 2);
1052	pv_npg = round_2mpage(phys_avail[(i - 2) + 1]) / NBPDR;
1053
1054	/*
1055	 * Allocate memory for the pv head table for superpages.
1056	 */
1057	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
1058	s = round_page(s);
1059	pv_table = (struct md_page *)kmem_malloc(kernel_arena, s,
1060	    M_WAITOK | M_ZERO);
1061	for (i = 0; i < pv_npg; i++)
1062		TAILQ_INIT(&pv_table[i].pv_list);
1063
1064	mtx_init(&cpage_lock, "cpage", NULL, MTX_DEF);
1065	cpage_a = kva_alloc(PAGE_SIZE);
1066	cpage_b = kva_alloc(PAGE_SIZE);
1067}
1068
1069static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
1070    "2MB page mapping counters");
1071
1072static u_long pmap_pde_demotions;
1073SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
1074    &pmap_pde_demotions, 0, "2MB page demotions");
1075
1076static u_long pmap_pde_mappings;
1077SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
1078    &pmap_pde_mappings, 0, "2MB page mappings");
1079
1080static u_long pmap_pde_p_failures;
1081SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
1082    &pmap_pde_p_failures, 0, "2MB page promotion failures");
1083
1084static u_long pmap_pde_promotions;
1085SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
1086    &pmap_pde_promotions, 0, "2MB page promotions");
1087
1088static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0,
1089    "1GB page mapping counters");
1090
1091static u_long pmap_pdpe_demotions;
1092SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD,
1093    &pmap_pdpe_demotions, 0, "1GB page demotions");
1094
1095/***************************************************
1096 * Low level helper routines.....
1097 ***************************************************/
1098
1099static pt_entry_t
1100pmap_swap_pat(pmap_t pmap, pt_entry_t entry)
1101{
1102	int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT;
1103
1104	switch (pmap->pm_type) {
1105	case PT_X86:
1106		/* Verify that both PAT bits are not set at the same time */
1107		KASSERT((entry & x86_pat_bits) != x86_pat_bits,
1108		    ("Invalid PAT bits in entry %#lx", entry));
1109
1110		/* Swap the PAT bits if one of them is set */
1111		if ((entry & x86_pat_bits) != 0)
1112			entry ^= x86_pat_bits;
1113		break;
1114	case PT_EPT:
1115		/*
1116		 * Nothing to do - the memory attributes are represented
1117		 * the same way for regular pages and superpages.
1118		 */
1119		break;
1120	default:
1121		panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type);
1122	}
1123
1124	return (entry);
1125}
1126
1127/*
1128 * Determine the appropriate bits to set in a PTE or PDE for a specified
1129 * caching mode.
1130 */
1131static int
1132pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde)
1133{
1134	int cache_bits, pat_flag, pat_idx;
1135
1136	if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0)
1137		panic("Unknown caching mode %d\n", mode);
1138
1139	switch (pmap->pm_type) {
1140	case PT_X86:
1141		/* The PAT bit is different for PTE's and PDE's. */
1142		pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT;
1143
1144		/* Map the caching mode to a PAT index. */
1145		pat_idx = pat_index[mode];
1146
1147		/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
1148		cache_bits = 0;
1149		if (pat_idx & 0x4)
1150			cache_bits |= pat_flag;
1151		if (pat_idx & 0x2)
1152			cache_bits |= PG_NC_PCD;
1153		if (pat_idx & 0x1)
1154			cache_bits |= PG_NC_PWT;
1155		break;
1156
1157	case PT_EPT:
1158		cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode);
1159		break;
1160
1161	default:
1162		panic("unsupported pmap type %d", pmap->pm_type);
1163	}
1164
1165	return (cache_bits);
1166}
1167
1168static int
1169pmap_cache_mask(pmap_t pmap, boolean_t is_pde)
1170{
1171	int mask;
1172
1173	switch (pmap->pm_type) {
1174	case PT_X86:
1175		mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE;
1176		break;
1177	case PT_EPT:
1178		mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7);
1179		break;
1180	default:
1181		panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type);
1182	}
1183
1184	return (mask);
1185}
1186
1187static __inline boolean_t
1188pmap_ps_enabled(pmap_t pmap)
1189{
1190
1191	return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0);
1192}
1193
1194static void
1195pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde)
1196{
1197
1198	switch (pmap->pm_type) {
1199	case PT_X86:
1200		break;
1201	case PT_EPT:
1202		/*
1203		 * XXX
1204		 * This is a little bogus since the generation number is
1205		 * supposed to be bumped up when a region of the address
1206		 * space is invalidated in the page tables.
1207		 *
1208		 * In this case the old PDE entry is valid but yet we want
1209		 * to make sure that any mappings using the old entry are
1210		 * invalidated in the TLB.
1211		 *
1212		 * The reason this works as expected is because we rendezvous
1213		 * "all" host cpus and force any vcpu context to exit as a
1214		 * side-effect.
1215		 */
1216		atomic_add_acq_long(&pmap->pm_eptgen, 1);
1217		break;
1218	default:
1219		panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type);
1220	}
1221	pde_store(pde, newpde);
1222}
1223
1224/*
1225 * After changing the page size for the specified virtual address in the page
1226 * table, flush the corresponding entries from the processor's TLB.  Only the
1227 * calling processor's TLB is affected.
1228 *
1229 * The calling thread must be pinned to a processor.
1230 */
1231static void
1232pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde)
1233{
1234	pt_entry_t PG_G;
1235
1236	if (pmap->pm_type == PT_EPT)
1237		return;
1238
1239	KASSERT(pmap->pm_type == PT_X86,
1240	    ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type));
1241
1242	PG_G = pmap_global_bit(pmap);
1243
1244	if ((newpde & PG_PS) == 0)
1245		/* Demotion: flush a specific 2MB page mapping. */
1246		invlpg(va);
1247	else if ((newpde & PG_G) == 0)
1248		/*
1249		 * Promotion: flush every 4KB page mapping from the TLB
1250		 * because there are too many to flush individually.
1251		 */
1252		invltlb();
1253	else {
1254		/*
1255		 * Promotion: flush every 4KB page mapping from the TLB,
1256		 * including any global (PG_G) mappings.
1257		 */
1258		invltlb_globpcid();
1259	}
1260}
1261#ifdef SMP
1262
1263static void
1264pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va)
1265{
1266	struct invpcid_descr d;
1267	uint64_t cr3;
1268
1269	if (invpcid_works) {
1270		d.pcid = pmap->pm_pcid;
1271		d.pad = 0;
1272		d.addr = va;
1273		invpcid(&d, INVPCID_ADDR);
1274		return;
1275	}
1276
1277	cr3 = rcr3();
1278	critical_enter();
1279	load_cr3(pmap->pm_cr3 | CR3_PCID_SAVE);
1280	invlpg(va);
1281	load_cr3(cr3 | CR3_PCID_SAVE);
1282	critical_exit();
1283}
1284
1285/*
1286 * For SMP, these functions have to use the IPI mechanism for coherence.
1287 *
1288 * N.B.: Before calling any of the following TLB invalidation functions,
1289 * the calling processor must ensure that all stores updating a non-
1290 * kernel page table are globally performed.  Otherwise, another
1291 * processor could cache an old, pre-update entry without being
1292 * invalidated.  This can happen one of two ways: (1) The pmap becomes
1293 * active on another processor after its pm_active field is checked by
1294 * one of the following functions but before a store updating the page
1295 * table is globally performed. (2) The pmap becomes active on another
1296 * processor before its pm_active field is checked but due to
1297 * speculative loads one of the following functions stills reads the
1298 * pmap as inactive on the other processor.
1299 *
1300 * The kernel page table is exempt because its pm_active field is
1301 * immutable.  The kernel page table is always active on every
1302 * processor.
1303 */
1304
1305/*
1306 * Interrupt the cpus that are executing in the guest context.
1307 * This will force the vcpu to exit and the cached EPT mappings
1308 * will be invalidated by the host before the next vmresume.
1309 */
1310static __inline void
1311pmap_invalidate_ept(pmap_t pmap)
1312{
1313	int ipinum;
1314
1315	sched_pin();
1316	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
1317	    ("pmap_invalidate_ept: absurd pm_active"));
1318
1319	/*
1320	 * The TLB mappings associated with a vcpu context are not
1321	 * flushed each time a different vcpu is chosen to execute.
1322	 *
1323	 * This is in contrast with a process's vtop mappings that
1324	 * are flushed from the TLB on each context switch.
1325	 *
1326	 * Therefore we need to do more than just a TLB shootdown on
1327	 * the active cpus in 'pmap->pm_active'. To do this we keep
1328	 * track of the number of invalidations performed on this pmap.
1329	 *
1330	 * Each vcpu keeps a cache of this counter and compares it
1331	 * just before a vmresume. If the counter is out-of-date an
1332	 * invept will be done to flush stale mappings from the TLB.
1333	 */
1334	atomic_add_acq_long(&pmap->pm_eptgen, 1);
1335
1336	/*
1337	 * Force the vcpu to exit and trap back into the hypervisor.
1338	 */
1339	ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK;
1340	ipi_selected(pmap->pm_active, ipinum);
1341	sched_unpin();
1342}
1343
1344void
1345pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1346{
1347	cpuset_t other_cpus;
1348	u_int cpuid;
1349
1350	if (pmap->pm_type == PT_EPT) {
1351		pmap_invalidate_ept(pmap);
1352		return;
1353	}
1354
1355	KASSERT(pmap->pm_type == PT_X86,
1356	    ("pmap_invalidate_page: invalid type %d", pmap->pm_type));
1357
1358	sched_pin();
1359	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
1360		if (!pmap_pcid_enabled) {
1361			invlpg(va);
1362		} else {
1363			if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) {
1364				if (pmap == PCPU_GET(curpmap))
1365					invlpg(va);
1366				else
1367					pmap_invalidate_page_pcid(pmap, va);
1368			} else {
1369				invltlb_globpcid();
1370			}
1371		}
1372		smp_invlpg(pmap, va);
1373	} else {
1374		cpuid = PCPU_GET(cpuid);
1375		other_cpus = all_cpus;
1376		CPU_CLR(cpuid, &other_cpus);
1377		if (CPU_ISSET(cpuid, &pmap->pm_active))
1378			invlpg(va);
1379		else if (pmap_pcid_enabled) {
1380			if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0)
1381				pmap_invalidate_page_pcid(pmap, va);
1382			else
1383				invltlb_globpcid();
1384		}
1385		if (pmap_pcid_enabled)
1386			CPU_AND(&other_cpus, &pmap->pm_save);
1387		else
1388			CPU_AND(&other_cpus, &pmap->pm_active);
1389		if (!CPU_EMPTY(&other_cpus))
1390			smp_masked_invlpg(other_cpus, pmap, va);
1391	}
1392	sched_unpin();
1393}
1394
1395static void
1396pmap_invalidate_range_pcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1397{
1398	struct invpcid_descr d;
1399	uint64_t cr3;
1400	vm_offset_t addr;
1401
1402	if (invpcid_works) {
1403		d.pcid = pmap->pm_pcid;
1404		d.pad = 0;
1405		for (addr = sva; addr < eva; addr += PAGE_SIZE) {
1406			d.addr = addr;
1407			invpcid(&d, INVPCID_ADDR);
1408		}
1409		return;
1410	}
1411
1412	cr3 = rcr3();
1413	critical_enter();
1414	load_cr3(pmap->pm_cr3 | CR3_PCID_SAVE);
1415	for (addr = sva; addr < eva; addr += PAGE_SIZE)
1416		invlpg(addr);
1417	load_cr3(cr3 | CR3_PCID_SAVE);
1418	critical_exit();
1419}
1420
1421void
1422pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1423{
1424	cpuset_t other_cpus;
1425	vm_offset_t addr;
1426	u_int cpuid;
1427
1428	if (pmap->pm_type == PT_EPT) {
1429		pmap_invalidate_ept(pmap);
1430		return;
1431	}
1432
1433	KASSERT(pmap->pm_type == PT_X86,
1434	    ("pmap_invalidate_range: invalid type %d", pmap->pm_type));
1435
1436	sched_pin();
1437	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
1438		if (!pmap_pcid_enabled) {
1439			for (addr = sva; addr < eva; addr += PAGE_SIZE)
1440				invlpg(addr);
1441		} else {
1442			if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) {
1443				if (pmap == PCPU_GET(curpmap)) {
1444					for (addr = sva; addr < eva;
1445					    addr += PAGE_SIZE)
1446						invlpg(addr);
1447				} else {
1448					pmap_invalidate_range_pcid(pmap,
1449					    sva, eva);
1450				}
1451			} else {
1452				invltlb_globpcid();
1453			}
1454		}
1455		smp_invlpg_range(pmap, sva, eva);
1456	} else {
1457		cpuid = PCPU_GET(cpuid);
1458		other_cpus = all_cpus;
1459		CPU_CLR(cpuid, &other_cpus);
1460		if (CPU_ISSET(cpuid, &pmap->pm_active)) {
1461			for (addr = sva; addr < eva; addr += PAGE_SIZE)
1462				invlpg(addr);
1463		} else if (pmap_pcid_enabled) {
1464			if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0)
1465				pmap_invalidate_range_pcid(pmap, sva, eva);
1466			else
1467				invltlb_globpcid();
1468		}
1469		if (pmap_pcid_enabled)
1470			CPU_AND(&other_cpus, &pmap->pm_save);
1471		else
1472			CPU_AND(&other_cpus, &pmap->pm_active);
1473		if (!CPU_EMPTY(&other_cpus))
1474			smp_masked_invlpg_range(other_cpus, pmap, sva, eva);
1475	}
1476	sched_unpin();
1477}
1478
1479void
1480pmap_invalidate_all(pmap_t pmap)
1481{
1482	cpuset_t other_cpus;
1483	struct invpcid_descr d;
1484	uint64_t cr3;
1485	u_int cpuid;
1486
1487	if (pmap->pm_type == PT_EPT) {
1488		pmap_invalidate_ept(pmap);
1489		return;
1490	}
1491
1492	KASSERT(pmap->pm_type == PT_X86,
1493	    ("pmap_invalidate_all: invalid type %d", pmap->pm_type));
1494
1495	sched_pin();
1496	cpuid = PCPU_GET(cpuid);
1497	if (pmap == kernel_pmap ||
1498	    (pmap_pcid_enabled && !CPU_CMP(&pmap->pm_save, &all_cpus)) ||
1499	    !CPU_CMP(&pmap->pm_active, &all_cpus)) {
1500		if (invpcid_works) {
1501			bzero(&d, sizeof(d));
1502			invpcid(&d, INVPCID_CTXGLOB);
1503		} else {
1504			invltlb_globpcid();
1505		}
1506		if (!CPU_ISSET(cpuid, &pmap->pm_active))
1507			CPU_CLR_ATOMIC(cpuid, &pmap->pm_save);
1508		smp_invltlb(pmap);
1509	} else {
1510		other_cpus = all_cpus;
1511		CPU_CLR(cpuid, &other_cpus);
1512
1513		/*
1514		 * This logic is duplicated in the Xinvltlb shootdown
1515		 * IPI handler.
1516		 */
1517		if (pmap_pcid_enabled) {
1518			if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) {
1519				if (invpcid_works) {
1520					d.pcid = pmap->pm_pcid;
1521					d.pad = 0;
1522					d.addr = 0;
1523					invpcid(&d, INVPCID_CTX);
1524				} else {
1525					cr3 = rcr3();
1526					critical_enter();
1527
1528					/*
1529					 * Bit 63 is clear, pcid TLB
1530					 * entries are invalidated.
1531					 */
1532					load_cr3(pmap->pm_cr3);
1533					load_cr3(cr3 | CR3_PCID_SAVE);
1534					critical_exit();
1535				}
1536			} else {
1537				invltlb_globpcid();
1538			}
1539		} else if (CPU_ISSET(cpuid, &pmap->pm_active))
1540			invltlb();
1541		if (!CPU_ISSET(cpuid, &pmap->pm_active))
1542			CPU_CLR_ATOMIC(cpuid, &pmap->pm_save);
1543		if (pmap_pcid_enabled)
1544			CPU_AND(&other_cpus, &pmap->pm_save);
1545		else
1546			CPU_AND(&other_cpus, &pmap->pm_active);
1547		if (!CPU_EMPTY(&other_cpus))
1548			smp_masked_invltlb(other_cpus, pmap);
1549	}
1550	sched_unpin();
1551}
1552
1553void
1554pmap_invalidate_cache(void)
1555{
1556
1557	sched_pin();
1558	wbinvd();
1559	smp_cache_flush();
1560	sched_unpin();
1561}
1562
1563struct pde_action {
1564	cpuset_t invalidate;	/* processors that invalidate their TLB */
1565	pmap_t pmap;
1566	vm_offset_t va;
1567	pd_entry_t *pde;
1568	pd_entry_t newpde;
1569	u_int store;		/* processor that updates the PDE */
1570};
1571
1572static void
1573pmap_update_pde_action(void *arg)
1574{
1575	struct pde_action *act = arg;
1576
1577	if (act->store == PCPU_GET(cpuid))
1578		pmap_update_pde_store(act->pmap, act->pde, act->newpde);
1579}
1580
1581static void
1582pmap_update_pde_teardown(void *arg)
1583{
1584	struct pde_action *act = arg;
1585
1586	if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
1587		pmap_update_pde_invalidate(act->pmap, act->va, act->newpde);
1588}
1589
1590/*
1591 * Change the page size for the specified virtual address in a way that
1592 * prevents any possibility of the TLB ever having two entries that map the
1593 * same virtual address using different page sizes.  This is the recommended
1594 * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
1595 * machine check exception for a TLB state that is improperly diagnosed as a
1596 * hardware error.
1597 */
1598static void
1599pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1600{
1601	struct pde_action act;
1602	cpuset_t active, other_cpus;
1603	u_int cpuid;
1604
1605	sched_pin();
1606	cpuid = PCPU_GET(cpuid);
1607	other_cpus = all_cpus;
1608	CPU_CLR(cpuid, &other_cpus);
1609	if (pmap == kernel_pmap || pmap->pm_type == PT_EPT)
1610		active = all_cpus;
1611	else {
1612		active = pmap->pm_active;
1613		CPU_AND_ATOMIC(&pmap->pm_save, &active);
1614	}
1615	if (CPU_OVERLAP(&active, &other_cpus)) {
1616		act.store = cpuid;
1617		act.invalidate = active;
1618		act.va = va;
1619		act.pmap = pmap;
1620		act.pde = pde;
1621		act.newpde = newpde;
1622		CPU_SET(cpuid, &active);
1623		smp_rendezvous_cpus(active,
1624		    smp_no_rendevous_barrier, pmap_update_pde_action,
1625		    pmap_update_pde_teardown, &act);
1626	} else {
1627		pmap_update_pde_store(pmap, pde, newpde);
1628		if (CPU_ISSET(cpuid, &active))
1629			pmap_update_pde_invalidate(pmap, va, newpde);
1630	}
1631	sched_unpin();
1632}
1633#else /* !SMP */
1634/*
1635 * Normal, non-SMP, invalidation functions.
1636 * We inline these within pmap.c for speed.
1637 */
1638PMAP_INLINE void
1639pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1640{
1641
1642	switch (pmap->pm_type) {
1643	case PT_X86:
1644		if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1645			invlpg(va);
1646		break;
1647	case PT_EPT:
1648		pmap->pm_eptgen++;
1649		break;
1650	default:
1651		panic("pmap_invalidate_page: unknown type: %d", pmap->pm_type);
1652	}
1653}
1654
1655PMAP_INLINE void
1656pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1657{
1658	vm_offset_t addr;
1659
1660	switch (pmap->pm_type) {
1661	case PT_X86:
1662		if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1663			for (addr = sva; addr < eva; addr += PAGE_SIZE)
1664				invlpg(addr);
1665		break;
1666	case PT_EPT:
1667		pmap->pm_eptgen++;
1668		break;
1669	default:
1670		panic("pmap_invalidate_range: unknown type: %d", pmap->pm_type);
1671	}
1672}
1673
1674PMAP_INLINE void
1675pmap_invalidate_all(pmap_t pmap)
1676{
1677
1678	switch (pmap->pm_type) {
1679	case PT_X86:
1680		if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1681			invltlb();
1682		break;
1683	case PT_EPT:
1684		pmap->pm_eptgen++;
1685		break;
1686	default:
1687		panic("pmap_invalidate_all: unknown type %d", pmap->pm_type);
1688	}
1689}
1690
1691PMAP_INLINE void
1692pmap_invalidate_cache(void)
1693{
1694
1695	wbinvd();
1696}
1697
1698static void
1699pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1700{
1701
1702	pmap_update_pde_store(pmap, pde, newpde);
1703	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1704		pmap_update_pde_invalidate(pmap, va, newpde);
1705	else
1706		CPU_ZERO(&pmap->pm_save);
1707}
1708#endif /* !SMP */
1709
1710#define PMAP_CLFLUSH_THRESHOLD   (2 * 1024 * 1024)
1711
1712void
1713pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
1714{
1715
1716	KASSERT((sva & PAGE_MASK) == 0,
1717	    ("pmap_invalidate_cache_range: sva not page-aligned"));
1718	KASSERT((eva & PAGE_MASK) == 0,
1719	    ("pmap_invalidate_cache_range: eva not page-aligned"));
1720
1721	if (cpu_feature & CPUID_SS)
1722		; /* If "Self Snoop" is supported, do nothing. */
1723	else if ((cpu_feature & CPUID_CLFSH) != 0 &&
1724	    eva - sva < PMAP_CLFLUSH_THRESHOLD) {
1725
1726		/*
1727		 * XXX: Some CPUs fault, hang, or trash the local APIC
1728		 * registers if we use CLFLUSH on the local APIC
1729		 * range.  The local APIC is always uncached, so we
1730		 * don't need to flush for that range anyway.
1731		 */
1732		if (pmap_kextract(sva) == lapic_paddr)
1733			return;
1734
1735		/*
1736		 * Otherwise, do per-cache line flush.  Use the mfence
1737		 * instruction to insure that previous stores are
1738		 * included in the write-back.  The processor
1739		 * propagates flush to other processors in the cache
1740		 * coherence domain.
1741		 */
1742		mfence();
1743		for (; sva < eva; sva += cpu_clflush_line_size)
1744			clflush(sva);
1745		mfence();
1746	} else {
1747
1748		/*
1749		 * No targeted cache flush methods are supported by CPU,
1750		 * or the supplied range is bigger than 2MB.
1751		 * Globally invalidate cache.
1752		 */
1753		pmap_invalidate_cache();
1754	}
1755}
1756
1757/*
1758 * Remove the specified set of pages from the data and instruction caches.
1759 *
1760 * In contrast to pmap_invalidate_cache_range(), this function does not
1761 * rely on the CPU's self-snoop feature, because it is intended for use
1762 * when moving pages into a different cache domain.
1763 */
1764void
1765pmap_invalidate_cache_pages(vm_page_t *pages, int count)
1766{
1767	vm_offset_t daddr, eva;
1768	int i;
1769
1770	if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
1771	    (cpu_feature & CPUID_CLFSH) == 0)
1772		pmap_invalidate_cache();
1773	else {
1774		mfence();
1775		for (i = 0; i < count; i++) {
1776			daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i]));
1777			eva = daddr + PAGE_SIZE;
1778			for (; daddr < eva; daddr += cpu_clflush_line_size)
1779				clflush(daddr);
1780		}
1781		mfence();
1782	}
1783}
1784
1785/*
1786 *	Routine:	pmap_extract
1787 *	Function:
1788 *		Extract the physical page address associated
1789 *		with the given map/virtual_address pair.
1790 */
1791vm_paddr_t
1792pmap_extract(pmap_t pmap, vm_offset_t va)
1793{
1794	pdp_entry_t *pdpe;
1795	pd_entry_t *pde;
1796	pt_entry_t *pte, PG_V;
1797	vm_paddr_t pa;
1798
1799	pa = 0;
1800	PG_V = pmap_valid_bit(pmap);
1801	PMAP_LOCK(pmap);
1802	pdpe = pmap_pdpe(pmap, va);
1803	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
1804		if ((*pdpe & PG_PS) != 0)
1805			pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK);
1806		else {
1807			pde = pmap_pdpe_to_pde(pdpe, va);
1808			if ((*pde & PG_V) != 0) {
1809				if ((*pde & PG_PS) != 0) {
1810					pa = (*pde & PG_PS_FRAME) |
1811					    (va & PDRMASK);
1812				} else {
1813					pte = pmap_pde_to_pte(pde, va);
1814					pa = (*pte & PG_FRAME) |
1815					    (va & PAGE_MASK);
1816				}
1817			}
1818		}
1819	}
1820	PMAP_UNLOCK(pmap);
1821	return (pa);
1822}
1823
1824/*
1825 *	Routine:	pmap_extract_and_hold
1826 *	Function:
1827 *		Atomically extract and hold the physical page
1828 *		with the given pmap and virtual address pair
1829 *		if that mapping permits the given protection.
1830 */
1831vm_page_t
1832pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1833{
1834	pd_entry_t pde, *pdep;
1835	pt_entry_t pte, PG_RW, PG_V;
1836	vm_paddr_t pa;
1837	vm_page_t m;
1838
1839	pa = 0;
1840	m = NULL;
1841	PG_RW = pmap_rw_bit(pmap);
1842	PG_V = pmap_valid_bit(pmap);
1843	PMAP_LOCK(pmap);
1844retry:
1845	pdep = pmap_pde(pmap, va);
1846	if (pdep != NULL && (pde = *pdep)) {
1847		if (pde & PG_PS) {
1848			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
1849				if (vm_page_pa_tryrelock(pmap, (pde &
1850				    PG_PS_FRAME) | (va & PDRMASK), &pa))
1851					goto retry;
1852				m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
1853				    (va & PDRMASK));
1854				vm_page_hold(m);
1855			}
1856		} else {
1857			pte = *pmap_pde_to_pte(pdep, va);
1858			if ((pte & PG_V) &&
1859			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
1860				if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME,
1861				    &pa))
1862					goto retry;
1863				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
1864				vm_page_hold(m);
1865			}
1866		}
1867	}
1868	PA_UNLOCK_COND(pa);
1869	PMAP_UNLOCK(pmap);
1870	return (m);
1871}
1872
1873vm_paddr_t
1874pmap_kextract(vm_offset_t va)
1875{
1876	pd_entry_t pde;
1877	vm_paddr_t pa;
1878
1879	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
1880		pa = DMAP_TO_PHYS(va);
1881	} else {
1882		pde = *vtopde(va);
1883		if (pde & PG_PS) {
1884			pa = (pde & PG_PS_FRAME) | (va & PDRMASK);
1885		} else {
1886			/*
1887			 * Beware of a concurrent promotion that changes the
1888			 * PDE at this point!  For example, vtopte() must not
1889			 * be used to access the PTE because it would use the
1890			 * new PDE.  It is, however, safe to use the old PDE
1891			 * because the page table page is preserved by the
1892			 * promotion.
1893			 */
1894			pa = *pmap_pde_to_pte(&pde, va);
1895			pa = (pa & PG_FRAME) | (va & PAGE_MASK);
1896		}
1897	}
1898	return (pa);
1899}
1900
1901/***************************************************
1902 * Low level mapping routines.....
1903 ***************************************************/
1904
1905/*
1906 * Add a wired page to the kva.
1907 * Note: not SMP coherent.
1908 */
1909PMAP_INLINE void
1910pmap_kenter(vm_offset_t va, vm_paddr_t pa)
1911{
1912	pt_entry_t *pte;
1913
1914	pte = vtopte(va);
1915	pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G);
1916}
1917
1918static __inline void
1919pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
1920{
1921	pt_entry_t *pte;
1922	int cache_bits;
1923
1924	pte = vtopte(va);
1925	cache_bits = pmap_cache_bits(kernel_pmap, mode, 0);
1926	pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G | cache_bits);
1927}
1928
1929/*
1930 * Remove a page from the kernel pagetables.
1931 * Note: not SMP coherent.
1932 */
1933PMAP_INLINE void
1934pmap_kremove(vm_offset_t va)
1935{
1936	pt_entry_t *pte;
1937
1938	pte = vtopte(va);
1939	pte_clear(pte);
1940}
1941
1942/*
1943 *	Used to map a range of physical addresses into kernel
1944 *	virtual address space.
1945 *
1946 *	The value passed in '*virt' is a suggested virtual address for
1947 *	the mapping. Architectures which can support a direct-mapped
1948 *	physical to virtual region can return the appropriate address
1949 *	within that region, leaving '*virt' unchanged. Other
1950 *	architectures should map the pages starting at '*virt' and
1951 *	update '*virt' with the first usable address after the mapped
1952 *	region.
1953 */
1954vm_offset_t
1955pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1956{
1957	return PHYS_TO_DMAP(start);
1958}
1959
1960
1961/*
1962 * Add a list of wired pages to the kva
1963 * this routine is only used for temporary
1964 * kernel mappings that do not need to have
1965 * page modification or references recorded.
1966 * Note that old mappings are simply written
1967 * over.  The page *must* be wired.
1968 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1969 */
1970void
1971pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1972{
1973	pt_entry_t *endpte, oldpte, pa, *pte;
1974	vm_page_t m;
1975	int cache_bits;
1976
1977	oldpte = 0;
1978	pte = vtopte(sva);
1979	endpte = pte + count;
1980	while (pte < endpte) {
1981		m = *ma++;
1982		cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0);
1983		pa = VM_PAGE_TO_PHYS(m) | cache_bits;
1984		if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) {
1985			oldpte |= *pte;
1986			pte_store(pte, pa | X86_PG_G | X86_PG_RW | X86_PG_V);
1987		}
1988		pte++;
1989	}
1990	if (__predict_false((oldpte & X86_PG_V) != 0))
1991		pmap_invalidate_range(kernel_pmap, sva, sva + count *
1992		    PAGE_SIZE);
1993}
1994
1995/*
1996 * This routine tears out page mappings from the
1997 * kernel -- it is meant only for temporary mappings.
1998 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1999 */
2000void
2001pmap_qremove(vm_offset_t sva, int count)
2002{
2003	vm_offset_t va;
2004
2005	va = sva;
2006	while (count-- > 0) {
2007		KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va));
2008		pmap_kremove(va);
2009		va += PAGE_SIZE;
2010	}
2011	pmap_invalidate_range(kernel_pmap, sva, va);
2012}
2013
2014/***************************************************
2015 * Page table page management routines.....
2016 ***************************************************/
2017static __inline void
2018pmap_free_zero_pages(struct spglist *free)
2019{
2020	vm_page_t m;
2021
2022	while ((m = SLIST_FIRST(free)) != NULL) {
2023		SLIST_REMOVE_HEAD(free, plinks.s.ss);
2024		/* Preserve the page's PG_ZERO setting. */
2025		vm_page_free_toq(m);
2026	}
2027}
2028
2029/*
2030 * Schedule the specified unused page table page to be freed.  Specifically,
2031 * add the page to the specified list of pages that will be released to the
2032 * physical memory manager after the TLB has been updated.
2033 */
2034static __inline void
2035pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
2036    boolean_t set_PG_ZERO)
2037{
2038
2039	if (set_PG_ZERO)
2040		m->flags |= PG_ZERO;
2041	else
2042		m->flags &= ~PG_ZERO;
2043	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
2044}
2045
2046/*
2047 * Inserts the specified page table page into the specified pmap's collection
2048 * of idle page table pages.  Each of a pmap's page table pages is responsible
2049 * for mapping a distinct range of virtual addresses.  The pmap's collection is
2050 * ordered by this virtual address range.
2051 */
2052static __inline int
2053pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
2054{
2055
2056	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2057	return (vm_radix_insert(&pmap->pm_root, mpte));
2058}
2059
2060/*
2061 * Looks for a page table page mapping the specified virtual address in the
2062 * specified pmap's collection of idle page table pages.  Returns NULL if there
2063 * is no page table page corresponding to the specified virtual address.
2064 */
2065static __inline vm_page_t
2066pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va)
2067{
2068
2069	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2070	return (vm_radix_lookup(&pmap->pm_root, pmap_pde_pindex(va)));
2071}
2072
2073/*
2074 * Removes the specified page table page from the specified pmap's collection
2075 * of idle page table pages.  The specified page table page must be a member of
2076 * the pmap's collection.
2077 */
2078static __inline void
2079pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte)
2080{
2081
2082	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2083	vm_radix_remove(&pmap->pm_root, mpte->pindex);
2084}
2085
2086/*
2087 * Decrements a page table page's wire count, which is used to record the
2088 * number of valid page table entries within the page.  If the wire count
2089 * drops to zero, then the page table page is unmapped.  Returns TRUE if the
2090 * page table page was unmapped and FALSE otherwise.
2091 */
2092static inline boolean_t
2093pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2094{
2095
2096	--m->wire_count;
2097	if (m->wire_count == 0) {
2098		_pmap_unwire_ptp(pmap, va, m, free);
2099		return (TRUE);
2100	} else
2101		return (FALSE);
2102}
2103
2104static void
2105_pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2106{
2107
2108	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2109	/*
2110	 * unmap the page table page
2111	 */
2112	if (m->pindex >= (NUPDE + NUPDPE)) {
2113		/* PDP page */
2114		pml4_entry_t *pml4;
2115		pml4 = pmap_pml4e(pmap, va);
2116		*pml4 = 0;
2117	} else if (m->pindex >= NUPDE) {
2118		/* PD page */
2119		pdp_entry_t *pdp;
2120		pdp = pmap_pdpe(pmap, va);
2121		*pdp = 0;
2122	} else {
2123		/* PTE page */
2124		pd_entry_t *pd;
2125		pd = pmap_pde(pmap, va);
2126		*pd = 0;
2127	}
2128	pmap_resident_count_dec(pmap, 1);
2129	if (m->pindex < NUPDE) {
2130		/* We just released a PT, unhold the matching PD */
2131		vm_page_t pdpg;
2132
2133		pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
2134		pmap_unwire_ptp(pmap, va, pdpg, free);
2135	}
2136	if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
2137		/* We just released a PD, unhold the matching PDP */
2138		vm_page_t pdppg;
2139
2140		pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
2141		pmap_unwire_ptp(pmap, va, pdppg, free);
2142	}
2143
2144	/*
2145	 * This is a release store so that the ordinary store unmapping
2146	 * the page table page is globally performed before TLB shoot-
2147	 * down is begun.
2148	 */
2149	atomic_subtract_rel_int(&cnt.v_wire_count, 1);
2150
2151	/*
2152	 * Put page on a list so that it is released after
2153	 * *ALL* TLB shootdown is done
2154	 */
2155	pmap_add_delayed_free_list(m, free, TRUE);
2156}
2157
2158/*
2159 * After removing a page table entry, this routine is used to
2160 * conditionally free the page, and manage the hold/wire counts.
2161 */
2162static int
2163pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
2164    struct spglist *free)
2165{
2166	vm_page_t mpte;
2167
2168	if (va >= VM_MAXUSER_ADDRESS)
2169		return (0);
2170	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
2171	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
2172	return (pmap_unwire_ptp(pmap, va, mpte, free));
2173}
2174
2175void
2176pmap_pinit0(pmap_t pmap)
2177{
2178
2179	PMAP_LOCK_INIT(pmap);
2180	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
2181	pmap->pm_cr3 = KPML4phys;
2182	pmap->pm_root.rt_root = 0;
2183	CPU_ZERO(&pmap->pm_active);
2184	CPU_ZERO(&pmap->pm_save);
2185	PCPU_SET(curpmap, pmap);
2186	TAILQ_INIT(&pmap->pm_pvchunk);
2187	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
2188	pmap->pm_pcid = pmap_pcid_enabled ? 0 : -1;
2189	pmap->pm_flags = pmap_flags;
2190}
2191
2192/*
2193 * Initialize a preallocated and zeroed pmap structure,
2194 * such as one in a vmspace structure.
2195 */
2196int
2197pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags)
2198{
2199	vm_page_t pml4pg;
2200	vm_paddr_t pml4phys;
2201	int i;
2202
2203	/*
2204	 * allocate the page directory page
2205	 */
2206	while ((pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
2207	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
2208		VM_WAIT;
2209
2210	pml4phys = VM_PAGE_TO_PHYS(pml4pg);
2211	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys);
2212	pmap->pm_pcid = -1;
2213	pmap->pm_cr3 = ~0;	/* initialize to an invalid value */
2214
2215	if ((pml4pg->flags & PG_ZERO) == 0)
2216		pagezero(pmap->pm_pml4);
2217
2218	/*
2219	 * Do not install the host kernel mappings in the nested page
2220	 * tables. These mappings are meaningless in the guest physical
2221	 * address space.
2222	 */
2223	if ((pmap->pm_type = pm_type) == PT_X86) {
2224		pmap->pm_cr3 = pml4phys;
2225
2226		/* Wire in kernel global address entries. */
2227		for (i = 0; i < NKPML4E; i++) {
2228			pmap->pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) |
2229			    X86_PG_RW | X86_PG_V | PG_U;
2230		}
2231		for (i = 0; i < ndmpdpphys; i++) {
2232			pmap->pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) |
2233			    X86_PG_RW | X86_PG_V | PG_U;
2234		}
2235
2236		/* install self-referential address mapping entry(s) */
2237		pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) |
2238		    X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
2239
2240		if (pmap_pcid_enabled) {
2241			pmap->pm_pcid = alloc_unr(&pcid_unr);
2242			if (pmap->pm_pcid != -1)
2243				pmap->pm_cr3 |= pmap->pm_pcid;
2244		}
2245	}
2246
2247	pmap->pm_root.rt_root = 0;
2248	CPU_ZERO(&pmap->pm_active);
2249	TAILQ_INIT(&pmap->pm_pvchunk);
2250	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
2251	pmap->pm_flags = flags;
2252	pmap->pm_eptgen = 0;
2253	CPU_ZERO(&pmap->pm_save);
2254
2255	return (1);
2256}
2257
2258int
2259pmap_pinit(pmap_t pmap)
2260{
2261
2262	return (pmap_pinit_type(pmap, PT_X86, pmap_flags));
2263}
2264
2265/*
2266 * This routine is called if the desired page table page does not exist.
2267 *
2268 * If page table page allocation fails, this routine may sleep before
2269 * returning NULL.  It sleeps only if a lock pointer was given.
2270 *
2271 * Note: If a page allocation fails at page table level two or three,
2272 * one or two pages may be held during the wait, only to be released
2273 * afterwards.  This conservative approach is easily argued to avoid
2274 * race conditions.
2275 */
2276static vm_page_t
2277_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
2278{
2279	vm_page_t m, pdppg, pdpg;
2280	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
2281
2282	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2283
2284	PG_A = pmap_accessed_bit(pmap);
2285	PG_M = pmap_modified_bit(pmap);
2286	PG_V = pmap_valid_bit(pmap);
2287	PG_RW = pmap_rw_bit(pmap);
2288
2289	/*
2290	 * Allocate a page table page.
2291	 */
2292	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
2293	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
2294		if (lockp != NULL) {
2295			RELEASE_PV_LIST_LOCK(lockp);
2296			PMAP_UNLOCK(pmap);
2297			rw_runlock(&pvh_global_lock);
2298			VM_WAIT;
2299			rw_rlock(&pvh_global_lock);
2300			PMAP_LOCK(pmap);
2301		}
2302
2303		/*
2304		 * Indicate the need to retry.  While waiting, the page table
2305		 * page may have been allocated.
2306		 */
2307		return (NULL);
2308	}
2309	if ((m->flags & PG_ZERO) == 0)
2310		pmap_zero_page(m);
2311
2312	/*
2313	 * Map the pagetable page into the process address space, if
2314	 * it isn't already there.
2315	 */
2316
2317	if (ptepindex >= (NUPDE + NUPDPE)) {
2318		pml4_entry_t *pml4;
2319		vm_pindex_t pml4index;
2320
2321		/* Wire up a new PDPE page */
2322		pml4index = ptepindex - (NUPDE + NUPDPE);
2323		pml4 = &pmap->pm_pml4[pml4index];
2324		*pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
2325
2326	} else if (ptepindex >= NUPDE) {
2327		vm_pindex_t pml4index;
2328		vm_pindex_t pdpindex;
2329		pml4_entry_t *pml4;
2330		pdp_entry_t *pdp;
2331
2332		/* Wire up a new PDE page */
2333		pdpindex = ptepindex - NUPDE;
2334		pml4index = pdpindex >> NPML4EPGSHIFT;
2335
2336		pml4 = &pmap->pm_pml4[pml4index];
2337		if ((*pml4 & PG_V) == 0) {
2338			/* Have to allocate a new pdp, recurse */
2339			if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index,
2340			    lockp) == NULL) {
2341				--m->wire_count;
2342				atomic_subtract_int(&cnt.v_wire_count, 1);
2343				vm_page_free_zero(m);
2344				return (NULL);
2345			}
2346		} else {
2347			/* Add reference to pdp page */
2348			pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
2349			pdppg->wire_count++;
2350		}
2351		pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
2352
2353		/* Now find the pdp page */
2354		pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
2355		*pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
2356
2357	} else {
2358		vm_pindex_t pml4index;
2359		vm_pindex_t pdpindex;
2360		pml4_entry_t *pml4;
2361		pdp_entry_t *pdp;
2362		pd_entry_t *pd;
2363
2364		/* Wire up a new PTE page */
2365		pdpindex = ptepindex >> NPDPEPGSHIFT;
2366		pml4index = pdpindex >> NPML4EPGSHIFT;
2367
2368		/* First, find the pdp and check that its valid. */
2369		pml4 = &pmap->pm_pml4[pml4index];
2370		if ((*pml4 & PG_V) == 0) {
2371			/* Have to allocate a new pd, recurse */
2372			if (_pmap_allocpte(pmap, NUPDE + pdpindex,
2373			    lockp) == NULL) {
2374				--m->wire_count;
2375				atomic_subtract_int(&cnt.v_wire_count, 1);
2376				vm_page_free_zero(m);
2377				return (NULL);
2378			}
2379			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
2380			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
2381		} else {
2382			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
2383			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
2384			if ((*pdp & PG_V) == 0) {
2385				/* Have to allocate a new pd, recurse */
2386				if (_pmap_allocpte(pmap, NUPDE + pdpindex,
2387				    lockp) == NULL) {
2388					--m->wire_count;
2389					atomic_subtract_int(&cnt.v_wire_count,
2390					    1);
2391					vm_page_free_zero(m);
2392					return (NULL);
2393				}
2394			} else {
2395				/* Add reference to the pd page */
2396				pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
2397				pdpg->wire_count++;
2398			}
2399		}
2400		pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
2401
2402		/* Now we know where the page directory page is */
2403		pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
2404		*pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
2405	}
2406
2407	pmap_resident_count_inc(pmap, 1);
2408
2409	return (m);
2410}
2411
2412static vm_page_t
2413pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
2414{
2415	vm_pindex_t pdpindex, ptepindex;
2416	pdp_entry_t *pdpe, PG_V;
2417	vm_page_t pdpg;
2418
2419	PG_V = pmap_valid_bit(pmap);
2420
2421retry:
2422	pdpe = pmap_pdpe(pmap, va);
2423	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
2424		/* Add a reference to the pd page. */
2425		pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
2426		pdpg->wire_count++;
2427	} else {
2428		/* Allocate a pd page. */
2429		ptepindex = pmap_pde_pindex(va);
2430		pdpindex = ptepindex >> NPDPEPGSHIFT;
2431		pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp);
2432		if (pdpg == NULL && lockp != NULL)
2433			goto retry;
2434	}
2435	return (pdpg);
2436}
2437
2438static vm_page_t
2439pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
2440{
2441	vm_pindex_t ptepindex;
2442	pd_entry_t *pd, PG_V;
2443	vm_page_t m;
2444
2445	PG_V = pmap_valid_bit(pmap);
2446
2447	/*
2448	 * Calculate pagetable page index
2449	 */
2450	ptepindex = pmap_pde_pindex(va);
2451retry:
2452	/*
2453	 * Get the page directory entry
2454	 */
2455	pd = pmap_pde(pmap, va);
2456
2457	/*
2458	 * This supports switching from a 2MB page to a
2459	 * normal 4K page.
2460	 */
2461	if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
2462		if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) {
2463			/*
2464			 * Invalidation of the 2MB page mapping may have caused
2465			 * the deallocation of the underlying PD page.
2466			 */
2467			pd = NULL;
2468		}
2469	}
2470
2471	/*
2472	 * If the page table page is mapped, we just increment the
2473	 * hold count, and activate it.
2474	 */
2475	if (pd != NULL && (*pd & PG_V) != 0) {
2476		m = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
2477		m->wire_count++;
2478	} else {
2479		/*
2480		 * Here if the pte page isn't mapped, or if it has been
2481		 * deallocated.
2482		 */
2483		m = _pmap_allocpte(pmap, ptepindex, lockp);
2484		if (m == NULL && lockp != NULL)
2485			goto retry;
2486	}
2487	return (m);
2488}
2489
2490
2491/***************************************************
2492 * Pmap allocation/deallocation routines.
2493 ***************************************************/
2494
2495/*
2496 * Release any resources held by the given physical map.
2497 * Called when a pmap initialized by pmap_pinit is being released.
2498 * Should only be called if the map contains no valid mappings.
2499 */
2500void
2501pmap_release(pmap_t pmap)
2502{
2503	vm_page_t m;
2504	int i;
2505
2506	KASSERT(pmap->pm_stats.resident_count == 0,
2507	    ("pmap_release: pmap resident count %ld != 0",
2508	    pmap->pm_stats.resident_count));
2509	KASSERT(vm_radix_is_empty(&pmap->pm_root),
2510	    ("pmap_release: pmap has reserved page table page(s)"));
2511
2512	if (pmap_pcid_enabled) {
2513		/*
2514		 * Invalidate any left TLB entries, to allow the reuse
2515		 * of the pcid.
2516		 */
2517		pmap_invalidate_all(pmap);
2518	}
2519
2520	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4));
2521
2522	for (i = 0; i < NKPML4E; i++)	/* KVA */
2523		pmap->pm_pml4[KPML4BASE + i] = 0;
2524	for (i = 0; i < ndmpdpphys; i++)/* Direct Map */
2525		pmap->pm_pml4[DMPML4I + i] = 0;
2526	pmap->pm_pml4[PML4PML4I] = 0;	/* Recursive Mapping */
2527
2528	m->wire_count--;
2529	atomic_subtract_int(&cnt.v_wire_count, 1);
2530	vm_page_free_zero(m);
2531	if (pmap->pm_pcid != -1)
2532		free_unr(&pcid_unr, pmap->pm_pcid);
2533}
2534
2535static int
2536kvm_size(SYSCTL_HANDLER_ARGS)
2537{
2538	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
2539
2540	return sysctl_handle_long(oidp, &ksize, 0, req);
2541}
2542SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
2543    0, 0, kvm_size, "LU", "Size of KVM");
2544
2545static int
2546kvm_free(SYSCTL_HANDLER_ARGS)
2547{
2548	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
2549
2550	return sysctl_handle_long(oidp, &kfree, 0, req);
2551}
2552SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
2553    0, 0, kvm_free, "LU", "Amount of KVM free");
2554
2555/*
2556 * grow the number of kernel page table entries, if needed
2557 */
2558void
2559pmap_growkernel(vm_offset_t addr)
2560{
2561	vm_paddr_t paddr;
2562	vm_page_t nkpg;
2563	pd_entry_t *pde, newpdir;
2564	pdp_entry_t *pdpe;
2565
2566	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
2567
2568	/*
2569	 * Return if "addr" is within the range of kernel page table pages
2570	 * that were preallocated during pmap bootstrap.  Moreover, leave
2571	 * "kernel_vm_end" and the kernel page table as they were.
2572	 *
2573	 * The correctness of this action is based on the following
2574	 * argument: vm_map_findspace() allocates contiguous ranges of the
2575	 * kernel virtual address space.  It calls this function if a range
2576	 * ends after "kernel_vm_end".  If the kernel is mapped between
2577	 * "kernel_vm_end" and "addr", then the range cannot begin at
2578	 * "kernel_vm_end".  In fact, its beginning address cannot be less
2579	 * than the kernel.  Thus, there is no immediate need to allocate
2580	 * any new kernel page table pages between "kernel_vm_end" and
2581	 * "KERNBASE".
2582	 */
2583	if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR)
2584		return;
2585
2586	addr = roundup2(addr, NBPDR);
2587	if (addr - 1 >= kernel_map->max_offset)
2588		addr = kernel_map->max_offset;
2589	while (kernel_vm_end < addr) {
2590		pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end);
2591		if ((*pdpe & X86_PG_V) == 0) {
2592			/* We need a new PDP entry */
2593			nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT,
2594			    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
2595			    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
2596			if (nkpg == NULL)
2597				panic("pmap_growkernel: no memory to grow kernel");
2598			if ((nkpg->flags & PG_ZERO) == 0)
2599				pmap_zero_page(nkpg);
2600			paddr = VM_PAGE_TO_PHYS(nkpg);
2601			*pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW |
2602			    X86_PG_A | X86_PG_M);
2603			continue; /* try again */
2604		}
2605		pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end);
2606		if ((*pde & X86_PG_V) != 0) {
2607			kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2608			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2609				kernel_vm_end = kernel_map->max_offset;
2610				break;
2611			}
2612			continue;
2613		}
2614
2615		nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end),
2616		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
2617		    VM_ALLOC_ZERO);
2618		if (nkpg == NULL)
2619			panic("pmap_growkernel: no memory to grow kernel");
2620		if ((nkpg->flags & PG_ZERO) == 0)
2621			pmap_zero_page(nkpg);
2622		paddr = VM_PAGE_TO_PHYS(nkpg);
2623		newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
2624		pde_store(pde, newpdir);
2625
2626		kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2627		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2628			kernel_vm_end = kernel_map->max_offset;
2629			break;
2630		}
2631	}
2632}
2633
2634
2635/***************************************************
2636 * page management routines.
2637 ***************************************************/
2638
2639CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
2640CTASSERT(_NPCM == 3);
2641CTASSERT(_NPCPV == 168);
2642
2643static __inline struct pv_chunk *
2644pv_to_chunk(pv_entry_t pv)
2645{
2646
2647	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
2648}
2649
2650#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
2651
2652#define	PC_FREE0	0xfffffffffffffffful
2653#define	PC_FREE1	0xfffffffffffffffful
2654#define	PC_FREE2	0x000000fffffffffful
2655
2656static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
2657
2658#ifdef PV_STATS
2659static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
2660
2661SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
2662	"Current number of pv entry chunks");
2663SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
2664	"Current number of pv entry chunks allocated");
2665SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
2666	"Current number of pv entry chunks frees");
2667SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
2668	"Number of times tried to get a chunk page but failed.");
2669
2670static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
2671static int pv_entry_spare;
2672
2673SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
2674	"Current number of pv entry frees");
2675SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
2676	"Current number of pv entry allocs");
2677SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
2678	"Current number of pv entries");
2679SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
2680	"Current number of spare pv entries");
2681#endif
2682
2683/*
2684 * We are in a serious low memory condition.  Resort to
2685 * drastic measures to free some pages so we can allocate
2686 * another pv entry chunk.
2687 *
2688 * Returns NULL if PV entries were reclaimed from the specified pmap.
2689 *
2690 * We do not, however, unmap 2mpages because subsequent accesses will
2691 * allocate per-page pv entries until repromotion occurs, thereby
2692 * exacerbating the shortage of free pv entries.
2693 */
2694static vm_page_t
2695reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
2696{
2697	struct pch new_tail;
2698	struct pv_chunk *pc;
2699	struct md_page *pvh;
2700	pd_entry_t *pde;
2701	pmap_t pmap;
2702	pt_entry_t *pte, tpte;
2703	pt_entry_t PG_G, PG_A, PG_M, PG_RW;
2704	pv_entry_t pv;
2705	vm_offset_t va;
2706	vm_page_t m, m_pc;
2707	struct spglist free;
2708	uint64_t inuse;
2709	int bit, field, freed;
2710
2711	rw_assert(&pvh_global_lock, RA_LOCKED);
2712	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
2713	KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
2714	pmap = NULL;
2715	m_pc = NULL;
2716	PG_G = PG_A = PG_M = PG_RW = 0;
2717	SLIST_INIT(&free);
2718	TAILQ_INIT(&new_tail);
2719	mtx_lock(&pv_chunks_mutex);
2720	while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && SLIST_EMPTY(&free)) {
2721		TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2722		mtx_unlock(&pv_chunks_mutex);
2723		if (pmap != pc->pc_pmap) {
2724			if (pmap != NULL) {
2725				pmap_invalidate_all(pmap);
2726				if (pmap != locked_pmap)
2727					PMAP_UNLOCK(pmap);
2728			}
2729			pmap = pc->pc_pmap;
2730			/* Avoid deadlock and lock recursion. */
2731			if (pmap > locked_pmap) {
2732				RELEASE_PV_LIST_LOCK(lockp);
2733				PMAP_LOCK(pmap);
2734			} else if (pmap != locked_pmap &&
2735			    !PMAP_TRYLOCK(pmap)) {
2736				pmap = NULL;
2737				TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
2738				mtx_lock(&pv_chunks_mutex);
2739				continue;
2740			}
2741			PG_G = pmap_global_bit(pmap);
2742			PG_A = pmap_accessed_bit(pmap);
2743			PG_M = pmap_modified_bit(pmap);
2744			PG_RW = pmap_rw_bit(pmap);
2745		}
2746
2747		/*
2748		 * Destroy every non-wired, 4 KB page mapping in the chunk.
2749		 */
2750		freed = 0;
2751		for (field = 0; field < _NPCM; field++) {
2752			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
2753			    inuse != 0; inuse &= ~(1UL << bit)) {
2754				bit = bsfq(inuse);
2755				pv = &pc->pc_pventry[field * 64 + bit];
2756				va = pv->pv_va;
2757				pde = pmap_pde(pmap, va);
2758				if ((*pde & PG_PS) != 0)
2759					continue;
2760				pte = pmap_pde_to_pte(pde, va);
2761				if ((*pte & PG_W) != 0)
2762					continue;
2763				tpte = pte_load_clear(pte);
2764				if ((tpte & PG_G) != 0)
2765					pmap_invalidate_page(pmap, va);
2766				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
2767				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2768					vm_page_dirty(m);
2769				if ((tpte & PG_A) != 0)
2770					vm_page_aflag_set(m, PGA_REFERENCED);
2771				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2772				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
2773				m->md.pv_gen++;
2774				if (TAILQ_EMPTY(&m->md.pv_list) &&
2775				    (m->flags & PG_FICTITIOUS) == 0) {
2776					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2777					if (TAILQ_EMPTY(&pvh->pv_list)) {
2778						vm_page_aflag_clear(m,
2779						    PGA_WRITEABLE);
2780					}
2781				}
2782				pc->pc_map[field] |= 1UL << bit;
2783				pmap_unuse_pt(pmap, va, *pde, &free);
2784				freed++;
2785			}
2786		}
2787		if (freed == 0) {
2788			TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
2789			mtx_lock(&pv_chunks_mutex);
2790			continue;
2791		}
2792		/* Every freed mapping is for a 4 KB page. */
2793		pmap_resident_count_dec(pmap, freed);
2794		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
2795		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
2796		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
2797		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2798		if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 &&
2799		    pc->pc_map[2] == PC_FREE2) {
2800			PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
2801			PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
2802			PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
2803			/* Entire chunk is free; return it. */
2804			m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
2805			dump_drop_page(m_pc->phys_addr);
2806			mtx_lock(&pv_chunks_mutex);
2807			break;
2808		}
2809		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2810		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
2811		mtx_lock(&pv_chunks_mutex);
2812		/* One freed pv entry in locked_pmap is sufficient. */
2813		if (pmap == locked_pmap)
2814			break;
2815	}
2816	TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
2817	mtx_unlock(&pv_chunks_mutex);
2818	if (pmap != NULL) {
2819		pmap_invalidate_all(pmap);
2820		if (pmap != locked_pmap)
2821			PMAP_UNLOCK(pmap);
2822	}
2823	if (m_pc == NULL && !SLIST_EMPTY(&free)) {
2824		m_pc = SLIST_FIRST(&free);
2825		SLIST_REMOVE_HEAD(&free, plinks.s.ss);
2826		/* Recycle a freed page table page. */
2827		m_pc->wire_count = 1;
2828		atomic_add_int(&cnt.v_wire_count, 1);
2829	}
2830	pmap_free_zero_pages(&free);
2831	return (m_pc);
2832}
2833
2834/*
2835 * free the pv_entry back to the free list
2836 */
2837static void
2838free_pv_entry(pmap_t pmap, pv_entry_t pv)
2839{
2840	struct pv_chunk *pc;
2841	int idx, field, bit;
2842
2843	rw_assert(&pvh_global_lock, RA_LOCKED);
2844	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2845	PV_STAT(atomic_add_long(&pv_entry_frees, 1));
2846	PV_STAT(atomic_add_int(&pv_entry_spare, 1));
2847	PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
2848	pc = pv_to_chunk(pv);
2849	idx = pv - &pc->pc_pventry[0];
2850	field = idx / 64;
2851	bit = idx % 64;
2852	pc->pc_map[field] |= 1ul << bit;
2853	if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
2854	    pc->pc_map[2] != PC_FREE2) {
2855		/* 98% of the time, pc is already at the head of the list. */
2856		if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
2857			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2858			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2859		}
2860		return;
2861	}
2862	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2863	free_pv_chunk(pc);
2864}
2865
2866static void
2867free_pv_chunk(struct pv_chunk *pc)
2868{
2869	vm_page_t m;
2870
2871	mtx_lock(&pv_chunks_mutex);
2872 	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2873	mtx_unlock(&pv_chunks_mutex);
2874	PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
2875	PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
2876	PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
2877	/* entire chunk is free, return it */
2878	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
2879	dump_drop_page(m->phys_addr);
2880	vm_page_unwire(m, 0);
2881	vm_page_free(m);
2882}
2883
2884/*
2885 * Returns a new PV entry, allocating a new PV chunk from the system when
2886 * needed.  If this PV chunk allocation fails and a PV list lock pointer was
2887 * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
2888 * returned.
2889 *
2890 * The given PV list lock may be released.
2891 */
2892static pv_entry_t
2893get_pv_entry(pmap_t pmap, struct rwlock **lockp)
2894{
2895	int bit, field;
2896	pv_entry_t pv;
2897	struct pv_chunk *pc;
2898	vm_page_t m;
2899
2900	rw_assert(&pvh_global_lock, RA_LOCKED);
2901	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2902	PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
2903retry:
2904	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2905	if (pc != NULL) {
2906		for (field = 0; field < _NPCM; field++) {
2907			if (pc->pc_map[field]) {
2908				bit = bsfq(pc->pc_map[field]);
2909				break;
2910			}
2911		}
2912		if (field < _NPCM) {
2913			pv = &pc->pc_pventry[field * 64 + bit];
2914			pc->pc_map[field] &= ~(1ul << bit);
2915			/* If this was the last item, move it to tail */
2916			if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
2917			    pc->pc_map[2] == 0) {
2918				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2919				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
2920				    pc_list);
2921			}
2922			PV_STAT(atomic_add_long(&pv_entry_count, 1));
2923			PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
2924			return (pv);
2925		}
2926	}
2927	/* No free items, allocate another chunk */
2928	m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
2929	    VM_ALLOC_WIRED);
2930	if (m == NULL) {
2931		if (lockp == NULL) {
2932			PV_STAT(pc_chunk_tryfail++);
2933			return (NULL);
2934		}
2935		m = reclaim_pv_chunk(pmap, lockp);
2936		if (m == NULL)
2937			goto retry;
2938	}
2939	PV_STAT(atomic_add_int(&pc_chunk_count, 1));
2940	PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
2941	dump_add_page(m->phys_addr);
2942	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
2943	pc->pc_pmap = pmap;
2944	pc->pc_map[0] = PC_FREE0 & ~1ul;	/* preallocated bit 0 */
2945	pc->pc_map[1] = PC_FREE1;
2946	pc->pc_map[2] = PC_FREE2;
2947	mtx_lock(&pv_chunks_mutex);
2948	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
2949	mtx_unlock(&pv_chunks_mutex);
2950	pv = &pc->pc_pventry[0];
2951	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2952	PV_STAT(atomic_add_long(&pv_entry_count, 1));
2953	PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
2954	return (pv);
2955}
2956
2957/*
2958 * Returns the number of one bits within the given PV chunk map element.
2959 */
2960static int
2961popcnt_pc_map_elem(uint64_t elem)
2962{
2963	int count;
2964
2965	/*
2966	 * This simple method of counting the one bits performs well because
2967	 * the given element typically contains more zero bits than one bits.
2968	 */
2969	count = 0;
2970	for (; elem != 0; elem &= elem - 1)
2971		count++;
2972	return (count);
2973}
2974
2975/*
2976 * Ensure that the number of spare PV entries in the specified pmap meets or
2977 * exceeds the given count, "needed".
2978 *
2979 * The given PV list lock may be released.
2980 */
2981static void
2982reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
2983{
2984	struct pch new_tail;
2985	struct pv_chunk *pc;
2986	int avail, free;
2987	vm_page_t m;
2988
2989	rw_assert(&pvh_global_lock, RA_LOCKED);
2990	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2991	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
2992
2993	/*
2994	 * Newly allocated PV chunks must be stored in a private list until
2995	 * the required number of PV chunks have been allocated.  Otherwise,
2996	 * reclaim_pv_chunk() could recycle one of these chunks.  In
2997	 * contrast, these chunks must be added to the pmap upon allocation.
2998	 */
2999	TAILQ_INIT(&new_tail);
3000retry:
3001	avail = 0;
3002	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
3003		if ((cpu_feature2 & CPUID2_POPCNT) == 0) {
3004			free = popcnt_pc_map_elem(pc->pc_map[0]);
3005			free += popcnt_pc_map_elem(pc->pc_map[1]);
3006			free += popcnt_pc_map_elem(pc->pc_map[2]);
3007		} else {
3008			free = popcntq(pc->pc_map[0]);
3009			free += popcntq(pc->pc_map[1]);
3010			free += popcntq(pc->pc_map[2]);
3011		}
3012		if (free == 0)
3013			break;
3014		avail += free;
3015		if (avail >= needed)
3016			break;
3017	}
3018	for (; avail < needed; avail += _NPCPV) {
3019		m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
3020		    VM_ALLOC_WIRED);
3021		if (m == NULL) {
3022			m = reclaim_pv_chunk(pmap, lockp);
3023			if (m == NULL)
3024				goto retry;
3025		}
3026		PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3027		PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3028		dump_add_page(m->phys_addr);
3029		pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3030		pc->pc_pmap = pmap;
3031		pc->pc_map[0] = PC_FREE0;
3032		pc->pc_map[1] = PC_FREE1;
3033		pc->pc_map[2] = PC_FREE2;
3034		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3035		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
3036		PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
3037	}
3038	if (!TAILQ_EMPTY(&new_tail)) {
3039		mtx_lock(&pv_chunks_mutex);
3040		TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
3041		mtx_unlock(&pv_chunks_mutex);
3042	}
3043}
3044
3045/*
3046 * First find and then remove the pv entry for the specified pmap and virtual
3047 * address from the specified pv list.  Returns the pv entry if found and NULL
3048 * otherwise.  This operation can be performed on pv lists for either 4KB or
3049 * 2MB page mappings.
3050 */
3051static __inline pv_entry_t
3052pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3053{
3054	pv_entry_t pv;
3055
3056	rw_assert(&pvh_global_lock, RA_LOCKED);
3057	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
3058		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
3059			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
3060			pvh->pv_gen++;
3061			break;
3062		}
3063	}
3064	return (pv);
3065}
3066
3067/*
3068 * After demotion from a 2MB page mapping to 512 4KB page mappings,
3069 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
3070 * entries for each of the 4KB page mappings.
3071 */
3072static void
3073pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3074    struct rwlock **lockp)
3075{
3076	struct md_page *pvh;
3077	struct pv_chunk *pc;
3078	pv_entry_t pv;
3079	vm_offset_t va_last;
3080	vm_page_t m;
3081	int bit, field;
3082
3083	rw_assert(&pvh_global_lock, RA_LOCKED);
3084	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3085	KASSERT((pa & PDRMASK) == 0,
3086	    ("pmap_pv_demote_pde: pa is not 2mpage aligned"));
3087	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3088
3089	/*
3090	 * Transfer the 2mpage's pv entry for this mapping to the first
3091	 * page's pv list.  Once this transfer begins, the pv list lock
3092	 * must not be released until the last pv entry is reinstantiated.
3093	 */
3094	pvh = pa_to_pvh(pa);
3095	va = trunc_2mpage(va);
3096	pv = pmap_pvh_remove(pvh, pmap, va);
3097	KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
3098	m = PHYS_TO_VM_PAGE(pa);
3099	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3100	m->md.pv_gen++;
3101	/* Instantiate the remaining NPTEPG - 1 pv entries. */
3102	PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1));
3103	va_last = va + NBPDR - PAGE_SIZE;
3104	for (;;) {
3105		pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3106		KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
3107		    pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare"));
3108		for (field = 0; field < _NPCM; field++) {
3109			while (pc->pc_map[field]) {
3110				bit = bsfq(pc->pc_map[field]);
3111				pc->pc_map[field] &= ~(1ul << bit);
3112				pv = &pc->pc_pventry[field * 64 + bit];
3113				va += PAGE_SIZE;
3114				pv->pv_va = va;
3115				m++;
3116				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3117			    ("pmap_pv_demote_pde: page %p is not managed", m));
3118				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3119				m->md.pv_gen++;
3120				if (va == va_last)
3121					goto out;
3122			}
3123		}
3124		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3125		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3126	}
3127out:
3128	if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
3129		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3130		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3131	}
3132	PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1));
3133	PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1));
3134}
3135
3136/*
3137 * After promotion from 512 4KB page mappings to a single 2MB page mapping,
3138 * replace the many pv entries for the 4KB page mappings by a single pv entry
3139 * for the 2MB page mapping.
3140 */
3141static void
3142pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3143    struct rwlock **lockp)
3144{
3145	struct md_page *pvh;
3146	pv_entry_t pv;
3147	vm_offset_t va_last;
3148	vm_page_t m;
3149
3150	rw_assert(&pvh_global_lock, RA_LOCKED);
3151	KASSERT((pa & PDRMASK) == 0,
3152	    ("pmap_pv_promote_pde: pa is not 2mpage aligned"));
3153	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3154
3155	/*
3156	 * Transfer the first page's pv entry for this mapping to the 2mpage's
3157	 * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
3158	 * a transfer avoids the possibility that get_pv_entry() calls
3159	 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
3160	 * mappings that is being promoted.
3161	 */
3162	m = PHYS_TO_VM_PAGE(pa);
3163	va = trunc_2mpage(va);
3164	pv = pmap_pvh_remove(&m->md, pmap, va);
3165	KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
3166	pvh = pa_to_pvh(pa);
3167	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
3168	pvh->pv_gen++;
3169	/* Free the remaining NPTEPG - 1 pv entries. */
3170	va_last = va + NBPDR - PAGE_SIZE;
3171	do {
3172		m++;
3173		va += PAGE_SIZE;
3174		pmap_pvh_free(&m->md, pmap, va);
3175	} while (va < va_last);
3176}
3177
3178/*
3179 * First find and then destroy the pv entry for the specified pmap and virtual
3180 * address.  This operation can be performed on pv lists for either 4KB or 2MB
3181 * page mappings.
3182 */
3183static void
3184pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3185{
3186	pv_entry_t pv;
3187
3188	pv = pmap_pvh_remove(pvh, pmap, va);
3189	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
3190	free_pv_entry(pmap, pv);
3191}
3192
3193/*
3194 * Conditionally create the PV entry for a 4KB page mapping if the required
3195 * memory can be allocated without resorting to reclamation.
3196 */
3197static boolean_t
3198pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
3199    struct rwlock **lockp)
3200{
3201	pv_entry_t pv;
3202
3203	rw_assert(&pvh_global_lock, RA_LOCKED);
3204	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3205	/* Pass NULL instead of the lock pointer to disable reclamation. */
3206	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
3207		pv->pv_va = va;
3208		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3209		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3210		m->md.pv_gen++;
3211		return (TRUE);
3212	} else
3213		return (FALSE);
3214}
3215
3216/*
3217 * Conditionally create the PV entry for a 2MB page mapping if the required
3218 * memory can be allocated without resorting to reclamation.
3219 */
3220static boolean_t
3221pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3222    struct rwlock **lockp)
3223{
3224	struct md_page *pvh;
3225	pv_entry_t pv;
3226
3227	rw_assert(&pvh_global_lock, RA_LOCKED);
3228	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3229	/* Pass NULL instead of the lock pointer to disable reclamation. */
3230	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
3231		pv->pv_va = va;
3232		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3233		pvh = pa_to_pvh(pa);
3234		TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
3235		pvh->pv_gen++;
3236		return (TRUE);
3237	} else
3238		return (FALSE);
3239}
3240
3241/*
3242 * Fills a page table page with mappings to consecutive physical pages.
3243 */
3244static void
3245pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
3246{
3247	pt_entry_t *pte;
3248
3249	for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
3250		*pte = newpte;
3251		newpte += PAGE_SIZE;
3252	}
3253}
3254
3255/*
3256 * Tries to demote a 2MB page mapping.  If demotion fails, the 2MB page
3257 * mapping is invalidated.
3258 */
3259static boolean_t
3260pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
3261{
3262	struct rwlock *lock;
3263	boolean_t rv;
3264
3265	lock = NULL;
3266	rv = pmap_demote_pde_locked(pmap, pde, va, &lock);
3267	if (lock != NULL)
3268		rw_wunlock(lock);
3269	return (rv);
3270}
3271
3272static boolean_t
3273pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
3274    struct rwlock **lockp)
3275{
3276	pd_entry_t newpde, oldpde;
3277	pt_entry_t *firstpte, newpte;
3278	pt_entry_t PG_A, PG_G, PG_M, PG_RW, PG_V;
3279	vm_paddr_t mptepa;
3280	vm_page_t mpte;
3281	struct spglist free;
3282	int PG_PTE_CACHE;
3283
3284	PG_G = pmap_global_bit(pmap);
3285	PG_A = pmap_accessed_bit(pmap);
3286	PG_M = pmap_modified_bit(pmap);
3287	PG_RW = pmap_rw_bit(pmap);
3288	PG_V = pmap_valid_bit(pmap);
3289	PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
3290
3291	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3292	oldpde = *pde;
3293	KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
3294	    ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
3295	if ((oldpde & PG_A) != 0 && (mpte = pmap_lookup_pt_page(pmap, va)) !=
3296	    NULL)
3297		pmap_remove_pt_page(pmap, mpte);
3298	else {
3299		KASSERT((oldpde & PG_W) == 0,
3300		    ("pmap_demote_pde: page table page for a wired mapping"
3301		    " is missing"));
3302
3303		/*
3304		 * Invalidate the 2MB page mapping and return "failure" if the
3305		 * mapping was never accessed or the allocation of the new
3306		 * page table page fails.  If the 2MB page mapping belongs to
3307		 * the direct map region of the kernel's address space, then
3308		 * the page allocation request specifies the highest possible
3309		 * priority (VM_ALLOC_INTERRUPT).  Otherwise, the priority is
3310		 * normal.  Page table pages are preallocated for every other
3311		 * part of the kernel address space, so the direct map region
3312		 * is the only part of the kernel address space that must be
3313		 * handled here.
3314		 */
3315		if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
3316		    pmap_pde_pindex(va), (va >= DMAP_MIN_ADDRESS && va <
3317		    DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) |
3318		    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
3319			SLIST_INIT(&free);
3320			pmap_remove_pde(pmap, pde, trunc_2mpage(va), &free,
3321			    lockp);
3322			pmap_invalidate_page(pmap, trunc_2mpage(va));
3323			pmap_free_zero_pages(&free);
3324			CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx"
3325			    " in pmap %p", va, pmap);
3326			return (FALSE);
3327		}
3328		if (va < VM_MAXUSER_ADDRESS)
3329			pmap_resident_count_inc(pmap, 1);
3330	}
3331	mptepa = VM_PAGE_TO_PHYS(mpte);
3332	firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
3333	newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
3334	KASSERT((oldpde & PG_A) != 0,
3335	    ("pmap_demote_pde: oldpde is missing PG_A"));
3336	KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
3337	    ("pmap_demote_pde: oldpde is missing PG_M"));
3338	newpte = oldpde & ~PG_PS;
3339	newpte = pmap_swap_pat(pmap, newpte);
3340
3341	/*
3342	 * If the page table page is new, initialize it.
3343	 */
3344	if (mpte->wire_count == 1) {
3345		mpte->wire_count = NPTEPG;
3346		pmap_fill_ptp(firstpte, newpte);
3347	}
3348	KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
3349	    ("pmap_demote_pde: firstpte and newpte map different physical"
3350	    " addresses"));
3351
3352	/*
3353	 * If the mapping has changed attributes, update the page table
3354	 * entries.
3355	 */
3356	if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
3357		pmap_fill_ptp(firstpte, newpte);
3358
3359	/*
3360	 * The spare PV entries must be reserved prior to demoting the
3361	 * mapping, that is, prior to changing the PDE.  Otherwise, the state
3362	 * of the PDE and the PV lists will be inconsistent, which can result
3363	 * in reclaim_pv_chunk() attempting to remove a PV entry from the
3364	 * wrong PV list and pmap_pv_demote_pde() failing to find the expected
3365	 * PV entry for the 2MB page mapping that is being demoted.
3366	 */
3367	if ((oldpde & PG_MANAGED) != 0)
3368		reserve_pv_entries(pmap, NPTEPG - 1, lockp);
3369
3370	/*
3371	 * Demote the mapping.  This pmap is locked.  The old PDE has
3372	 * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
3373	 * set.  Thus, there is no danger of a race with another
3374	 * processor changing the setting of PG_A and/or PG_M between
3375	 * the read above and the store below.
3376	 */
3377	if (workaround_erratum383)
3378		pmap_update_pde(pmap, va, pde, newpde);
3379	else
3380		pde_store(pde, newpde);
3381
3382	/*
3383	 * Invalidate a stale recursive mapping of the page table page.
3384	 */
3385	if (va >= VM_MAXUSER_ADDRESS)
3386		pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
3387
3388	/*
3389	 * Demote the PV entry.
3390	 */
3391	if ((oldpde & PG_MANAGED) != 0)
3392		pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp);
3393
3394	atomic_add_long(&pmap_pde_demotions, 1);
3395	CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx"
3396	    " in pmap %p", va, pmap);
3397	return (TRUE);
3398}
3399
3400/*
3401 * pmap_remove_kernel_pde: Remove a kernel superpage mapping.
3402 */
3403static void
3404pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
3405{
3406	pd_entry_t newpde;
3407	vm_paddr_t mptepa;
3408	vm_page_t mpte;
3409
3410	KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
3411	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3412	mpte = pmap_lookup_pt_page(pmap, va);
3413	if (mpte == NULL)
3414		panic("pmap_remove_kernel_pde: Missing pt page.");
3415
3416	pmap_remove_pt_page(pmap, mpte);
3417	mptepa = VM_PAGE_TO_PHYS(mpte);
3418	newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V;
3419
3420	/*
3421	 * Initialize the page table page.
3422	 */
3423	pagezero((void *)PHYS_TO_DMAP(mptepa));
3424
3425	/*
3426	 * Demote the mapping.
3427	 */
3428	if (workaround_erratum383)
3429		pmap_update_pde(pmap, va, pde, newpde);
3430	else
3431		pde_store(pde, newpde);
3432
3433	/*
3434	 * Invalidate a stale recursive mapping of the page table page.
3435	 */
3436	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
3437}
3438
3439/*
3440 * pmap_remove_pde: do the things to unmap a superpage in a process
3441 */
3442static int
3443pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
3444    struct spglist *free, struct rwlock **lockp)
3445{
3446	struct md_page *pvh;
3447	pd_entry_t oldpde;
3448	vm_offset_t eva, va;
3449	vm_page_t m, mpte;
3450	pt_entry_t PG_G, PG_A, PG_M, PG_RW;
3451
3452	PG_G = pmap_global_bit(pmap);
3453	PG_A = pmap_accessed_bit(pmap);
3454	PG_M = pmap_modified_bit(pmap);
3455	PG_RW = pmap_rw_bit(pmap);
3456
3457	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3458	KASSERT((sva & PDRMASK) == 0,
3459	    ("pmap_remove_pde: sva is not 2mpage aligned"));
3460	oldpde = pte_load_clear(pdq);
3461	if (oldpde & PG_W)
3462		pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
3463
3464	/*
3465	 * Machines that don't support invlpg, also don't support
3466	 * PG_G.
3467	 */
3468	if (oldpde & PG_G)
3469		pmap_invalidate_page(kernel_pmap, sva);
3470	pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
3471	if (oldpde & PG_MANAGED) {
3472		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME);
3473		pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
3474		pmap_pvh_free(pvh, pmap, sva);
3475		eva = sva + NBPDR;
3476		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
3477		    va < eva; va += PAGE_SIZE, m++) {
3478			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
3479				vm_page_dirty(m);
3480			if (oldpde & PG_A)
3481				vm_page_aflag_set(m, PGA_REFERENCED);
3482			if (TAILQ_EMPTY(&m->md.pv_list) &&
3483			    TAILQ_EMPTY(&pvh->pv_list))
3484				vm_page_aflag_clear(m, PGA_WRITEABLE);
3485		}
3486	}
3487	if (pmap == kernel_pmap) {
3488		pmap_remove_kernel_pde(pmap, pdq, sva);
3489	} else {
3490		mpte = pmap_lookup_pt_page(pmap, sva);
3491		if (mpte != NULL) {
3492			pmap_remove_pt_page(pmap, mpte);
3493			pmap_resident_count_dec(pmap, 1);
3494			KASSERT(mpte->wire_count == NPTEPG,
3495			    ("pmap_remove_pde: pte page wire count error"));
3496			mpte->wire_count = 0;
3497			pmap_add_delayed_free_list(mpte, free, FALSE);
3498			atomic_subtract_int(&cnt.v_wire_count, 1);
3499		}
3500	}
3501	return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free));
3502}
3503
3504/*
3505 * pmap_remove_pte: do the things to unmap a page in a process
3506 */
3507static int
3508pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
3509    pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp)
3510{
3511	struct md_page *pvh;
3512	pt_entry_t oldpte, PG_A, PG_M, PG_RW;
3513	vm_page_t m;
3514
3515	PG_A = pmap_accessed_bit(pmap);
3516	PG_M = pmap_modified_bit(pmap);
3517	PG_RW = pmap_rw_bit(pmap);
3518
3519	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3520	oldpte = pte_load_clear(ptq);
3521	if (oldpte & PG_W)
3522		pmap->pm_stats.wired_count -= 1;
3523	pmap_resident_count_dec(pmap, 1);
3524	if (oldpte & PG_MANAGED) {
3525		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
3526		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
3527			vm_page_dirty(m);
3528		if (oldpte & PG_A)
3529			vm_page_aflag_set(m, PGA_REFERENCED);
3530		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3531		pmap_pvh_free(&m->md, pmap, va);
3532		if (TAILQ_EMPTY(&m->md.pv_list) &&
3533		    (m->flags & PG_FICTITIOUS) == 0) {
3534			pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3535			if (TAILQ_EMPTY(&pvh->pv_list))
3536				vm_page_aflag_clear(m, PGA_WRITEABLE);
3537		}
3538	}
3539	return (pmap_unuse_pt(pmap, va, ptepde, free));
3540}
3541
3542/*
3543 * Remove a single page from a process address space
3544 */
3545static void
3546pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
3547    struct spglist *free)
3548{
3549	struct rwlock *lock;
3550	pt_entry_t *pte, PG_V;
3551
3552	PG_V = pmap_valid_bit(pmap);
3553	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3554	if ((*pde & PG_V) == 0)
3555		return;
3556	pte = pmap_pde_to_pte(pde, va);
3557	if ((*pte & PG_V) == 0)
3558		return;
3559	lock = NULL;
3560	pmap_remove_pte(pmap, pte, va, *pde, free, &lock);
3561	if (lock != NULL)
3562		rw_wunlock(lock);
3563	pmap_invalidate_page(pmap, va);
3564}
3565
3566/*
3567 *	Remove the given range of addresses from the specified map.
3568 *
3569 *	It is assumed that the start and end are properly
3570 *	rounded to the page size.
3571 */
3572void
3573pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
3574{
3575	struct rwlock *lock;
3576	vm_offset_t va, va_next;
3577	pml4_entry_t *pml4e;
3578	pdp_entry_t *pdpe;
3579	pd_entry_t ptpaddr, *pde;
3580	pt_entry_t *pte, PG_G, PG_V;
3581	struct spglist free;
3582	int anyvalid;
3583
3584	PG_G = pmap_global_bit(pmap);
3585	PG_V = pmap_valid_bit(pmap);
3586
3587	/*
3588	 * Perform an unsynchronized read.  This is, however, safe.
3589	 */
3590	if (pmap->pm_stats.resident_count == 0)
3591		return;
3592
3593	anyvalid = 0;
3594	SLIST_INIT(&free);
3595
3596	rw_rlock(&pvh_global_lock);
3597	PMAP_LOCK(pmap);
3598
3599	/*
3600	 * special handling of removing one page.  a very
3601	 * common operation and easy to short circuit some
3602	 * code.
3603	 */
3604	if (sva + PAGE_SIZE == eva) {
3605		pde = pmap_pde(pmap, sva);
3606		if (pde && (*pde & PG_PS) == 0) {
3607			pmap_remove_page(pmap, sva, pde, &free);
3608			goto out;
3609		}
3610	}
3611
3612	lock = NULL;
3613	for (; sva < eva; sva = va_next) {
3614
3615		if (pmap->pm_stats.resident_count == 0)
3616			break;
3617
3618		pml4e = pmap_pml4e(pmap, sva);
3619		if ((*pml4e & PG_V) == 0) {
3620			va_next = (sva + NBPML4) & ~PML4MASK;
3621			if (va_next < sva)
3622				va_next = eva;
3623			continue;
3624		}
3625
3626		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
3627		if ((*pdpe & PG_V) == 0) {
3628			va_next = (sva + NBPDP) & ~PDPMASK;
3629			if (va_next < sva)
3630				va_next = eva;
3631			continue;
3632		}
3633
3634		/*
3635		 * Calculate index for next page table.
3636		 */
3637		va_next = (sva + NBPDR) & ~PDRMASK;
3638		if (va_next < sva)
3639			va_next = eva;
3640
3641		pde = pmap_pdpe_to_pde(pdpe, sva);
3642		ptpaddr = *pde;
3643
3644		/*
3645		 * Weed out invalid mappings.
3646		 */
3647		if (ptpaddr == 0)
3648			continue;
3649
3650		/*
3651		 * Check for large page.
3652		 */
3653		if ((ptpaddr & PG_PS) != 0) {
3654			/*
3655			 * Are we removing the entire large page?  If not,
3656			 * demote the mapping and fall through.
3657			 */
3658			if (sva + NBPDR == va_next && eva >= va_next) {
3659				/*
3660				 * The TLB entry for a PG_G mapping is
3661				 * invalidated by pmap_remove_pde().
3662				 */
3663				if ((ptpaddr & PG_G) == 0)
3664					anyvalid = 1;
3665				pmap_remove_pde(pmap, pde, sva, &free, &lock);
3666				continue;
3667			} else if (!pmap_demote_pde_locked(pmap, pde, sva,
3668			    &lock)) {
3669				/* The large page mapping was destroyed. */
3670				continue;
3671			} else
3672				ptpaddr = *pde;
3673		}
3674
3675		/*
3676		 * Limit our scan to either the end of the va represented
3677		 * by the current page table page, or to the end of the
3678		 * range being removed.
3679		 */
3680		if (va_next > eva)
3681			va_next = eva;
3682
3683		va = va_next;
3684		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
3685		    sva += PAGE_SIZE) {
3686			if (*pte == 0) {
3687				if (va != va_next) {
3688					pmap_invalidate_range(pmap, va, sva);
3689					va = va_next;
3690				}
3691				continue;
3692			}
3693			if ((*pte & PG_G) == 0)
3694				anyvalid = 1;
3695			else if (va == va_next)
3696				va = sva;
3697			if (pmap_remove_pte(pmap, pte, sva, ptpaddr, &free,
3698			    &lock)) {
3699				sva += PAGE_SIZE;
3700				break;
3701			}
3702		}
3703		if (va != va_next)
3704			pmap_invalidate_range(pmap, va, sva);
3705	}
3706	if (lock != NULL)
3707		rw_wunlock(lock);
3708out:
3709	if (anyvalid)
3710		pmap_invalidate_all(pmap);
3711	rw_runlock(&pvh_global_lock);
3712	PMAP_UNLOCK(pmap);
3713	pmap_free_zero_pages(&free);
3714}
3715
3716/*
3717 *	Routine:	pmap_remove_all
3718 *	Function:
3719 *		Removes this physical page from
3720 *		all physical maps in which it resides.
3721 *		Reflects back modify bits to the pager.
3722 *
3723 *	Notes:
3724 *		Original versions of this routine were very
3725 *		inefficient because they iteratively called
3726 *		pmap_remove (slow...)
3727 */
3728
3729void
3730pmap_remove_all(vm_page_t m)
3731{
3732	struct md_page *pvh;
3733	pv_entry_t pv;
3734	pmap_t pmap;
3735	pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW;
3736	pd_entry_t *pde;
3737	vm_offset_t va;
3738	struct spglist free;
3739
3740	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3741	    ("pmap_remove_all: page %p is not managed", m));
3742	SLIST_INIT(&free);
3743	rw_wlock(&pvh_global_lock);
3744	if ((m->flags & PG_FICTITIOUS) != 0)
3745		goto small_mappings;
3746	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3747	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
3748		pmap = PV_PMAP(pv);
3749		PMAP_LOCK(pmap);
3750		va = pv->pv_va;
3751		pde = pmap_pde(pmap, va);
3752		(void)pmap_demote_pde(pmap, pde, va);
3753		PMAP_UNLOCK(pmap);
3754	}
3755small_mappings:
3756	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3757		pmap = PV_PMAP(pv);
3758		PMAP_LOCK(pmap);
3759		PG_A = pmap_accessed_bit(pmap);
3760		PG_M = pmap_modified_bit(pmap);
3761		PG_RW = pmap_rw_bit(pmap);
3762		pmap_resident_count_dec(pmap, 1);
3763		pde = pmap_pde(pmap, pv->pv_va);
3764		KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
3765		    " a 2mpage in page %p's pv list", m));
3766		pte = pmap_pde_to_pte(pde, pv->pv_va);
3767		tpte = pte_load_clear(pte);
3768		if (tpte & PG_W)
3769			pmap->pm_stats.wired_count--;
3770		if (tpte & PG_A)
3771			vm_page_aflag_set(m, PGA_REFERENCED);
3772
3773		/*
3774		 * Update the vm_page_t clean and reference bits.
3775		 */
3776		if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
3777			vm_page_dirty(m);
3778		pmap_unuse_pt(pmap, pv->pv_va, *pde, &free);
3779		pmap_invalidate_page(pmap, pv->pv_va);
3780		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3781		m->md.pv_gen++;
3782		free_pv_entry(pmap, pv);
3783		PMAP_UNLOCK(pmap);
3784	}
3785	vm_page_aflag_clear(m, PGA_WRITEABLE);
3786	rw_wunlock(&pvh_global_lock);
3787	pmap_free_zero_pages(&free);
3788}
3789
3790/*
3791 * pmap_protect_pde: do the things to protect a 2mpage in a process
3792 */
3793static boolean_t
3794pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
3795{
3796	pd_entry_t newpde, oldpde;
3797	vm_offset_t eva, va;
3798	vm_page_t m;
3799	boolean_t anychanged;
3800	pt_entry_t PG_G, PG_M, PG_RW;
3801
3802	PG_G = pmap_global_bit(pmap);
3803	PG_M = pmap_modified_bit(pmap);
3804	PG_RW = pmap_rw_bit(pmap);
3805
3806	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3807	KASSERT((sva & PDRMASK) == 0,
3808	    ("pmap_protect_pde: sva is not 2mpage aligned"));
3809	anychanged = FALSE;
3810retry:
3811	oldpde = newpde = *pde;
3812	if (oldpde & PG_MANAGED) {
3813		eva = sva + NBPDR;
3814		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
3815		    va < eva; va += PAGE_SIZE, m++)
3816			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
3817				vm_page_dirty(m);
3818	}
3819	if ((prot & VM_PROT_WRITE) == 0)
3820		newpde &= ~(PG_RW | PG_M);
3821	if ((prot & VM_PROT_EXECUTE) == 0)
3822		newpde |= pg_nx;
3823	if (newpde != oldpde) {
3824		if (!atomic_cmpset_long(pde, oldpde, newpde))
3825			goto retry;
3826		if (oldpde & PG_G)
3827			pmap_invalidate_page(pmap, sva);
3828		else
3829			anychanged = TRUE;
3830	}
3831	return (anychanged);
3832}
3833
3834/*
3835 *	Set the physical protection on the
3836 *	specified range of this map as requested.
3837 */
3838void
3839pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
3840{
3841	vm_offset_t va_next;
3842	pml4_entry_t *pml4e;
3843	pdp_entry_t *pdpe;
3844	pd_entry_t ptpaddr, *pde;
3845	pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V;
3846	boolean_t anychanged, pv_lists_locked;
3847
3848	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
3849		pmap_remove(pmap, sva, eva);
3850		return;
3851	}
3852
3853	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
3854	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
3855		return;
3856
3857	PG_G = pmap_global_bit(pmap);
3858	PG_M = pmap_modified_bit(pmap);
3859	PG_V = pmap_valid_bit(pmap);
3860	PG_RW = pmap_rw_bit(pmap);
3861	pv_lists_locked = FALSE;
3862resume:
3863	anychanged = FALSE;
3864
3865	PMAP_LOCK(pmap);
3866	for (; sva < eva; sva = va_next) {
3867
3868		pml4e = pmap_pml4e(pmap, sva);
3869		if ((*pml4e & PG_V) == 0) {
3870			va_next = (sva + NBPML4) & ~PML4MASK;
3871			if (va_next < sva)
3872				va_next = eva;
3873			continue;
3874		}
3875
3876		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
3877		if ((*pdpe & PG_V) == 0) {
3878			va_next = (sva + NBPDP) & ~PDPMASK;
3879			if (va_next < sva)
3880				va_next = eva;
3881			continue;
3882		}
3883
3884		va_next = (sva + NBPDR) & ~PDRMASK;
3885		if (va_next < sva)
3886			va_next = eva;
3887
3888		pde = pmap_pdpe_to_pde(pdpe, sva);
3889		ptpaddr = *pde;
3890
3891		/*
3892		 * Weed out invalid mappings.
3893		 */
3894		if (ptpaddr == 0)
3895			continue;
3896
3897		/*
3898		 * Check for large page.
3899		 */
3900		if ((ptpaddr & PG_PS) != 0) {
3901			/*
3902			 * Are we protecting the entire large page?  If not,
3903			 * demote the mapping and fall through.
3904			 */
3905			if (sva + NBPDR == va_next && eva >= va_next) {
3906				/*
3907				 * The TLB entry for a PG_G mapping is
3908				 * invalidated by pmap_protect_pde().
3909				 */
3910				if (pmap_protect_pde(pmap, pde, sva, prot))
3911					anychanged = TRUE;
3912				continue;
3913			} else {
3914				if (!pv_lists_locked) {
3915					pv_lists_locked = TRUE;
3916					if (!rw_try_rlock(&pvh_global_lock)) {
3917						if (anychanged)
3918							pmap_invalidate_all(
3919							    pmap);
3920						PMAP_UNLOCK(pmap);
3921						rw_rlock(&pvh_global_lock);
3922						goto resume;
3923					}
3924				}
3925				if (!pmap_demote_pde(pmap, pde, sva)) {
3926					/*
3927					 * The large page mapping was
3928					 * destroyed.
3929					 */
3930					continue;
3931				}
3932			}
3933		}
3934
3935		if (va_next > eva)
3936			va_next = eva;
3937
3938		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
3939		    sva += PAGE_SIZE) {
3940			pt_entry_t obits, pbits;
3941			vm_page_t m;
3942
3943retry:
3944			obits = pbits = *pte;
3945			if ((pbits & PG_V) == 0)
3946				continue;
3947
3948			if ((prot & VM_PROT_WRITE) == 0) {
3949				if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
3950				    (PG_MANAGED | PG_M | PG_RW)) {
3951					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
3952					vm_page_dirty(m);
3953				}
3954				pbits &= ~(PG_RW | PG_M);
3955			}
3956			if ((prot & VM_PROT_EXECUTE) == 0)
3957				pbits |= pg_nx;
3958
3959			if (pbits != obits) {
3960				if (!atomic_cmpset_long(pte, obits, pbits))
3961					goto retry;
3962				if (obits & PG_G)
3963					pmap_invalidate_page(pmap, sva);
3964				else
3965					anychanged = TRUE;
3966			}
3967		}
3968	}
3969	if (anychanged)
3970		pmap_invalidate_all(pmap);
3971	if (pv_lists_locked)
3972		rw_runlock(&pvh_global_lock);
3973	PMAP_UNLOCK(pmap);
3974}
3975
3976/*
3977 * Tries to promote the 512, contiguous 4KB page mappings that are within a
3978 * single page table page (PTP) to a single 2MB page mapping.  For promotion
3979 * to occur, two conditions must be met: (1) the 4KB page mappings must map
3980 * aligned, contiguous physical memory and (2) the 4KB page mappings must have
3981 * identical characteristics.
3982 */
3983static void
3984pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
3985    struct rwlock **lockp)
3986{
3987	pd_entry_t newpde;
3988	pt_entry_t *firstpte, oldpte, pa, *pte;
3989	pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V;
3990	vm_offset_t oldpteva;
3991	vm_page_t mpte;
3992	int PG_PTE_CACHE;
3993
3994	PG_A = pmap_accessed_bit(pmap);
3995	PG_G = pmap_global_bit(pmap);
3996	PG_M = pmap_modified_bit(pmap);
3997	PG_V = pmap_valid_bit(pmap);
3998	PG_RW = pmap_rw_bit(pmap);
3999	PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
4000
4001	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4002
4003	/*
4004	 * Examine the first PTE in the specified PTP.  Abort if this PTE is
4005	 * either invalid, unused, or does not map the first 4KB physical page
4006	 * within a 2MB page.
4007	 */
4008	firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
4009setpde:
4010	newpde = *firstpte;
4011	if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
4012		atomic_add_long(&pmap_pde_p_failures, 1);
4013		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
4014		    " in pmap %p", va, pmap);
4015		return;
4016	}
4017	if ((newpde & (PG_M | PG_RW)) == PG_RW) {
4018		/*
4019		 * When PG_M is already clear, PG_RW can be cleared without
4020		 * a TLB invalidation.
4021		 */
4022		if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW))
4023			goto setpde;
4024		newpde &= ~PG_RW;
4025	}
4026
4027	/*
4028	 * Examine each of the other PTEs in the specified PTP.  Abort if this
4029	 * PTE maps an unexpected 4KB physical page or does not have identical
4030	 * characteristics to the first PTE.
4031	 */
4032	pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
4033	for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
4034setpte:
4035		oldpte = *pte;
4036		if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
4037			atomic_add_long(&pmap_pde_p_failures, 1);
4038			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
4039			    " in pmap %p", va, pmap);
4040			return;
4041		}
4042		if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
4043			/*
4044			 * When PG_M is already clear, PG_RW can be cleared
4045			 * without a TLB invalidation.
4046			 */
4047			if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW))
4048				goto setpte;
4049			oldpte &= ~PG_RW;
4050			oldpteva = (oldpte & PG_FRAME & PDRMASK) |
4051			    (va & ~PDRMASK);
4052			CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx"
4053			    " in pmap %p", oldpteva, pmap);
4054		}
4055		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
4056			atomic_add_long(&pmap_pde_p_failures, 1);
4057			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
4058			    " in pmap %p", va, pmap);
4059			return;
4060		}
4061		pa -= PAGE_SIZE;
4062	}
4063
4064	/*
4065	 * Save the page table page in its current state until the PDE
4066	 * mapping the superpage is demoted by pmap_demote_pde() or
4067	 * destroyed by pmap_remove_pde().
4068	 */
4069	mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
4070	KASSERT(mpte >= vm_page_array &&
4071	    mpte < &vm_page_array[vm_page_array_size],
4072	    ("pmap_promote_pde: page table page is out of range"));
4073	KASSERT(mpte->pindex == pmap_pde_pindex(va),
4074	    ("pmap_promote_pde: page table page's pindex is wrong"));
4075	if (pmap_insert_pt_page(pmap, mpte)) {
4076		atomic_add_long(&pmap_pde_p_failures, 1);
4077		CTR2(KTR_PMAP,
4078		    "pmap_promote_pde: failure for va %#lx in pmap %p", va,
4079		    pmap);
4080		return;
4081	}
4082
4083	/*
4084	 * Promote the pv entries.
4085	 */
4086	if ((newpde & PG_MANAGED) != 0)
4087		pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp);
4088
4089	/*
4090	 * Propagate the PAT index to its proper position.
4091	 */
4092	newpde = pmap_swap_pat(pmap, newpde);
4093
4094	/*
4095	 * Map the superpage.
4096	 */
4097	if (workaround_erratum383)
4098		pmap_update_pde(pmap, va, pde, PG_PS | newpde);
4099	else
4100		pde_store(pde, PG_PS | newpde);
4101
4102	atomic_add_long(&pmap_pde_promotions, 1);
4103	CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx"
4104	    " in pmap %p", va, pmap);
4105}
4106
4107/*
4108 *	Insert the given physical page (p) at
4109 *	the specified virtual address (v) in the
4110 *	target physical map with the protection requested.
4111 *
4112 *	If specified, the page will be wired down, meaning
4113 *	that the related pte can not be reclaimed.
4114 *
4115 *	NB:  This is the only routine which MAY NOT lazy-evaluate
4116 *	or lose information.  That is, this routine must actually
4117 *	insert this page into the given map NOW.
4118 */
4119int
4120pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
4121    u_int flags, int8_t psind __unused)
4122{
4123	struct rwlock *lock;
4124	pd_entry_t *pde;
4125	pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V;
4126	pt_entry_t newpte, origpte;
4127	pv_entry_t pv;
4128	vm_paddr_t opa, pa;
4129	vm_page_t mpte, om;
4130	boolean_t nosleep;
4131
4132	PG_A = pmap_accessed_bit(pmap);
4133	PG_G = pmap_global_bit(pmap);
4134	PG_M = pmap_modified_bit(pmap);
4135	PG_V = pmap_valid_bit(pmap);
4136	PG_RW = pmap_rw_bit(pmap);
4137
4138	va = trunc_page(va);
4139	KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
4140	KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
4141	    ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)",
4142	    va));
4143	KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva ||
4144	    va >= kmi.clean_eva,
4145	    ("pmap_enter: managed mapping within the clean submap"));
4146	if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
4147		VM_OBJECT_ASSERT_LOCKED(m->object);
4148	pa = VM_PAGE_TO_PHYS(m);
4149	newpte = (pt_entry_t)(pa | PG_A | PG_V);
4150	if ((flags & VM_PROT_WRITE) != 0)
4151		newpte |= PG_M;
4152	if ((prot & VM_PROT_WRITE) != 0)
4153		newpte |= PG_RW;
4154	KASSERT((newpte & (PG_M | PG_RW)) != PG_M,
4155	    ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't"));
4156	if ((prot & VM_PROT_EXECUTE) == 0)
4157		newpte |= pg_nx;
4158	if ((flags & PMAP_ENTER_WIRED) != 0)
4159		newpte |= PG_W;
4160	if (va < VM_MAXUSER_ADDRESS)
4161		newpte |= PG_U;
4162	if (pmap == kernel_pmap)
4163		newpte |= PG_G;
4164	newpte |= pmap_cache_bits(pmap, m->md.pat_mode, 0);
4165
4166	/*
4167	 * Set modified bit gratuitously for writeable mappings if
4168	 * the page is unmanaged. We do not want to take a fault
4169	 * to do the dirty bit accounting for these mappings.
4170	 */
4171	if ((m->oflags & VPO_UNMANAGED) != 0) {
4172		if ((newpte & PG_RW) != 0)
4173			newpte |= PG_M;
4174	}
4175
4176	mpte = NULL;
4177
4178	lock = NULL;
4179	rw_rlock(&pvh_global_lock);
4180	PMAP_LOCK(pmap);
4181
4182	/*
4183	 * In the case that a page table page is not
4184	 * resident, we are creating it here.
4185	 */
4186retry:
4187	pde = pmap_pde(pmap, va);
4188	if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 ||
4189	    pmap_demote_pde_locked(pmap, pde, va, &lock))) {
4190		pte = pmap_pde_to_pte(pde, va);
4191		if (va < VM_MAXUSER_ADDRESS && mpte == NULL) {
4192			mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
4193			mpte->wire_count++;
4194		}
4195	} else if (va < VM_MAXUSER_ADDRESS) {
4196		/*
4197		 * Here if the pte page isn't mapped, or if it has been
4198		 * deallocated.
4199		 */
4200		nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
4201		mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va),
4202		    nosleep ? NULL : &lock);
4203		if (mpte == NULL && nosleep) {
4204			if (lock != NULL)
4205				rw_wunlock(lock);
4206			rw_runlock(&pvh_global_lock);
4207			PMAP_UNLOCK(pmap);
4208			return (KERN_RESOURCE_SHORTAGE);
4209		}
4210		goto retry;
4211	} else
4212		panic("pmap_enter: invalid page directory va=%#lx", va);
4213
4214	origpte = *pte;
4215
4216	/*
4217	 * Is the specified virtual address already mapped?
4218	 */
4219	if ((origpte & PG_V) != 0) {
4220		/*
4221		 * Wiring change, just update stats. We don't worry about
4222		 * wiring PT pages as they remain resident as long as there
4223		 * are valid mappings in them. Hence, if a user page is wired,
4224		 * the PT page will be also.
4225		 */
4226		if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0)
4227			pmap->pm_stats.wired_count++;
4228		else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0)
4229			pmap->pm_stats.wired_count--;
4230
4231		/*
4232		 * Remove the extra PT page reference.
4233		 */
4234		if (mpte != NULL) {
4235			mpte->wire_count--;
4236			KASSERT(mpte->wire_count > 0,
4237			    ("pmap_enter: missing reference to page table page,"
4238			     " va: 0x%lx", va));
4239		}
4240
4241		/*
4242		 * Has the physical page changed?
4243		 */
4244		opa = origpte & PG_FRAME;
4245		if (opa == pa) {
4246			/*
4247			 * No, might be a protection or wiring change.
4248			 */
4249			if ((origpte & PG_MANAGED) != 0) {
4250				newpte |= PG_MANAGED;
4251				if ((newpte & PG_RW) != 0)
4252					vm_page_aflag_set(m, PGA_WRITEABLE);
4253			}
4254			if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0)
4255				goto unchanged;
4256			goto validate;
4257		}
4258	} else {
4259		/*
4260		 * Increment the counters.
4261		 */
4262		if ((newpte & PG_W) != 0)
4263			pmap->pm_stats.wired_count++;
4264		pmap_resident_count_inc(pmap, 1);
4265	}
4266
4267	/*
4268	 * Enter on the PV list if part of our managed memory.
4269	 */
4270	if ((m->oflags & VPO_UNMANAGED) == 0) {
4271		newpte |= PG_MANAGED;
4272		pv = get_pv_entry(pmap, &lock);
4273		pv->pv_va = va;
4274		CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
4275		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
4276		m->md.pv_gen++;
4277		if ((newpte & PG_RW) != 0)
4278			vm_page_aflag_set(m, PGA_WRITEABLE);
4279	}
4280
4281	/*
4282	 * Update the PTE.
4283	 */
4284	if ((origpte & PG_V) != 0) {
4285validate:
4286		origpte = pte_load_store(pte, newpte);
4287		opa = origpte & PG_FRAME;
4288		if (opa != pa) {
4289			if ((origpte & PG_MANAGED) != 0) {
4290				om = PHYS_TO_VM_PAGE(opa);
4291				if ((origpte & (PG_M | PG_RW)) == (PG_M |
4292				    PG_RW))
4293					vm_page_dirty(om);
4294				if ((origpte & PG_A) != 0)
4295					vm_page_aflag_set(om, PGA_REFERENCED);
4296				CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
4297				pmap_pvh_free(&om->md, pmap, va);
4298				if ((om->aflags & PGA_WRITEABLE) != 0 &&
4299				    TAILQ_EMPTY(&om->md.pv_list) &&
4300				    ((om->flags & PG_FICTITIOUS) != 0 ||
4301				    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
4302					vm_page_aflag_clear(om, PGA_WRITEABLE);
4303			}
4304		} else if ((newpte & PG_M) == 0 && (origpte & (PG_M |
4305		    PG_RW)) == (PG_M | PG_RW)) {
4306			if ((origpte & PG_MANAGED) != 0)
4307				vm_page_dirty(m);
4308
4309			/*
4310			 * Although the PTE may still have PG_RW set, TLB
4311			 * invalidation may nonetheless be required because
4312			 * the PTE no longer has PG_M set.
4313			 */
4314		} else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) {
4315			/*
4316			 * This PTE change does not require TLB invalidation.
4317			 */
4318			goto unchanged;
4319		}
4320		if ((origpte & PG_A) != 0)
4321			pmap_invalidate_page(pmap, va);
4322	} else
4323		pte_store(pte, newpte);
4324
4325unchanged:
4326
4327	/*
4328	 * If both the page table page and the reservation are fully
4329	 * populated, then attempt promotion.
4330	 */
4331	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
4332	    pmap_ps_enabled(pmap) &&
4333	    (m->flags & PG_FICTITIOUS) == 0 &&
4334	    vm_reserv_level_iffullpop(m) == 0)
4335		pmap_promote_pde(pmap, pde, va, &lock);
4336
4337	if (lock != NULL)
4338		rw_wunlock(lock);
4339	rw_runlock(&pvh_global_lock);
4340	PMAP_UNLOCK(pmap);
4341	return (KERN_SUCCESS);
4342}
4343
4344/*
4345 * Tries to create a 2MB page mapping.  Returns TRUE if successful and FALSE
4346 * otherwise.  Fails if (1) a page table page cannot be allocated without
4347 * blocking, (2) a mapping already exists at the specified virtual address, or
4348 * (3) a pv entry cannot be allocated without reclaiming another pv entry.
4349 */
4350static boolean_t
4351pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
4352    struct rwlock **lockp)
4353{
4354	pd_entry_t *pde, newpde;
4355	pt_entry_t PG_V;
4356	vm_page_t mpde;
4357	struct spglist free;
4358
4359	PG_V = pmap_valid_bit(pmap);
4360	rw_assert(&pvh_global_lock, RA_LOCKED);
4361	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4362
4363	if ((mpde = pmap_allocpde(pmap, va, NULL)) == NULL) {
4364		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
4365		    " in pmap %p", va, pmap);
4366		return (FALSE);
4367	}
4368	pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpde));
4369	pde = &pde[pmap_pde_index(va)];
4370	if ((*pde & PG_V) != 0) {
4371		KASSERT(mpde->wire_count > 1,
4372		    ("pmap_enter_pde: mpde's wire count is too low"));
4373		mpde->wire_count--;
4374		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
4375		    " in pmap %p", va, pmap);
4376		return (FALSE);
4377	}
4378	newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) |
4379	    PG_PS | PG_V;
4380	if ((m->oflags & VPO_UNMANAGED) == 0) {
4381		newpde |= PG_MANAGED;
4382
4383		/*
4384		 * Abort this mapping if its PV entry could not be created.
4385		 */
4386		if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m),
4387		    lockp)) {
4388			SLIST_INIT(&free);
4389			if (pmap_unwire_ptp(pmap, va, mpde, &free)) {
4390				pmap_invalidate_page(pmap, va);
4391				pmap_free_zero_pages(&free);
4392			}
4393			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
4394			    " in pmap %p", va, pmap);
4395			return (FALSE);
4396		}
4397	}
4398	if ((prot & VM_PROT_EXECUTE) == 0)
4399		newpde |= pg_nx;
4400	if (va < VM_MAXUSER_ADDRESS)
4401		newpde |= PG_U;
4402
4403	/*
4404	 * Increment counters.
4405	 */
4406	pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
4407
4408	/*
4409	 * Map the superpage.
4410	 */
4411	pde_store(pde, newpde);
4412
4413	atomic_add_long(&pmap_pde_mappings, 1);
4414	CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
4415	    " in pmap %p", va, pmap);
4416	return (TRUE);
4417}
4418
4419/*
4420 * Maps a sequence of resident pages belonging to the same object.
4421 * The sequence begins with the given page m_start.  This page is
4422 * mapped at the given virtual address start.  Each subsequent page is
4423 * mapped at a virtual address that is offset from start by the same
4424 * amount as the page is offset from m_start within the object.  The
4425 * last page in the sequence is the page with the largest offset from
4426 * m_start that can be mapped at a virtual address less than the given
4427 * virtual address end.  Not every virtual page between start and end
4428 * is mapped; only those for which a resident page exists with the
4429 * corresponding offset from m_start are mapped.
4430 */
4431void
4432pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
4433    vm_page_t m_start, vm_prot_t prot)
4434{
4435	struct rwlock *lock;
4436	vm_offset_t va;
4437	vm_page_t m, mpte;
4438	vm_pindex_t diff, psize;
4439
4440	VM_OBJECT_ASSERT_LOCKED(m_start->object);
4441
4442	psize = atop(end - start);
4443	mpte = NULL;
4444	m = m_start;
4445	lock = NULL;
4446	rw_rlock(&pvh_global_lock);
4447	PMAP_LOCK(pmap);
4448	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
4449		va = start + ptoa(diff);
4450		if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
4451		    m->psind == 1 && pmap_ps_enabled(pmap) &&
4452		    pmap_enter_pde(pmap, va, m, prot, &lock))
4453			m = &m[NBPDR / PAGE_SIZE - 1];
4454		else
4455			mpte = pmap_enter_quick_locked(pmap, va, m, prot,
4456			    mpte, &lock);
4457		m = TAILQ_NEXT(m, listq);
4458	}
4459	if (lock != NULL)
4460		rw_wunlock(lock);
4461	rw_runlock(&pvh_global_lock);
4462	PMAP_UNLOCK(pmap);
4463}
4464
4465/*
4466 * this code makes some *MAJOR* assumptions:
4467 * 1. Current pmap & pmap exists.
4468 * 2. Not wired.
4469 * 3. Read access.
4470 * 4. No page table pages.
4471 * but is *MUCH* faster than pmap_enter...
4472 */
4473
4474void
4475pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
4476{
4477	struct rwlock *lock;
4478
4479	lock = NULL;
4480	rw_rlock(&pvh_global_lock);
4481	PMAP_LOCK(pmap);
4482	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
4483	if (lock != NULL)
4484		rw_wunlock(lock);
4485	rw_runlock(&pvh_global_lock);
4486	PMAP_UNLOCK(pmap);
4487}
4488
4489static vm_page_t
4490pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
4491    vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
4492{
4493	struct spglist free;
4494	pt_entry_t *pte, PG_V;
4495	vm_paddr_t pa;
4496
4497	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
4498	    (m->oflags & VPO_UNMANAGED) != 0,
4499	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
4500	PG_V = pmap_valid_bit(pmap);
4501	rw_assert(&pvh_global_lock, RA_LOCKED);
4502	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4503
4504	/*
4505	 * In the case that a page table page is not
4506	 * resident, we are creating it here.
4507	 */
4508	if (va < VM_MAXUSER_ADDRESS) {
4509		vm_pindex_t ptepindex;
4510		pd_entry_t *ptepa;
4511
4512		/*
4513		 * Calculate pagetable page index
4514		 */
4515		ptepindex = pmap_pde_pindex(va);
4516		if (mpte && (mpte->pindex == ptepindex)) {
4517			mpte->wire_count++;
4518		} else {
4519			/*
4520			 * Get the page directory entry
4521			 */
4522			ptepa = pmap_pde(pmap, va);
4523
4524			/*
4525			 * If the page table page is mapped, we just increment
4526			 * the hold count, and activate it.  Otherwise, we
4527			 * attempt to allocate a page table page.  If this
4528			 * attempt fails, we don't retry.  Instead, we give up.
4529			 */
4530			if (ptepa && (*ptepa & PG_V) != 0) {
4531				if (*ptepa & PG_PS)
4532					return (NULL);
4533				mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME);
4534				mpte->wire_count++;
4535			} else {
4536				/*
4537				 * Pass NULL instead of the PV list lock
4538				 * pointer, because we don't intend to sleep.
4539				 */
4540				mpte = _pmap_allocpte(pmap, ptepindex, NULL);
4541				if (mpte == NULL)
4542					return (mpte);
4543			}
4544		}
4545		pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
4546		pte = &pte[pmap_pte_index(va)];
4547	} else {
4548		mpte = NULL;
4549		pte = vtopte(va);
4550	}
4551	if (*pte) {
4552		if (mpte != NULL) {
4553			mpte->wire_count--;
4554			mpte = NULL;
4555		}
4556		return (mpte);
4557	}
4558
4559	/*
4560	 * Enter on the PV list if part of our managed memory.
4561	 */
4562	if ((m->oflags & VPO_UNMANAGED) == 0 &&
4563	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
4564		if (mpte != NULL) {
4565			SLIST_INIT(&free);
4566			if (pmap_unwire_ptp(pmap, va, mpte, &free)) {
4567				pmap_invalidate_page(pmap, va);
4568				pmap_free_zero_pages(&free);
4569			}
4570			mpte = NULL;
4571		}
4572		return (mpte);
4573	}
4574
4575	/*
4576	 * Increment counters
4577	 */
4578	pmap_resident_count_inc(pmap, 1);
4579
4580	pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 0);
4581	if ((prot & VM_PROT_EXECUTE) == 0)
4582		pa |= pg_nx;
4583
4584	/*
4585	 * Now validate mapping with RO protection
4586	 */
4587	if ((m->oflags & VPO_UNMANAGED) != 0)
4588		pte_store(pte, pa | PG_V | PG_U);
4589	else
4590		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
4591	return (mpte);
4592}
4593
4594/*
4595 * Make a temporary mapping for a physical address.  This is only intended
4596 * to be used for panic dumps.
4597 */
4598void *
4599pmap_kenter_temporary(vm_paddr_t pa, int i)
4600{
4601	vm_offset_t va;
4602
4603	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
4604	pmap_kenter(va, pa);
4605	invlpg(va);
4606	return ((void *)crashdumpmap);
4607}
4608
4609/*
4610 * This code maps large physical mmap regions into the
4611 * processor address space.  Note that some shortcuts
4612 * are taken, but the code works.
4613 */
4614void
4615pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
4616    vm_pindex_t pindex, vm_size_t size)
4617{
4618	pd_entry_t *pde;
4619	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
4620	vm_paddr_t pa, ptepa;
4621	vm_page_t p, pdpg;
4622	int pat_mode;
4623
4624	PG_A = pmap_accessed_bit(pmap);
4625	PG_M = pmap_modified_bit(pmap);
4626	PG_V = pmap_valid_bit(pmap);
4627	PG_RW = pmap_rw_bit(pmap);
4628
4629	VM_OBJECT_ASSERT_WLOCKED(object);
4630	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
4631	    ("pmap_object_init_pt: non-device object"));
4632	if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
4633		if (!pmap_ps_enabled(pmap))
4634			return;
4635		if (!vm_object_populate(object, pindex, pindex + atop(size)))
4636			return;
4637		p = vm_page_lookup(object, pindex);
4638		KASSERT(p->valid == VM_PAGE_BITS_ALL,
4639		    ("pmap_object_init_pt: invalid page %p", p));
4640		pat_mode = p->md.pat_mode;
4641
4642		/*
4643		 * Abort the mapping if the first page is not physically
4644		 * aligned to a 2MB page boundary.
4645		 */
4646		ptepa = VM_PAGE_TO_PHYS(p);
4647		if (ptepa & (NBPDR - 1))
4648			return;
4649
4650		/*
4651		 * Skip the first page.  Abort the mapping if the rest of
4652		 * the pages are not physically contiguous or have differing
4653		 * memory attributes.
4654		 */
4655		p = TAILQ_NEXT(p, listq);
4656		for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
4657		    pa += PAGE_SIZE) {
4658			KASSERT(p->valid == VM_PAGE_BITS_ALL,
4659			    ("pmap_object_init_pt: invalid page %p", p));
4660			if (pa != VM_PAGE_TO_PHYS(p) ||
4661			    pat_mode != p->md.pat_mode)
4662				return;
4663			p = TAILQ_NEXT(p, listq);
4664		}
4665
4666		/*
4667		 * Map using 2MB pages.  Since "ptepa" is 2M aligned and
4668		 * "size" is a multiple of 2M, adding the PAT setting to "pa"
4669		 * will not affect the termination of this loop.
4670		 */
4671		PMAP_LOCK(pmap);
4672		for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1);
4673		    pa < ptepa + size; pa += NBPDR) {
4674			pdpg = pmap_allocpde(pmap, addr, NULL);
4675			if (pdpg == NULL) {
4676				/*
4677				 * The creation of mappings below is only an
4678				 * optimization.  If a page directory page
4679				 * cannot be allocated without blocking,
4680				 * continue on to the next mapping rather than
4681				 * blocking.
4682				 */
4683				addr += NBPDR;
4684				continue;
4685			}
4686			pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
4687			pde = &pde[pmap_pde_index(addr)];
4688			if ((*pde & PG_V) == 0) {
4689				pde_store(pde, pa | PG_PS | PG_M | PG_A |
4690				    PG_U | PG_RW | PG_V);
4691				pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
4692				atomic_add_long(&pmap_pde_mappings, 1);
4693			} else {
4694				/* Continue on if the PDE is already valid. */
4695				pdpg->wire_count--;
4696				KASSERT(pdpg->wire_count > 0,
4697				    ("pmap_object_init_pt: missing reference "
4698				    "to page directory page, va: 0x%lx", addr));
4699			}
4700			addr += NBPDR;
4701		}
4702		PMAP_UNLOCK(pmap);
4703	}
4704}
4705
4706/*
4707 *	Routine:	pmap_change_wiring
4708 *	Function:	Change the wiring attribute for a map/virtual-address
4709 *			pair.
4710 *	In/out conditions:
4711 *			The mapping must already exist in the pmap.
4712 */
4713void
4714pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
4715{
4716	pd_entry_t *pde;
4717	pt_entry_t *pte;
4718	boolean_t pv_lists_locked;
4719
4720	pv_lists_locked = FALSE;
4721
4722	/*
4723	 * Wiring is not a hardware characteristic so there is no need to
4724	 * invalidate TLB.
4725	 */
4726retry:
4727	PMAP_LOCK(pmap);
4728	pde = pmap_pde(pmap, va);
4729	if ((*pde & PG_PS) != 0) {
4730		if (!wired != ((*pde & PG_W) == 0)) {
4731			if (!pv_lists_locked) {
4732				pv_lists_locked = TRUE;
4733				if (!rw_try_rlock(&pvh_global_lock)) {
4734					PMAP_UNLOCK(pmap);
4735					rw_rlock(&pvh_global_lock);
4736					goto retry;
4737				}
4738			}
4739			if (!pmap_demote_pde(pmap, pde, va))
4740				panic("pmap_change_wiring: demotion failed");
4741		} else
4742			goto out;
4743	}
4744	pte = pmap_pde_to_pte(pde, va);
4745	if (wired && (*pte & PG_W) == 0) {
4746		pmap->pm_stats.wired_count++;
4747		atomic_set_long(pte, PG_W);
4748	} else if (!wired && (*pte & PG_W) != 0) {
4749		pmap->pm_stats.wired_count--;
4750		atomic_clear_long(pte, PG_W);
4751	}
4752out:
4753	if (pv_lists_locked)
4754		rw_runlock(&pvh_global_lock);
4755	PMAP_UNLOCK(pmap);
4756}
4757
4758/*
4759 *	Copy the range specified by src_addr/len
4760 *	from the source map to the range dst_addr/len
4761 *	in the destination map.
4762 *
4763 *	This routine is only advisory and need not do anything.
4764 */
4765
4766void
4767pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
4768    vm_offset_t src_addr)
4769{
4770	struct rwlock *lock;
4771	struct spglist free;
4772	vm_offset_t addr;
4773	vm_offset_t end_addr = src_addr + len;
4774	vm_offset_t va_next;
4775	pt_entry_t PG_A, PG_M, PG_V;
4776
4777	if (dst_addr != src_addr)
4778		return;
4779
4780	if (dst_pmap->pm_type != src_pmap->pm_type)
4781		return;
4782
4783	/*
4784	 * EPT page table entries that require emulation of A/D bits are
4785	 * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although
4786	 * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit
4787	 * (aka EPT_PG_EXECUTE) could still be set. Since some EPT
4788	 * implementations flag an EPT misconfiguration for exec-only
4789	 * mappings we skip this function entirely for emulated pmaps.
4790	 */
4791	if (pmap_emulate_ad_bits(dst_pmap))
4792		return;
4793
4794	lock = NULL;
4795	rw_rlock(&pvh_global_lock);
4796	if (dst_pmap < src_pmap) {
4797		PMAP_LOCK(dst_pmap);
4798		PMAP_LOCK(src_pmap);
4799	} else {
4800		PMAP_LOCK(src_pmap);
4801		PMAP_LOCK(dst_pmap);
4802	}
4803
4804	PG_A = pmap_accessed_bit(dst_pmap);
4805	PG_M = pmap_modified_bit(dst_pmap);
4806	PG_V = pmap_valid_bit(dst_pmap);
4807
4808	for (addr = src_addr; addr < end_addr; addr = va_next) {
4809		pt_entry_t *src_pte, *dst_pte;
4810		vm_page_t dstmpde, dstmpte, srcmpte;
4811		pml4_entry_t *pml4e;
4812		pdp_entry_t *pdpe;
4813		pd_entry_t srcptepaddr, *pde;
4814
4815		KASSERT(addr < UPT_MIN_ADDRESS,
4816		    ("pmap_copy: invalid to pmap_copy page tables"));
4817
4818		pml4e = pmap_pml4e(src_pmap, addr);
4819		if ((*pml4e & PG_V) == 0) {
4820			va_next = (addr + NBPML4) & ~PML4MASK;
4821			if (va_next < addr)
4822				va_next = end_addr;
4823			continue;
4824		}
4825
4826		pdpe = pmap_pml4e_to_pdpe(pml4e, addr);
4827		if ((*pdpe & PG_V) == 0) {
4828			va_next = (addr + NBPDP) & ~PDPMASK;
4829			if (va_next < addr)
4830				va_next = end_addr;
4831			continue;
4832		}
4833
4834		va_next = (addr + NBPDR) & ~PDRMASK;
4835		if (va_next < addr)
4836			va_next = end_addr;
4837
4838		pde = pmap_pdpe_to_pde(pdpe, addr);
4839		srcptepaddr = *pde;
4840		if (srcptepaddr == 0)
4841			continue;
4842
4843		if (srcptepaddr & PG_PS) {
4844			if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr)
4845				continue;
4846			dstmpde = pmap_allocpde(dst_pmap, addr, NULL);
4847			if (dstmpde == NULL)
4848				break;
4849			pde = (pd_entry_t *)
4850			    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpde));
4851			pde = &pde[pmap_pde_index(addr)];
4852			if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 ||
4853			    pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr &
4854			    PG_PS_FRAME, &lock))) {
4855				*pde = srcptepaddr & ~PG_W;
4856				pmap_resident_count_inc(dst_pmap, NBPDR / PAGE_SIZE);
4857			} else
4858				dstmpde->wire_count--;
4859			continue;
4860		}
4861
4862		srcptepaddr &= PG_FRAME;
4863		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
4864		KASSERT(srcmpte->wire_count > 0,
4865		    ("pmap_copy: source page table page is unused"));
4866
4867		if (va_next > end_addr)
4868			va_next = end_addr;
4869
4870		src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
4871		src_pte = &src_pte[pmap_pte_index(addr)];
4872		dstmpte = NULL;
4873		while (addr < va_next) {
4874			pt_entry_t ptetemp;
4875			ptetemp = *src_pte;
4876			/*
4877			 * we only virtual copy managed pages
4878			 */
4879			if ((ptetemp & PG_MANAGED) != 0) {
4880				if (dstmpte != NULL &&
4881				    dstmpte->pindex == pmap_pde_pindex(addr))
4882					dstmpte->wire_count++;
4883				else if ((dstmpte = pmap_allocpte(dst_pmap,
4884				    addr, NULL)) == NULL)
4885					goto out;
4886				dst_pte = (pt_entry_t *)
4887				    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
4888				dst_pte = &dst_pte[pmap_pte_index(addr)];
4889				if (*dst_pte == 0 &&
4890				    pmap_try_insert_pv_entry(dst_pmap, addr,
4891				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME),
4892				    &lock)) {
4893					/*
4894					 * Clear the wired, modified, and
4895					 * accessed (referenced) bits
4896					 * during the copy.
4897					 */
4898					*dst_pte = ptetemp & ~(PG_W | PG_M |
4899					    PG_A);
4900					pmap_resident_count_inc(dst_pmap, 1);
4901				} else {
4902					SLIST_INIT(&free);
4903					if (pmap_unwire_ptp(dst_pmap, addr,
4904					    dstmpte, &free)) {
4905						pmap_invalidate_page(dst_pmap,
4906						    addr);
4907						pmap_free_zero_pages(&free);
4908					}
4909					goto out;
4910				}
4911				if (dstmpte->wire_count >= srcmpte->wire_count)
4912					break;
4913			}
4914			addr += PAGE_SIZE;
4915			src_pte++;
4916		}
4917	}
4918out:
4919	if (lock != NULL)
4920		rw_wunlock(lock);
4921	rw_runlock(&pvh_global_lock);
4922	PMAP_UNLOCK(src_pmap);
4923	PMAP_UNLOCK(dst_pmap);
4924}
4925
4926/*
4927 *	pmap_zero_page zeros the specified hardware page by mapping
4928 *	the page into KVM and using bzero to clear its contents.
4929 */
4930void
4931pmap_zero_page(vm_page_t m)
4932{
4933	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
4934
4935	pagezero((void *)va);
4936}
4937
4938/*
4939 *	pmap_zero_page_area zeros the specified hardware page by mapping
4940 *	the page into KVM and using bzero to clear its contents.
4941 *
4942 *	off and size may not cover an area beyond a single hardware page.
4943 */
4944void
4945pmap_zero_page_area(vm_page_t m, int off, int size)
4946{
4947	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
4948
4949	if (off == 0 && size == PAGE_SIZE)
4950		pagezero((void *)va);
4951	else
4952		bzero((char *)va + off, size);
4953}
4954
4955/*
4956 *	pmap_zero_page_idle zeros the specified hardware page by mapping
4957 *	the page into KVM and using bzero to clear its contents.  This
4958 *	is intended to be called from the vm_pagezero process only and
4959 *	outside of Giant.
4960 */
4961void
4962pmap_zero_page_idle(vm_page_t m)
4963{
4964	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
4965
4966	pagezero((void *)va);
4967}
4968
4969/*
4970 *	pmap_copy_page copies the specified (machine independent)
4971 *	page by mapping the page into virtual memory and using
4972 *	bcopy to copy the page, one machine dependent page at a
4973 *	time.
4974 */
4975void
4976pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
4977{
4978	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
4979	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
4980
4981	pagecopy((void *)src, (void *)dst);
4982}
4983
4984int unmapped_buf_allowed = 1;
4985
4986void
4987pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
4988    vm_offset_t b_offset, int xfersize)
4989{
4990	void *a_cp, *b_cp;
4991	vm_page_t m_a, m_b;
4992	vm_paddr_t p_a, p_b;
4993	pt_entry_t *pte;
4994	vm_offset_t a_pg_offset, b_pg_offset;
4995	int cnt;
4996	boolean_t pinned;
4997
4998	/*
4999	 * NB:  The sequence of updating a page table followed by accesses
5000	 * to the corresponding pages used in the !DMAP case is subject to
5001	 * the situation described in the "AMD64 Architecture Programmer's
5002	 * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special
5003	 * Coherency Considerations".  Therefore, issuing the INVLPG right
5004	 * after modifying the PTE bits is crucial.
5005	 */
5006	pinned = FALSE;
5007	while (xfersize > 0) {
5008		a_pg_offset = a_offset & PAGE_MASK;
5009		m_a = ma[a_offset >> PAGE_SHIFT];
5010		p_a = m_a->phys_addr;
5011		b_pg_offset = b_offset & PAGE_MASK;
5012		m_b = mb[b_offset >> PAGE_SHIFT];
5013		p_b = m_b->phys_addr;
5014		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
5015		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
5016		if (__predict_false(p_a < DMAP_MIN_ADDRESS ||
5017		    p_a > DMAP_MIN_ADDRESS + dmaplimit)) {
5018			mtx_lock(&cpage_lock);
5019			sched_pin();
5020			pinned = TRUE;
5021			pte = vtopte(cpage_a);
5022			*pte = p_a | X86_PG_A | X86_PG_V |
5023			    pmap_cache_bits(kernel_pmap, m_a->md.pat_mode, 0);
5024			invlpg(cpage_a);
5025			a_cp = (char *)cpage_a + a_pg_offset;
5026		} else {
5027			a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
5028		}
5029		if (__predict_false(p_b < DMAP_MIN_ADDRESS ||
5030		    p_b > DMAP_MIN_ADDRESS + dmaplimit)) {
5031			if (!pinned) {
5032				mtx_lock(&cpage_lock);
5033				sched_pin();
5034				pinned = TRUE;
5035			}
5036			pte = vtopte(cpage_b);
5037			*pte = p_b | X86_PG_A | X86_PG_M | X86_PG_RW |
5038			    X86_PG_V | pmap_cache_bits(kernel_pmap,
5039			    m_b->md.pat_mode, 0);
5040			invlpg(cpage_b);
5041			b_cp = (char *)cpage_b + b_pg_offset;
5042		} else {
5043			b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
5044		}
5045		bcopy(a_cp, b_cp, cnt);
5046		if (__predict_false(pinned)) {
5047			sched_unpin();
5048			mtx_unlock(&cpage_lock);
5049			pinned = FALSE;
5050		}
5051		a_offset += cnt;
5052		b_offset += cnt;
5053		xfersize -= cnt;
5054	}
5055}
5056
5057/*
5058 * Returns true if the pmap's pv is one of the first
5059 * 16 pvs linked to from this page.  This count may
5060 * be changed upwards or downwards in the future; it
5061 * is only necessary that true be returned for a small
5062 * subset of pmaps for proper page aging.
5063 */
5064boolean_t
5065pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
5066{
5067	struct md_page *pvh;
5068	struct rwlock *lock;
5069	pv_entry_t pv;
5070	int loops = 0;
5071	boolean_t rv;
5072
5073	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5074	    ("pmap_page_exists_quick: page %p is not managed", m));
5075	rv = FALSE;
5076	rw_rlock(&pvh_global_lock);
5077	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5078	rw_rlock(lock);
5079	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5080		if (PV_PMAP(pv) == pmap) {
5081			rv = TRUE;
5082			break;
5083		}
5084		loops++;
5085		if (loops >= 16)
5086			break;
5087	}
5088	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
5089		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5090		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5091			if (PV_PMAP(pv) == pmap) {
5092				rv = TRUE;
5093				break;
5094			}
5095			loops++;
5096			if (loops >= 16)
5097				break;
5098		}
5099	}
5100	rw_runlock(lock);
5101	rw_runlock(&pvh_global_lock);
5102	return (rv);
5103}
5104
5105/*
5106 *	pmap_page_wired_mappings:
5107 *
5108 *	Return the number of managed mappings to the given physical page
5109 *	that are wired.
5110 */
5111int
5112pmap_page_wired_mappings(vm_page_t m)
5113{
5114	struct rwlock *lock;
5115	struct md_page *pvh;
5116	pmap_t pmap;
5117	pt_entry_t *pte;
5118	pv_entry_t pv;
5119	int count, md_gen, pvh_gen;
5120
5121	if ((m->oflags & VPO_UNMANAGED) != 0)
5122		return (0);
5123	rw_rlock(&pvh_global_lock);
5124	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5125	rw_rlock(lock);
5126restart:
5127	count = 0;
5128	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5129		pmap = PV_PMAP(pv);
5130		if (!PMAP_TRYLOCK(pmap)) {
5131			md_gen = m->md.pv_gen;
5132			rw_runlock(lock);
5133			PMAP_LOCK(pmap);
5134			rw_rlock(lock);
5135			if (md_gen != m->md.pv_gen) {
5136				PMAP_UNLOCK(pmap);
5137				goto restart;
5138			}
5139		}
5140		pte = pmap_pte(pmap, pv->pv_va);
5141		if ((*pte & PG_W) != 0)
5142			count++;
5143		PMAP_UNLOCK(pmap);
5144	}
5145	if ((m->flags & PG_FICTITIOUS) == 0) {
5146		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5147		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5148			pmap = PV_PMAP(pv);
5149			if (!PMAP_TRYLOCK(pmap)) {
5150				md_gen = m->md.pv_gen;
5151				pvh_gen = pvh->pv_gen;
5152				rw_runlock(lock);
5153				PMAP_LOCK(pmap);
5154				rw_rlock(lock);
5155				if (md_gen != m->md.pv_gen ||
5156				    pvh_gen != pvh->pv_gen) {
5157					PMAP_UNLOCK(pmap);
5158					goto restart;
5159				}
5160			}
5161			pte = pmap_pde(pmap, pv->pv_va);
5162			if ((*pte & PG_W) != 0)
5163				count++;
5164			PMAP_UNLOCK(pmap);
5165		}
5166	}
5167	rw_runlock(lock);
5168	rw_runlock(&pvh_global_lock);
5169	return (count);
5170}
5171
5172/*
5173 * Returns TRUE if the given page is mapped individually or as part of
5174 * a 2mpage.  Otherwise, returns FALSE.
5175 */
5176boolean_t
5177pmap_page_is_mapped(vm_page_t m)
5178{
5179	struct rwlock *lock;
5180	boolean_t rv;
5181
5182	if ((m->oflags & VPO_UNMANAGED) != 0)
5183		return (FALSE);
5184	rw_rlock(&pvh_global_lock);
5185	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5186	rw_rlock(lock);
5187	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
5188	    ((m->flags & PG_FICTITIOUS) == 0 &&
5189	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
5190	rw_runlock(lock);
5191	rw_runlock(&pvh_global_lock);
5192	return (rv);
5193}
5194
5195/*
5196 * Destroy all managed, non-wired mappings in the given user-space
5197 * pmap.  This pmap cannot be active on any processor besides the
5198 * caller.
5199 *
5200 * This function cannot be applied to the kernel pmap.  Moreover, it
5201 * is not intended for general use.  It is only to be used during
5202 * process termination.  Consequently, it can be implemented in ways
5203 * that make it faster than pmap_remove().  First, it can more quickly
5204 * destroy mappings by iterating over the pmap's collection of PV
5205 * entries, rather than searching the page table.  Second, it doesn't
5206 * have to test and clear the page table entries atomically, because
5207 * no processor is currently accessing the user address space.  In
5208 * particular, a page table entry's dirty bit won't change state once
5209 * this function starts.
5210 */
5211void
5212pmap_remove_pages(pmap_t pmap)
5213{
5214	pd_entry_t ptepde;
5215	pt_entry_t *pte, tpte;
5216	pt_entry_t PG_M, PG_RW, PG_V;
5217	struct spglist free;
5218	vm_page_t m, mpte, mt;
5219	pv_entry_t pv;
5220	struct md_page *pvh;
5221	struct pv_chunk *pc, *npc;
5222	struct rwlock *lock;
5223	int64_t bit;
5224	uint64_t inuse, bitmask;
5225	int allfree, field, freed, idx;
5226	boolean_t superpage;
5227	vm_paddr_t pa;
5228
5229	/*
5230	 * Assert that the given pmap is only active on the current
5231	 * CPU.  Unfortunately, we cannot block another CPU from
5232	 * activating the pmap while this function is executing.
5233	 */
5234	KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap));
5235#ifdef INVARIANTS
5236	{
5237		cpuset_t other_cpus;
5238
5239		other_cpus = all_cpus;
5240		critical_enter();
5241		CPU_CLR(PCPU_GET(cpuid), &other_cpus);
5242		CPU_AND(&other_cpus, &pmap->pm_active);
5243		critical_exit();
5244		KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap));
5245	}
5246#endif
5247
5248	lock = NULL;
5249	PG_M = pmap_modified_bit(pmap);
5250	PG_V = pmap_valid_bit(pmap);
5251	PG_RW = pmap_rw_bit(pmap);
5252
5253	SLIST_INIT(&free);
5254	rw_rlock(&pvh_global_lock);
5255	PMAP_LOCK(pmap);
5256	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
5257		allfree = 1;
5258		freed = 0;
5259		for (field = 0; field < _NPCM; field++) {
5260			inuse = ~pc->pc_map[field] & pc_freemask[field];
5261			while (inuse != 0) {
5262				bit = bsfq(inuse);
5263				bitmask = 1UL << bit;
5264				idx = field * 64 + bit;
5265				pv = &pc->pc_pventry[idx];
5266				inuse &= ~bitmask;
5267
5268				pte = pmap_pdpe(pmap, pv->pv_va);
5269				ptepde = *pte;
5270				pte = pmap_pdpe_to_pde(pte, pv->pv_va);
5271				tpte = *pte;
5272				if ((tpte & (PG_PS | PG_V)) == PG_V) {
5273					superpage = FALSE;
5274					ptepde = tpte;
5275					pte = (pt_entry_t *)PHYS_TO_DMAP(tpte &
5276					    PG_FRAME);
5277					pte = &pte[pmap_pte_index(pv->pv_va)];
5278					tpte = *pte;
5279				} else {
5280					/*
5281					 * Keep track whether 'tpte' is a
5282					 * superpage explicitly instead of
5283					 * relying on PG_PS being set.
5284					 *
5285					 * This is because PG_PS is numerically
5286					 * identical to PG_PTE_PAT and thus a
5287					 * regular page could be mistaken for
5288					 * a superpage.
5289					 */
5290					superpage = TRUE;
5291				}
5292
5293				if ((tpte & PG_V) == 0) {
5294					panic("bad pte va %lx pte %lx",
5295					    pv->pv_va, tpte);
5296				}
5297
5298/*
5299 * We cannot remove wired pages from a process' mapping at this time
5300 */
5301				if (tpte & PG_W) {
5302					allfree = 0;
5303					continue;
5304				}
5305
5306				if (superpage)
5307					pa = tpte & PG_PS_FRAME;
5308				else
5309					pa = tpte & PG_FRAME;
5310
5311				m = PHYS_TO_VM_PAGE(pa);
5312				KASSERT(m->phys_addr == pa,
5313				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
5314				    m, (uintmax_t)m->phys_addr,
5315				    (uintmax_t)tpte));
5316
5317				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
5318				    m < &vm_page_array[vm_page_array_size],
5319				    ("pmap_remove_pages: bad tpte %#jx",
5320				    (uintmax_t)tpte));
5321
5322				pte_clear(pte);
5323
5324				/*
5325				 * Update the vm_page_t clean/reference bits.
5326				 */
5327				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
5328					if (superpage) {
5329						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
5330							vm_page_dirty(mt);
5331					} else
5332						vm_page_dirty(m);
5333				}
5334
5335				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
5336
5337				/* Mark free */
5338				pc->pc_map[field] |= bitmask;
5339				if (superpage) {
5340					pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
5341					pvh = pa_to_pvh(tpte & PG_PS_FRAME);
5342					TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
5343					pvh->pv_gen++;
5344					if (TAILQ_EMPTY(&pvh->pv_list)) {
5345						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
5346							if ((mt->aflags & PGA_WRITEABLE) != 0 &&
5347							    TAILQ_EMPTY(&mt->md.pv_list))
5348								vm_page_aflag_clear(mt, PGA_WRITEABLE);
5349					}
5350					mpte = pmap_lookup_pt_page(pmap, pv->pv_va);
5351					if (mpte != NULL) {
5352						pmap_remove_pt_page(pmap, mpte);
5353						pmap_resident_count_dec(pmap, 1);
5354						KASSERT(mpte->wire_count == NPTEPG,
5355						    ("pmap_remove_pages: pte page wire count error"));
5356						mpte->wire_count = 0;
5357						pmap_add_delayed_free_list(mpte, &free, FALSE);
5358						atomic_subtract_int(&cnt.v_wire_count, 1);
5359					}
5360				} else {
5361					pmap_resident_count_dec(pmap, 1);
5362					TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
5363					m->md.pv_gen++;
5364					if ((m->aflags & PGA_WRITEABLE) != 0 &&
5365					    TAILQ_EMPTY(&m->md.pv_list) &&
5366					    (m->flags & PG_FICTITIOUS) == 0) {
5367						pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5368						if (TAILQ_EMPTY(&pvh->pv_list))
5369							vm_page_aflag_clear(m, PGA_WRITEABLE);
5370					}
5371				}
5372				pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
5373				freed++;
5374			}
5375		}
5376		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
5377		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
5378		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
5379		if (allfree) {
5380			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
5381			free_pv_chunk(pc);
5382		}
5383	}
5384	if (lock != NULL)
5385		rw_wunlock(lock);
5386	pmap_invalidate_all(pmap);
5387	rw_runlock(&pvh_global_lock);
5388	PMAP_UNLOCK(pmap);
5389	pmap_free_zero_pages(&free);
5390}
5391
5392static boolean_t
5393pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
5394{
5395	struct rwlock *lock;
5396	pv_entry_t pv;
5397	struct md_page *pvh;
5398	pt_entry_t *pte, mask;
5399	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
5400	pmap_t pmap;
5401	int md_gen, pvh_gen;
5402	boolean_t rv;
5403
5404	rv = FALSE;
5405	rw_rlock(&pvh_global_lock);
5406	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5407	rw_rlock(lock);
5408restart:
5409	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5410		pmap = PV_PMAP(pv);
5411		if (!PMAP_TRYLOCK(pmap)) {
5412			md_gen = m->md.pv_gen;
5413			rw_runlock(lock);
5414			PMAP_LOCK(pmap);
5415			rw_rlock(lock);
5416			if (md_gen != m->md.pv_gen) {
5417				PMAP_UNLOCK(pmap);
5418				goto restart;
5419			}
5420		}
5421		pte = pmap_pte(pmap, pv->pv_va);
5422		mask = 0;
5423		if (modified) {
5424			PG_M = pmap_modified_bit(pmap);
5425			PG_RW = pmap_rw_bit(pmap);
5426			mask |= PG_RW | PG_M;
5427		}
5428		if (accessed) {
5429			PG_A = pmap_accessed_bit(pmap);
5430			PG_V = pmap_valid_bit(pmap);
5431			mask |= PG_V | PG_A;
5432		}
5433		rv = (*pte & mask) == mask;
5434		PMAP_UNLOCK(pmap);
5435		if (rv)
5436			goto out;
5437	}
5438	if ((m->flags & PG_FICTITIOUS) == 0) {
5439		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5440		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5441			pmap = PV_PMAP(pv);
5442			if (!PMAP_TRYLOCK(pmap)) {
5443				md_gen = m->md.pv_gen;
5444				pvh_gen = pvh->pv_gen;
5445				rw_runlock(lock);
5446				PMAP_LOCK(pmap);
5447				rw_rlock(lock);
5448				if (md_gen != m->md.pv_gen ||
5449				    pvh_gen != pvh->pv_gen) {
5450					PMAP_UNLOCK(pmap);
5451					goto restart;
5452				}
5453			}
5454			pte = pmap_pde(pmap, pv->pv_va);
5455			mask = 0;
5456			if (modified) {
5457				PG_M = pmap_modified_bit(pmap);
5458				PG_RW = pmap_rw_bit(pmap);
5459				mask |= PG_RW | PG_M;
5460			}
5461			if (accessed) {
5462				PG_A = pmap_accessed_bit(pmap);
5463				PG_V = pmap_valid_bit(pmap);
5464				mask |= PG_V | PG_A;
5465			}
5466			rv = (*pte & mask) == mask;
5467			PMAP_UNLOCK(pmap);
5468			if (rv)
5469				goto out;
5470		}
5471	}
5472out:
5473	rw_runlock(lock);
5474	rw_runlock(&pvh_global_lock);
5475	return (rv);
5476}
5477
5478/*
5479 *	pmap_is_modified:
5480 *
5481 *	Return whether or not the specified physical page was modified
5482 *	in any physical maps.
5483 */
5484boolean_t
5485pmap_is_modified(vm_page_t m)
5486{
5487
5488	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5489	    ("pmap_is_modified: page %p is not managed", m));
5490
5491	/*
5492	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
5493	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
5494	 * is clear, no PTEs can have PG_M set.
5495	 */
5496	VM_OBJECT_ASSERT_WLOCKED(m->object);
5497	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
5498		return (FALSE);
5499	return (pmap_page_test_mappings(m, FALSE, TRUE));
5500}
5501
5502/*
5503 *	pmap_is_prefaultable:
5504 *
5505 *	Return whether or not the specified virtual address is eligible
5506 *	for prefault.
5507 */
5508boolean_t
5509pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
5510{
5511	pd_entry_t *pde;
5512	pt_entry_t *pte, PG_V;
5513	boolean_t rv;
5514
5515	PG_V = pmap_valid_bit(pmap);
5516	rv = FALSE;
5517	PMAP_LOCK(pmap);
5518	pde = pmap_pde(pmap, addr);
5519	if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) {
5520		pte = pmap_pde_to_pte(pde, addr);
5521		rv = (*pte & PG_V) == 0;
5522	}
5523	PMAP_UNLOCK(pmap);
5524	return (rv);
5525}
5526
5527/*
5528 *	pmap_is_referenced:
5529 *
5530 *	Return whether or not the specified physical page was referenced
5531 *	in any physical maps.
5532 */
5533boolean_t
5534pmap_is_referenced(vm_page_t m)
5535{
5536
5537	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5538	    ("pmap_is_referenced: page %p is not managed", m));
5539	return (pmap_page_test_mappings(m, TRUE, FALSE));
5540}
5541
5542/*
5543 * Clear the write and modified bits in each of the given page's mappings.
5544 */
5545void
5546pmap_remove_write(vm_page_t m)
5547{
5548	struct md_page *pvh;
5549	pmap_t pmap;
5550	struct rwlock *lock;
5551	pv_entry_t next_pv, pv;
5552	pd_entry_t *pde;
5553	pt_entry_t oldpte, *pte, PG_M, PG_RW;
5554	vm_offset_t va;
5555	int pvh_gen, md_gen;
5556
5557	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5558	    ("pmap_remove_write: page %p is not managed", m));
5559
5560	/*
5561	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
5562	 * set by another thread while the object is locked.  Thus,
5563	 * if PGA_WRITEABLE is clear, no page table entries need updating.
5564	 */
5565	VM_OBJECT_ASSERT_WLOCKED(m->object);
5566	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
5567		return;
5568	rw_rlock(&pvh_global_lock);
5569	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5570	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5571retry_pv_loop:
5572	rw_wlock(lock);
5573	if ((m->flags & PG_FICTITIOUS) != 0)
5574		goto small_mappings;
5575	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
5576		pmap = PV_PMAP(pv);
5577		if (!PMAP_TRYLOCK(pmap)) {
5578			pvh_gen = pvh->pv_gen;
5579			rw_wunlock(lock);
5580			PMAP_LOCK(pmap);
5581			rw_wlock(lock);
5582			if (pvh_gen != pvh->pv_gen) {
5583				PMAP_UNLOCK(pmap);
5584				rw_wunlock(lock);
5585				goto retry_pv_loop;
5586			}
5587		}
5588		PG_RW = pmap_rw_bit(pmap);
5589		va = pv->pv_va;
5590		pde = pmap_pde(pmap, va);
5591		if ((*pde & PG_RW) != 0)
5592			(void)pmap_demote_pde_locked(pmap, pde, va, &lock);
5593		KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
5594		    ("inconsistent pv lock %p %p for page %p",
5595		    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
5596		PMAP_UNLOCK(pmap);
5597	}
5598small_mappings:
5599	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5600		pmap = PV_PMAP(pv);
5601		if (!PMAP_TRYLOCK(pmap)) {
5602			pvh_gen = pvh->pv_gen;
5603			md_gen = m->md.pv_gen;
5604			rw_wunlock(lock);
5605			PMAP_LOCK(pmap);
5606			rw_wlock(lock);
5607			if (pvh_gen != pvh->pv_gen ||
5608			    md_gen != m->md.pv_gen) {
5609				PMAP_UNLOCK(pmap);
5610				rw_wunlock(lock);
5611				goto retry_pv_loop;
5612			}
5613		}
5614		PG_M = pmap_modified_bit(pmap);
5615		PG_RW = pmap_rw_bit(pmap);
5616		pde = pmap_pde(pmap, pv->pv_va);
5617		KASSERT((*pde & PG_PS) == 0,
5618		    ("pmap_remove_write: found a 2mpage in page %p's pv list",
5619		    m));
5620		pte = pmap_pde_to_pte(pde, pv->pv_va);
5621retry:
5622		oldpte = *pte;
5623		if (oldpte & PG_RW) {
5624			if (!atomic_cmpset_long(pte, oldpte, oldpte &
5625			    ~(PG_RW | PG_M)))
5626				goto retry;
5627			if ((oldpte & PG_M) != 0)
5628				vm_page_dirty(m);
5629			pmap_invalidate_page(pmap, pv->pv_va);
5630		}
5631		PMAP_UNLOCK(pmap);
5632	}
5633	rw_wunlock(lock);
5634	vm_page_aflag_clear(m, PGA_WRITEABLE);
5635	rw_runlock(&pvh_global_lock);
5636}
5637
5638static __inline boolean_t
5639safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte)
5640{
5641
5642	if (!pmap_emulate_ad_bits(pmap))
5643		return (TRUE);
5644
5645	KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type));
5646
5647	/*
5648	 * RWX = 010 or 110 will cause an unconditional EPT misconfiguration
5649	 * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared
5650	 * if the EPT_PG_WRITE bit is set.
5651	 */
5652	if ((pte & EPT_PG_WRITE) != 0)
5653		return (FALSE);
5654
5655	/*
5656	 * RWX = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set.
5657	 */
5658	if ((pte & EPT_PG_EXECUTE) == 0 ||
5659	    ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0))
5660		return (TRUE);
5661	else
5662		return (FALSE);
5663}
5664
5665#define	PMAP_TS_REFERENCED_MAX	5
5666
5667/*
5668 *	pmap_ts_referenced:
5669 *
5670 *	Return a count of reference bits for a page, clearing those bits.
5671 *	It is not necessary for every reference bit to be cleared, but it
5672 *	is necessary that 0 only be returned when there are truly no
5673 *	reference bits set.
5674 *
5675 *	XXX: The exact number of bits to check and clear is a matter that
5676 *	should be tested and standardized at some point in the future for
5677 *	optimal aging of shared pages.
5678 */
5679int
5680pmap_ts_referenced(vm_page_t m)
5681{
5682	struct md_page *pvh;
5683	pv_entry_t pv, pvf;
5684	pmap_t pmap;
5685	struct rwlock *lock;
5686	pd_entry_t oldpde, *pde;
5687	pt_entry_t *pte, PG_A;
5688	vm_offset_t va;
5689	vm_paddr_t pa;
5690	int cleared, md_gen, not_cleared, pvh_gen;
5691	struct spglist free;
5692	boolean_t demoted;
5693
5694	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5695	    ("pmap_ts_referenced: page %p is not managed", m));
5696	SLIST_INIT(&free);
5697	cleared = 0;
5698	pa = VM_PAGE_TO_PHYS(m);
5699	lock = PHYS_TO_PV_LIST_LOCK(pa);
5700	pvh = pa_to_pvh(pa);
5701	rw_rlock(&pvh_global_lock);
5702	rw_wlock(lock);
5703retry:
5704	not_cleared = 0;
5705	if ((m->flags & PG_FICTITIOUS) != 0 ||
5706	    (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
5707		goto small_mappings;
5708	pv = pvf;
5709	do {
5710		if (pvf == NULL)
5711			pvf = pv;
5712		pmap = PV_PMAP(pv);
5713		if (!PMAP_TRYLOCK(pmap)) {
5714			pvh_gen = pvh->pv_gen;
5715			rw_wunlock(lock);
5716			PMAP_LOCK(pmap);
5717			rw_wlock(lock);
5718			if (pvh_gen != pvh->pv_gen) {
5719				PMAP_UNLOCK(pmap);
5720				goto retry;
5721			}
5722		}
5723		PG_A = pmap_accessed_bit(pmap);
5724		va = pv->pv_va;
5725		pde = pmap_pde(pmap, pv->pv_va);
5726		oldpde = *pde;
5727		if ((*pde & PG_A) != 0) {
5728			/*
5729			 * Since this reference bit is shared by 512 4KB
5730			 * pages, it should not be cleared every time it is
5731			 * tested.  Apply a simple "hash" function on the
5732			 * physical page number, the virtual superpage number,
5733			 * and the pmap address to select one 4KB page out of
5734			 * the 512 on which testing the reference bit will
5735			 * result in clearing that reference bit.  This
5736			 * function is designed to avoid the selection of the
5737			 * same 4KB page for every 2MB page mapping.
5738			 *
5739			 * On demotion, a mapping that hasn't been referenced
5740			 * is simply destroyed.  To avoid the possibility of a
5741			 * subsequent page fault on a demoted wired mapping,
5742			 * always leave its reference bit set.  Moreover,
5743			 * since the superpage is wired, the current state of
5744			 * its reference bit won't affect page replacement.
5745			 */
5746			if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^
5747			    (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
5748			    (*pde & PG_W) == 0) {
5749				if (safe_to_clear_referenced(pmap, oldpde)) {
5750					atomic_clear_long(pde, PG_A);
5751					pmap_invalidate_page(pmap, pv->pv_va);
5752					demoted = FALSE;
5753				} else if (pmap_demote_pde_locked(pmap, pde,
5754				    pv->pv_va, &lock)) {
5755					/*
5756					 * Remove the mapping to a single page
5757					 * so that a subsequent access may
5758					 * repromote.  Since the underlying
5759					 * page table page is fully populated,
5760					 * this removal never frees a page
5761					 * table page.
5762					 */
5763					demoted = TRUE;
5764					va += VM_PAGE_TO_PHYS(m) - (oldpde &
5765					    PG_PS_FRAME);
5766					pte = pmap_pde_to_pte(pde, va);
5767					pmap_remove_pte(pmap, pte, va, *pde,
5768					    NULL, &lock);
5769					pmap_invalidate_page(pmap, va);
5770				} else
5771					demoted = TRUE;
5772
5773				if (demoted) {
5774					/*
5775					 * The superpage mapping was removed
5776					 * entirely and therefore 'pv' is no
5777					 * longer valid.
5778					 */
5779					if (pvf == pv)
5780						pvf = NULL;
5781					pv = NULL;
5782				}
5783				cleared++;
5784				KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
5785				    ("inconsistent pv lock %p %p for page %p",
5786				    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
5787			} else
5788				not_cleared++;
5789		}
5790		PMAP_UNLOCK(pmap);
5791		/* Rotate the PV list if it has more than one entry. */
5792		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
5793			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
5794			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
5795			pvh->pv_gen++;
5796		}
5797		if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
5798			goto out;
5799	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
5800small_mappings:
5801	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
5802		goto out;
5803	pv = pvf;
5804	do {
5805		if (pvf == NULL)
5806			pvf = pv;
5807		pmap = PV_PMAP(pv);
5808		if (!PMAP_TRYLOCK(pmap)) {
5809			pvh_gen = pvh->pv_gen;
5810			md_gen = m->md.pv_gen;
5811			rw_wunlock(lock);
5812			PMAP_LOCK(pmap);
5813			rw_wlock(lock);
5814			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
5815				PMAP_UNLOCK(pmap);
5816				goto retry;
5817			}
5818		}
5819		PG_A = pmap_accessed_bit(pmap);
5820		pde = pmap_pde(pmap, pv->pv_va);
5821		KASSERT((*pde & PG_PS) == 0,
5822		    ("pmap_ts_referenced: found a 2mpage in page %p's pv list",
5823		    m));
5824		pte = pmap_pde_to_pte(pde, pv->pv_va);
5825		if ((*pte & PG_A) != 0) {
5826			if (safe_to_clear_referenced(pmap, *pte)) {
5827				atomic_clear_long(pte, PG_A);
5828				pmap_invalidate_page(pmap, pv->pv_va);
5829				cleared++;
5830			} else if ((*pte & PG_W) == 0) {
5831				/*
5832				 * Wired pages cannot be paged out so
5833				 * doing accessed bit emulation for
5834				 * them is wasted effort. We do the
5835				 * hard work for unwired pages only.
5836				 */
5837				pmap_remove_pte(pmap, pte, pv->pv_va,
5838				    *pde, &free, &lock);
5839				pmap_invalidate_page(pmap, pv->pv_va);
5840				cleared++;
5841				if (pvf == pv)
5842					pvf = NULL;
5843				pv = NULL;
5844				KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
5845				    ("inconsistent pv lock %p %p for page %p",
5846				    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
5847			} else
5848				not_cleared++;
5849		}
5850		PMAP_UNLOCK(pmap);
5851		/* Rotate the PV list if it has more than one entry. */
5852		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
5853			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
5854			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
5855			m->md.pv_gen++;
5856		}
5857	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
5858	    not_cleared < PMAP_TS_REFERENCED_MAX);
5859out:
5860	rw_wunlock(lock);
5861	rw_runlock(&pvh_global_lock);
5862	pmap_free_zero_pages(&free);
5863	return (cleared + not_cleared);
5864}
5865
5866/*
5867 *	Apply the given advice to the specified range of addresses within the
5868 *	given pmap.  Depending on the advice, clear the referenced and/or
5869 *	modified flags in each mapping and set the mapped page's dirty field.
5870 */
5871void
5872pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
5873{
5874	struct rwlock *lock;
5875	pml4_entry_t *pml4e;
5876	pdp_entry_t *pdpe;
5877	pd_entry_t oldpde, *pde;
5878	pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V;
5879	vm_offset_t va_next;
5880	vm_page_t m;
5881	boolean_t anychanged, pv_lists_locked;
5882
5883	if (advice != MADV_DONTNEED && advice != MADV_FREE)
5884		return;
5885
5886	/*
5887	 * A/D bit emulation requires an alternate code path when clearing
5888	 * the modified and accessed bits below. Since this function is
5889	 * advisory in nature we skip it entirely for pmaps that require
5890	 * A/D bit emulation.
5891	 */
5892	if (pmap_emulate_ad_bits(pmap))
5893		return;
5894
5895	PG_A = pmap_accessed_bit(pmap);
5896	PG_G = pmap_global_bit(pmap);
5897	PG_M = pmap_modified_bit(pmap);
5898	PG_V = pmap_valid_bit(pmap);
5899	PG_RW = pmap_rw_bit(pmap);
5900
5901	pv_lists_locked = FALSE;
5902resume:
5903	anychanged = FALSE;
5904	PMAP_LOCK(pmap);
5905	for (; sva < eva; sva = va_next) {
5906		pml4e = pmap_pml4e(pmap, sva);
5907		if ((*pml4e & PG_V) == 0) {
5908			va_next = (sva + NBPML4) & ~PML4MASK;
5909			if (va_next < sva)
5910				va_next = eva;
5911			continue;
5912		}
5913		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
5914		if ((*pdpe & PG_V) == 0) {
5915			va_next = (sva + NBPDP) & ~PDPMASK;
5916			if (va_next < sva)
5917				va_next = eva;
5918			continue;
5919		}
5920		va_next = (sva + NBPDR) & ~PDRMASK;
5921		if (va_next < sva)
5922			va_next = eva;
5923		pde = pmap_pdpe_to_pde(pdpe, sva);
5924		oldpde = *pde;
5925		if ((oldpde & PG_V) == 0)
5926			continue;
5927		else if ((oldpde & PG_PS) != 0) {
5928			if ((oldpde & PG_MANAGED) == 0)
5929				continue;
5930			if (!pv_lists_locked) {
5931				pv_lists_locked = TRUE;
5932				if (!rw_try_rlock(&pvh_global_lock)) {
5933					if (anychanged)
5934						pmap_invalidate_all(pmap);
5935					PMAP_UNLOCK(pmap);
5936					rw_rlock(&pvh_global_lock);
5937					goto resume;
5938				}
5939			}
5940			lock = NULL;
5941			if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) {
5942				if (lock != NULL)
5943					rw_wunlock(lock);
5944
5945				/*
5946				 * The large page mapping was destroyed.
5947				 */
5948				continue;
5949			}
5950
5951			/*
5952			 * Unless the page mappings are wired, remove the
5953			 * mapping to a single page so that a subsequent
5954			 * access may repromote.  Since the underlying page
5955			 * table page is fully populated, this removal never
5956			 * frees a page table page.
5957			 */
5958			if ((oldpde & PG_W) == 0) {
5959				pte = pmap_pde_to_pte(pde, sva);
5960				KASSERT((*pte & PG_V) != 0,
5961				    ("pmap_advise: invalid PTE"));
5962				pmap_remove_pte(pmap, pte, sva, *pde, NULL,
5963				    &lock);
5964				anychanged = TRUE;
5965			}
5966			if (lock != NULL)
5967				rw_wunlock(lock);
5968		}
5969		if (va_next > eva)
5970			va_next = eva;
5971		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
5972		    sva += PAGE_SIZE) {
5973			if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED |
5974			    PG_V))
5975				continue;
5976			else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
5977				if (advice == MADV_DONTNEED) {
5978					/*
5979					 * Future calls to pmap_is_modified()
5980					 * can be avoided by making the page
5981					 * dirty now.
5982					 */
5983					m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
5984					vm_page_dirty(m);
5985				}
5986				atomic_clear_long(pte, PG_M | PG_A);
5987			} else if ((*pte & PG_A) != 0)
5988				atomic_clear_long(pte, PG_A);
5989			else
5990				continue;
5991			if ((*pte & PG_G) != 0)
5992				pmap_invalidate_page(pmap, sva);
5993			else
5994				anychanged = TRUE;
5995		}
5996	}
5997	if (anychanged)
5998		pmap_invalidate_all(pmap);
5999	if (pv_lists_locked)
6000		rw_runlock(&pvh_global_lock);
6001	PMAP_UNLOCK(pmap);
6002}
6003
6004/*
6005 *	Clear the modify bits on the specified physical page.
6006 */
6007void
6008pmap_clear_modify(vm_page_t m)
6009{
6010	struct md_page *pvh;
6011	pmap_t pmap;
6012	pv_entry_t next_pv, pv;
6013	pd_entry_t oldpde, *pde;
6014	pt_entry_t oldpte, *pte, PG_M, PG_RW, PG_V;
6015	struct rwlock *lock;
6016	vm_offset_t va;
6017	int md_gen, pvh_gen;
6018
6019	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6020	    ("pmap_clear_modify: page %p is not managed", m));
6021	VM_OBJECT_ASSERT_WLOCKED(m->object);
6022	KASSERT(!vm_page_xbusied(m),
6023	    ("pmap_clear_modify: page %p is exclusive busied", m));
6024
6025	/*
6026	 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
6027	 * If the object containing the page is locked and the page is not
6028	 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
6029	 */
6030	if ((m->aflags & PGA_WRITEABLE) == 0)
6031		return;
6032	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
6033	rw_rlock(&pvh_global_lock);
6034	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6035	rw_wlock(lock);
6036restart:
6037	if ((m->flags & PG_FICTITIOUS) != 0)
6038		goto small_mappings;
6039	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
6040		pmap = PV_PMAP(pv);
6041		if (!PMAP_TRYLOCK(pmap)) {
6042			pvh_gen = pvh->pv_gen;
6043			rw_wunlock(lock);
6044			PMAP_LOCK(pmap);
6045			rw_wlock(lock);
6046			if (pvh_gen != pvh->pv_gen) {
6047				PMAP_UNLOCK(pmap);
6048				goto restart;
6049			}
6050		}
6051		PG_M = pmap_modified_bit(pmap);
6052		PG_V = pmap_valid_bit(pmap);
6053		PG_RW = pmap_rw_bit(pmap);
6054		va = pv->pv_va;
6055		pde = pmap_pde(pmap, va);
6056		oldpde = *pde;
6057		if ((oldpde & PG_RW) != 0) {
6058			if (pmap_demote_pde_locked(pmap, pde, va, &lock)) {
6059				if ((oldpde & PG_W) == 0) {
6060					/*
6061					 * Write protect the mapping to a
6062					 * single page so that a subsequent
6063					 * write access may repromote.
6064					 */
6065					va += VM_PAGE_TO_PHYS(m) - (oldpde &
6066					    PG_PS_FRAME);
6067					pte = pmap_pde_to_pte(pde, va);
6068					oldpte = *pte;
6069					if ((oldpte & PG_V) != 0) {
6070						while (!atomic_cmpset_long(pte,
6071						    oldpte,
6072						    oldpte & ~(PG_M | PG_RW)))
6073							oldpte = *pte;
6074						vm_page_dirty(m);
6075						pmap_invalidate_page(pmap, va);
6076					}
6077				}
6078			}
6079		}
6080		PMAP_UNLOCK(pmap);
6081	}
6082small_mappings:
6083	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6084		pmap = PV_PMAP(pv);
6085		if (!PMAP_TRYLOCK(pmap)) {
6086			md_gen = m->md.pv_gen;
6087			pvh_gen = pvh->pv_gen;
6088			rw_wunlock(lock);
6089			PMAP_LOCK(pmap);
6090			rw_wlock(lock);
6091			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
6092				PMAP_UNLOCK(pmap);
6093				goto restart;
6094			}
6095		}
6096		PG_M = pmap_modified_bit(pmap);
6097		PG_RW = pmap_rw_bit(pmap);
6098		pde = pmap_pde(pmap, pv->pv_va);
6099		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
6100		    " a 2mpage in page %p's pv list", m));
6101		pte = pmap_pde_to_pte(pde, pv->pv_va);
6102		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
6103			atomic_clear_long(pte, PG_M);
6104			pmap_invalidate_page(pmap, pv->pv_va);
6105		}
6106		PMAP_UNLOCK(pmap);
6107	}
6108	rw_wunlock(lock);
6109	rw_runlock(&pvh_global_lock);
6110}
6111
6112/*
6113 * Miscellaneous support routines follow
6114 */
6115
6116/* Adjust the cache mode for a 4KB page mapped via a PTE. */
6117static __inline void
6118pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask)
6119{
6120	u_int opte, npte;
6121
6122	/*
6123	 * The cache mode bits are all in the low 32-bits of the
6124	 * PTE, so we can just spin on updating the low 32-bits.
6125	 */
6126	do {
6127		opte = *(u_int *)pte;
6128		npte = opte & ~mask;
6129		npte |= cache_bits;
6130	} while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
6131}
6132
6133/* Adjust the cache mode for a 2MB page mapped via a PDE. */
6134static __inline void
6135pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask)
6136{
6137	u_int opde, npde;
6138
6139	/*
6140	 * The cache mode bits are all in the low 32-bits of the
6141	 * PDE, so we can just spin on updating the low 32-bits.
6142	 */
6143	do {
6144		opde = *(u_int *)pde;
6145		npde = opde & ~mask;
6146		npde |= cache_bits;
6147	} while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
6148}
6149
6150/*
6151 * Map a set of physical memory pages into the kernel virtual
6152 * address space. Return a pointer to where it is mapped. This
6153 * routine is intended to be used for mapping device memory,
6154 * NOT real memory.
6155 */
6156void *
6157pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
6158{
6159	vm_offset_t va, offset;
6160	vm_size_t tmpsize;
6161
6162	/*
6163	 * If the specified range of physical addresses fits within the direct
6164	 * map window, use the direct map.
6165	 */
6166	if (pa < dmaplimit && pa + size < dmaplimit) {
6167		va = PHYS_TO_DMAP(pa);
6168		if (!pmap_change_attr(va, size, mode))
6169			return ((void *)va);
6170	}
6171	offset = pa & PAGE_MASK;
6172	size = round_page(offset + size);
6173	va = kva_alloc(size);
6174	if (!va)
6175		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
6176	pa = trunc_page(pa);
6177	for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
6178		pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
6179	pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
6180	pmap_invalidate_cache_range(va, va + tmpsize);
6181	return ((void *)(va + offset));
6182}
6183
6184void *
6185pmap_mapdev(vm_paddr_t pa, vm_size_t size)
6186{
6187
6188	return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
6189}
6190
6191void *
6192pmap_mapbios(vm_paddr_t pa, vm_size_t size)
6193{
6194
6195	return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
6196}
6197
6198void
6199pmap_unmapdev(vm_offset_t va, vm_size_t size)
6200{
6201	vm_offset_t base, offset;
6202
6203	/* If we gave a direct map region in pmap_mapdev, do nothing */
6204	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
6205		return;
6206	base = trunc_page(va);
6207	offset = va & PAGE_MASK;
6208	size = round_page(offset + size);
6209	kva_free(base, size);
6210}
6211
6212/*
6213 * Tries to demote a 1GB page mapping.
6214 */
6215static boolean_t
6216pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va)
6217{
6218	pdp_entry_t newpdpe, oldpdpe;
6219	pd_entry_t *firstpde, newpde, *pde;
6220	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
6221	vm_paddr_t mpdepa;
6222	vm_page_t mpde;
6223
6224	PG_A = pmap_accessed_bit(pmap);
6225	PG_M = pmap_modified_bit(pmap);
6226	PG_V = pmap_valid_bit(pmap);
6227	PG_RW = pmap_rw_bit(pmap);
6228
6229	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6230	oldpdpe = *pdpe;
6231	KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V),
6232	    ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V"));
6233	if ((mpde = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT |
6234	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
6235		CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx"
6236		    " in pmap %p", va, pmap);
6237		return (FALSE);
6238	}
6239	mpdepa = VM_PAGE_TO_PHYS(mpde);
6240	firstpde = (pd_entry_t *)PHYS_TO_DMAP(mpdepa);
6241	newpdpe = mpdepa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V;
6242	KASSERT((oldpdpe & PG_A) != 0,
6243	    ("pmap_demote_pdpe: oldpdpe is missing PG_A"));
6244	KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW,
6245	    ("pmap_demote_pdpe: oldpdpe is missing PG_M"));
6246	newpde = oldpdpe;
6247
6248	/*
6249	 * Initialize the page directory page.
6250	 */
6251	for (pde = firstpde; pde < firstpde + NPDEPG; pde++) {
6252		*pde = newpde;
6253		newpde += NBPDR;
6254	}
6255
6256	/*
6257	 * Demote the mapping.
6258	 */
6259	*pdpe = newpdpe;
6260
6261	/*
6262	 * Invalidate a stale recursive mapping of the page directory page.
6263	 */
6264	pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va));
6265
6266	pmap_pdpe_demotions++;
6267	CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx"
6268	    " in pmap %p", va, pmap);
6269	return (TRUE);
6270}
6271
6272/*
6273 * Sets the memory attribute for the specified page.
6274 */
6275void
6276pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
6277{
6278
6279	m->md.pat_mode = ma;
6280
6281	/*
6282	 * If "m" is a normal page, update its direct mapping.  This update
6283	 * can be relied upon to perform any cache operations that are
6284	 * required for data coherence.
6285	 */
6286	if ((m->flags & PG_FICTITIOUS) == 0 &&
6287	    pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
6288	    m->md.pat_mode))
6289		panic("memory attribute change on the direct map failed");
6290}
6291
6292/*
6293 * Changes the specified virtual address range's memory type to that given by
6294 * the parameter "mode".  The specified virtual address range must be
6295 * completely contained within either the direct map or the kernel map.  If
6296 * the virtual address range is contained within the kernel map, then the
6297 * memory type for each of the corresponding ranges of the direct map is also
6298 * changed.  (The corresponding ranges of the direct map are those ranges that
6299 * map the same physical pages as the specified virtual address range.)  These
6300 * changes to the direct map are necessary because Intel describes the
6301 * behavior of their processors as "undefined" if two or more mappings to the
6302 * same physical page have different memory types.
6303 *
6304 * Returns zero if the change completed successfully, and either EINVAL or
6305 * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
6306 * of the virtual address range was not mapped, and ENOMEM is returned if
6307 * there was insufficient memory available to complete the change.  In the
6308 * latter case, the memory type may have been changed on some part of the
6309 * virtual address range or the direct map.
6310 */
6311int
6312pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
6313{
6314	int error;
6315
6316	PMAP_LOCK(kernel_pmap);
6317	error = pmap_change_attr_locked(va, size, mode);
6318	PMAP_UNLOCK(kernel_pmap);
6319	return (error);
6320}
6321
6322static int
6323pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
6324{
6325	vm_offset_t base, offset, tmpva;
6326	vm_paddr_t pa_start, pa_end;
6327	pdp_entry_t *pdpe;
6328	pd_entry_t *pde;
6329	pt_entry_t *pte;
6330	int cache_bits_pte, cache_bits_pde, error;
6331	boolean_t changed;
6332
6333	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
6334	base = trunc_page(va);
6335	offset = va & PAGE_MASK;
6336	size = round_page(offset + size);
6337
6338	/*
6339	 * Only supported on kernel virtual addresses, including the direct
6340	 * map but excluding the recursive map.
6341	 */
6342	if (base < DMAP_MIN_ADDRESS)
6343		return (EINVAL);
6344
6345	cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1);
6346	cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0);
6347	changed = FALSE;
6348
6349	/*
6350	 * Pages that aren't mapped aren't supported.  Also break down 2MB pages
6351	 * into 4KB pages if required.
6352	 */
6353	for (tmpva = base; tmpva < base + size; ) {
6354		pdpe = pmap_pdpe(kernel_pmap, tmpva);
6355		if (*pdpe == 0)
6356			return (EINVAL);
6357		if (*pdpe & PG_PS) {
6358			/*
6359			 * If the current 1GB page already has the required
6360			 * memory type, then we need not demote this page. Just
6361			 * increment tmpva to the next 1GB page frame.
6362			 */
6363			if ((*pdpe & X86_PG_PDE_CACHE) == cache_bits_pde) {
6364				tmpva = trunc_1gpage(tmpva) + NBPDP;
6365				continue;
6366			}
6367
6368			/*
6369			 * If the current offset aligns with a 1GB page frame
6370			 * and there is at least 1GB left within the range, then
6371			 * we need not break down this page into 2MB pages.
6372			 */
6373			if ((tmpva & PDPMASK) == 0 &&
6374			    tmpva + PDPMASK < base + size) {
6375				tmpva += NBPDP;
6376				continue;
6377			}
6378			if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva))
6379				return (ENOMEM);
6380		}
6381		pde = pmap_pdpe_to_pde(pdpe, tmpva);
6382		if (*pde == 0)
6383			return (EINVAL);
6384		if (*pde & PG_PS) {
6385			/*
6386			 * If the current 2MB page already has the required
6387			 * memory type, then we need not demote this page. Just
6388			 * increment tmpva to the next 2MB page frame.
6389			 */
6390			if ((*pde & X86_PG_PDE_CACHE) == cache_bits_pde) {
6391				tmpva = trunc_2mpage(tmpva) + NBPDR;
6392				continue;
6393			}
6394
6395			/*
6396			 * If the current offset aligns with a 2MB page frame
6397			 * and there is at least 2MB left within the range, then
6398			 * we need not break down this page into 4KB pages.
6399			 */
6400			if ((tmpva & PDRMASK) == 0 &&
6401			    tmpva + PDRMASK < base + size) {
6402				tmpva += NBPDR;
6403				continue;
6404			}
6405			if (!pmap_demote_pde(kernel_pmap, pde, tmpva))
6406				return (ENOMEM);
6407		}
6408		pte = pmap_pde_to_pte(pde, tmpva);
6409		if (*pte == 0)
6410			return (EINVAL);
6411		tmpva += PAGE_SIZE;
6412	}
6413	error = 0;
6414
6415	/*
6416	 * Ok, all the pages exist, so run through them updating their
6417	 * cache mode if required.
6418	 */
6419	pa_start = pa_end = 0;
6420	for (tmpva = base; tmpva < base + size; ) {
6421		pdpe = pmap_pdpe(kernel_pmap, tmpva);
6422		if (*pdpe & PG_PS) {
6423			if ((*pdpe & X86_PG_PDE_CACHE) != cache_bits_pde) {
6424				pmap_pde_attr(pdpe, cache_bits_pde,
6425				    X86_PG_PDE_CACHE);
6426				changed = TRUE;
6427			}
6428			if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
6429				if (pa_start == pa_end) {
6430					/* Start physical address run. */
6431					pa_start = *pdpe & PG_PS_FRAME;
6432					pa_end = pa_start + NBPDP;
6433				} else if (pa_end == (*pdpe & PG_PS_FRAME))
6434					pa_end += NBPDP;
6435				else {
6436					/* Run ended, update direct map. */
6437					error = pmap_change_attr_locked(
6438					    PHYS_TO_DMAP(pa_start),
6439					    pa_end - pa_start, mode);
6440					if (error != 0)
6441						break;
6442					/* Start physical address run. */
6443					pa_start = *pdpe & PG_PS_FRAME;
6444					pa_end = pa_start + NBPDP;
6445				}
6446			}
6447			tmpva = trunc_1gpage(tmpva) + NBPDP;
6448			continue;
6449		}
6450		pde = pmap_pdpe_to_pde(pdpe, tmpva);
6451		if (*pde & PG_PS) {
6452			if ((*pde & X86_PG_PDE_CACHE) != cache_bits_pde) {
6453				pmap_pde_attr(pde, cache_bits_pde,
6454				    X86_PG_PDE_CACHE);
6455				changed = TRUE;
6456			}
6457			if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
6458				if (pa_start == pa_end) {
6459					/* Start physical address run. */
6460					pa_start = *pde & PG_PS_FRAME;
6461					pa_end = pa_start + NBPDR;
6462				} else if (pa_end == (*pde & PG_PS_FRAME))
6463					pa_end += NBPDR;
6464				else {
6465					/* Run ended, update direct map. */
6466					error = pmap_change_attr_locked(
6467					    PHYS_TO_DMAP(pa_start),
6468					    pa_end - pa_start, mode);
6469					if (error != 0)
6470						break;
6471					/* Start physical address run. */
6472					pa_start = *pde & PG_PS_FRAME;
6473					pa_end = pa_start + NBPDR;
6474				}
6475			}
6476			tmpva = trunc_2mpage(tmpva) + NBPDR;
6477		} else {
6478			pte = pmap_pde_to_pte(pde, tmpva);
6479			if ((*pte & X86_PG_PTE_CACHE) != cache_bits_pte) {
6480				pmap_pte_attr(pte, cache_bits_pte,
6481				    X86_PG_PTE_CACHE);
6482				changed = TRUE;
6483			}
6484			if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
6485				if (pa_start == pa_end) {
6486					/* Start physical address run. */
6487					pa_start = *pte & PG_FRAME;
6488					pa_end = pa_start + PAGE_SIZE;
6489				} else if (pa_end == (*pte & PG_FRAME))
6490					pa_end += PAGE_SIZE;
6491				else {
6492					/* Run ended, update direct map. */
6493					error = pmap_change_attr_locked(
6494					    PHYS_TO_DMAP(pa_start),
6495					    pa_end - pa_start, mode);
6496					if (error != 0)
6497						break;
6498					/* Start physical address run. */
6499					pa_start = *pte & PG_FRAME;
6500					pa_end = pa_start + PAGE_SIZE;
6501				}
6502			}
6503			tmpva += PAGE_SIZE;
6504		}
6505	}
6506	if (error == 0 && pa_start != pa_end)
6507		error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start),
6508		    pa_end - pa_start, mode);
6509
6510	/*
6511	 * Flush CPU caches if required to make sure any data isn't cached that
6512	 * shouldn't be, etc.
6513	 */
6514	if (changed) {
6515		pmap_invalidate_range(kernel_pmap, base, tmpva);
6516		pmap_invalidate_cache_range(base, tmpva);
6517	}
6518	return (error);
6519}
6520
6521/*
6522 * Demotes any mapping within the direct map region that covers more than the
6523 * specified range of physical addresses.  This range's size must be a power
6524 * of two and its starting address must be a multiple of its size.  Since the
6525 * demotion does not change any attributes of the mapping, a TLB invalidation
6526 * is not mandatory.  The caller may, however, request a TLB invalidation.
6527 */
6528void
6529pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate)
6530{
6531	pdp_entry_t *pdpe;
6532	pd_entry_t *pde;
6533	vm_offset_t va;
6534	boolean_t changed;
6535
6536	if (len == 0)
6537		return;
6538	KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2"));
6539	KASSERT((base & (len - 1)) == 0,
6540	    ("pmap_demote_DMAP: base is not a multiple of len"));
6541	if (len < NBPDP && base < dmaplimit) {
6542		va = PHYS_TO_DMAP(base);
6543		changed = FALSE;
6544		PMAP_LOCK(kernel_pmap);
6545		pdpe = pmap_pdpe(kernel_pmap, va);
6546		if ((*pdpe & X86_PG_V) == 0)
6547			panic("pmap_demote_DMAP: invalid PDPE");
6548		if ((*pdpe & PG_PS) != 0) {
6549			if (!pmap_demote_pdpe(kernel_pmap, pdpe, va))
6550				panic("pmap_demote_DMAP: PDPE failed");
6551			changed = TRUE;
6552		}
6553		if (len < NBPDR) {
6554			pde = pmap_pdpe_to_pde(pdpe, va);
6555			if ((*pde & X86_PG_V) == 0)
6556				panic("pmap_demote_DMAP: invalid PDE");
6557			if ((*pde & PG_PS) != 0) {
6558				if (!pmap_demote_pde(kernel_pmap, pde, va))
6559					panic("pmap_demote_DMAP: PDE failed");
6560				changed = TRUE;
6561			}
6562		}
6563		if (changed && invalidate)
6564			pmap_invalidate_page(kernel_pmap, va);
6565		PMAP_UNLOCK(kernel_pmap);
6566	}
6567}
6568
6569/*
6570 * perform the pmap work for mincore
6571 */
6572int
6573pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
6574{
6575	pd_entry_t *pdep;
6576	pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V;
6577	vm_paddr_t pa;
6578	int val;
6579
6580	PG_A = pmap_accessed_bit(pmap);
6581	PG_M = pmap_modified_bit(pmap);
6582	PG_V = pmap_valid_bit(pmap);
6583	PG_RW = pmap_rw_bit(pmap);
6584
6585	PMAP_LOCK(pmap);
6586retry:
6587	pdep = pmap_pde(pmap, addr);
6588	if (pdep != NULL && (*pdep & PG_V)) {
6589		if (*pdep & PG_PS) {
6590			pte = *pdep;
6591			/* Compute the physical address of the 4KB page. */
6592			pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
6593			    PG_FRAME;
6594			val = MINCORE_SUPER;
6595		} else {
6596			pte = *pmap_pde_to_pte(pdep, addr);
6597			pa = pte & PG_FRAME;
6598			val = 0;
6599		}
6600	} else {
6601		pte = 0;
6602		pa = 0;
6603		val = 0;
6604	}
6605	if ((pte & PG_V) != 0) {
6606		val |= MINCORE_INCORE;
6607		if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
6608			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
6609		if ((pte & PG_A) != 0)
6610			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
6611	}
6612	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
6613	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
6614	    (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
6615		/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
6616		if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
6617			goto retry;
6618	} else
6619		PA_UNLOCK_COND(*locked_pa);
6620	PMAP_UNLOCK(pmap);
6621	return (val);
6622}
6623
6624void
6625pmap_activate(struct thread *td)
6626{
6627	pmap_t	pmap, oldpmap;
6628	u_int	cpuid;
6629
6630	critical_enter();
6631	pmap = vmspace_pmap(td->td_proc->p_vmspace);
6632	oldpmap = PCPU_GET(curpmap);
6633	cpuid = PCPU_GET(cpuid);
6634#ifdef SMP
6635	CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
6636	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
6637	CPU_SET_ATOMIC(cpuid, &pmap->pm_save);
6638#else
6639	CPU_CLR(cpuid, &oldpmap->pm_active);
6640	CPU_SET(cpuid, &pmap->pm_active);
6641	CPU_SET(cpuid, &pmap->pm_save);
6642#endif
6643	td->td_pcb->pcb_cr3 = pmap->pm_cr3;
6644	load_cr3(pmap->pm_cr3);
6645	PCPU_SET(curpmap, pmap);
6646	critical_exit();
6647}
6648
6649void
6650pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
6651{
6652}
6653
6654/*
6655 *	Increase the starting virtual address of the given mapping if a
6656 *	different alignment might result in more superpage mappings.
6657 */
6658void
6659pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
6660    vm_offset_t *addr, vm_size_t size)
6661{
6662	vm_offset_t superpage_offset;
6663
6664	if (size < NBPDR)
6665		return;
6666	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
6667		offset += ptoa(object->pg_color);
6668	superpage_offset = offset & PDRMASK;
6669	if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
6670	    (*addr & PDRMASK) == superpage_offset)
6671		return;
6672	if ((*addr & PDRMASK) < superpage_offset)
6673		*addr = (*addr & ~PDRMASK) + superpage_offset;
6674	else
6675		*addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
6676}
6677
6678#ifdef INVARIANTS
6679static unsigned long num_dirty_emulations;
6680SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW,
6681	     &num_dirty_emulations, 0, NULL);
6682
6683static unsigned long num_accessed_emulations;
6684SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW,
6685	     &num_accessed_emulations, 0, NULL);
6686
6687static unsigned long num_superpage_accessed_emulations;
6688SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW,
6689	     &num_superpage_accessed_emulations, 0, NULL);
6690
6691static unsigned long ad_emulation_superpage_promotions;
6692SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW,
6693	     &ad_emulation_superpage_promotions, 0, NULL);
6694#endif	/* INVARIANTS */
6695
6696int
6697pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype)
6698{
6699	int rv;
6700	struct rwlock *lock;
6701	vm_page_t m, mpte;
6702	pd_entry_t *pde;
6703	pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V;
6704	boolean_t pv_lists_locked;
6705
6706	KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE,
6707	    ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype));
6708
6709	if (!pmap_emulate_ad_bits(pmap))
6710		return (-1);
6711
6712	PG_A = pmap_accessed_bit(pmap);
6713	PG_M = pmap_modified_bit(pmap);
6714	PG_V = pmap_valid_bit(pmap);
6715	PG_RW = pmap_rw_bit(pmap);
6716
6717	rv = -1;
6718	lock = NULL;
6719	pv_lists_locked = FALSE;
6720retry:
6721	PMAP_LOCK(pmap);
6722
6723	pde = pmap_pde(pmap, va);
6724	if (pde == NULL || (*pde & PG_V) == 0)
6725		goto done;
6726
6727	if ((*pde & PG_PS) != 0) {
6728		if (ftype == VM_PROT_READ) {
6729#ifdef INVARIANTS
6730			atomic_add_long(&num_superpage_accessed_emulations, 1);
6731#endif
6732			*pde |= PG_A;
6733			rv = 0;
6734		}
6735		goto done;
6736	}
6737
6738	pte = pmap_pde_to_pte(pde, va);
6739	if ((*pte & PG_V) == 0)
6740		goto done;
6741
6742	if (ftype == VM_PROT_WRITE) {
6743		if ((*pte & PG_RW) == 0)
6744			goto done;
6745		*pte |= PG_M;
6746	}
6747	*pte |= PG_A;
6748
6749	/* try to promote the mapping */
6750	if (va < VM_MAXUSER_ADDRESS)
6751		mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
6752	else
6753		mpte = NULL;
6754
6755	m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
6756
6757	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
6758	    pmap_ps_enabled(pmap) &&
6759	    (m->flags & PG_FICTITIOUS) == 0 &&
6760	    vm_reserv_level_iffullpop(m) == 0) {
6761		if (!pv_lists_locked) {
6762			pv_lists_locked = TRUE;
6763			if (!rw_try_rlock(&pvh_global_lock)) {
6764				PMAP_UNLOCK(pmap);
6765				rw_rlock(&pvh_global_lock);
6766				goto retry;
6767			}
6768		}
6769		pmap_promote_pde(pmap, pde, va, &lock);
6770#ifdef INVARIANTS
6771		atomic_add_long(&ad_emulation_superpage_promotions, 1);
6772#endif
6773	}
6774#ifdef INVARIANTS
6775	if (ftype == VM_PROT_WRITE)
6776		atomic_add_long(&num_dirty_emulations, 1);
6777	else
6778		atomic_add_long(&num_accessed_emulations, 1);
6779#endif
6780	rv = 0;		/* success */
6781done:
6782	if (lock != NULL)
6783		rw_wunlock(lock);
6784	if (pv_lists_locked)
6785		rw_runlock(&pvh_global_lock);
6786	PMAP_UNLOCK(pmap);
6787	return (rv);
6788}
6789
6790void
6791pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num)
6792{
6793	pml4_entry_t *pml4;
6794	pdp_entry_t *pdp;
6795	pd_entry_t *pde;
6796	pt_entry_t *pte, PG_V;
6797	int idx;
6798
6799	idx = 0;
6800	PG_V = pmap_valid_bit(pmap);
6801	PMAP_LOCK(pmap);
6802
6803	pml4 = pmap_pml4e(pmap, va);
6804	ptr[idx++] = *pml4;
6805	if ((*pml4 & PG_V) == 0)
6806		goto done;
6807
6808	pdp = pmap_pml4e_to_pdpe(pml4, va);
6809	ptr[idx++] = *pdp;
6810	if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0)
6811		goto done;
6812
6813	pde = pmap_pdpe_to_pde(pdp, va);
6814	ptr[idx++] = *pde;
6815	if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0)
6816		goto done;
6817
6818	pte = pmap_pde_to_pte(pde, va);
6819	ptr[idx++] = *pte;
6820
6821done:
6822	PMAP_UNLOCK(pmap);
6823	*num = idx;
6824}
6825
6826#include "opt_ddb.h"
6827#ifdef DDB
6828#include <ddb/ddb.h>
6829
6830DB_SHOW_COMMAND(pte, pmap_print_pte)
6831{
6832	pmap_t pmap;
6833	pml4_entry_t *pml4;
6834	pdp_entry_t *pdp;
6835	pd_entry_t *pde;
6836	pt_entry_t *pte, PG_V;
6837	vm_offset_t va;
6838
6839	if (have_addr) {
6840		va = (vm_offset_t)addr;
6841		pmap = PCPU_GET(curpmap); /* XXX */
6842	} else {
6843		db_printf("show pte addr\n");
6844		return;
6845	}
6846	PG_V = pmap_valid_bit(pmap);
6847	pml4 = pmap_pml4e(pmap, va);
6848	db_printf("VA %#016lx pml4e %#016lx", va, *pml4);
6849	if ((*pml4 & PG_V) == 0) {
6850		db_printf("\n");
6851		return;
6852	}
6853	pdp = pmap_pml4e_to_pdpe(pml4, va);
6854	db_printf(" pdpe %#016lx", *pdp);
6855	if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) {
6856		db_printf("\n");
6857		return;
6858	}
6859	pde = pmap_pdpe_to_pde(pdp, va);
6860	db_printf(" pde %#016lx", *pde);
6861	if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) {
6862		db_printf("\n");
6863		return;
6864	}
6865	pte = pmap_pde_to_pte(pde, va);
6866	db_printf(" pte %#016lx\n", *pte);
6867}
6868
6869DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap)
6870{
6871	vm_paddr_t a;
6872
6873	if (have_addr) {
6874		a = (vm_paddr_t)addr;
6875		db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a));
6876	} else {
6877		db_printf("show phys2dmap addr\n");
6878	}
6879}
6880#endif
6881