pmap.c revision 284021
1/*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2003 Peter Wemm
9 * All rights reserved.
10 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
11 * All rights reserved.
12 *
13 * This code is derived from software contributed to Berkeley by
14 * the Systems Programming Group of the University of Utah Computer
15 * Science Department and William Jolitz of UUNET Technologies Inc.
16 *
17 * Redistribution and use in source and binary forms, with or without
18 * modification, are permitted provided that the following conditions
19 * are met:
20 * 1. Redistributions of source code must retain the above copyright
21 *    notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 *    notice, this list of conditions and the following disclaimer in the
24 *    documentation and/or other materials provided with the distribution.
25 * 3. All advertising materials mentioning features or use of this software
26 *    must display the following acknowledgement:
27 *	This product includes software developed by the University of
28 *	California, Berkeley and its contributors.
29 * 4. Neither the name of the University nor the names of its contributors
30 *    may be used to endorse or promote products derived from this software
31 *    without specific prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
34 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
35 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
36 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
37 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
38 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
39 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
40 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
41 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
42 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
43 * SUCH DAMAGE.
44 *
45 *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
46 */
47/*-
48 * Copyright (c) 2003 Networks Associates Technology, Inc.
49 * All rights reserved.
50 *
51 * This software was developed for the FreeBSD Project by Jake Burkholder,
52 * Safeport Network Services, and Network Associates Laboratories, the
53 * Security Research Division of Network Associates, Inc. under
54 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
55 * CHATS research program.
56 *
57 * Redistribution and use in source and binary forms, with or without
58 * modification, are permitted provided that the following conditions
59 * are met:
60 * 1. Redistributions of source code must retain the above copyright
61 *    notice, this list of conditions and the following disclaimer.
62 * 2. Redistributions in binary form must reproduce the above copyright
63 *    notice, this list of conditions and the following disclaimer in the
64 *    documentation and/or other materials provided with the distribution.
65 *
66 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
67 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
68 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
69 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
70 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
71 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
72 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
73 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
74 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
75 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
76 * SUCH DAMAGE.
77 */
78
79#define	AMD64_NPT_AWARE
80
81#include <sys/cdefs.h>
82__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/pmap.c 284021 2015-06-05 08:36:25Z kib $");
83
84/*
85 *	Manages physical address maps.
86 *
87 *	Since the information managed by this module is
88 *	also stored by the logical address mapping module,
89 *	this module may throw away valid virtual-to-physical
90 *	mappings at almost any time.  However, invalidations
91 *	of virtual-to-physical mappings must be done as
92 *	requested.
93 *
94 *	In order to cope with hardware architectures which
95 *	make virtual-to-physical map invalidates expensive,
96 *	this module may delay invalidate or reduced protection
97 *	operations until such time as they are actually
98 *	necessary.  This module is given full information as
99 *	to which processors are currently using which maps,
100 *	and to when physical maps must be made correct.
101 */
102
103#include "opt_pmap.h"
104#include "opt_vm.h"
105
106#include <sys/param.h>
107#include <sys/bus.h>
108#include <sys/systm.h>
109#include <sys/kernel.h>
110#include <sys/ktr.h>
111#include <sys/lock.h>
112#include <sys/malloc.h>
113#include <sys/mman.h>
114#include <sys/mutex.h>
115#include <sys/proc.h>
116#include <sys/rwlock.h>
117#include <sys/sx.h>
118#include <sys/vmmeter.h>
119#include <sys/sched.h>
120#include <sys/sysctl.h>
121#include <sys/_unrhdr.h>
122#include <sys/smp.h>
123
124#include <vm/vm.h>
125#include <vm/vm_param.h>
126#include <vm/vm_kern.h>
127#include <vm/vm_page.h>
128#include <vm/vm_map.h>
129#include <vm/vm_object.h>
130#include <vm/vm_extern.h>
131#include <vm/vm_pageout.h>
132#include <vm/vm_pager.h>
133#include <vm/vm_phys.h>
134#include <vm/vm_radix.h>
135#include <vm/vm_reserv.h>
136#include <vm/uma.h>
137
138#include <machine/intr_machdep.h>
139#include <machine/apicvar.h>
140#include <machine/cpu.h>
141#include <machine/cputypes.h>
142#include <machine/md_var.h>
143#include <machine/pcb.h>
144#include <machine/specialreg.h>
145#ifdef SMP
146#include <machine/smp.h>
147#endif
148
149static __inline boolean_t
150pmap_type_guest(pmap_t pmap)
151{
152
153	return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI));
154}
155
156static __inline boolean_t
157pmap_emulate_ad_bits(pmap_t pmap)
158{
159
160	return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0);
161}
162
163static __inline pt_entry_t
164pmap_valid_bit(pmap_t pmap)
165{
166	pt_entry_t mask;
167
168	switch (pmap->pm_type) {
169	case PT_X86:
170	case PT_RVI:
171		mask = X86_PG_V;
172		break;
173	case PT_EPT:
174		if (pmap_emulate_ad_bits(pmap))
175			mask = EPT_PG_EMUL_V;
176		else
177			mask = EPT_PG_READ;
178		break;
179	default:
180		panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type);
181	}
182
183	return (mask);
184}
185
186static __inline pt_entry_t
187pmap_rw_bit(pmap_t pmap)
188{
189	pt_entry_t mask;
190
191	switch (pmap->pm_type) {
192	case PT_X86:
193	case PT_RVI:
194		mask = X86_PG_RW;
195		break;
196	case PT_EPT:
197		if (pmap_emulate_ad_bits(pmap))
198			mask = EPT_PG_EMUL_RW;
199		else
200			mask = EPT_PG_WRITE;
201		break;
202	default:
203		panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type);
204	}
205
206	return (mask);
207}
208
209static __inline pt_entry_t
210pmap_global_bit(pmap_t pmap)
211{
212	pt_entry_t mask;
213
214	switch (pmap->pm_type) {
215	case PT_X86:
216		mask = X86_PG_G;
217		break;
218	case PT_RVI:
219	case PT_EPT:
220		mask = 0;
221		break;
222	default:
223		panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type);
224	}
225
226	return (mask);
227}
228
229static __inline pt_entry_t
230pmap_accessed_bit(pmap_t pmap)
231{
232	pt_entry_t mask;
233
234	switch (pmap->pm_type) {
235	case PT_X86:
236	case PT_RVI:
237		mask = X86_PG_A;
238		break;
239	case PT_EPT:
240		if (pmap_emulate_ad_bits(pmap))
241			mask = EPT_PG_READ;
242		else
243			mask = EPT_PG_A;
244		break;
245	default:
246		panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type);
247	}
248
249	return (mask);
250}
251
252static __inline pt_entry_t
253pmap_modified_bit(pmap_t pmap)
254{
255	pt_entry_t mask;
256
257	switch (pmap->pm_type) {
258	case PT_X86:
259	case PT_RVI:
260		mask = X86_PG_M;
261		break;
262	case PT_EPT:
263		if (pmap_emulate_ad_bits(pmap))
264			mask = EPT_PG_WRITE;
265		else
266			mask = EPT_PG_M;
267		break;
268	default:
269		panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type);
270	}
271
272	return (mask);
273}
274
275#if !defined(DIAGNOSTIC)
276#ifdef __GNUC_GNU_INLINE__
277#define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
278#else
279#define PMAP_INLINE	extern inline
280#endif
281#else
282#define PMAP_INLINE
283#endif
284
285#ifdef PV_STATS
286#define PV_STAT(x)	do { x ; } while (0)
287#else
288#define PV_STAT(x)	do { } while (0)
289#endif
290
291#define	pa_index(pa)	((pa) >> PDRSHIFT)
292#define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
293
294#define	NPV_LIST_LOCKS	MAXCPU
295
296#define	PHYS_TO_PV_LIST_LOCK(pa)	\
297			(&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
298
299#define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)	do {	\
300	struct rwlock **_lockp = (lockp);		\
301	struct rwlock *_new_lock;			\
302							\
303	_new_lock = PHYS_TO_PV_LIST_LOCK(pa);		\
304	if (_new_lock != *_lockp) {			\
305		if (*_lockp != NULL)			\
306			rw_wunlock(*_lockp);		\
307		*_lockp = _new_lock;			\
308		rw_wlock(*_lockp);			\
309	}						\
310} while (0)
311
312#define	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)	\
313			CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
314
315#define	RELEASE_PV_LIST_LOCK(lockp)		do {	\
316	struct rwlock **_lockp = (lockp);		\
317							\
318	if (*_lockp != NULL) {				\
319		rw_wunlock(*_lockp);			\
320		*_lockp = NULL;				\
321	}						\
322} while (0)
323
324#define	VM_PAGE_TO_PV_LIST_LOCK(m)	\
325			PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
326
327struct pmap kernel_pmap_store;
328
329vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
330vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
331
332int nkpt;
333SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0,
334    "Number of kernel page table pages allocated on bootup");
335
336static int ndmpdp;
337vm_paddr_t dmaplimit;
338vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
339pt_entry_t pg_nx;
340
341static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
342
343static int pat_works = 1;
344SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1,
345    "Is page attribute table fully functional?");
346
347static int pg_ps_enabled = 1;
348SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0,
349    "Are large page mappings enabled?");
350
351#define	PAT_INDEX_SIZE	8
352static int pat_index[PAT_INDEX_SIZE];	/* cache mode to PAT index conversion */
353
354static u_int64_t	KPTphys;	/* phys addr of kernel level 1 */
355static u_int64_t	KPDphys;	/* phys addr of kernel level 2 */
356u_int64_t		KPDPphys;	/* phys addr of kernel level 3 */
357u_int64_t		KPML4phys;	/* phys addr of kernel level 4 */
358
359static u_int64_t	DMPDphys;	/* phys addr of direct mapped level 2 */
360static u_int64_t	DMPDPphys;	/* phys addr of direct mapped level 3 */
361static int		ndmpdpphys;	/* number of DMPDPphys pages */
362
363static struct rwlock_padalign pvh_global_lock;
364
365/*
366 * Data for the pv entry allocation mechanism
367 */
368static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
369static struct mtx pv_chunks_mutex;
370static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
371static struct md_page *pv_table;
372
373/*
374 * All those kernel PT submaps that BSD is so fond of
375 */
376pt_entry_t *CMAP1 = 0;
377caddr_t CADDR1 = 0;
378
379static int pmap_flags = PMAP_PDE_SUPERPAGE;	/* flags for x86 pmaps */
380
381static struct unrhdr pcid_unr;
382static struct mtx pcid_mtx;
383int pmap_pcid_enabled = 0;
384SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN, &pmap_pcid_enabled,
385    0, "Is TLB Context ID enabled ?");
386int invpcid_works = 0;
387SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0,
388    "Is the invpcid instruction available ?");
389
390static int
391pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS)
392{
393	int i;
394	uint64_t res;
395
396	res = 0;
397	CPU_FOREACH(i) {
398		res += cpuid_to_pcpu[i]->pc_pm_save_cnt;
399	}
400	return (sysctl_handle_64(oidp, &res, 0, req));
401}
402SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RW |
403    CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU",
404    "Count of saved TLB context on switch");
405
406/* pmap_copy_pages() over non-DMAP */
407static struct mtx cpage_lock;
408static vm_offset_t cpage_a;
409static vm_offset_t cpage_b;
410
411/*
412 * Crashdump maps.
413 */
414static caddr_t crashdumpmap;
415
416static void	free_pv_chunk(struct pv_chunk *pc);
417static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
418static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
419static int	popcnt_pc_map_elem(uint64_t elem);
420static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
421static void	reserve_pv_entries(pmap_t pmap, int needed,
422		    struct rwlock **lockp);
423static void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
424		    struct rwlock **lockp);
425static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
426		    struct rwlock **lockp);
427static void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
428		    struct rwlock **lockp);
429static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
430static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
431		    vm_offset_t va);
432
433static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode);
434static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
435static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde,
436    vm_offset_t va, struct rwlock **lockp);
437static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe,
438    vm_offset_t va);
439static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
440    vm_prot_t prot, struct rwlock **lockp);
441static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
442    vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
443static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
444static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
445static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
446static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va);
447static void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask);
448static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
449    struct rwlock **lockp);
450static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
451    vm_prot_t prot);
452static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask);
453static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
454    struct spglist *free, struct rwlock **lockp);
455static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
456    pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
457static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte);
458static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
459    struct spglist *free);
460static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
461    vm_page_t m, struct rwlock **lockp);
462static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
463    pd_entry_t newpde);
464static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde);
465
466static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
467		struct rwlock **lockp);
468static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va,
469		struct rwlock **lockp);
470static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va,
471		struct rwlock **lockp);
472
473static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
474    struct spglist *free);
475static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
476static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
477
478/*
479 * Move the kernel virtual free pointer to the next
480 * 2MB.  This is used to help improve performance
481 * by using a large (2MB) page for much of the kernel
482 * (.text, .data, .bss)
483 */
484static vm_offset_t
485pmap_kmem_choose(vm_offset_t addr)
486{
487	vm_offset_t newaddr = addr;
488
489	newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
490	return (newaddr);
491}
492
493/********************/
494/* Inline functions */
495/********************/
496
497/* Return a non-clipped PD index for a given VA */
498static __inline vm_pindex_t
499pmap_pde_pindex(vm_offset_t va)
500{
501	return (va >> PDRSHIFT);
502}
503
504
505/* Return various clipped indexes for a given VA */
506static __inline vm_pindex_t
507pmap_pte_index(vm_offset_t va)
508{
509
510	return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1));
511}
512
513static __inline vm_pindex_t
514pmap_pde_index(vm_offset_t va)
515{
516
517	return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
518}
519
520static __inline vm_pindex_t
521pmap_pdpe_index(vm_offset_t va)
522{
523
524	return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1));
525}
526
527static __inline vm_pindex_t
528pmap_pml4e_index(vm_offset_t va)
529{
530
531	return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
532}
533
534/* Return a pointer to the PML4 slot that corresponds to a VA */
535static __inline pml4_entry_t *
536pmap_pml4e(pmap_t pmap, vm_offset_t va)
537{
538
539	return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
540}
541
542/* Return a pointer to the PDP slot that corresponds to a VA */
543static __inline pdp_entry_t *
544pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va)
545{
546	pdp_entry_t *pdpe;
547
548	pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
549	return (&pdpe[pmap_pdpe_index(va)]);
550}
551
552/* Return a pointer to the PDP slot that corresponds to a VA */
553static __inline pdp_entry_t *
554pmap_pdpe(pmap_t pmap, vm_offset_t va)
555{
556	pml4_entry_t *pml4e;
557	pt_entry_t PG_V;
558
559	PG_V = pmap_valid_bit(pmap);
560	pml4e = pmap_pml4e(pmap, va);
561	if ((*pml4e & PG_V) == 0)
562		return (NULL);
563	return (pmap_pml4e_to_pdpe(pml4e, va));
564}
565
566/* Return a pointer to the PD slot that corresponds to a VA */
567static __inline pd_entry_t *
568pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va)
569{
570	pd_entry_t *pde;
571
572	pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
573	return (&pde[pmap_pde_index(va)]);
574}
575
576/* Return a pointer to the PD slot that corresponds to a VA */
577static __inline pd_entry_t *
578pmap_pde(pmap_t pmap, vm_offset_t va)
579{
580	pdp_entry_t *pdpe;
581	pt_entry_t PG_V;
582
583	PG_V = pmap_valid_bit(pmap);
584	pdpe = pmap_pdpe(pmap, va);
585	if (pdpe == NULL || (*pdpe & PG_V) == 0)
586		return (NULL);
587	return (pmap_pdpe_to_pde(pdpe, va));
588}
589
590/* Return a pointer to the PT slot that corresponds to a VA */
591static __inline pt_entry_t *
592pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va)
593{
594	pt_entry_t *pte;
595
596	pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
597	return (&pte[pmap_pte_index(va)]);
598}
599
600/* Return a pointer to the PT slot that corresponds to a VA */
601static __inline pt_entry_t *
602pmap_pte(pmap_t pmap, vm_offset_t va)
603{
604	pd_entry_t *pde;
605	pt_entry_t PG_V;
606
607	PG_V = pmap_valid_bit(pmap);
608	pde = pmap_pde(pmap, va);
609	if (pde == NULL || (*pde & PG_V) == 0)
610		return (NULL);
611	if ((*pde & PG_PS) != 0)	/* compat with i386 pmap_pte() */
612		return ((pt_entry_t *)pde);
613	return (pmap_pde_to_pte(pde, va));
614}
615
616static __inline void
617pmap_resident_count_inc(pmap_t pmap, int count)
618{
619
620	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
621	pmap->pm_stats.resident_count += count;
622}
623
624static __inline void
625pmap_resident_count_dec(pmap_t pmap, int count)
626{
627
628	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
629	KASSERT(pmap->pm_stats.resident_count >= count,
630	    ("pmap %p resident count underflow %ld %d", pmap,
631	    pmap->pm_stats.resident_count, count));
632	pmap->pm_stats.resident_count -= count;
633}
634
635PMAP_INLINE pt_entry_t *
636vtopte(vm_offset_t va)
637{
638	u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
639
640	KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va));
641
642	return (PTmap + ((va >> PAGE_SHIFT) & mask));
643}
644
645static __inline pd_entry_t *
646vtopde(vm_offset_t va)
647{
648	u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
649
650	KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va));
651
652	return (PDmap + ((va >> PDRSHIFT) & mask));
653}
654
655static u_int64_t
656allocpages(vm_paddr_t *firstaddr, int n)
657{
658	u_int64_t ret;
659
660	ret = *firstaddr;
661	bzero((void *)ret, n * PAGE_SIZE);
662	*firstaddr += n * PAGE_SIZE;
663	return (ret);
664}
665
666CTASSERT(powerof2(NDMPML4E));
667
668/* number of kernel PDP slots */
669#define	NKPDPE(ptpgs)		howmany((ptpgs), NPDEPG)
670
671static void
672nkpt_init(vm_paddr_t addr)
673{
674	int pt_pages;
675
676#ifdef NKPT
677	pt_pages = NKPT;
678#else
679	pt_pages = howmany(addr, 1 << PDRSHIFT);
680	pt_pages += NKPDPE(pt_pages);
681
682	/*
683	 * Add some slop beyond the bare minimum required for bootstrapping
684	 * the kernel.
685	 *
686	 * This is quite important when allocating KVA for kernel modules.
687	 * The modules are required to be linked in the negative 2GB of
688	 * the address space.  If we run out of KVA in this region then
689	 * pmap_growkernel() will need to allocate page table pages to map
690	 * the entire 512GB of KVA space which is an unnecessary tax on
691	 * physical memory.
692	 */
693	pt_pages += 8;		/* 16MB additional slop for kernel modules */
694#endif
695	nkpt = pt_pages;
696}
697
698static void
699create_pagetables(vm_paddr_t *firstaddr)
700{
701	int i, j, ndm1g, nkpdpe;
702	pt_entry_t *pt_p;
703	pd_entry_t *pd_p;
704	pdp_entry_t *pdp_p;
705	pml4_entry_t *p4_p;
706
707	/* Allocate page table pages for the direct map */
708	ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT;
709	if (ndmpdp < 4)		/* Minimum 4GB of dirmap */
710		ndmpdp = 4;
711	ndmpdpphys = howmany(ndmpdp, NPDPEPG);
712	if (ndmpdpphys > NDMPML4E) {
713		/*
714		 * Each NDMPML4E allows 512 GB, so limit to that,
715		 * and then readjust ndmpdp and ndmpdpphys.
716		 */
717		printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512);
718		Maxmem = atop(NDMPML4E * NBPML4);
719		ndmpdpphys = NDMPML4E;
720		ndmpdp = NDMPML4E * NPDEPG;
721	}
722	DMPDPphys = allocpages(firstaddr, ndmpdpphys);
723	ndm1g = 0;
724	if ((amd_feature & AMDID_PAGE1GB) != 0)
725		ndm1g = ptoa(Maxmem) >> PDPSHIFT;
726	if (ndm1g < ndmpdp)
727		DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g);
728	dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
729
730	/* Allocate pages */
731	KPML4phys = allocpages(firstaddr, 1);
732	KPDPphys = allocpages(firstaddr, NKPML4E);
733
734	/*
735	 * Allocate the initial number of kernel page table pages required to
736	 * bootstrap.  We defer this until after all memory-size dependent
737	 * allocations are done (e.g. direct map), so that we don't have to
738	 * build in too much slop in our estimate.
739	 *
740	 * Note that when NKPML4E > 1, we have an empty page underneath
741	 * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed)
742	 * pages.  (pmap_enter requires a PD page to exist for each KPML4E.)
743	 */
744	nkpt_init(*firstaddr);
745	nkpdpe = NKPDPE(nkpt);
746
747	KPTphys = allocpages(firstaddr, nkpt);
748	KPDphys = allocpages(firstaddr, nkpdpe);
749
750	/* Fill in the underlying page table pages */
751	/* Nominally read-only (but really R/W) from zero to physfree */
752	/* XXX not fully used, underneath 2M pages */
753	pt_p = (pt_entry_t *)KPTphys;
754	for (i = 0; ptoa(i) < *firstaddr; i++)
755		pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | X86_PG_G;
756
757	/* Now map the page tables at their location within PTmap */
758	pd_p = (pd_entry_t *)KPDphys;
759	for (i = 0; i < nkpt; i++)
760		pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
761
762	/* Map from zero to end of allocations under 2M pages */
763	/* This replaces some of the KPTphys entries above */
764	for (i = 0; (i << PDRSHIFT) < *firstaddr; i++)
765		pd_p[i] = (i << PDRSHIFT) | X86_PG_RW | X86_PG_V | PG_PS |
766		    X86_PG_G;
767
768	/* And connect up the PD to the PDP (leaving room for L4 pages) */
769	pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE));
770	for (i = 0; i < nkpdpe; i++)
771		pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V |
772		    PG_U;
773
774	/*
775	 * Now, set up the direct map region using 2MB and/or 1GB pages.  If
776	 * the end of physical memory is not aligned to a 1GB page boundary,
777	 * then the residual physical memory is mapped with 2MB pages.  Later,
778	 * if pmap_mapdev{_attr}() uses the direct map for non-write-back
779	 * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings
780	 * that are partially used.
781	 */
782	pd_p = (pd_entry_t *)DMPDphys;
783	for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) {
784		pd_p[j] = (vm_paddr_t)i << PDRSHIFT;
785		/* Preset PG_M and PG_A because demotion expects it. */
786		pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G |
787		    X86_PG_M | X86_PG_A;
788	}
789	pdp_p = (pdp_entry_t *)DMPDPphys;
790	for (i = 0; i < ndm1g; i++) {
791		pdp_p[i] = (vm_paddr_t)i << PDPSHIFT;
792		/* Preset PG_M and PG_A because demotion expects it. */
793		pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G |
794		    X86_PG_M | X86_PG_A;
795	}
796	for (j = 0; i < ndmpdp; i++, j++) {
797		pdp_p[i] = DMPDphys + ptoa(j);
798		pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_U;
799	}
800
801	/* And recursively map PML4 to itself in order to get PTmap */
802	p4_p = (pml4_entry_t *)KPML4phys;
803	p4_p[PML4PML4I] = KPML4phys;
804	p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | PG_U;
805
806	/* Connect the Direct Map slot(s) up to the PML4. */
807	for (i = 0; i < ndmpdpphys; i++) {
808		p4_p[DMPML4I + i] = DMPDPphys + ptoa(i);
809		p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | PG_U;
810	}
811
812	/* Connect the KVA slots up to the PML4 */
813	for (i = 0; i < NKPML4E; i++) {
814		p4_p[KPML4BASE + i] = KPDPphys + ptoa(i);
815		p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V | PG_U;
816	}
817}
818
819/*
820 *	Bootstrap the system enough to run with virtual memory.
821 *
822 *	On amd64 this is called after mapping has already been enabled
823 *	and just syncs the pmap module with what has already been done.
824 *	[We can't call it easily with mapping off since the kernel is not
825 *	mapped with PA == VA, hence we would have to relocate every address
826 *	from the linked base (virtual) address "KERNBASE" to the actual
827 *	(physical) address starting relative to 0]
828 */
829void
830pmap_bootstrap(vm_paddr_t *firstaddr)
831{
832	vm_offset_t va;
833	pt_entry_t *pte;
834
835	/*
836	 * Create an initial set of page tables to run the kernel in.
837	 */
838	create_pagetables(firstaddr);
839
840	/*
841	 * Add a physical memory segment (vm_phys_seg) corresponding to the
842	 * preallocated kernel page table pages so that vm_page structures
843	 * representing these pages will be created.  The vm_page structures
844	 * are required for promotion of the corresponding kernel virtual
845	 * addresses to superpage mappings.
846	 */
847	vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt));
848
849	virtual_avail = (vm_offset_t) KERNBASE + *firstaddr;
850	virtual_avail = pmap_kmem_choose(virtual_avail);
851
852	virtual_end = VM_MAX_KERNEL_ADDRESS;
853
854
855	/* XXX do %cr0 as well */
856	load_cr4(rcr4() | CR4_PGE | CR4_PSE);
857	load_cr3(KPML4phys);
858	if (cpu_stdext_feature & CPUID_STDEXT_SMEP)
859		load_cr4(rcr4() | CR4_SMEP);
860
861	/*
862	 * Initialize the kernel pmap (which is statically allocated).
863	 */
864	PMAP_LOCK_INIT(kernel_pmap);
865	kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys);
866	kernel_pmap->pm_cr3 = KPML4phys;
867	CPU_FILL(&kernel_pmap->pm_active);	/* don't allow deactivation */
868	CPU_FILL(&kernel_pmap->pm_save);	/* always superset of pm_active */
869	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
870	kernel_pmap->pm_flags = pmap_flags;
871
872 	/*
873	 * Initialize the global pv list lock.
874	 */
875	rw_init(&pvh_global_lock, "pmap pv global");
876
877	/*
878	 * Reserve some special page table entries/VA space for temporary
879	 * mapping of pages.
880	 */
881#define	SYSMAP(c, p, v, n)	\
882	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
883
884	va = virtual_avail;
885	pte = vtopte(va);
886
887	/*
888	 * Crashdump maps.  The first page is reused as CMAP1 for the
889	 * memory test.
890	 */
891	SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS)
892	CADDR1 = crashdumpmap;
893
894	virtual_avail = va;
895
896	/* Initialize the PAT MSR. */
897	pmap_init_pat();
898
899	/* Initialize TLB Context Id. */
900	TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
901	if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
902		load_cr4(rcr4() | CR4_PCIDE);
903		mtx_init(&pcid_mtx, "pcid", NULL, MTX_DEF);
904		init_unrhdr(&pcid_unr, 1, (1 << 12) - 1, &pcid_mtx);
905		/* Check for INVPCID support */
906		invpcid_works = (cpu_stdext_feature & CPUID_STDEXT_INVPCID)
907		    != 0;
908		kernel_pmap->pm_pcid = 0;
909#ifndef SMP
910		pmap_pcid_enabled = 0;
911#endif
912	} else
913		pmap_pcid_enabled = 0;
914}
915
916/*
917 * Setup the PAT MSR.
918 */
919void
920pmap_init_pat(void)
921{
922	int pat_table[PAT_INDEX_SIZE];
923	uint64_t pat_msr;
924	u_long cr0, cr4;
925	int i;
926
927	/* Bail if this CPU doesn't implement PAT. */
928	if ((cpu_feature & CPUID_PAT) == 0)
929		panic("no PAT??");
930
931	/* Set default PAT index table. */
932	for (i = 0; i < PAT_INDEX_SIZE; i++)
933		pat_table[i] = -1;
934	pat_table[PAT_WRITE_BACK] = 0;
935	pat_table[PAT_WRITE_THROUGH] = 1;
936	pat_table[PAT_UNCACHEABLE] = 3;
937	pat_table[PAT_WRITE_COMBINING] = 3;
938	pat_table[PAT_WRITE_PROTECTED] = 3;
939	pat_table[PAT_UNCACHED] = 3;
940
941	/* Initialize default PAT entries. */
942	pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
943	    PAT_VALUE(1, PAT_WRITE_THROUGH) |
944	    PAT_VALUE(2, PAT_UNCACHED) |
945	    PAT_VALUE(3, PAT_UNCACHEABLE) |
946	    PAT_VALUE(4, PAT_WRITE_BACK) |
947	    PAT_VALUE(5, PAT_WRITE_THROUGH) |
948	    PAT_VALUE(6, PAT_UNCACHED) |
949	    PAT_VALUE(7, PAT_UNCACHEABLE);
950
951	if (pat_works) {
952		/*
953		 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
954		 * Program 5 and 6 as WP and WC.
955		 * Leave 4 and 7 as WB and UC.
956		 */
957		pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6));
958		pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) |
959		    PAT_VALUE(6, PAT_WRITE_COMBINING);
960		pat_table[PAT_UNCACHED] = 2;
961		pat_table[PAT_WRITE_PROTECTED] = 5;
962		pat_table[PAT_WRITE_COMBINING] = 6;
963	} else {
964		/*
965		 * Just replace PAT Index 2 with WC instead of UC-.
966		 */
967		pat_msr &= ~PAT_MASK(2);
968		pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
969		pat_table[PAT_WRITE_COMBINING] = 2;
970	}
971
972	/* Disable PGE. */
973	cr4 = rcr4();
974	load_cr4(cr4 & ~CR4_PGE);
975
976	/* Disable caches (CD = 1, NW = 0). */
977	cr0 = rcr0();
978	load_cr0((cr0 & ~CR0_NW) | CR0_CD);
979
980	/* Flushes caches and TLBs. */
981	wbinvd();
982	invltlb();
983
984	/* Update PAT and index table. */
985	wrmsr(MSR_PAT, pat_msr);
986	for (i = 0; i < PAT_INDEX_SIZE; i++)
987		pat_index[i] = pat_table[i];
988
989	/* Flush caches and TLBs again. */
990	wbinvd();
991	invltlb();
992
993	/* Restore caches and PGE. */
994	load_cr0(cr0);
995	load_cr4(cr4);
996}
997
998/*
999 *	Initialize a vm_page's machine-dependent fields.
1000 */
1001void
1002pmap_page_init(vm_page_t m)
1003{
1004
1005	TAILQ_INIT(&m->md.pv_list);
1006	m->md.pat_mode = PAT_WRITE_BACK;
1007}
1008
1009/*
1010 *	Initialize the pmap module.
1011 *	Called by vm_init, to initialize any structures that the pmap
1012 *	system needs to map virtual memory.
1013 */
1014void
1015pmap_init(void)
1016{
1017	vm_page_t mpte;
1018	vm_size_t s;
1019	int i, pv_npg;
1020
1021	/*
1022	 * Initialize the vm page array entries for the kernel pmap's
1023	 * page table pages.
1024	 */
1025	for (i = 0; i < nkpt; i++) {
1026		mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
1027		KASSERT(mpte >= vm_page_array &&
1028		    mpte < &vm_page_array[vm_page_array_size],
1029		    ("pmap_init: page table page is out of range"));
1030		mpte->pindex = pmap_pde_pindex(KERNBASE) + i;
1031		mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
1032	}
1033
1034	/*
1035	 * If the kernel is running on a virtual machine, then it must assume
1036	 * that MCA is enabled by the hypervisor.  Moreover, the kernel must
1037	 * be prepared for the hypervisor changing the vendor and family that
1038	 * are reported by CPUID.  Consequently, the workaround for AMD Family
1039	 * 10h Erratum 383 is enabled if the processor's feature set does not
1040	 * include at least one feature that is only supported by older Intel
1041	 * or newer AMD processors.
1042	 */
1043	if (vm_guest == VM_GUEST_VM && (cpu_feature & CPUID_SS) == 0 &&
1044	    (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI |
1045	    CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP |
1046	    AMDID2_FMA4)) == 0)
1047		workaround_erratum383 = 1;
1048
1049	/*
1050	 * Are large page mappings enabled?
1051	 */
1052	TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
1053	if (pg_ps_enabled) {
1054		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
1055		    ("pmap_init: can't assign to pagesizes[1]"));
1056		pagesizes[1] = NBPDR;
1057	}
1058
1059	/*
1060	 * Initialize the pv chunk list mutex.
1061	 */
1062	mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
1063
1064	/*
1065	 * Initialize the pool of pv list locks.
1066	 */
1067	for (i = 0; i < NPV_LIST_LOCKS; i++)
1068		rw_init(&pv_list_locks[i], "pmap pv list");
1069
1070	/*
1071	 * Calculate the size of the pv head table for superpages.
1072	 */
1073	pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR);
1074
1075	/*
1076	 * Allocate memory for the pv head table for superpages.
1077	 */
1078	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
1079	s = round_page(s);
1080	pv_table = (struct md_page *)kmem_malloc(kernel_arena, s,
1081	    M_WAITOK | M_ZERO);
1082	for (i = 0; i < pv_npg; i++)
1083		TAILQ_INIT(&pv_table[i].pv_list);
1084
1085	mtx_init(&cpage_lock, "cpage", NULL, MTX_DEF);
1086	cpage_a = kva_alloc(PAGE_SIZE);
1087	cpage_b = kva_alloc(PAGE_SIZE);
1088}
1089
1090static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
1091    "2MB page mapping counters");
1092
1093static u_long pmap_pde_demotions;
1094SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
1095    &pmap_pde_demotions, 0, "2MB page demotions");
1096
1097static u_long pmap_pde_mappings;
1098SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
1099    &pmap_pde_mappings, 0, "2MB page mappings");
1100
1101static u_long pmap_pde_p_failures;
1102SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
1103    &pmap_pde_p_failures, 0, "2MB page promotion failures");
1104
1105static u_long pmap_pde_promotions;
1106SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
1107    &pmap_pde_promotions, 0, "2MB page promotions");
1108
1109static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0,
1110    "1GB page mapping counters");
1111
1112static u_long pmap_pdpe_demotions;
1113SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD,
1114    &pmap_pdpe_demotions, 0, "1GB page demotions");
1115
1116/***************************************************
1117 * Low level helper routines.....
1118 ***************************************************/
1119
1120static pt_entry_t
1121pmap_swap_pat(pmap_t pmap, pt_entry_t entry)
1122{
1123	int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT;
1124
1125	switch (pmap->pm_type) {
1126	case PT_X86:
1127	case PT_RVI:
1128		/* Verify that both PAT bits are not set at the same time */
1129		KASSERT((entry & x86_pat_bits) != x86_pat_bits,
1130		    ("Invalid PAT bits in entry %#lx", entry));
1131
1132		/* Swap the PAT bits if one of them is set */
1133		if ((entry & x86_pat_bits) != 0)
1134			entry ^= x86_pat_bits;
1135		break;
1136	case PT_EPT:
1137		/*
1138		 * Nothing to do - the memory attributes are represented
1139		 * the same way for regular pages and superpages.
1140		 */
1141		break;
1142	default:
1143		panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type);
1144	}
1145
1146	return (entry);
1147}
1148
1149/*
1150 * Determine the appropriate bits to set in a PTE or PDE for a specified
1151 * caching mode.
1152 */
1153static int
1154pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde)
1155{
1156	int cache_bits, pat_flag, pat_idx;
1157
1158	if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0)
1159		panic("Unknown caching mode %d\n", mode);
1160
1161	switch (pmap->pm_type) {
1162	case PT_X86:
1163	case PT_RVI:
1164		/* The PAT bit is different for PTE's and PDE's. */
1165		pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT;
1166
1167		/* Map the caching mode to a PAT index. */
1168		pat_idx = pat_index[mode];
1169
1170		/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
1171		cache_bits = 0;
1172		if (pat_idx & 0x4)
1173			cache_bits |= pat_flag;
1174		if (pat_idx & 0x2)
1175			cache_bits |= PG_NC_PCD;
1176		if (pat_idx & 0x1)
1177			cache_bits |= PG_NC_PWT;
1178		break;
1179
1180	case PT_EPT:
1181		cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode);
1182		break;
1183
1184	default:
1185		panic("unsupported pmap type %d", pmap->pm_type);
1186	}
1187
1188	return (cache_bits);
1189}
1190
1191static int
1192pmap_cache_mask(pmap_t pmap, boolean_t is_pde)
1193{
1194	int mask;
1195
1196	switch (pmap->pm_type) {
1197	case PT_X86:
1198	case PT_RVI:
1199		mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE;
1200		break;
1201	case PT_EPT:
1202		mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7);
1203		break;
1204	default:
1205		panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type);
1206	}
1207
1208	return (mask);
1209}
1210
1211static __inline boolean_t
1212pmap_ps_enabled(pmap_t pmap)
1213{
1214
1215	return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0);
1216}
1217
1218static void
1219pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde)
1220{
1221
1222	switch (pmap->pm_type) {
1223	case PT_X86:
1224		break;
1225	case PT_RVI:
1226	case PT_EPT:
1227		/*
1228		 * XXX
1229		 * This is a little bogus since the generation number is
1230		 * supposed to be bumped up when a region of the address
1231		 * space is invalidated in the page tables.
1232		 *
1233		 * In this case the old PDE entry is valid but yet we want
1234		 * to make sure that any mappings using the old entry are
1235		 * invalidated in the TLB.
1236		 *
1237		 * The reason this works as expected is because we rendezvous
1238		 * "all" host cpus and force any vcpu context to exit as a
1239		 * side-effect.
1240		 */
1241		atomic_add_acq_long(&pmap->pm_eptgen, 1);
1242		break;
1243	default:
1244		panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type);
1245	}
1246	pde_store(pde, newpde);
1247}
1248
1249/*
1250 * After changing the page size for the specified virtual address in the page
1251 * table, flush the corresponding entries from the processor's TLB.  Only the
1252 * calling processor's TLB is affected.
1253 *
1254 * The calling thread must be pinned to a processor.
1255 */
1256static void
1257pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde)
1258{
1259	pt_entry_t PG_G;
1260
1261	if (pmap_type_guest(pmap))
1262		return;
1263
1264	KASSERT(pmap->pm_type == PT_X86,
1265	    ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type));
1266
1267	PG_G = pmap_global_bit(pmap);
1268
1269	if ((newpde & PG_PS) == 0)
1270		/* Demotion: flush a specific 2MB page mapping. */
1271		invlpg(va);
1272	else if ((newpde & PG_G) == 0)
1273		/*
1274		 * Promotion: flush every 4KB page mapping from the TLB
1275		 * because there are too many to flush individually.
1276		 */
1277		invltlb();
1278	else {
1279		/*
1280		 * Promotion: flush every 4KB page mapping from the TLB,
1281		 * including any global (PG_G) mappings.
1282		 */
1283		invltlb_globpcid();
1284	}
1285}
1286#ifdef SMP
1287
1288static void
1289pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va)
1290{
1291	struct invpcid_descr d;
1292	uint64_t cr3;
1293
1294	if (invpcid_works) {
1295		d.pcid = pmap->pm_pcid;
1296		d.pad = 0;
1297		d.addr = va;
1298		invpcid(&d, INVPCID_ADDR);
1299		return;
1300	}
1301
1302	cr3 = rcr3();
1303	critical_enter();
1304	load_cr3(pmap->pm_cr3 | CR3_PCID_SAVE);
1305	invlpg(va);
1306	load_cr3(cr3 | CR3_PCID_SAVE);
1307	critical_exit();
1308}
1309
1310/*
1311 * For SMP, these functions have to use the IPI mechanism for coherence.
1312 *
1313 * N.B.: Before calling any of the following TLB invalidation functions,
1314 * the calling processor must ensure that all stores updating a non-
1315 * kernel page table are globally performed.  Otherwise, another
1316 * processor could cache an old, pre-update entry without being
1317 * invalidated.  This can happen one of two ways: (1) The pmap becomes
1318 * active on another processor after its pm_active field is checked by
1319 * one of the following functions but before a store updating the page
1320 * table is globally performed. (2) The pmap becomes active on another
1321 * processor before its pm_active field is checked but due to
1322 * speculative loads one of the following functions stills reads the
1323 * pmap as inactive on the other processor.
1324 *
1325 * The kernel page table is exempt because its pm_active field is
1326 * immutable.  The kernel page table is always active on every
1327 * processor.
1328 */
1329
1330/*
1331 * Interrupt the cpus that are executing in the guest context.
1332 * This will force the vcpu to exit and the cached EPT mappings
1333 * will be invalidated by the host before the next vmresume.
1334 */
1335static __inline void
1336pmap_invalidate_ept(pmap_t pmap)
1337{
1338	int ipinum;
1339
1340	sched_pin();
1341	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
1342	    ("pmap_invalidate_ept: absurd pm_active"));
1343
1344	/*
1345	 * The TLB mappings associated with a vcpu context are not
1346	 * flushed each time a different vcpu is chosen to execute.
1347	 *
1348	 * This is in contrast with a process's vtop mappings that
1349	 * are flushed from the TLB on each context switch.
1350	 *
1351	 * Therefore we need to do more than just a TLB shootdown on
1352	 * the active cpus in 'pmap->pm_active'. To do this we keep
1353	 * track of the number of invalidations performed on this pmap.
1354	 *
1355	 * Each vcpu keeps a cache of this counter and compares it
1356	 * just before a vmresume. If the counter is out-of-date an
1357	 * invept will be done to flush stale mappings from the TLB.
1358	 */
1359	atomic_add_acq_long(&pmap->pm_eptgen, 1);
1360
1361	/*
1362	 * Force the vcpu to exit and trap back into the hypervisor.
1363	 */
1364	ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK;
1365	ipi_selected(pmap->pm_active, ipinum);
1366	sched_unpin();
1367}
1368
1369void
1370pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1371{
1372	cpuset_t other_cpus;
1373	u_int cpuid;
1374
1375	if (pmap_type_guest(pmap)) {
1376		pmap_invalidate_ept(pmap);
1377		return;
1378	}
1379
1380	KASSERT(pmap->pm_type == PT_X86,
1381	    ("pmap_invalidate_page: invalid type %d", pmap->pm_type));
1382
1383	sched_pin();
1384	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
1385		if (!pmap_pcid_enabled) {
1386			invlpg(va);
1387		} else {
1388			if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) {
1389				if (pmap == PCPU_GET(curpmap))
1390					invlpg(va);
1391				else
1392					pmap_invalidate_page_pcid(pmap, va);
1393			} else {
1394				invltlb_globpcid();
1395			}
1396		}
1397		smp_invlpg(pmap, va);
1398	} else {
1399		cpuid = PCPU_GET(cpuid);
1400		other_cpus = all_cpus;
1401		CPU_CLR(cpuid, &other_cpus);
1402		if (CPU_ISSET(cpuid, &pmap->pm_active))
1403			invlpg(va);
1404		else if (pmap_pcid_enabled) {
1405			if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0)
1406				pmap_invalidate_page_pcid(pmap, va);
1407			else
1408				invltlb_globpcid();
1409		}
1410		if (pmap_pcid_enabled)
1411			CPU_AND(&other_cpus, &pmap->pm_save);
1412		else
1413			CPU_AND(&other_cpus, &pmap->pm_active);
1414		if (!CPU_EMPTY(&other_cpus))
1415			smp_masked_invlpg(other_cpus, pmap, va);
1416	}
1417	sched_unpin();
1418}
1419
1420static void
1421pmap_invalidate_range_pcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1422{
1423	struct invpcid_descr d;
1424	uint64_t cr3;
1425	vm_offset_t addr;
1426
1427	if (invpcid_works) {
1428		d.pcid = pmap->pm_pcid;
1429		d.pad = 0;
1430		for (addr = sva; addr < eva; addr += PAGE_SIZE) {
1431			d.addr = addr;
1432			invpcid(&d, INVPCID_ADDR);
1433		}
1434		return;
1435	}
1436
1437	cr3 = rcr3();
1438	critical_enter();
1439	load_cr3(pmap->pm_cr3 | CR3_PCID_SAVE);
1440	for (addr = sva; addr < eva; addr += PAGE_SIZE)
1441		invlpg(addr);
1442	load_cr3(cr3 | CR3_PCID_SAVE);
1443	critical_exit();
1444}
1445
1446void
1447pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1448{
1449	cpuset_t other_cpus;
1450	vm_offset_t addr;
1451	u_int cpuid;
1452
1453	if (pmap_type_guest(pmap)) {
1454		pmap_invalidate_ept(pmap);
1455		return;
1456	}
1457
1458	KASSERT(pmap->pm_type == PT_X86,
1459	    ("pmap_invalidate_range: invalid type %d", pmap->pm_type));
1460
1461	sched_pin();
1462	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
1463		if (!pmap_pcid_enabled) {
1464			for (addr = sva; addr < eva; addr += PAGE_SIZE)
1465				invlpg(addr);
1466		} else {
1467			if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) {
1468				if (pmap == PCPU_GET(curpmap)) {
1469					for (addr = sva; addr < eva;
1470					    addr += PAGE_SIZE)
1471						invlpg(addr);
1472				} else {
1473					pmap_invalidate_range_pcid(pmap,
1474					    sva, eva);
1475				}
1476			} else {
1477				invltlb_globpcid();
1478			}
1479		}
1480		smp_invlpg_range(pmap, sva, eva);
1481	} else {
1482		cpuid = PCPU_GET(cpuid);
1483		other_cpus = all_cpus;
1484		CPU_CLR(cpuid, &other_cpus);
1485		if (CPU_ISSET(cpuid, &pmap->pm_active)) {
1486			for (addr = sva; addr < eva; addr += PAGE_SIZE)
1487				invlpg(addr);
1488		} else if (pmap_pcid_enabled) {
1489			if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0)
1490				pmap_invalidate_range_pcid(pmap, sva, eva);
1491			else
1492				invltlb_globpcid();
1493		}
1494		if (pmap_pcid_enabled)
1495			CPU_AND(&other_cpus, &pmap->pm_save);
1496		else
1497			CPU_AND(&other_cpus, &pmap->pm_active);
1498		if (!CPU_EMPTY(&other_cpus))
1499			smp_masked_invlpg_range(other_cpus, pmap, sva, eva);
1500	}
1501	sched_unpin();
1502}
1503
1504void
1505pmap_invalidate_all(pmap_t pmap)
1506{
1507	cpuset_t other_cpus;
1508	struct invpcid_descr d;
1509	uint64_t cr3;
1510	u_int cpuid;
1511
1512	if (pmap_type_guest(pmap)) {
1513		pmap_invalidate_ept(pmap);
1514		return;
1515	}
1516
1517	KASSERT(pmap->pm_type == PT_X86,
1518	    ("pmap_invalidate_all: invalid type %d", pmap->pm_type));
1519
1520	sched_pin();
1521	cpuid = PCPU_GET(cpuid);
1522	if (pmap == kernel_pmap ||
1523	    (pmap_pcid_enabled && !CPU_CMP(&pmap->pm_save, &all_cpus)) ||
1524	    !CPU_CMP(&pmap->pm_active, &all_cpus)) {
1525		if (invpcid_works) {
1526			bzero(&d, sizeof(d));
1527			invpcid(&d, INVPCID_CTXGLOB);
1528		} else {
1529			invltlb_globpcid();
1530		}
1531		if (!CPU_ISSET(cpuid, &pmap->pm_active))
1532			CPU_CLR_ATOMIC(cpuid, &pmap->pm_save);
1533		smp_invltlb(pmap);
1534	} else {
1535		other_cpus = all_cpus;
1536		CPU_CLR(cpuid, &other_cpus);
1537
1538		/*
1539		 * This logic is duplicated in the Xinvltlb shootdown
1540		 * IPI handler.
1541		 */
1542		if (pmap_pcid_enabled) {
1543			if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) {
1544				if (invpcid_works) {
1545					d.pcid = pmap->pm_pcid;
1546					d.pad = 0;
1547					d.addr = 0;
1548					invpcid(&d, INVPCID_CTX);
1549				} else {
1550					cr3 = rcr3();
1551					critical_enter();
1552
1553					/*
1554					 * Bit 63 is clear, pcid TLB
1555					 * entries are invalidated.
1556					 */
1557					load_cr3(pmap->pm_cr3);
1558					load_cr3(cr3 | CR3_PCID_SAVE);
1559					critical_exit();
1560				}
1561			} else {
1562				invltlb_globpcid();
1563			}
1564		} else if (CPU_ISSET(cpuid, &pmap->pm_active))
1565			invltlb();
1566		if (!CPU_ISSET(cpuid, &pmap->pm_active))
1567			CPU_CLR_ATOMIC(cpuid, &pmap->pm_save);
1568		if (pmap_pcid_enabled)
1569			CPU_AND(&other_cpus, &pmap->pm_save);
1570		else
1571			CPU_AND(&other_cpus, &pmap->pm_active);
1572		if (!CPU_EMPTY(&other_cpus))
1573			smp_masked_invltlb(other_cpus, pmap);
1574	}
1575	sched_unpin();
1576}
1577
1578void
1579pmap_invalidate_cache(void)
1580{
1581
1582	sched_pin();
1583	wbinvd();
1584	smp_cache_flush();
1585	sched_unpin();
1586}
1587
1588struct pde_action {
1589	cpuset_t invalidate;	/* processors that invalidate their TLB */
1590	pmap_t pmap;
1591	vm_offset_t va;
1592	pd_entry_t *pde;
1593	pd_entry_t newpde;
1594	u_int store;		/* processor that updates the PDE */
1595};
1596
1597static void
1598pmap_update_pde_action(void *arg)
1599{
1600	struct pde_action *act = arg;
1601
1602	if (act->store == PCPU_GET(cpuid))
1603		pmap_update_pde_store(act->pmap, act->pde, act->newpde);
1604}
1605
1606static void
1607pmap_update_pde_teardown(void *arg)
1608{
1609	struct pde_action *act = arg;
1610
1611	if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
1612		pmap_update_pde_invalidate(act->pmap, act->va, act->newpde);
1613}
1614
1615/*
1616 * Change the page size for the specified virtual address in a way that
1617 * prevents any possibility of the TLB ever having two entries that map the
1618 * same virtual address using different page sizes.  This is the recommended
1619 * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
1620 * machine check exception for a TLB state that is improperly diagnosed as a
1621 * hardware error.
1622 */
1623static void
1624pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1625{
1626	struct pde_action act;
1627	cpuset_t active, other_cpus;
1628	u_int cpuid;
1629
1630	sched_pin();
1631	cpuid = PCPU_GET(cpuid);
1632	other_cpus = all_cpus;
1633	CPU_CLR(cpuid, &other_cpus);
1634	if (pmap == kernel_pmap || pmap_type_guest(pmap))
1635		active = all_cpus;
1636	else {
1637		active = pmap->pm_active;
1638		CPU_AND_ATOMIC(&pmap->pm_save, &active);
1639	}
1640	if (CPU_OVERLAP(&active, &other_cpus)) {
1641		act.store = cpuid;
1642		act.invalidate = active;
1643		act.va = va;
1644		act.pmap = pmap;
1645		act.pde = pde;
1646		act.newpde = newpde;
1647		CPU_SET(cpuid, &active);
1648		smp_rendezvous_cpus(active,
1649		    smp_no_rendevous_barrier, pmap_update_pde_action,
1650		    pmap_update_pde_teardown, &act);
1651	} else {
1652		pmap_update_pde_store(pmap, pde, newpde);
1653		if (CPU_ISSET(cpuid, &active))
1654			pmap_update_pde_invalidate(pmap, va, newpde);
1655	}
1656	sched_unpin();
1657}
1658#else /* !SMP */
1659/*
1660 * Normal, non-SMP, invalidation functions.
1661 * We inline these within pmap.c for speed.
1662 */
1663PMAP_INLINE void
1664pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1665{
1666
1667	switch (pmap->pm_type) {
1668	case PT_X86:
1669		if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1670			invlpg(va);
1671		break;
1672	case PT_RVI:
1673	case PT_EPT:
1674		pmap->pm_eptgen++;
1675		break;
1676	default:
1677		panic("pmap_invalidate_page: unknown type: %d", pmap->pm_type);
1678	}
1679}
1680
1681PMAP_INLINE void
1682pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1683{
1684	vm_offset_t addr;
1685
1686	switch (pmap->pm_type) {
1687	case PT_X86:
1688		if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1689			for (addr = sva; addr < eva; addr += PAGE_SIZE)
1690				invlpg(addr);
1691		break;
1692	case PT_RVI:
1693	case PT_EPT:
1694		pmap->pm_eptgen++;
1695		break;
1696	default:
1697		panic("pmap_invalidate_range: unknown type: %d", pmap->pm_type);
1698	}
1699}
1700
1701PMAP_INLINE void
1702pmap_invalidate_all(pmap_t pmap)
1703{
1704
1705	switch (pmap->pm_type) {
1706	case PT_X86:
1707		if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1708			invltlb();
1709		break;
1710	case PT_RVI:
1711	case PT_EPT:
1712		pmap->pm_eptgen++;
1713		break;
1714	default:
1715		panic("pmap_invalidate_all: unknown type %d", pmap->pm_type);
1716	}
1717}
1718
1719PMAP_INLINE void
1720pmap_invalidate_cache(void)
1721{
1722
1723	wbinvd();
1724}
1725
1726static void
1727pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1728{
1729
1730	pmap_update_pde_store(pmap, pde, newpde);
1731	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1732		pmap_update_pde_invalidate(pmap, va, newpde);
1733	else
1734		CPU_ZERO(&pmap->pm_save);
1735}
1736#endif /* !SMP */
1737
1738#define PMAP_CLFLUSH_THRESHOLD   (2 * 1024 * 1024)
1739
1740void
1741pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force)
1742{
1743
1744	if (force) {
1745		sva &= ~(vm_offset_t)cpu_clflush_line_size;
1746	} else {
1747		KASSERT((sva & PAGE_MASK) == 0,
1748		    ("pmap_invalidate_cache_range: sva not page-aligned"));
1749		KASSERT((eva & PAGE_MASK) == 0,
1750		    ("pmap_invalidate_cache_range: eva not page-aligned"));
1751	}
1752
1753	if ((cpu_feature & CPUID_SS) != 0 && !force)
1754		; /* If "Self Snoop" is supported and allowed, do nothing. */
1755	else if ((cpu_feature & CPUID_CLFSH) != 0 &&
1756	    eva - sva < PMAP_CLFLUSH_THRESHOLD) {
1757
1758		/*
1759		 * XXX: Some CPUs fault, hang, or trash the local APIC
1760		 * registers if we use CLFLUSH on the local APIC
1761		 * range.  The local APIC is always uncached, so we
1762		 * don't need to flush for that range anyway.
1763		 */
1764		if (pmap_kextract(sva) == lapic_paddr)
1765			return;
1766
1767		/*
1768		 * Otherwise, do per-cache line flush.  Use the mfence
1769		 * instruction to insure that previous stores are
1770		 * included in the write-back.  The processor
1771		 * propagates flush to other processors in the cache
1772		 * coherence domain.
1773		 */
1774		mfence();
1775		for (; sva < eva; sva += cpu_clflush_line_size)
1776			clflush(sva);
1777		mfence();
1778	} else {
1779
1780		/*
1781		 * No targeted cache flush methods are supported by CPU,
1782		 * or the supplied range is bigger than 2MB.
1783		 * Globally invalidate cache.
1784		 */
1785		pmap_invalidate_cache();
1786	}
1787}
1788
1789/*
1790 * Remove the specified set of pages from the data and instruction caches.
1791 *
1792 * In contrast to pmap_invalidate_cache_range(), this function does not
1793 * rely on the CPU's self-snoop feature, because it is intended for use
1794 * when moving pages into a different cache domain.
1795 */
1796void
1797pmap_invalidate_cache_pages(vm_page_t *pages, int count)
1798{
1799	vm_offset_t daddr, eva;
1800	int i;
1801
1802	if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
1803	    (cpu_feature & CPUID_CLFSH) == 0)
1804		pmap_invalidate_cache();
1805	else {
1806		mfence();
1807		for (i = 0; i < count; i++) {
1808			daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i]));
1809			eva = daddr + PAGE_SIZE;
1810			for (; daddr < eva; daddr += cpu_clflush_line_size)
1811				clflush(daddr);
1812		}
1813		mfence();
1814	}
1815}
1816
1817/*
1818 *	Routine:	pmap_extract
1819 *	Function:
1820 *		Extract the physical page address associated
1821 *		with the given map/virtual_address pair.
1822 */
1823vm_paddr_t
1824pmap_extract(pmap_t pmap, vm_offset_t va)
1825{
1826	pdp_entry_t *pdpe;
1827	pd_entry_t *pde;
1828	pt_entry_t *pte, PG_V;
1829	vm_paddr_t pa;
1830
1831	pa = 0;
1832	PG_V = pmap_valid_bit(pmap);
1833	PMAP_LOCK(pmap);
1834	pdpe = pmap_pdpe(pmap, va);
1835	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
1836		if ((*pdpe & PG_PS) != 0)
1837			pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK);
1838		else {
1839			pde = pmap_pdpe_to_pde(pdpe, va);
1840			if ((*pde & PG_V) != 0) {
1841				if ((*pde & PG_PS) != 0) {
1842					pa = (*pde & PG_PS_FRAME) |
1843					    (va & PDRMASK);
1844				} else {
1845					pte = pmap_pde_to_pte(pde, va);
1846					pa = (*pte & PG_FRAME) |
1847					    (va & PAGE_MASK);
1848				}
1849			}
1850		}
1851	}
1852	PMAP_UNLOCK(pmap);
1853	return (pa);
1854}
1855
1856/*
1857 *	Routine:	pmap_extract_and_hold
1858 *	Function:
1859 *		Atomically extract and hold the physical page
1860 *		with the given pmap and virtual address pair
1861 *		if that mapping permits the given protection.
1862 */
1863vm_page_t
1864pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1865{
1866	pd_entry_t pde, *pdep;
1867	pt_entry_t pte, PG_RW, PG_V;
1868	vm_paddr_t pa;
1869	vm_page_t m;
1870
1871	pa = 0;
1872	m = NULL;
1873	PG_RW = pmap_rw_bit(pmap);
1874	PG_V = pmap_valid_bit(pmap);
1875	PMAP_LOCK(pmap);
1876retry:
1877	pdep = pmap_pde(pmap, va);
1878	if (pdep != NULL && (pde = *pdep)) {
1879		if (pde & PG_PS) {
1880			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
1881				if (vm_page_pa_tryrelock(pmap, (pde &
1882				    PG_PS_FRAME) | (va & PDRMASK), &pa))
1883					goto retry;
1884				m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
1885				    (va & PDRMASK));
1886				vm_page_hold(m);
1887			}
1888		} else {
1889			pte = *pmap_pde_to_pte(pdep, va);
1890			if ((pte & PG_V) &&
1891			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
1892				if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME,
1893				    &pa))
1894					goto retry;
1895				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
1896				vm_page_hold(m);
1897			}
1898		}
1899	}
1900	PA_UNLOCK_COND(pa);
1901	PMAP_UNLOCK(pmap);
1902	return (m);
1903}
1904
1905vm_paddr_t
1906pmap_kextract(vm_offset_t va)
1907{
1908	pd_entry_t pde;
1909	vm_paddr_t pa;
1910
1911	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
1912		pa = DMAP_TO_PHYS(va);
1913	} else {
1914		pde = *vtopde(va);
1915		if (pde & PG_PS) {
1916			pa = (pde & PG_PS_FRAME) | (va & PDRMASK);
1917		} else {
1918			/*
1919			 * Beware of a concurrent promotion that changes the
1920			 * PDE at this point!  For example, vtopte() must not
1921			 * be used to access the PTE because it would use the
1922			 * new PDE.  It is, however, safe to use the old PDE
1923			 * because the page table page is preserved by the
1924			 * promotion.
1925			 */
1926			pa = *pmap_pde_to_pte(&pde, va);
1927			pa = (pa & PG_FRAME) | (va & PAGE_MASK);
1928		}
1929	}
1930	return (pa);
1931}
1932
1933/***************************************************
1934 * Low level mapping routines.....
1935 ***************************************************/
1936
1937/*
1938 * Add a wired page to the kva.
1939 * Note: not SMP coherent.
1940 */
1941PMAP_INLINE void
1942pmap_kenter(vm_offset_t va, vm_paddr_t pa)
1943{
1944	pt_entry_t *pte;
1945
1946	pte = vtopte(va);
1947	pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G);
1948}
1949
1950static __inline void
1951pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
1952{
1953	pt_entry_t *pte;
1954	int cache_bits;
1955
1956	pte = vtopte(va);
1957	cache_bits = pmap_cache_bits(kernel_pmap, mode, 0);
1958	pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G | cache_bits);
1959}
1960
1961/*
1962 * Remove a page from the kernel pagetables.
1963 * Note: not SMP coherent.
1964 */
1965PMAP_INLINE void
1966pmap_kremove(vm_offset_t va)
1967{
1968	pt_entry_t *pte;
1969
1970	pte = vtopte(va);
1971	pte_clear(pte);
1972}
1973
1974/*
1975 *	Used to map a range of physical addresses into kernel
1976 *	virtual address space.
1977 *
1978 *	The value passed in '*virt' is a suggested virtual address for
1979 *	the mapping. Architectures which can support a direct-mapped
1980 *	physical to virtual region can return the appropriate address
1981 *	within that region, leaving '*virt' unchanged. Other
1982 *	architectures should map the pages starting at '*virt' and
1983 *	update '*virt' with the first usable address after the mapped
1984 *	region.
1985 */
1986vm_offset_t
1987pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1988{
1989	return PHYS_TO_DMAP(start);
1990}
1991
1992
1993/*
1994 * Add a list of wired pages to the kva
1995 * this routine is only used for temporary
1996 * kernel mappings that do not need to have
1997 * page modification or references recorded.
1998 * Note that old mappings are simply written
1999 * over.  The page *must* be wired.
2000 * Note: SMP coherent.  Uses a ranged shootdown IPI.
2001 */
2002void
2003pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
2004{
2005	pt_entry_t *endpte, oldpte, pa, *pte;
2006	vm_page_t m;
2007	int cache_bits;
2008
2009	oldpte = 0;
2010	pte = vtopte(sva);
2011	endpte = pte + count;
2012	while (pte < endpte) {
2013		m = *ma++;
2014		cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0);
2015		pa = VM_PAGE_TO_PHYS(m) | cache_bits;
2016		if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) {
2017			oldpte |= *pte;
2018			pte_store(pte, pa | X86_PG_G | X86_PG_RW | X86_PG_V);
2019		}
2020		pte++;
2021	}
2022	if (__predict_false((oldpte & X86_PG_V) != 0))
2023		pmap_invalidate_range(kernel_pmap, sva, sva + count *
2024		    PAGE_SIZE);
2025}
2026
2027/*
2028 * This routine tears out page mappings from the
2029 * kernel -- it is meant only for temporary mappings.
2030 * Note: SMP coherent.  Uses a ranged shootdown IPI.
2031 */
2032void
2033pmap_qremove(vm_offset_t sva, int count)
2034{
2035	vm_offset_t va;
2036
2037	va = sva;
2038	while (count-- > 0) {
2039		KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va));
2040		pmap_kremove(va);
2041		va += PAGE_SIZE;
2042	}
2043	pmap_invalidate_range(kernel_pmap, sva, va);
2044}
2045
2046/***************************************************
2047 * Page table page management routines.....
2048 ***************************************************/
2049static __inline void
2050pmap_free_zero_pages(struct spglist *free)
2051{
2052	vm_page_t m;
2053
2054	while ((m = SLIST_FIRST(free)) != NULL) {
2055		SLIST_REMOVE_HEAD(free, plinks.s.ss);
2056		/* Preserve the page's PG_ZERO setting. */
2057		vm_page_free_toq(m);
2058	}
2059}
2060
2061/*
2062 * Schedule the specified unused page table page to be freed.  Specifically,
2063 * add the page to the specified list of pages that will be released to the
2064 * physical memory manager after the TLB has been updated.
2065 */
2066static __inline void
2067pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
2068    boolean_t set_PG_ZERO)
2069{
2070
2071	if (set_PG_ZERO)
2072		m->flags |= PG_ZERO;
2073	else
2074		m->flags &= ~PG_ZERO;
2075	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
2076}
2077
2078/*
2079 * Inserts the specified page table page into the specified pmap's collection
2080 * of idle page table pages.  Each of a pmap's page table pages is responsible
2081 * for mapping a distinct range of virtual addresses.  The pmap's collection is
2082 * ordered by this virtual address range.
2083 */
2084static __inline int
2085pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
2086{
2087
2088	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2089	return (vm_radix_insert(&pmap->pm_root, mpte));
2090}
2091
2092/*
2093 * Looks for a page table page mapping the specified virtual address in the
2094 * specified pmap's collection of idle page table pages.  Returns NULL if there
2095 * is no page table page corresponding to the specified virtual address.
2096 */
2097static __inline vm_page_t
2098pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va)
2099{
2100
2101	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2102	return (vm_radix_lookup(&pmap->pm_root, pmap_pde_pindex(va)));
2103}
2104
2105/*
2106 * Removes the specified page table page from the specified pmap's collection
2107 * of idle page table pages.  The specified page table page must be a member of
2108 * the pmap's collection.
2109 */
2110static __inline void
2111pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte)
2112{
2113
2114	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2115	vm_radix_remove(&pmap->pm_root, mpte->pindex);
2116}
2117
2118/*
2119 * Decrements a page table page's wire count, which is used to record the
2120 * number of valid page table entries within the page.  If the wire count
2121 * drops to zero, then the page table page is unmapped.  Returns TRUE if the
2122 * page table page was unmapped and FALSE otherwise.
2123 */
2124static inline boolean_t
2125pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2126{
2127
2128	--m->wire_count;
2129	if (m->wire_count == 0) {
2130		_pmap_unwire_ptp(pmap, va, m, free);
2131		return (TRUE);
2132	} else
2133		return (FALSE);
2134}
2135
2136static void
2137_pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2138{
2139
2140	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2141	/*
2142	 * unmap the page table page
2143	 */
2144	if (m->pindex >= (NUPDE + NUPDPE)) {
2145		/* PDP page */
2146		pml4_entry_t *pml4;
2147		pml4 = pmap_pml4e(pmap, va);
2148		*pml4 = 0;
2149	} else if (m->pindex >= NUPDE) {
2150		/* PD page */
2151		pdp_entry_t *pdp;
2152		pdp = pmap_pdpe(pmap, va);
2153		*pdp = 0;
2154	} else {
2155		/* PTE page */
2156		pd_entry_t *pd;
2157		pd = pmap_pde(pmap, va);
2158		*pd = 0;
2159	}
2160	pmap_resident_count_dec(pmap, 1);
2161	if (m->pindex < NUPDE) {
2162		/* We just released a PT, unhold the matching PD */
2163		vm_page_t pdpg;
2164
2165		pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
2166		pmap_unwire_ptp(pmap, va, pdpg, free);
2167	}
2168	if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
2169		/* We just released a PD, unhold the matching PDP */
2170		vm_page_t pdppg;
2171
2172		pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
2173		pmap_unwire_ptp(pmap, va, pdppg, free);
2174	}
2175
2176	/*
2177	 * This is a release store so that the ordinary store unmapping
2178	 * the page table page is globally performed before TLB shoot-
2179	 * down is begun.
2180	 */
2181	atomic_subtract_rel_int(&cnt.v_wire_count, 1);
2182
2183	/*
2184	 * Put page on a list so that it is released after
2185	 * *ALL* TLB shootdown is done
2186	 */
2187	pmap_add_delayed_free_list(m, free, TRUE);
2188}
2189
2190/*
2191 * After removing a page table entry, this routine is used to
2192 * conditionally free the page, and manage the hold/wire counts.
2193 */
2194static int
2195pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
2196    struct spglist *free)
2197{
2198	vm_page_t mpte;
2199
2200	if (va >= VM_MAXUSER_ADDRESS)
2201		return (0);
2202	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
2203	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
2204	return (pmap_unwire_ptp(pmap, va, mpte, free));
2205}
2206
2207void
2208pmap_pinit0(pmap_t pmap)
2209{
2210
2211	PMAP_LOCK_INIT(pmap);
2212	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
2213	pmap->pm_cr3 = KPML4phys;
2214	pmap->pm_root.rt_root = 0;
2215	CPU_ZERO(&pmap->pm_active);
2216	CPU_ZERO(&pmap->pm_save);
2217	PCPU_SET(curpmap, pmap);
2218	TAILQ_INIT(&pmap->pm_pvchunk);
2219	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
2220	pmap->pm_pcid = pmap_pcid_enabled ? 0 : -1;
2221	pmap->pm_flags = pmap_flags;
2222}
2223
2224/*
2225 * Initialize a preallocated and zeroed pmap structure,
2226 * such as one in a vmspace structure.
2227 */
2228int
2229pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags)
2230{
2231	vm_page_t pml4pg;
2232	vm_paddr_t pml4phys;
2233	int i;
2234
2235	/*
2236	 * allocate the page directory page
2237	 */
2238	while ((pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
2239	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
2240		VM_WAIT;
2241
2242	pml4phys = VM_PAGE_TO_PHYS(pml4pg);
2243	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys);
2244	pmap->pm_pcid = -1;
2245	pmap->pm_cr3 = ~0;	/* initialize to an invalid value */
2246
2247	if ((pml4pg->flags & PG_ZERO) == 0)
2248		pagezero(pmap->pm_pml4);
2249
2250	/*
2251	 * Do not install the host kernel mappings in the nested page
2252	 * tables. These mappings are meaningless in the guest physical
2253	 * address space.
2254	 */
2255	if ((pmap->pm_type = pm_type) == PT_X86) {
2256		pmap->pm_cr3 = pml4phys;
2257
2258		/* Wire in kernel global address entries. */
2259		for (i = 0; i < NKPML4E; i++) {
2260			pmap->pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) |
2261			    X86_PG_RW | X86_PG_V | PG_U;
2262		}
2263		for (i = 0; i < ndmpdpphys; i++) {
2264			pmap->pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) |
2265			    X86_PG_RW | X86_PG_V | PG_U;
2266		}
2267
2268		/* install self-referential address mapping entry(s) */
2269		pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) |
2270		    X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
2271
2272		if (pmap_pcid_enabled) {
2273			pmap->pm_pcid = alloc_unr(&pcid_unr);
2274			if (pmap->pm_pcid != -1)
2275				pmap->pm_cr3 |= pmap->pm_pcid;
2276		}
2277	}
2278
2279	pmap->pm_root.rt_root = 0;
2280	CPU_ZERO(&pmap->pm_active);
2281	TAILQ_INIT(&pmap->pm_pvchunk);
2282	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
2283	pmap->pm_flags = flags;
2284	pmap->pm_eptgen = 0;
2285	CPU_ZERO(&pmap->pm_save);
2286
2287	return (1);
2288}
2289
2290int
2291pmap_pinit(pmap_t pmap)
2292{
2293
2294	return (pmap_pinit_type(pmap, PT_X86, pmap_flags));
2295}
2296
2297/*
2298 * This routine is called if the desired page table page does not exist.
2299 *
2300 * If page table page allocation fails, this routine may sleep before
2301 * returning NULL.  It sleeps only if a lock pointer was given.
2302 *
2303 * Note: If a page allocation fails at page table level two or three,
2304 * one or two pages may be held during the wait, only to be released
2305 * afterwards.  This conservative approach is easily argued to avoid
2306 * race conditions.
2307 */
2308static vm_page_t
2309_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
2310{
2311	vm_page_t m, pdppg, pdpg;
2312	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
2313
2314	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2315
2316	PG_A = pmap_accessed_bit(pmap);
2317	PG_M = pmap_modified_bit(pmap);
2318	PG_V = pmap_valid_bit(pmap);
2319	PG_RW = pmap_rw_bit(pmap);
2320
2321	/*
2322	 * Allocate a page table page.
2323	 */
2324	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
2325	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
2326		if (lockp != NULL) {
2327			RELEASE_PV_LIST_LOCK(lockp);
2328			PMAP_UNLOCK(pmap);
2329			rw_runlock(&pvh_global_lock);
2330			VM_WAIT;
2331			rw_rlock(&pvh_global_lock);
2332			PMAP_LOCK(pmap);
2333		}
2334
2335		/*
2336		 * Indicate the need to retry.  While waiting, the page table
2337		 * page may have been allocated.
2338		 */
2339		return (NULL);
2340	}
2341	if ((m->flags & PG_ZERO) == 0)
2342		pmap_zero_page(m);
2343
2344	/*
2345	 * Map the pagetable page into the process address space, if
2346	 * it isn't already there.
2347	 */
2348
2349	if (ptepindex >= (NUPDE + NUPDPE)) {
2350		pml4_entry_t *pml4;
2351		vm_pindex_t pml4index;
2352
2353		/* Wire up a new PDPE page */
2354		pml4index = ptepindex - (NUPDE + NUPDPE);
2355		pml4 = &pmap->pm_pml4[pml4index];
2356		*pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
2357
2358	} else if (ptepindex >= NUPDE) {
2359		vm_pindex_t pml4index;
2360		vm_pindex_t pdpindex;
2361		pml4_entry_t *pml4;
2362		pdp_entry_t *pdp;
2363
2364		/* Wire up a new PDE page */
2365		pdpindex = ptepindex - NUPDE;
2366		pml4index = pdpindex >> NPML4EPGSHIFT;
2367
2368		pml4 = &pmap->pm_pml4[pml4index];
2369		if ((*pml4 & PG_V) == 0) {
2370			/* Have to allocate a new pdp, recurse */
2371			if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index,
2372			    lockp) == NULL) {
2373				--m->wire_count;
2374				atomic_subtract_int(&cnt.v_wire_count, 1);
2375				vm_page_free_zero(m);
2376				return (NULL);
2377			}
2378		} else {
2379			/* Add reference to pdp page */
2380			pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
2381			pdppg->wire_count++;
2382		}
2383		pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
2384
2385		/* Now find the pdp page */
2386		pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
2387		*pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
2388
2389	} else {
2390		vm_pindex_t pml4index;
2391		vm_pindex_t pdpindex;
2392		pml4_entry_t *pml4;
2393		pdp_entry_t *pdp;
2394		pd_entry_t *pd;
2395
2396		/* Wire up a new PTE page */
2397		pdpindex = ptepindex >> NPDPEPGSHIFT;
2398		pml4index = pdpindex >> NPML4EPGSHIFT;
2399
2400		/* First, find the pdp and check that its valid. */
2401		pml4 = &pmap->pm_pml4[pml4index];
2402		if ((*pml4 & PG_V) == 0) {
2403			/* Have to allocate a new pd, recurse */
2404			if (_pmap_allocpte(pmap, NUPDE + pdpindex,
2405			    lockp) == NULL) {
2406				--m->wire_count;
2407				atomic_subtract_int(&cnt.v_wire_count, 1);
2408				vm_page_free_zero(m);
2409				return (NULL);
2410			}
2411			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
2412			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
2413		} else {
2414			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
2415			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
2416			if ((*pdp & PG_V) == 0) {
2417				/* Have to allocate a new pd, recurse */
2418				if (_pmap_allocpte(pmap, NUPDE + pdpindex,
2419				    lockp) == NULL) {
2420					--m->wire_count;
2421					atomic_subtract_int(&cnt.v_wire_count,
2422					    1);
2423					vm_page_free_zero(m);
2424					return (NULL);
2425				}
2426			} else {
2427				/* Add reference to the pd page */
2428				pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
2429				pdpg->wire_count++;
2430			}
2431		}
2432		pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
2433
2434		/* Now we know where the page directory page is */
2435		pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
2436		*pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
2437	}
2438
2439	pmap_resident_count_inc(pmap, 1);
2440
2441	return (m);
2442}
2443
2444static vm_page_t
2445pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
2446{
2447	vm_pindex_t pdpindex, ptepindex;
2448	pdp_entry_t *pdpe, PG_V;
2449	vm_page_t pdpg;
2450
2451	PG_V = pmap_valid_bit(pmap);
2452
2453retry:
2454	pdpe = pmap_pdpe(pmap, va);
2455	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
2456		/* Add a reference to the pd page. */
2457		pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
2458		pdpg->wire_count++;
2459	} else {
2460		/* Allocate a pd page. */
2461		ptepindex = pmap_pde_pindex(va);
2462		pdpindex = ptepindex >> NPDPEPGSHIFT;
2463		pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp);
2464		if (pdpg == NULL && lockp != NULL)
2465			goto retry;
2466	}
2467	return (pdpg);
2468}
2469
2470static vm_page_t
2471pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
2472{
2473	vm_pindex_t ptepindex;
2474	pd_entry_t *pd, PG_V;
2475	vm_page_t m;
2476
2477	PG_V = pmap_valid_bit(pmap);
2478
2479	/*
2480	 * Calculate pagetable page index
2481	 */
2482	ptepindex = pmap_pde_pindex(va);
2483retry:
2484	/*
2485	 * Get the page directory entry
2486	 */
2487	pd = pmap_pde(pmap, va);
2488
2489	/*
2490	 * This supports switching from a 2MB page to a
2491	 * normal 4K page.
2492	 */
2493	if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
2494		if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) {
2495			/*
2496			 * Invalidation of the 2MB page mapping may have caused
2497			 * the deallocation of the underlying PD page.
2498			 */
2499			pd = NULL;
2500		}
2501	}
2502
2503	/*
2504	 * If the page table page is mapped, we just increment the
2505	 * hold count, and activate it.
2506	 */
2507	if (pd != NULL && (*pd & PG_V) != 0) {
2508		m = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
2509		m->wire_count++;
2510	} else {
2511		/*
2512		 * Here if the pte page isn't mapped, or if it has been
2513		 * deallocated.
2514		 */
2515		m = _pmap_allocpte(pmap, ptepindex, lockp);
2516		if (m == NULL && lockp != NULL)
2517			goto retry;
2518	}
2519	return (m);
2520}
2521
2522
2523/***************************************************
2524 * Pmap allocation/deallocation routines.
2525 ***************************************************/
2526
2527/*
2528 * Release any resources held by the given physical map.
2529 * Called when a pmap initialized by pmap_pinit is being released.
2530 * Should only be called if the map contains no valid mappings.
2531 */
2532void
2533pmap_release(pmap_t pmap)
2534{
2535	vm_page_t m;
2536	int i;
2537
2538	KASSERT(pmap->pm_stats.resident_count == 0,
2539	    ("pmap_release: pmap resident count %ld != 0",
2540	    pmap->pm_stats.resident_count));
2541	KASSERT(vm_radix_is_empty(&pmap->pm_root),
2542	    ("pmap_release: pmap has reserved page table page(s)"));
2543
2544	if (pmap_pcid_enabled) {
2545		/*
2546		 * Invalidate any left TLB entries, to allow the reuse
2547		 * of the pcid.
2548		 */
2549		pmap_invalidate_all(pmap);
2550	}
2551
2552	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4));
2553
2554	for (i = 0; i < NKPML4E; i++)	/* KVA */
2555		pmap->pm_pml4[KPML4BASE + i] = 0;
2556	for (i = 0; i < ndmpdpphys; i++)/* Direct Map */
2557		pmap->pm_pml4[DMPML4I + i] = 0;
2558	pmap->pm_pml4[PML4PML4I] = 0;	/* Recursive Mapping */
2559
2560	m->wire_count--;
2561	atomic_subtract_int(&cnt.v_wire_count, 1);
2562	vm_page_free_zero(m);
2563	if (pmap->pm_pcid != -1)
2564		free_unr(&pcid_unr, pmap->pm_pcid);
2565}
2566
2567static int
2568kvm_size(SYSCTL_HANDLER_ARGS)
2569{
2570	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
2571
2572	return sysctl_handle_long(oidp, &ksize, 0, req);
2573}
2574SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
2575    0, 0, kvm_size, "LU", "Size of KVM");
2576
2577static int
2578kvm_free(SYSCTL_HANDLER_ARGS)
2579{
2580	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
2581
2582	return sysctl_handle_long(oidp, &kfree, 0, req);
2583}
2584SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
2585    0, 0, kvm_free, "LU", "Amount of KVM free");
2586
2587/*
2588 * grow the number of kernel page table entries, if needed
2589 */
2590void
2591pmap_growkernel(vm_offset_t addr)
2592{
2593	vm_paddr_t paddr;
2594	vm_page_t nkpg;
2595	pd_entry_t *pde, newpdir;
2596	pdp_entry_t *pdpe;
2597
2598	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
2599
2600	/*
2601	 * Return if "addr" is within the range of kernel page table pages
2602	 * that were preallocated during pmap bootstrap.  Moreover, leave
2603	 * "kernel_vm_end" and the kernel page table as they were.
2604	 *
2605	 * The correctness of this action is based on the following
2606	 * argument: vm_map_insert() allocates contiguous ranges of the
2607	 * kernel virtual address space.  It calls this function if a range
2608	 * ends after "kernel_vm_end".  If the kernel is mapped between
2609	 * "kernel_vm_end" and "addr", then the range cannot begin at
2610	 * "kernel_vm_end".  In fact, its beginning address cannot be less
2611	 * than the kernel.  Thus, there is no immediate need to allocate
2612	 * any new kernel page table pages between "kernel_vm_end" and
2613	 * "KERNBASE".
2614	 */
2615	if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR)
2616		return;
2617
2618	addr = roundup2(addr, NBPDR);
2619	if (addr - 1 >= kernel_map->max_offset)
2620		addr = kernel_map->max_offset;
2621	while (kernel_vm_end < addr) {
2622		pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end);
2623		if ((*pdpe & X86_PG_V) == 0) {
2624			/* We need a new PDP entry */
2625			nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT,
2626			    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
2627			    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
2628			if (nkpg == NULL)
2629				panic("pmap_growkernel: no memory to grow kernel");
2630			if ((nkpg->flags & PG_ZERO) == 0)
2631				pmap_zero_page(nkpg);
2632			paddr = VM_PAGE_TO_PHYS(nkpg);
2633			*pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW |
2634			    X86_PG_A | X86_PG_M);
2635			continue; /* try again */
2636		}
2637		pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end);
2638		if ((*pde & X86_PG_V) != 0) {
2639			kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2640			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2641				kernel_vm_end = kernel_map->max_offset;
2642				break;
2643			}
2644			continue;
2645		}
2646
2647		nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end),
2648		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
2649		    VM_ALLOC_ZERO);
2650		if (nkpg == NULL)
2651			panic("pmap_growkernel: no memory to grow kernel");
2652		if ((nkpg->flags & PG_ZERO) == 0)
2653			pmap_zero_page(nkpg);
2654		paddr = VM_PAGE_TO_PHYS(nkpg);
2655		newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
2656		pde_store(pde, newpdir);
2657
2658		kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2659		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2660			kernel_vm_end = kernel_map->max_offset;
2661			break;
2662		}
2663	}
2664}
2665
2666
2667/***************************************************
2668 * page management routines.
2669 ***************************************************/
2670
2671CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
2672CTASSERT(_NPCM == 3);
2673CTASSERT(_NPCPV == 168);
2674
2675static __inline struct pv_chunk *
2676pv_to_chunk(pv_entry_t pv)
2677{
2678
2679	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
2680}
2681
2682#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
2683
2684#define	PC_FREE0	0xfffffffffffffffful
2685#define	PC_FREE1	0xfffffffffffffffful
2686#define	PC_FREE2	0x000000fffffffffful
2687
2688static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
2689
2690#ifdef PV_STATS
2691static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
2692
2693SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
2694	"Current number of pv entry chunks");
2695SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
2696	"Current number of pv entry chunks allocated");
2697SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
2698	"Current number of pv entry chunks frees");
2699SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
2700	"Number of times tried to get a chunk page but failed.");
2701
2702static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
2703static int pv_entry_spare;
2704
2705SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
2706	"Current number of pv entry frees");
2707SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
2708	"Current number of pv entry allocs");
2709SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
2710	"Current number of pv entries");
2711SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
2712	"Current number of spare pv entries");
2713#endif
2714
2715/*
2716 * We are in a serious low memory condition.  Resort to
2717 * drastic measures to free some pages so we can allocate
2718 * another pv entry chunk.
2719 *
2720 * Returns NULL if PV entries were reclaimed from the specified pmap.
2721 *
2722 * We do not, however, unmap 2mpages because subsequent accesses will
2723 * allocate per-page pv entries until repromotion occurs, thereby
2724 * exacerbating the shortage of free pv entries.
2725 */
2726static vm_page_t
2727reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
2728{
2729	struct pch new_tail;
2730	struct pv_chunk *pc;
2731	struct md_page *pvh;
2732	pd_entry_t *pde;
2733	pmap_t pmap;
2734	pt_entry_t *pte, tpte;
2735	pt_entry_t PG_G, PG_A, PG_M, PG_RW;
2736	pv_entry_t pv;
2737	vm_offset_t va;
2738	vm_page_t m, m_pc;
2739	struct spglist free;
2740	uint64_t inuse;
2741	int bit, field, freed;
2742
2743	rw_assert(&pvh_global_lock, RA_LOCKED);
2744	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
2745	KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
2746	pmap = NULL;
2747	m_pc = NULL;
2748	PG_G = PG_A = PG_M = PG_RW = 0;
2749	SLIST_INIT(&free);
2750	TAILQ_INIT(&new_tail);
2751	mtx_lock(&pv_chunks_mutex);
2752	while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && SLIST_EMPTY(&free)) {
2753		TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2754		mtx_unlock(&pv_chunks_mutex);
2755		if (pmap != pc->pc_pmap) {
2756			if (pmap != NULL) {
2757				pmap_invalidate_all(pmap);
2758				if (pmap != locked_pmap)
2759					PMAP_UNLOCK(pmap);
2760			}
2761			pmap = pc->pc_pmap;
2762			/* Avoid deadlock and lock recursion. */
2763			if (pmap > locked_pmap) {
2764				RELEASE_PV_LIST_LOCK(lockp);
2765				PMAP_LOCK(pmap);
2766			} else if (pmap != locked_pmap &&
2767			    !PMAP_TRYLOCK(pmap)) {
2768				pmap = NULL;
2769				TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
2770				mtx_lock(&pv_chunks_mutex);
2771				continue;
2772			}
2773			PG_G = pmap_global_bit(pmap);
2774			PG_A = pmap_accessed_bit(pmap);
2775			PG_M = pmap_modified_bit(pmap);
2776			PG_RW = pmap_rw_bit(pmap);
2777		}
2778
2779		/*
2780		 * Destroy every non-wired, 4 KB page mapping in the chunk.
2781		 */
2782		freed = 0;
2783		for (field = 0; field < _NPCM; field++) {
2784			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
2785			    inuse != 0; inuse &= ~(1UL << bit)) {
2786				bit = bsfq(inuse);
2787				pv = &pc->pc_pventry[field * 64 + bit];
2788				va = pv->pv_va;
2789				pde = pmap_pde(pmap, va);
2790				if ((*pde & PG_PS) != 0)
2791					continue;
2792				pte = pmap_pde_to_pte(pde, va);
2793				if ((*pte & PG_W) != 0)
2794					continue;
2795				tpte = pte_load_clear(pte);
2796				if ((tpte & PG_G) != 0)
2797					pmap_invalidate_page(pmap, va);
2798				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
2799				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2800					vm_page_dirty(m);
2801				if ((tpte & PG_A) != 0)
2802					vm_page_aflag_set(m, PGA_REFERENCED);
2803				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2804				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
2805				m->md.pv_gen++;
2806				if (TAILQ_EMPTY(&m->md.pv_list) &&
2807				    (m->flags & PG_FICTITIOUS) == 0) {
2808					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2809					if (TAILQ_EMPTY(&pvh->pv_list)) {
2810						vm_page_aflag_clear(m,
2811						    PGA_WRITEABLE);
2812					}
2813				}
2814				pc->pc_map[field] |= 1UL << bit;
2815				pmap_unuse_pt(pmap, va, *pde, &free);
2816				freed++;
2817			}
2818		}
2819		if (freed == 0) {
2820			TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
2821			mtx_lock(&pv_chunks_mutex);
2822			continue;
2823		}
2824		/* Every freed mapping is for a 4 KB page. */
2825		pmap_resident_count_dec(pmap, freed);
2826		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
2827		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
2828		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
2829		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2830		if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 &&
2831		    pc->pc_map[2] == PC_FREE2) {
2832			PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
2833			PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
2834			PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
2835			/* Entire chunk is free; return it. */
2836			m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
2837			dump_drop_page(m_pc->phys_addr);
2838			mtx_lock(&pv_chunks_mutex);
2839			break;
2840		}
2841		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2842		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
2843		mtx_lock(&pv_chunks_mutex);
2844		/* One freed pv entry in locked_pmap is sufficient. */
2845		if (pmap == locked_pmap)
2846			break;
2847	}
2848	TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
2849	mtx_unlock(&pv_chunks_mutex);
2850	if (pmap != NULL) {
2851		pmap_invalidate_all(pmap);
2852		if (pmap != locked_pmap)
2853			PMAP_UNLOCK(pmap);
2854	}
2855	if (m_pc == NULL && !SLIST_EMPTY(&free)) {
2856		m_pc = SLIST_FIRST(&free);
2857		SLIST_REMOVE_HEAD(&free, plinks.s.ss);
2858		/* Recycle a freed page table page. */
2859		m_pc->wire_count = 1;
2860		atomic_add_int(&cnt.v_wire_count, 1);
2861	}
2862	pmap_free_zero_pages(&free);
2863	return (m_pc);
2864}
2865
2866/*
2867 * free the pv_entry back to the free list
2868 */
2869static void
2870free_pv_entry(pmap_t pmap, pv_entry_t pv)
2871{
2872	struct pv_chunk *pc;
2873	int idx, field, bit;
2874
2875	rw_assert(&pvh_global_lock, RA_LOCKED);
2876	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2877	PV_STAT(atomic_add_long(&pv_entry_frees, 1));
2878	PV_STAT(atomic_add_int(&pv_entry_spare, 1));
2879	PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
2880	pc = pv_to_chunk(pv);
2881	idx = pv - &pc->pc_pventry[0];
2882	field = idx / 64;
2883	bit = idx % 64;
2884	pc->pc_map[field] |= 1ul << bit;
2885	if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
2886	    pc->pc_map[2] != PC_FREE2) {
2887		/* 98% of the time, pc is already at the head of the list. */
2888		if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
2889			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2890			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2891		}
2892		return;
2893	}
2894	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2895	free_pv_chunk(pc);
2896}
2897
2898static void
2899free_pv_chunk(struct pv_chunk *pc)
2900{
2901	vm_page_t m;
2902
2903	mtx_lock(&pv_chunks_mutex);
2904 	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2905	mtx_unlock(&pv_chunks_mutex);
2906	PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
2907	PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
2908	PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
2909	/* entire chunk is free, return it */
2910	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
2911	dump_drop_page(m->phys_addr);
2912	vm_page_unwire(m, 0);
2913	vm_page_free(m);
2914}
2915
2916/*
2917 * Returns a new PV entry, allocating a new PV chunk from the system when
2918 * needed.  If this PV chunk allocation fails and a PV list lock pointer was
2919 * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
2920 * returned.
2921 *
2922 * The given PV list lock may be released.
2923 */
2924static pv_entry_t
2925get_pv_entry(pmap_t pmap, struct rwlock **lockp)
2926{
2927	int bit, field;
2928	pv_entry_t pv;
2929	struct pv_chunk *pc;
2930	vm_page_t m;
2931
2932	rw_assert(&pvh_global_lock, RA_LOCKED);
2933	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2934	PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
2935retry:
2936	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2937	if (pc != NULL) {
2938		for (field = 0; field < _NPCM; field++) {
2939			if (pc->pc_map[field]) {
2940				bit = bsfq(pc->pc_map[field]);
2941				break;
2942			}
2943		}
2944		if (field < _NPCM) {
2945			pv = &pc->pc_pventry[field * 64 + bit];
2946			pc->pc_map[field] &= ~(1ul << bit);
2947			/* If this was the last item, move it to tail */
2948			if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
2949			    pc->pc_map[2] == 0) {
2950				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2951				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
2952				    pc_list);
2953			}
2954			PV_STAT(atomic_add_long(&pv_entry_count, 1));
2955			PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
2956			return (pv);
2957		}
2958	}
2959	/* No free items, allocate another chunk */
2960	m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
2961	    VM_ALLOC_WIRED);
2962	if (m == NULL) {
2963		if (lockp == NULL) {
2964			PV_STAT(pc_chunk_tryfail++);
2965			return (NULL);
2966		}
2967		m = reclaim_pv_chunk(pmap, lockp);
2968		if (m == NULL)
2969			goto retry;
2970	}
2971	PV_STAT(atomic_add_int(&pc_chunk_count, 1));
2972	PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
2973	dump_add_page(m->phys_addr);
2974	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
2975	pc->pc_pmap = pmap;
2976	pc->pc_map[0] = PC_FREE0 & ~1ul;	/* preallocated bit 0 */
2977	pc->pc_map[1] = PC_FREE1;
2978	pc->pc_map[2] = PC_FREE2;
2979	mtx_lock(&pv_chunks_mutex);
2980	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
2981	mtx_unlock(&pv_chunks_mutex);
2982	pv = &pc->pc_pventry[0];
2983	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2984	PV_STAT(atomic_add_long(&pv_entry_count, 1));
2985	PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
2986	return (pv);
2987}
2988
2989/*
2990 * Returns the number of one bits within the given PV chunk map element.
2991 */
2992static int
2993popcnt_pc_map_elem(uint64_t elem)
2994{
2995	int count;
2996
2997	/*
2998	 * This simple method of counting the one bits performs well because
2999	 * the given element typically contains more zero bits than one bits.
3000	 */
3001	count = 0;
3002	for (; elem != 0; elem &= elem - 1)
3003		count++;
3004	return (count);
3005}
3006
3007/*
3008 * Ensure that the number of spare PV entries in the specified pmap meets or
3009 * exceeds the given count, "needed".
3010 *
3011 * The given PV list lock may be released.
3012 */
3013static void
3014reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
3015{
3016	struct pch new_tail;
3017	struct pv_chunk *pc;
3018	int avail, free;
3019	vm_page_t m;
3020
3021	rw_assert(&pvh_global_lock, RA_LOCKED);
3022	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3023	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
3024
3025	/*
3026	 * Newly allocated PV chunks must be stored in a private list until
3027	 * the required number of PV chunks have been allocated.  Otherwise,
3028	 * reclaim_pv_chunk() could recycle one of these chunks.  In
3029	 * contrast, these chunks must be added to the pmap upon allocation.
3030	 */
3031	TAILQ_INIT(&new_tail);
3032retry:
3033	avail = 0;
3034	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
3035		if ((cpu_feature2 & CPUID2_POPCNT) == 0) {
3036			free = popcnt_pc_map_elem(pc->pc_map[0]);
3037			free += popcnt_pc_map_elem(pc->pc_map[1]);
3038			free += popcnt_pc_map_elem(pc->pc_map[2]);
3039		} else {
3040			free = popcntq(pc->pc_map[0]);
3041			free += popcntq(pc->pc_map[1]);
3042			free += popcntq(pc->pc_map[2]);
3043		}
3044		if (free == 0)
3045			break;
3046		avail += free;
3047		if (avail >= needed)
3048			break;
3049	}
3050	for (; avail < needed; avail += _NPCPV) {
3051		m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
3052		    VM_ALLOC_WIRED);
3053		if (m == NULL) {
3054			m = reclaim_pv_chunk(pmap, lockp);
3055			if (m == NULL)
3056				goto retry;
3057		}
3058		PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3059		PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3060		dump_add_page(m->phys_addr);
3061		pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3062		pc->pc_pmap = pmap;
3063		pc->pc_map[0] = PC_FREE0;
3064		pc->pc_map[1] = PC_FREE1;
3065		pc->pc_map[2] = PC_FREE2;
3066		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3067		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
3068		PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
3069	}
3070	if (!TAILQ_EMPTY(&new_tail)) {
3071		mtx_lock(&pv_chunks_mutex);
3072		TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
3073		mtx_unlock(&pv_chunks_mutex);
3074	}
3075}
3076
3077/*
3078 * First find and then remove the pv entry for the specified pmap and virtual
3079 * address from the specified pv list.  Returns the pv entry if found and NULL
3080 * otherwise.  This operation can be performed on pv lists for either 4KB or
3081 * 2MB page mappings.
3082 */
3083static __inline pv_entry_t
3084pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3085{
3086	pv_entry_t pv;
3087
3088	rw_assert(&pvh_global_lock, RA_LOCKED);
3089	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
3090		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
3091			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
3092			pvh->pv_gen++;
3093			break;
3094		}
3095	}
3096	return (pv);
3097}
3098
3099/*
3100 * After demotion from a 2MB page mapping to 512 4KB page mappings,
3101 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
3102 * entries for each of the 4KB page mappings.
3103 */
3104static void
3105pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3106    struct rwlock **lockp)
3107{
3108	struct md_page *pvh;
3109	struct pv_chunk *pc;
3110	pv_entry_t pv;
3111	vm_offset_t va_last;
3112	vm_page_t m;
3113	int bit, field;
3114
3115	rw_assert(&pvh_global_lock, RA_LOCKED);
3116	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3117	KASSERT((pa & PDRMASK) == 0,
3118	    ("pmap_pv_demote_pde: pa is not 2mpage aligned"));
3119	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3120
3121	/*
3122	 * Transfer the 2mpage's pv entry for this mapping to the first
3123	 * page's pv list.  Once this transfer begins, the pv list lock
3124	 * must not be released until the last pv entry is reinstantiated.
3125	 */
3126	pvh = pa_to_pvh(pa);
3127	va = trunc_2mpage(va);
3128	pv = pmap_pvh_remove(pvh, pmap, va);
3129	KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
3130	m = PHYS_TO_VM_PAGE(pa);
3131	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3132	m->md.pv_gen++;
3133	/* Instantiate the remaining NPTEPG - 1 pv entries. */
3134	PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1));
3135	va_last = va + NBPDR - PAGE_SIZE;
3136	for (;;) {
3137		pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3138		KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
3139		    pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare"));
3140		for (field = 0; field < _NPCM; field++) {
3141			while (pc->pc_map[field]) {
3142				bit = bsfq(pc->pc_map[field]);
3143				pc->pc_map[field] &= ~(1ul << bit);
3144				pv = &pc->pc_pventry[field * 64 + bit];
3145				va += PAGE_SIZE;
3146				pv->pv_va = va;
3147				m++;
3148				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3149			    ("pmap_pv_demote_pde: page %p is not managed", m));
3150				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3151				m->md.pv_gen++;
3152				if (va == va_last)
3153					goto out;
3154			}
3155		}
3156		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3157		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3158	}
3159out:
3160	if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
3161		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3162		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3163	}
3164	PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1));
3165	PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1));
3166}
3167
3168/*
3169 * After promotion from 512 4KB page mappings to a single 2MB page mapping,
3170 * replace the many pv entries for the 4KB page mappings by a single pv entry
3171 * for the 2MB page mapping.
3172 */
3173static void
3174pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3175    struct rwlock **lockp)
3176{
3177	struct md_page *pvh;
3178	pv_entry_t pv;
3179	vm_offset_t va_last;
3180	vm_page_t m;
3181
3182	rw_assert(&pvh_global_lock, RA_LOCKED);
3183	KASSERT((pa & PDRMASK) == 0,
3184	    ("pmap_pv_promote_pde: pa is not 2mpage aligned"));
3185	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3186
3187	/*
3188	 * Transfer the first page's pv entry for this mapping to the 2mpage's
3189	 * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
3190	 * a transfer avoids the possibility that get_pv_entry() calls
3191	 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
3192	 * mappings that is being promoted.
3193	 */
3194	m = PHYS_TO_VM_PAGE(pa);
3195	va = trunc_2mpage(va);
3196	pv = pmap_pvh_remove(&m->md, pmap, va);
3197	KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
3198	pvh = pa_to_pvh(pa);
3199	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
3200	pvh->pv_gen++;
3201	/* Free the remaining NPTEPG - 1 pv entries. */
3202	va_last = va + NBPDR - PAGE_SIZE;
3203	do {
3204		m++;
3205		va += PAGE_SIZE;
3206		pmap_pvh_free(&m->md, pmap, va);
3207	} while (va < va_last);
3208}
3209
3210/*
3211 * First find and then destroy the pv entry for the specified pmap and virtual
3212 * address.  This operation can be performed on pv lists for either 4KB or 2MB
3213 * page mappings.
3214 */
3215static void
3216pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3217{
3218	pv_entry_t pv;
3219
3220	pv = pmap_pvh_remove(pvh, pmap, va);
3221	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
3222	free_pv_entry(pmap, pv);
3223}
3224
3225/*
3226 * Conditionally create the PV entry for a 4KB page mapping if the required
3227 * memory can be allocated without resorting to reclamation.
3228 */
3229static boolean_t
3230pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
3231    struct rwlock **lockp)
3232{
3233	pv_entry_t pv;
3234
3235	rw_assert(&pvh_global_lock, RA_LOCKED);
3236	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3237	/* Pass NULL instead of the lock pointer to disable reclamation. */
3238	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
3239		pv->pv_va = va;
3240		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3241		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3242		m->md.pv_gen++;
3243		return (TRUE);
3244	} else
3245		return (FALSE);
3246}
3247
3248/*
3249 * Conditionally create the PV entry for a 2MB page mapping if the required
3250 * memory can be allocated without resorting to reclamation.
3251 */
3252static boolean_t
3253pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3254    struct rwlock **lockp)
3255{
3256	struct md_page *pvh;
3257	pv_entry_t pv;
3258
3259	rw_assert(&pvh_global_lock, RA_LOCKED);
3260	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3261	/* Pass NULL instead of the lock pointer to disable reclamation. */
3262	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
3263		pv->pv_va = va;
3264		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3265		pvh = pa_to_pvh(pa);
3266		TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
3267		pvh->pv_gen++;
3268		return (TRUE);
3269	} else
3270		return (FALSE);
3271}
3272
3273/*
3274 * Fills a page table page with mappings to consecutive physical pages.
3275 */
3276static void
3277pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
3278{
3279	pt_entry_t *pte;
3280
3281	for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
3282		*pte = newpte;
3283		newpte += PAGE_SIZE;
3284	}
3285}
3286
3287/*
3288 * Tries to demote a 2MB page mapping.  If demotion fails, the 2MB page
3289 * mapping is invalidated.
3290 */
3291static boolean_t
3292pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
3293{
3294	struct rwlock *lock;
3295	boolean_t rv;
3296
3297	lock = NULL;
3298	rv = pmap_demote_pde_locked(pmap, pde, va, &lock);
3299	if (lock != NULL)
3300		rw_wunlock(lock);
3301	return (rv);
3302}
3303
3304static boolean_t
3305pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
3306    struct rwlock **lockp)
3307{
3308	pd_entry_t newpde, oldpde;
3309	pt_entry_t *firstpte, newpte;
3310	pt_entry_t PG_A, PG_G, PG_M, PG_RW, PG_V;
3311	vm_paddr_t mptepa;
3312	vm_page_t mpte;
3313	struct spglist free;
3314	int PG_PTE_CACHE;
3315
3316	PG_G = pmap_global_bit(pmap);
3317	PG_A = pmap_accessed_bit(pmap);
3318	PG_M = pmap_modified_bit(pmap);
3319	PG_RW = pmap_rw_bit(pmap);
3320	PG_V = pmap_valid_bit(pmap);
3321	PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
3322
3323	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3324	oldpde = *pde;
3325	KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
3326	    ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
3327	if ((oldpde & PG_A) != 0 && (mpte = pmap_lookup_pt_page(pmap, va)) !=
3328	    NULL)
3329		pmap_remove_pt_page(pmap, mpte);
3330	else {
3331		KASSERT((oldpde & PG_W) == 0,
3332		    ("pmap_demote_pde: page table page for a wired mapping"
3333		    " is missing"));
3334
3335		/*
3336		 * Invalidate the 2MB page mapping and return "failure" if the
3337		 * mapping was never accessed or the allocation of the new
3338		 * page table page fails.  If the 2MB page mapping belongs to
3339		 * the direct map region of the kernel's address space, then
3340		 * the page allocation request specifies the highest possible
3341		 * priority (VM_ALLOC_INTERRUPT).  Otherwise, the priority is
3342		 * normal.  Page table pages are preallocated for every other
3343		 * part of the kernel address space, so the direct map region
3344		 * is the only part of the kernel address space that must be
3345		 * handled here.
3346		 */
3347		if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
3348		    pmap_pde_pindex(va), (va >= DMAP_MIN_ADDRESS && va <
3349		    DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) |
3350		    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
3351			SLIST_INIT(&free);
3352			pmap_remove_pde(pmap, pde, trunc_2mpage(va), &free,
3353			    lockp);
3354			pmap_invalidate_page(pmap, trunc_2mpage(va));
3355			pmap_free_zero_pages(&free);
3356			CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx"
3357			    " in pmap %p", va, pmap);
3358			return (FALSE);
3359		}
3360		if (va < VM_MAXUSER_ADDRESS)
3361			pmap_resident_count_inc(pmap, 1);
3362	}
3363	mptepa = VM_PAGE_TO_PHYS(mpte);
3364	firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
3365	newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
3366	KASSERT((oldpde & PG_A) != 0,
3367	    ("pmap_demote_pde: oldpde is missing PG_A"));
3368	KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
3369	    ("pmap_demote_pde: oldpde is missing PG_M"));
3370	newpte = oldpde & ~PG_PS;
3371	newpte = pmap_swap_pat(pmap, newpte);
3372
3373	/*
3374	 * If the page table page is new, initialize it.
3375	 */
3376	if (mpte->wire_count == 1) {
3377		mpte->wire_count = NPTEPG;
3378		pmap_fill_ptp(firstpte, newpte);
3379	}
3380	KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
3381	    ("pmap_demote_pde: firstpte and newpte map different physical"
3382	    " addresses"));
3383
3384	/*
3385	 * If the mapping has changed attributes, update the page table
3386	 * entries.
3387	 */
3388	if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
3389		pmap_fill_ptp(firstpte, newpte);
3390
3391	/*
3392	 * The spare PV entries must be reserved prior to demoting the
3393	 * mapping, that is, prior to changing the PDE.  Otherwise, the state
3394	 * of the PDE and the PV lists will be inconsistent, which can result
3395	 * in reclaim_pv_chunk() attempting to remove a PV entry from the
3396	 * wrong PV list and pmap_pv_demote_pde() failing to find the expected
3397	 * PV entry for the 2MB page mapping that is being demoted.
3398	 */
3399	if ((oldpde & PG_MANAGED) != 0)
3400		reserve_pv_entries(pmap, NPTEPG - 1, lockp);
3401
3402	/*
3403	 * Demote the mapping.  This pmap is locked.  The old PDE has
3404	 * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
3405	 * set.  Thus, there is no danger of a race with another
3406	 * processor changing the setting of PG_A and/or PG_M between
3407	 * the read above and the store below.
3408	 */
3409	if (workaround_erratum383)
3410		pmap_update_pde(pmap, va, pde, newpde);
3411	else
3412		pde_store(pde, newpde);
3413
3414	/*
3415	 * Invalidate a stale recursive mapping of the page table page.
3416	 */
3417	if (va >= VM_MAXUSER_ADDRESS)
3418		pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
3419
3420	/*
3421	 * Demote the PV entry.
3422	 */
3423	if ((oldpde & PG_MANAGED) != 0)
3424		pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp);
3425
3426	atomic_add_long(&pmap_pde_demotions, 1);
3427	CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx"
3428	    " in pmap %p", va, pmap);
3429	return (TRUE);
3430}
3431
3432/*
3433 * pmap_remove_kernel_pde: Remove a kernel superpage mapping.
3434 */
3435static void
3436pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
3437{
3438	pd_entry_t newpde;
3439	vm_paddr_t mptepa;
3440	vm_page_t mpte;
3441
3442	KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
3443	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3444	mpte = pmap_lookup_pt_page(pmap, va);
3445	if (mpte == NULL)
3446		panic("pmap_remove_kernel_pde: Missing pt page.");
3447
3448	pmap_remove_pt_page(pmap, mpte);
3449	mptepa = VM_PAGE_TO_PHYS(mpte);
3450	newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V;
3451
3452	/*
3453	 * Initialize the page table page.
3454	 */
3455	pagezero((void *)PHYS_TO_DMAP(mptepa));
3456
3457	/*
3458	 * Demote the mapping.
3459	 */
3460	if (workaround_erratum383)
3461		pmap_update_pde(pmap, va, pde, newpde);
3462	else
3463		pde_store(pde, newpde);
3464
3465	/*
3466	 * Invalidate a stale recursive mapping of the page table page.
3467	 */
3468	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
3469}
3470
3471/*
3472 * pmap_remove_pde: do the things to unmap a superpage in a process
3473 */
3474static int
3475pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
3476    struct spglist *free, struct rwlock **lockp)
3477{
3478	struct md_page *pvh;
3479	pd_entry_t oldpde;
3480	vm_offset_t eva, va;
3481	vm_page_t m, mpte;
3482	pt_entry_t PG_G, PG_A, PG_M, PG_RW;
3483
3484	PG_G = pmap_global_bit(pmap);
3485	PG_A = pmap_accessed_bit(pmap);
3486	PG_M = pmap_modified_bit(pmap);
3487	PG_RW = pmap_rw_bit(pmap);
3488
3489	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3490	KASSERT((sva & PDRMASK) == 0,
3491	    ("pmap_remove_pde: sva is not 2mpage aligned"));
3492	oldpde = pte_load_clear(pdq);
3493	if (oldpde & PG_W)
3494		pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
3495
3496	/*
3497	 * Machines that don't support invlpg, also don't support
3498	 * PG_G.
3499	 */
3500	if (oldpde & PG_G)
3501		pmap_invalidate_page(kernel_pmap, sva);
3502	pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
3503	if (oldpde & PG_MANAGED) {
3504		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME);
3505		pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
3506		pmap_pvh_free(pvh, pmap, sva);
3507		eva = sva + NBPDR;
3508		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
3509		    va < eva; va += PAGE_SIZE, m++) {
3510			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
3511				vm_page_dirty(m);
3512			if (oldpde & PG_A)
3513				vm_page_aflag_set(m, PGA_REFERENCED);
3514			if (TAILQ_EMPTY(&m->md.pv_list) &&
3515			    TAILQ_EMPTY(&pvh->pv_list))
3516				vm_page_aflag_clear(m, PGA_WRITEABLE);
3517		}
3518	}
3519	if (pmap == kernel_pmap) {
3520		pmap_remove_kernel_pde(pmap, pdq, sva);
3521	} else {
3522		mpte = pmap_lookup_pt_page(pmap, sva);
3523		if (mpte != NULL) {
3524			pmap_remove_pt_page(pmap, mpte);
3525			pmap_resident_count_dec(pmap, 1);
3526			KASSERT(mpte->wire_count == NPTEPG,
3527			    ("pmap_remove_pde: pte page wire count error"));
3528			mpte->wire_count = 0;
3529			pmap_add_delayed_free_list(mpte, free, FALSE);
3530			atomic_subtract_int(&cnt.v_wire_count, 1);
3531		}
3532	}
3533	return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free));
3534}
3535
3536/*
3537 * pmap_remove_pte: do the things to unmap a page in a process
3538 */
3539static int
3540pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
3541    pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp)
3542{
3543	struct md_page *pvh;
3544	pt_entry_t oldpte, PG_A, PG_M, PG_RW;
3545	vm_page_t m;
3546
3547	PG_A = pmap_accessed_bit(pmap);
3548	PG_M = pmap_modified_bit(pmap);
3549	PG_RW = pmap_rw_bit(pmap);
3550
3551	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3552	oldpte = pte_load_clear(ptq);
3553	if (oldpte & PG_W)
3554		pmap->pm_stats.wired_count -= 1;
3555	pmap_resident_count_dec(pmap, 1);
3556	if (oldpte & PG_MANAGED) {
3557		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
3558		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
3559			vm_page_dirty(m);
3560		if (oldpte & PG_A)
3561			vm_page_aflag_set(m, PGA_REFERENCED);
3562		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3563		pmap_pvh_free(&m->md, pmap, va);
3564		if (TAILQ_EMPTY(&m->md.pv_list) &&
3565		    (m->flags & PG_FICTITIOUS) == 0) {
3566			pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3567			if (TAILQ_EMPTY(&pvh->pv_list))
3568				vm_page_aflag_clear(m, PGA_WRITEABLE);
3569		}
3570	}
3571	return (pmap_unuse_pt(pmap, va, ptepde, free));
3572}
3573
3574/*
3575 * Remove a single page from a process address space
3576 */
3577static void
3578pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
3579    struct spglist *free)
3580{
3581	struct rwlock *lock;
3582	pt_entry_t *pte, PG_V;
3583
3584	PG_V = pmap_valid_bit(pmap);
3585	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3586	if ((*pde & PG_V) == 0)
3587		return;
3588	pte = pmap_pde_to_pte(pde, va);
3589	if ((*pte & PG_V) == 0)
3590		return;
3591	lock = NULL;
3592	pmap_remove_pte(pmap, pte, va, *pde, free, &lock);
3593	if (lock != NULL)
3594		rw_wunlock(lock);
3595	pmap_invalidate_page(pmap, va);
3596}
3597
3598/*
3599 *	Remove the given range of addresses from the specified map.
3600 *
3601 *	It is assumed that the start and end are properly
3602 *	rounded to the page size.
3603 */
3604void
3605pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
3606{
3607	struct rwlock *lock;
3608	vm_offset_t va, va_next;
3609	pml4_entry_t *pml4e;
3610	pdp_entry_t *pdpe;
3611	pd_entry_t ptpaddr, *pde;
3612	pt_entry_t *pte, PG_G, PG_V;
3613	struct spglist free;
3614	int anyvalid;
3615
3616	PG_G = pmap_global_bit(pmap);
3617	PG_V = pmap_valid_bit(pmap);
3618
3619	/*
3620	 * Perform an unsynchronized read.  This is, however, safe.
3621	 */
3622	if (pmap->pm_stats.resident_count == 0)
3623		return;
3624
3625	anyvalid = 0;
3626	SLIST_INIT(&free);
3627
3628	rw_rlock(&pvh_global_lock);
3629	PMAP_LOCK(pmap);
3630
3631	/*
3632	 * special handling of removing one page.  a very
3633	 * common operation and easy to short circuit some
3634	 * code.
3635	 */
3636	if (sva + PAGE_SIZE == eva) {
3637		pde = pmap_pde(pmap, sva);
3638		if (pde && (*pde & PG_PS) == 0) {
3639			pmap_remove_page(pmap, sva, pde, &free);
3640			goto out;
3641		}
3642	}
3643
3644	lock = NULL;
3645	for (; sva < eva; sva = va_next) {
3646
3647		if (pmap->pm_stats.resident_count == 0)
3648			break;
3649
3650		pml4e = pmap_pml4e(pmap, sva);
3651		if ((*pml4e & PG_V) == 0) {
3652			va_next = (sva + NBPML4) & ~PML4MASK;
3653			if (va_next < sva)
3654				va_next = eva;
3655			continue;
3656		}
3657
3658		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
3659		if ((*pdpe & PG_V) == 0) {
3660			va_next = (sva + NBPDP) & ~PDPMASK;
3661			if (va_next < sva)
3662				va_next = eva;
3663			continue;
3664		}
3665
3666		/*
3667		 * Calculate index for next page table.
3668		 */
3669		va_next = (sva + NBPDR) & ~PDRMASK;
3670		if (va_next < sva)
3671			va_next = eva;
3672
3673		pde = pmap_pdpe_to_pde(pdpe, sva);
3674		ptpaddr = *pde;
3675
3676		/*
3677		 * Weed out invalid mappings.
3678		 */
3679		if (ptpaddr == 0)
3680			continue;
3681
3682		/*
3683		 * Check for large page.
3684		 */
3685		if ((ptpaddr & PG_PS) != 0) {
3686			/*
3687			 * Are we removing the entire large page?  If not,
3688			 * demote the mapping and fall through.
3689			 */
3690			if (sva + NBPDR == va_next && eva >= va_next) {
3691				/*
3692				 * The TLB entry for a PG_G mapping is
3693				 * invalidated by pmap_remove_pde().
3694				 */
3695				if ((ptpaddr & PG_G) == 0)
3696					anyvalid = 1;
3697				pmap_remove_pde(pmap, pde, sva, &free, &lock);
3698				continue;
3699			} else if (!pmap_demote_pde_locked(pmap, pde, sva,
3700			    &lock)) {
3701				/* The large page mapping was destroyed. */
3702				continue;
3703			} else
3704				ptpaddr = *pde;
3705		}
3706
3707		/*
3708		 * Limit our scan to either the end of the va represented
3709		 * by the current page table page, or to the end of the
3710		 * range being removed.
3711		 */
3712		if (va_next > eva)
3713			va_next = eva;
3714
3715		va = va_next;
3716		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
3717		    sva += PAGE_SIZE) {
3718			if (*pte == 0) {
3719				if (va != va_next) {
3720					pmap_invalidate_range(pmap, va, sva);
3721					va = va_next;
3722				}
3723				continue;
3724			}
3725			if ((*pte & PG_G) == 0)
3726				anyvalid = 1;
3727			else if (va == va_next)
3728				va = sva;
3729			if (pmap_remove_pte(pmap, pte, sva, ptpaddr, &free,
3730			    &lock)) {
3731				sva += PAGE_SIZE;
3732				break;
3733			}
3734		}
3735		if (va != va_next)
3736			pmap_invalidate_range(pmap, va, sva);
3737	}
3738	if (lock != NULL)
3739		rw_wunlock(lock);
3740out:
3741	if (anyvalid)
3742		pmap_invalidate_all(pmap);
3743	rw_runlock(&pvh_global_lock);
3744	PMAP_UNLOCK(pmap);
3745	pmap_free_zero_pages(&free);
3746}
3747
3748/*
3749 *	Routine:	pmap_remove_all
3750 *	Function:
3751 *		Removes this physical page from
3752 *		all physical maps in which it resides.
3753 *		Reflects back modify bits to the pager.
3754 *
3755 *	Notes:
3756 *		Original versions of this routine were very
3757 *		inefficient because they iteratively called
3758 *		pmap_remove (slow...)
3759 */
3760
3761void
3762pmap_remove_all(vm_page_t m)
3763{
3764	struct md_page *pvh;
3765	pv_entry_t pv;
3766	pmap_t pmap;
3767	pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW;
3768	pd_entry_t *pde;
3769	vm_offset_t va;
3770	struct spglist free;
3771
3772	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3773	    ("pmap_remove_all: page %p is not managed", m));
3774	SLIST_INIT(&free);
3775	rw_wlock(&pvh_global_lock);
3776	if ((m->flags & PG_FICTITIOUS) != 0)
3777		goto small_mappings;
3778	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3779	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
3780		pmap = PV_PMAP(pv);
3781		PMAP_LOCK(pmap);
3782		va = pv->pv_va;
3783		pde = pmap_pde(pmap, va);
3784		(void)pmap_demote_pde(pmap, pde, va);
3785		PMAP_UNLOCK(pmap);
3786	}
3787small_mappings:
3788	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3789		pmap = PV_PMAP(pv);
3790		PMAP_LOCK(pmap);
3791		PG_A = pmap_accessed_bit(pmap);
3792		PG_M = pmap_modified_bit(pmap);
3793		PG_RW = pmap_rw_bit(pmap);
3794		pmap_resident_count_dec(pmap, 1);
3795		pde = pmap_pde(pmap, pv->pv_va);
3796		KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
3797		    " a 2mpage in page %p's pv list", m));
3798		pte = pmap_pde_to_pte(pde, pv->pv_va);
3799		tpte = pte_load_clear(pte);
3800		if (tpte & PG_W)
3801			pmap->pm_stats.wired_count--;
3802		if (tpte & PG_A)
3803			vm_page_aflag_set(m, PGA_REFERENCED);
3804
3805		/*
3806		 * Update the vm_page_t clean and reference bits.
3807		 */
3808		if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
3809			vm_page_dirty(m);
3810		pmap_unuse_pt(pmap, pv->pv_va, *pde, &free);
3811		pmap_invalidate_page(pmap, pv->pv_va);
3812		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3813		m->md.pv_gen++;
3814		free_pv_entry(pmap, pv);
3815		PMAP_UNLOCK(pmap);
3816	}
3817	vm_page_aflag_clear(m, PGA_WRITEABLE);
3818	rw_wunlock(&pvh_global_lock);
3819	pmap_free_zero_pages(&free);
3820}
3821
3822/*
3823 * pmap_protect_pde: do the things to protect a 2mpage in a process
3824 */
3825static boolean_t
3826pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
3827{
3828	pd_entry_t newpde, oldpde;
3829	vm_offset_t eva, va;
3830	vm_page_t m;
3831	boolean_t anychanged;
3832	pt_entry_t PG_G, PG_M, PG_RW;
3833
3834	PG_G = pmap_global_bit(pmap);
3835	PG_M = pmap_modified_bit(pmap);
3836	PG_RW = pmap_rw_bit(pmap);
3837
3838	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3839	KASSERT((sva & PDRMASK) == 0,
3840	    ("pmap_protect_pde: sva is not 2mpage aligned"));
3841	anychanged = FALSE;
3842retry:
3843	oldpde = newpde = *pde;
3844	if (oldpde & PG_MANAGED) {
3845		eva = sva + NBPDR;
3846		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
3847		    va < eva; va += PAGE_SIZE, m++)
3848			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
3849				vm_page_dirty(m);
3850	}
3851	if ((prot & VM_PROT_WRITE) == 0)
3852		newpde &= ~(PG_RW | PG_M);
3853	if ((prot & VM_PROT_EXECUTE) == 0)
3854		newpde |= pg_nx;
3855	if (newpde != oldpde) {
3856		if (!atomic_cmpset_long(pde, oldpde, newpde))
3857			goto retry;
3858		if (oldpde & PG_G)
3859			pmap_invalidate_page(pmap, sva);
3860		else
3861			anychanged = TRUE;
3862	}
3863	return (anychanged);
3864}
3865
3866/*
3867 *	Set the physical protection on the
3868 *	specified range of this map as requested.
3869 */
3870void
3871pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
3872{
3873	vm_offset_t va_next;
3874	pml4_entry_t *pml4e;
3875	pdp_entry_t *pdpe;
3876	pd_entry_t ptpaddr, *pde;
3877	pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V;
3878	boolean_t anychanged, pv_lists_locked;
3879
3880	KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
3881	if (prot == VM_PROT_NONE) {
3882		pmap_remove(pmap, sva, eva);
3883		return;
3884	}
3885
3886	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
3887	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
3888		return;
3889
3890	PG_G = pmap_global_bit(pmap);
3891	PG_M = pmap_modified_bit(pmap);
3892	PG_V = pmap_valid_bit(pmap);
3893	PG_RW = pmap_rw_bit(pmap);
3894	pv_lists_locked = FALSE;
3895resume:
3896	anychanged = FALSE;
3897
3898	PMAP_LOCK(pmap);
3899	for (; sva < eva; sva = va_next) {
3900
3901		pml4e = pmap_pml4e(pmap, sva);
3902		if ((*pml4e & PG_V) == 0) {
3903			va_next = (sva + NBPML4) & ~PML4MASK;
3904			if (va_next < sva)
3905				va_next = eva;
3906			continue;
3907		}
3908
3909		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
3910		if ((*pdpe & PG_V) == 0) {
3911			va_next = (sva + NBPDP) & ~PDPMASK;
3912			if (va_next < sva)
3913				va_next = eva;
3914			continue;
3915		}
3916
3917		va_next = (sva + NBPDR) & ~PDRMASK;
3918		if (va_next < sva)
3919			va_next = eva;
3920
3921		pde = pmap_pdpe_to_pde(pdpe, sva);
3922		ptpaddr = *pde;
3923
3924		/*
3925		 * Weed out invalid mappings.
3926		 */
3927		if (ptpaddr == 0)
3928			continue;
3929
3930		/*
3931		 * Check for large page.
3932		 */
3933		if ((ptpaddr & PG_PS) != 0) {
3934			/*
3935			 * Are we protecting the entire large page?  If not,
3936			 * demote the mapping and fall through.
3937			 */
3938			if (sva + NBPDR == va_next && eva >= va_next) {
3939				/*
3940				 * The TLB entry for a PG_G mapping is
3941				 * invalidated by pmap_protect_pde().
3942				 */
3943				if (pmap_protect_pde(pmap, pde, sva, prot))
3944					anychanged = TRUE;
3945				continue;
3946			} else {
3947				if (!pv_lists_locked) {
3948					pv_lists_locked = TRUE;
3949					if (!rw_try_rlock(&pvh_global_lock)) {
3950						if (anychanged)
3951							pmap_invalidate_all(
3952							    pmap);
3953						PMAP_UNLOCK(pmap);
3954						rw_rlock(&pvh_global_lock);
3955						goto resume;
3956					}
3957				}
3958				if (!pmap_demote_pde(pmap, pde, sva)) {
3959					/*
3960					 * The large page mapping was
3961					 * destroyed.
3962					 */
3963					continue;
3964				}
3965			}
3966		}
3967
3968		if (va_next > eva)
3969			va_next = eva;
3970
3971		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
3972		    sva += PAGE_SIZE) {
3973			pt_entry_t obits, pbits;
3974			vm_page_t m;
3975
3976retry:
3977			obits = pbits = *pte;
3978			if ((pbits & PG_V) == 0)
3979				continue;
3980
3981			if ((prot & VM_PROT_WRITE) == 0) {
3982				if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
3983				    (PG_MANAGED | PG_M | PG_RW)) {
3984					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
3985					vm_page_dirty(m);
3986				}
3987				pbits &= ~(PG_RW | PG_M);
3988			}
3989			if ((prot & VM_PROT_EXECUTE) == 0)
3990				pbits |= pg_nx;
3991
3992			if (pbits != obits) {
3993				if (!atomic_cmpset_long(pte, obits, pbits))
3994					goto retry;
3995				if (obits & PG_G)
3996					pmap_invalidate_page(pmap, sva);
3997				else
3998					anychanged = TRUE;
3999			}
4000		}
4001	}
4002	if (anychanged)
4003		pmap_invalidate_all(pmap);
4004	if (pv_lists_locked)
4005		rw_runlock(&pvh_global_lock);
4006	PMAP_UNLOCK(pmap);
4007}
4008
4009/*
4010 * Tries to promote the 512, contiguous 4KB page mappings that are within a
4011 * single page table page (PTP) to a single 2MB page mapping.  For promotion
4012 * to occur, two conditions must be met: (1) the 4KB page mappings must map
4013 * aligned, contiguous physical memory and (2) the 4KB page mappings must have
4014 * identical characteristics.
4015 */
4016static void
4017pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
4018    struct rwlock **lockp)
4019{
4020	pd_entry_t newpde;
4021	pt_entry_t *firstpte, oldpte, pa, *pte;
4022	pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V;
4023	vm_page_t mpte;
4024	int PG_PTE_CACHE;
4025
4026	PG_A = pmap_accessed_bit(pmap);
4027	PG_G = pmap_global_bit(pmap);
4028	PG_M = pmap_modified_bit(pmap);
4029	PG_V = pmap_valid_bit(pmap);
4030	PG_RW = pmap_rw_bit(pmap);
4031	PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
4032
4033	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4034
4035	/*
4036	 * Examine the first PTE in the specified PTP.  Abort if this PTE is
4037	 * either invalid, unused, or does not map the first 4KB physical page
4038	 * within a 2MB page.
4039	 */
4040	firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
4041setpde:
4042	newpde = *firstpte;
4043	if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
4044		atomic_add_long(&pmap_pde_p_failures, 1);
4045		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
4046		    " in pmap %p", va, pmap);
4047		return;
4048	}
4049	if ((newpde & (PG_M | PG_RW)) == PG_RW) {
4050		/*
4051		 * When PG_M is already clear, PG_RW can be cleared without
4052		 * a TLB invalidation.
4053		 */
4054		if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW))
4055			goto setpde;
4056		newpde &= ~PG_RW;
4057	}
4058
4059	/*
4060	 * Examine each of the other PTEs in the specified PTP.  Abort if this
4061	 * PTE maps an unexpected 4KB physical page or does not have identical
4062	 * characteristics to the first PTE.
4063	 */
4064	pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
4065	for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
4066setpte:
4067		oldpte = *pte;
4068		if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
4069			atomic_add_long(&pmap_pde_p_failures, 1);
4070			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
4071			    " in pmap %p", va, pmap);
4072			return;
4073		}
4074		if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
4075			/*
4076			 * When PG_M is already clear, PG_RW can be cleared
4077			 * without a TLB invalidation.
4078			 */
4079			if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW))
4080				goto setpte;
4081			oldpte &= ~PG_RW;
4082			CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx"
4083			    " in pmap %p", (oldpte & PG_FRAME & PDRMASK) |
4084			    (va & ~PDRMASK), pmap);
4085		}
4086		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
4087			atomic_add_long(&pmap_pde_p_failures, 1);
4088			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
4089			    " in pmap %p", va, pmap);
4090			return;
4091		}
4092		pa -= PAGE_SIZE;
4093	}
4094
4095	/*
4096	 * Save the page table page in its current state until the PDE
4097	 * mapping the superpage is demoted by pmap_demote_pde() or
4098	 * destroyed by pmap_remove_pde().
4099	 */
4100	mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
4101	KASSERT(mpte >= vm_page_array &&
4102	    mpte < &vm_page_array[vm_page_array_size],
4103	    ("pmap_promote_pde: page table page is out of range"));
4104	KASSERT(mpte->pindex == pmap_pde_pindex(va),
4105	    ("pmap_promote_pde: page table page's pindex is wrong"));
4106	if (pmap_insert_pt_page(pmap, mpte)) {
4107		atomic_add_long(&pmap_pde_p_failures, 1);
4108		CTR2(KTR_PMAP,
4109		    "pmap_promote_pde: failure for va %#lx in pmap %p", va,
4110		    pmap);
4111		return;
4112	}
4113
4114	/*
4115	 * Promote the pv entries.
4116	 */
4117	if ((newpde & PG_MANAGED) != 0)
4118		pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp);
4119
4120	/*
4121	 * Propagate the PAT index to its proper position.
4122	 */
4123	newpde = pmap_swap_pat(pmap, newpde);
4124
4125	/*
4126	 * Map the superpage.
4127	 */
4128	if (workaround_erratum383)
4129		pmap_update_pde(pmap, va, pde, PG_PS | newpde);
4130	else
4131		pde_store(pde, PG_PS | newpde);
4132
4133	atomic_add_long(&pmap_pde_promotions, 1);
4134	CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx"
4135	    " in pmap %p", va, pmap);
4136}
4137
4138/*
4139 *	Insert the given physical page (p) at
4140 *	the specified virtual address (v) in the
4141 *	target physical map with the protection requested.
4142 *
4143 *	If specified, the page will be wired down, meaning
4144 *	that the related pte can not be reclaimed.
4145 *
4146 *	NB:  This is the only routine which MAY NOT lazy-evaluate
4147 *	or lose information.  That is, this routine must actually
4148 *	insert this page into the given map NOW.
4149 */
4150int
4151pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
4152    u_int flags, int8_t psind __unused)
4153{
4154	struct rwlock *lock;
4155	pd_entry_t *pde;
4156	pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V;
4157	pt_entry_t newpte, origpte;
4158	pv_entry_t pv;
4159	vm_paddr_t opa, pa;
4160	vm_page_t mpte, om;
4161	boolean_t nosleep;
4162
4163	PG_A = pmap_accessed_bit(pmap);
4164	PG_G = pmap_global_bit(pmap);
4165	PG_M = pmap_modified_bit(pmap);
4166	PG_V = pmap_valid_bit(pmap);
4167	PG_RW = pmap_rw_bit(pmap);
4168
4169	va = trunc_page(va);
4170	KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
4171	KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
4172	    ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)",
4173	    va));
4174	KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva ||
4175	    va >= kmi.clean_eva,
4176	    ("pmap_enter: managed mapping within the clean submap"));
4177	if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
4178		VM_OBJECT_ASSERT_LOCKED(m->object);
4179	pa = VM_PAGE_TO_PHYS(m);
4180	newpte = (pt_entry_t)(pa | PG_A | PG_V);
4181	if ((flags & VM_PROT_WRITE) != 0)
4182		newpte |= PG_M;
4183	if ((prot & VM_PROT_WRITE) != 0)
4184		newpte |= PG_RW;
4185	KASSERT((newpte & (PG_M | PG_RW)) != PG_M,
4186	    ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't"));
4187	if ((prot & VM_PROT_EXECUTE) == 0)
4188		newpte |= pg_nx;
4189	if ((flags & PMAP_ENTER_WIRED) != 0)
4190		newpte |= PG_W;
4191	if (va < VM_MAXUSER_ADDRESS)
4192		newpte |= PG_U;
4193	if (pmap == kernel_pmap)
4194		newpte |= PG_G;
4195	newpte |= pmap_cache_bits(pmap, m->md.pat_mode, 0);
4196
4197	/*
4198	 * Set modified bit gratuitously for writeable mappings if
4199	 * the page is unmanaged. We do not want to take a fault
4200	 * to do the dirty bit accounting for these mappings.
4201	 */
4202	if ((m->oflags & VPO_UNMANAGED) != 0) {
4203		if ((newpte & PG_RW) != 0)
4204			newpte |= PG_M;
4205	}
4206
4207	mpte = NULL;
4208
4209	lock = NULL;
4210	rw_rlock(&pvh_global_lock);
4211	PMAP_LOCK(pmap);
4212
4213	/*
4214	 * In the case that a page table page is not
4215	 * resident, we are creating it here.
4216	 */
4217retry:
4218	pde = pmap_pde(pmap, va);
4219	if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 ||
4220	    pmap_demote_pde_locked(pmap, pde, va, &lock))) {
4221		pte = pmap_pde_to_pte(pde, va);
4222		if (va < VM_MAXUSER_ADDRESS && mpte == NULL) {
4223			mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
4224			mpte->wire_count++;
4225		}
4226	} else if (va < VM_MAXUSER_ADDRESS) {
4227		/*
4228		 * Here if the pte page isn't mapped, or if it has been
4229		 * deallocated.
4230		 */
4231		nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
4232		mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va),
4233		    nosleep ? NULL : &lock);
4234		if (mpte == NULL && nosleep) {
4235			if (lock != NULL)
4236				rw_wunlock(lock);
4237			rw_runlock(&pvh_global_lock);
4238			PMAP_UNLOCK(pmap);
4239			return (KERN_RESOURCE_SHORTAGE);
4240		}
4241		goto retry;
4242	} else
4243		panic("pmap_enter: invalid page directory va=%#lx", va);
4244
4245	origpte = *pte;
4246
4247	/*
4248	 * Is the specified virtual address already mapped?
4249	 */
4250	if ((origpte & PG_V) != 0) {
4251		/*
4252		 * Wiring change, just update stats. We don't worry about
4253		 * wiring PT pages as they remain resident as long as there
4254		 * are valid mappings in them. Hence, if a user page is wired,
4255		 * the PT page will be also.
4256		 */
4257		if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0)
4258			pmap->pm_stats.wired_count++;
4259		else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0)
4260			pmap->pm_stats.wired_count--;
4261
4262		/*
4263		 * Remove the extra PT page reference.
4264		 */
4265		if (mpte != NULL) {
4266			mpte->wire_count--;
4267			KASSERT(mpte->wire_count > 0,
4268			    ("pmap_enter: missing reference to page table page,"
4269			     " va: 0x%lx", va));
4270		}
4271
4272		/*
4273		 * Has the physical page changed?
4274		 */
4275		opa = origpte & PG_FRAME;
4276		if (opa == pa) {
4277			/*
4278			 * No, might be a protection or wiring change.
4279			 */
4280			if ((origpte & PG_MANAGED) != 0) {
4281				newpte |= PG_MANAGED;
4282				if ((newpte & PG_RW) != 0)
4283					vm_page_aflag_set(m, PGA_WRITEABLE);
4284			}
4285			if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0)
4286				goto unchanged;
4287			goto validate;
4288		}
4289	} else {
4290		/*
4291		 * Increment the counters.
4292		 */
4293		if ((newpte & PG_W) != 0)
4294			pmap->pm_stats.wired_count++;
4295		pmap_resident_count_inc(pmap, 1);
4296	}
4297
4298	/*
4299	 * Enter on the PV list if part of our managed memory.
4300	 */
4301	if ((m->oflags & VPO_UNMANAGED) == 0) {
4302		newpte |= PG_MANAGED;
4303		pv = get_pv_entry(pmap, &lock);
4304		pv->pv_va = va;
4305		CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
4306		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
4307		m->md.pv_gen++;
4308		if ((newpte & PG_RW) != 0)
4309			vm_page_aflag_set(m, PGA_WRITEABLE);
4310	}
4311
4312	/*
4313	 * Update the PTE.
4314	 */
4315	if ((origpte & PG_V) != 0) {
4316validate:
4317		origpte = pte_load_store(pte, newpte);
4318		opa = origpte & PG_FRAME;
4319		if (opa != pa) {
4320			if ((origpte & PG_MANAGED) != 0) {
4321				om = PHYS_TO_VM_PAGE(opa);
4322				if ((origpte & (PG_M | PG_RW)) == (PG_M |
4323				    PG_RW))
4324					vm_page_dirty(om);
4325				if ((origpte & PG_A) != 0)
4326					vm_page_aflag_set(om, PGA_REFERENCED);
4327				CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
4328				pmap_pvh_free(&om->md, pmap, va);
4329				if ((om->aflags & PGA_WRITEABLE) != 0 &&
4330				    TAILQ_EMPTY(&om->md.pv_list) &&
4331				    ((om->flags & PG_FICTITIOUS) != 0 ||
4332				    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
4333					vm_page_aflag_clear(om, PGA_WRITEABLE);
4334			}
4335		} else if ((newpte & PG_M) == 0 && (origpte & (PG_M |
4336		    PG_RW)) == (PG_M | PG_RW)) {
4337			if ((origpte & PG_MANAGED) != 0)
4338				vm_page_dirty(m);
4339
4340			/*
4341			 * Although the PTE may still have PG_RW set, TLB
4342			 * invalidation may nonetheless be required because
4343			 * the PTE no longer has PG_M set.
4344			 */
4345		} else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) {
4346			/*
4347			 * This PTE change does not require TLB invalidation.
4348			 */
4349			goto unchanged;
4350		}
4351		if ((origpte & PG_A) != 0)
4352			pmap_invalidate_page(pmap, va);
4353	} else
4354		pte_store(pte, newpte);
4355
4356unchanged:
4357
4358	/*
4359	 * If both the page table page and the reservation are fully
4360	 * populated, then attempt promotion.
4361	 */
4362	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
4363	    pmap_ps_enabled(pmap) &&
4364	    (m->flags & PG_FICTITIOUS) == 0 &&
4365	    vm_reserv_level_iffullpop(m) == 0)
4366		pmap_promote_pde(pmap, pde, va, &lock);
4367
4368	if (lock != NULL)
4369		rw_wunlock(lock);
4370	rw_runlock(&pvh_global_lock);
4371	PMAP_UNLOCK(pmap);
4372	return (KERN_SUCCESS);
4373}
4374
4375/*
4376 * Tries to create a 2MB page mapping.  Returns TRUE if successful and FALSE
4377 * otherwise.  Fails if (1) a page table page cannot be allocated without
4378 * blocking, (2) a mapping already exists at the specified virtual address, or
4379 * (3) a pv entry cannot be allocated without reclaiming another pv entry.
4380 */
4381static boolean_t
4382pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
4383    struct rwlock **lockp)
4384{
4385	pd_entry_t *pde, newpde;
4386	pt_entry_t PG_V;
4387	vm_page_t mpde;
4388	struct spglist free;
4389
4390	PG_V = pmap_valid_bit(pmap);
4391	rw_assert(&pvh_global_lock, RA_LOCKED);
4392	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4393
4394	if ((mpde = pmap_allocpde(pmap, va, NULL)) == NULL) {
4395		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
4396		    " in pmap %p", va, pmap);
4397		return (FALSE);
4398	}
4399	pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpde));
4400	pde = &pde[pmap_pde_index(va)];
4401	if ((*pde & PG_V) != 0) {
4402		KASSERT(mpde->wire_count > 1,
4403		    ("pmap_enter_pde: mpde's wire count is too low"));
4404		mpde->wire_count--;
4405		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
4406		    " in pmap %p", va, pmap);
4407		return (FALSE);
4408	}
4409	newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) |
4410	    PG_PS | PG_V;
4411	if ((m->oflags & VPO_UNMANAGED) == 0) {
4412		newpde |= PG_MANAGED;
4413
4414		/*
4415		 * Abort this mapping if its PV entry could not be created.
4416		 */
4417		if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m),
4418		    lockp)) {
4419			SLIST_INIT(&free);
4420			if (pmap_unwire_ptp(pmap, va, mpde, &free)) {
4421				pmap_invalidate_page(pmap, va);
4422				pmap_free_zero_pages(&free);
4423			}
4424			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
4425			    " in pmap %p", va, pmap);
4426			return (FALSE);
4427		}
4428	}
4429	if ((prot & VM_PROT_EXECUTE) == 0)
4430		newpde |= pg_nx;
4431	if (va < VM_MAXUSER_ADDRESS)
4432		newpde |= PG_U;
4433
4434	/*
4435	 * Increment counters.
4436	 */
4437	pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
4438
4439	/*
4440	 * Map the superpage.
4441	 */
4442	pde_store(pde, newpde);
4443
4444	atomic_add_long(&pmap_pde_mappings, 1);
4445	CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
4446	    " in pmap %p", va, pmap);
4447	return (TRUE);
4448}
4449
4450/*
4451 * Maps a sequence of resident pages belonging to the same object.
4452 * The sequence begins with the given page m_start.  This page is
4453 * mapped at the given virtual address start.  Each subsequent page is
4454 * mapped at a virtual address that is offset from start by the same
4455 * amount as the page is offset from m_start within the object.  The
4456 * last page in the sequence is the page with the largest offset from
4457 * m_start that can be mapped at a virtual address less than the given
4458 * virtual address end.  Not every virtual page between start and end
4459 * is mapped; only those for which a resident page exists with the
4460 * corresponding offset from m_start are mapped.
4461 */
4462void
4463pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
4464    vm_page_t m_start, vm_prot_t prot)
4465{
4466	struct rwlock *lock;
4467	vm_offset_t va;
4468	vm_page_t m, mpte;
4469	vm_pindex_t diff, psize;
4470
4471	VM_OBJECT_ASSERT_LOCKED(m_start->object);
4472
4473	psize = atop(end - start);
4474	mpte = NULL;
4475	m = m_start;
4476	lock = NULL;
4477	rw_rlock(&pvh_global_lock);
4478	PMAP_LOCK(pmap);
4479	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
4480		va = start + ptoa(diff);
4481		if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
4482		    m->psind == 1 && pmap_ps_enabled(pmap) &&
4483		    pmap_enter_pde(pmap, va, m, prot, &lock))
4484			m = &m[NBPDR / PAGE_SIZE - 1];
4485		else
4486			mpte = pmap_enter_quick_locked(pmap, va, m, prot,
4487			    mpte, &lock);
4488		m = TAILQ_NEXT(m, listq);
4489	}
4490	if (lock != NULL)
4491		rw_wunlock(lock);
4492	rw_runlock(&pvh_global_lock);
4493	PMAP_UNLOCK(pmap);
4494}
4495
4496/*
4497 * this code makes some *MAJOR* assumptions:
4498 * 1. Current pmap & pmap exists.
4499 * 2. Not wired.
4500 * 3. Read access.
4501 * 4. No page table pages.
4502 * but is *MUCH* faster than pmap_enter...
4503 */
4504
4505void
4506pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
4507{
4508	struct rwlock *lock;
4509
4510	lock = NULL;
4511	rw_rlock(&pvh_global_lock);
4512	PMAP_LOCK(pmap);
4513	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
4514	if (lock != NULL)
4515		rw_wunlock(lock);
4516	rw_runlock(&pvh_global_lock);
4517	PMAP_UNLOCK(pmap);
4518}
4519
4520static vm_page_t
4521pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
4522    vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
4523{
4524	struct spglist free;
4525	pt_entry_t *pte, PG_V;
4526	vm_paddr_t pa;
4527
4528	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
4529	    (m->oflags & VPO_UNMANAGED) != 0,
4530	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
4531	PG_V = pmap_valid_bit(pmap);
4532	rw_assert(&pvh_global_lock, RA_LOCKED);
4533	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4534
4535	/*
4536	 * In the case that a page table page is not
4537	 * resident, we are creating it here.
4538	 */
4539	if (va < VM_MAXUSER_ADDRESS) {
4540		vm_pindex_t ptepindex;
4541		pd_entry_t *ptepa;
4542
4543		/*
4544		 * Calculate pagetable page index
4545		 */
4546		ptepindex = pmap_pde_pindex(va);
4547		if (mpte && (mpte->pindex == ptepindex)) {
4548			mpte->wire_count++;
4549		} else {
4550			/*
4551			 * Get the page directory entry
4552			 */
4553			ptepa = pmap_pde(pmap, va);
4554
4555			/*
4556			 * If the page table page is mapped, we just increment
4557			 * the hold count, and activate it.  Otherwise, we
4558			 * attempt to allocate a page table page.  If this
4559			 * attempt fails, we don't retry.  Instead, we give up.
4560			 */
4561			if (ptepa && (*ptepa & PG_V) != 0) {
4562				if (*ptepa & PG_PS)
4563					return (NULL);
4564				mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME);
4565				mpte->wire_count++;
4566			} else {
4567				/*
4568				 * Pass NULL instead of the PV list lock
4569				 * pointer, because we don't intend to sleep.
4570				 */
4571				mpte = _pmap_allocpte(pmap, ptepindex, NULL);
4572				if (mpte == NULL)
4573					return (mpte);
4574			}
4575		}
4576		pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
4577		pte = &pte[pmap_pte_index(va)];
4578	} else {
4579		mpte = NULL;
4580		pte = vtopte(va);
4581	}
4582	if (*pte) {
4583		if (mpte != NULL) {
4584			mpte->wire_count--;
4585			mpte = NULL;
4586		}
4587		return (mpte);
4588	}
4589
4590	/*
4591	 * Enter on the PV list if part of our managed memory.
4592	 */
4593	if ((m->oflags & VPO_UNMANAGED) == 0 &&
4594	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
4595		if (mpte != NULL) {
4596			SLIST_INIT(&free);
4597			if (pmap_unwire_ptp(pmap, va, mpte, &free)) {
4598				pmap_invalidate_page(pmap, va);
4599				pmap_free_zero_pages(&free);
4600			}
4601			mpte = NULL;
4602		}
4603		return (mpte);
4604	}
4605
4606	/*
4607	 * Increment counters
4608	 */
4609	pmap_resident_count_inc(pmap, 1);
4610
4611	pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 0);
4612	if ((prot & VM_PROT_EXECUTE) == 0)
4613		pa |= pg_nx;
4614
4615	/*
4616	 * Now validate mapping with RO protection
4617	 */
4618	if ((m->oflags & VPO_UNMANAGED) != 0)
4619		pte_store(pte, pa | PG_V | PG_U);
4620	else
4621		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
4622	return (mpte);
4623}
4624
4625/*
4626 * Make a temporary mapping for a physical address.  This is only intended
4627 * to be used for panic dumps.
4628 */
4629void *
4630pmap_kenter_temporary(vm_paddr_t pa, int i)
4631{
4632	vm_offset_t va;
4633
4634	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
4635	pmap_kenter(va, pa);
4636	invlpg(va);
4637	return ((void *)crashdumpmap);
4638}
4639
4640/*
4641 * This code maps large physical mmap regions into the
4642 * processor address space.  Note that some shortcuts
4643 * are taken, but the code works.
4644 */
4645void
4646pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
4647    vm_pindex_t pindex, vm_size_t size)
4648{
4649	pd_entry_t *pde;
4650	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
4651	vm_paddr_t pa, ptepa;
4652	vm_page_t p, pdpg;
4653	int pat_mode;
4654
4655	PG_A = pmap_accessed_bit(pmap);
4656	PG_M = pmap_modified_bit(pmap);
4657	PG_V = pmap_valid_bit(pmap);
4658	PG_RW = pmap_rw_bit(pmap);
4659
4660	VM_OBJECT_ASSERT_WLOCKED(object);
4661	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
4662	    ("pmap_object_init_pt: non-device object"));
4663	if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
4664		if (!pmap_ps_enabled(pmap))
4665			return;
4666		if (!vm_object_populate(object, pindex, pindex + atop(size)))
4667			return;
4668		p = vm_page_lookup(object, pindex);
4669		KASSERT(p->valid == VM_PAGE_BITS_ALL,
4670		    ("pmap_object_init_pt: invalid page %p", p));
4671		pat_mode = p->md.pat_mode;
4672
4673		/*
4674		 * Abort the mapping if the first page is not physically
4675		 * aligned to a 2MB page boundary.
4676		 */
4677		ptepa = VM_PAGE_TO_PHYS(p);
4678		if (ptepa & (NBPDR - 1))
4679			return;
4680
4681		/*
4682		 * Skip the first page.  Abort the mapping if the rest of
4683		 * the pages are not physically contiguous or have differing
4684		 * memory attributes.
4685		 */
4686		p = TAILQ_NEXT(p, listq);
4687		for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
4688		    pa += PAGE_SIZE) {
4689			KASSERT(p->valid == VM_PAGE_BITS_ALL,
4690			    ("pmap_object_init_pt: invalid page %p", p));
4691			if (pa != VM_PAGE_TO_PHYS(p) ||
4692			    pat_mode != p->md.pat_mode)
4693				return;
4694			p = TAILQ_NEXT(p, listq);
4695		}
4696
4697		/*
4698		 * Map using 2MB pages.  Since "ptepa" is 2M aligned and
4699		 * "size" is a multiple of 2M, adding the PAT setting to "pa"
4700		 * will not affect the termination of this loop.
4701		 */
4702		PMAP_LOCK(pmap);
4703		for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1);
4704		    pa < ptepa + size; pa += NBPDR) {
4705			pdpg = pmap_allocpde(pmap, addr, NULL);
4706			if (pdpg == NULL) {
4707				/*
4708				 * The creation of mappings below is only an
4709				 * optimization.  If a page directory page
4710				 * cannot be allocated without blocking,
4711				 * continue on to the next mapping rather than
4712				 * blocking.
4713				 */
4714				addr += NBPDR;
4715				continue;
4716			}
4717			pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
4718			pde = &pde[pmap_pde_index(addr)];
4719			if ((*pde & PG_V) == 0) {
4720				pde_store(pde, pa | PG_PS | PG_M | PG_A |
4721				    PG_U | PG_RW | PG_V);
4722				pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
4723				atomic_add_long(&pmap_pde_mappings, 1);
4724			} else {
4725				/* Continue on if the PDE is already valid. */
4726				pdpg->wire_count--;
4727				KASSERT(pdpg->wire_count > 0,
4728				    ("pmap_object_init_pt: missing reference "
4729				    "to page directory page, va: 0x%lx", addr));
4730			}
4731			addr += NBPDR;
4732		}
4733		PMAP_UNLOCK(pmap);
4734	}
4735}
4736
4737/*
4738 *	Clear the wired attribute from the mappings for the specified range of
4739 *	addresses in the given pmap.  Every valid mapping within that range
4740 *	must have the wired attribute set.  In contrast, invalid mappings
4741 *	cannot have the wired attribute set, so they are ignored.
4742 *
4743 *	The wired attribute of the page table entry is not a hardware feature,
4744 *	so there is no need to invalidate any TLB entries.
4745 */
4746void
4747pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
4748{
4749	vm_offset_t va_next;
4750	pml4_entry_t *pml4e;
4751	pdp_entry_t *pdpe;
4752	pd_entry_t *pde;
4753	pt_entry_t *pte, PG_V;
4754	boolean_t pv_lists_locked;
4755
4756	PG_V = pmap_valid_bit(pmap);
4757	pv_lists_locked = FALSE;
4758resume:
4759	PMAP_LOCK(pmap);
4760	for (; sva < eva; sva = va_next) {
4761		pml4e = pmap_pml4e(pmap, sva);
4762		if ((*pml4e & PG_V) == 0) {
4763			va_next = (sva + NBPML4) & ~PML4MASK;
4764			if (va_next < sva)
4765				va_next = eva;
4766			continue;
4767		}
4768		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
4769		if ((*pdpe & PG_V) == 0) {
4770			va_next = (sva + NBPDP) & ~PDPMASK;
4771			if (va_next < sva)
4772				va_next = eva;
4773			continue;
4774		}
4775		va_next = (sva + NBPDR) & ~PDRMASK;
4776		if (va_next < sva)
4777			va_next = eva;
4778		pde = pmap_pdpe_to_pde(pdpe, sva);
4779		if ((*pde & PG_V) == 0)
4780			continue;
4781		if ((*pde & PG_PS) != 0) {
4782			if ((*pde & PG_W) == 0)
4783				panic("pmap_unwire: pde %#jx is missing PG_W",
4784				    (uintmax_t)*pde);
4785
4786			/*
4787			 * Are we unwiring the entire large page?  If not,
4788			 * demote the mapping and fall through.
4789			 */
4790			if (sva + NBPDR == va_next && eva >= va_next) {
4791				atomic_clear_long(pde, PG_W);
4792				pmap->pm_stats.wired_count -= NBPDR /
4793				    PAGE_SIZE;
4794				continue;
4795			} else {
4796				if (!pv_lists_locked) {
4797					pv_lists_locked = TRUE;
4798					if (!rw_try_rlock(&pvh_global_lock)) {
4799						PMAP_UNLOCK(pmap);
4800						rw_rlock(&pvh_global_lock);
4801						/* Repeat sva. */
4802						goto resume;
4803					}
4804				}
4805				if (!pmap_demote_pde(pmap, pde, sva))
4806					panic("pmap_unwire: demotion failed");
4807			}
4808		}
4809		if (va_next > eva)
4810			va_next = eva;
4811		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
4812		    sva += PAGE_SIZE) {
4813			if ((*pte & PG_V) == 0)
4814				continue;
4815			if ((*pte & PG_W) == 0)
4816				panic("pmap_unwire: pte %#jx is missing PG_W",
4817				    (uintmax_t)*pte);
4818
4819			/*
4820			 * PG_W must be cleared atomically.  Although the pmap
4821			 * lock synchronizes access to PG_W, another processor
4822			 * could be setting PG_M and/or PG_A concurrently.
4823			 */
4824			atomic_clear_long(pte, PG_W);
4825			pmap->pm_stats.wired_count--;
4826		}
4827	}
4828	if (pv_lists_locked)
4829		rw_runlock(&pvh_global_lock);
4830	PMAP_UNLOCK(pmap);
4831}
4832
4833/*
4834 *	Copy the range specified by src_addr/len
4835 *	from the source map to the range dst_addr/len
4836 *	in the destination map.
4837 *
4838 *	This routine is only advisory and need not do anything.
4839 */
4840
4841void
4842pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
4843    vm_offset_t src_addr)
4844{
4845	struct rwlock *lock;
4846	struct spglist free;
4847	vm_offset_t addr;
4848	vm_offset_t end_addr = src_addr + len;
4849	vm_offset_t va_next;
4850	pt_entry_t PG_A, PG_M, PG_V;
4851
4852	if (dst_addr != src_addr)
4853		return;
4854
4855	if (dst_pmap->pm_type != src_pmap->pm_type)
4856		return;
4857
4858	/*
4859	 * EPT page table entries that require emulation of A/D bits are
4860	 * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although
4861	 * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit
4862	 * (aka EPT_PG_EXECUTE) could still be set. Since some EPT
4863	 * implementations flag an EPT misconfiguration for exec-only
4864	 * mappings we skip this function entirely for emulated pmaps.
4865	 */
4866	if (pmap_emulate_ad_bits(dst_pmap))
4867		return;
4868
4869	lock = NULL;
4870	rw_rlock(&pvh_global_lock);
4871	if (dst_pmap < src_pmap) {
4872		PMAP_LOCK(dst_pmap);
4873		PMAP_LOCK(src_pmap);
4874	} else {
4875		PMAP_LOCK(src_pmap);
4876		PMAP_LOCK(dst_pmap);
4877	}
4878
4879	PG_A = pmap_accessed_bit(dst_pmap);
4880	PG_M = pmap_modified_bit(dst_pmap);
4881	PG_V = pmap_valid_bit(dst_pmap);
4882
4883	for (addr = src_addr; addr < end_addr; addr = va_next) {
4884		pt_entry_t *src_pte, *dst_pte;
4885		vm_page_t dstmpde, dstmpte, srcmpte;
4886		pml4_entry_t *pml4e;
4887		pdp_entry_t *pdpe;
4888		pd_entry_t srcptepaddr, *pde;
4889
4890		KASSERT(addr < UPT_MIN_ADDRESS,
4891		    ("pmap_copy: invalid to pmap_copy page tables"));
4892
4893		pml4e = pmap_pml4e(src_pmap, addr);
4894		if ((*pml4e & PG_V) == 0) {
4895			va_next = (addr + NBPML4) & ~PML4MASK;
4896			if (va_next < addr)
4897				va_next = end_addr;
4898			continue;
4899		}
4900
4901		pdpe = pmap_pml4e_to_pdpe(pml4e, addr);
4902		if ((*pdpe & PG_V) == 0) {
4903			va_next = (addr + NBPDP) & ~PDPMASK;
4904			if (va_next < addr)
4905				va_next = end_addr;
4906			continue;
4907		}
4908
4909		va_next = (addr + NBPDR) & ~PDRMASK;
4910		if (va_next < addr)
4911			va_next = end_addr;
4912
4913		pde = pmap_pdpe_to_pde(pdpe, addr);
4914		srcptepaddr = *pde;
4915		if (srcptepaddr == 0)
4916			continue;
4917
4918		if (srcptepaddr & PG_PS) {
4919			if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr)
4920				continue;
4921			dstmpde = pmap_allocpde(dst_pmap, addr, NULL);
4922			if (dstmpde == NULL)
4923				break;
4924			pde = (pd_entry_t *)
4925			    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpde));
4926			pde = &pde[pmap_pde_index(addr)];
4927			if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 ||
4928			    pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr &
4929			    PG_PS_FRAME, &lock))) {
4930				*pde = srcptepaddr & ~PG_W;
4931				pmap_resident_count_inc(dst_pmap, NBPDR / PAGE_SIZE);
4932			} else
4933				dstmpde->wire_count--;
4934			continue;
4935		}
4936
4937		srcptepaddr &= PG_FRAME;
4938		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
4939		KASSERT(srcmpte->wire_count > 0,
4940		    ("pmap_copy: source page table page is unused"));
4941
4942		if (va_next > end_addr)
4943			va_next = end_addr;
4944
4945		src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
4946		src_pte = &src_pte[pmap_pte_index(addr)];
4947		dstmpte = NULL;
4948		while (addr < va_next) {
4949			pt_entry_t ptetemp;
4950			ptetemp = *src_pte;
4951			/*
4952			 * we only virtual copy managed pages
4953			 */
4954			if ((ptetemp & PG_MANAGED) != 0) {
4955				if (dstmpte != NULL &&
4956				    dstmpte->pindex == pmap_pde_pindex(addr))
4957					dstmpte->wire_count++;
4958				else if ((dstmpte = pmap_allocpte(dst_pmap,
4959				    addr, NULL)) == NULL)
4960					goto out;
4961				dst_pte = (pt_entry_t *)
4962				    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
4963				dst_pte = &dst_pte[pmap_pte_index(addr)];
4964				if (*dst_pte == 0 &&
4965				    pmap_try_insert_pv_entry(dst_pmap, addr,
4966				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME),
4967				    &lock)) {
4968					/*
4969					 * Clear the wired, modified, and
4970					 * accessed (referenced) bits
4971					 * during the copy.
4972					 */
4973					*dst_pte = ptetemp & ~(PG_W | PG_M |
4974					    PG_A);
4975					pmap_resident_count_inc(dst_pmap, 1);
4976				} else {
4977					SLIST_INIT(&free);
4978					if (pmap_unwire_ptp(dst_pmap, addr,
4979					    dstmpte, &free)) {
4980						pmap_invalidate_page(dst_pmap,
4981						    addr);
4982						pmap_free_zero_pages(&free);
4983					}
4984					goto out;
4985				}
4986				if (dstmpte->wire_count >= srcmpte->wire_count)
4987					break;
4988			}
4989			addr += PAGE_SIZE;
4990			src_pte++;
4991		}
4992	}
4993out:
4994	if (lock != NULL)
4995		rw_wunlock(lock);
4996	rw_runlock(&pvh_global_lock);
4997	PMAP_UNLOCK(src_pmap);
4998	PMAP_UNLOCK(dst_pmap);
4999}
5000
5001/*
5002 *	pmap_zero_page zeros the specified hardware page by mapping
5003 *	the page into KVM and using bzero to clear its contents.
5004 */
5005void
5006pmap_zero_page(vm_page_t m)
5007{
5008	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
5009
5010	pagezero((void *)va);
5011}
5012
5013/*
5014 *	pmap_zero_page_area zeros the specified hardware page by mapping
5015 *	the page into KVM and using bzero to clear its contents.
5016 *
5017 *	off and size may not cover an area beyond a single hardware page.
5018 */
5019void
5020pmap_zero_page_area(vm_page_t m, int off, int size)
5021{
5022	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
5023
5024	if (off == 0 && size == PAGE_SIZE)
5025		pagezero((void *)va);
5026	else
5027		bzero((char *)va + off, size);
5028}
5029
5030/*
5031 *	pmap_zero_page_idle zeros the specified hardware page by mapping
5032 *	the page into KVM and using bzero to clear its contents.  This
5033 *	is intended to be called from the vm_pagezero process only and
5034 *	outside of Giant.
5035 */
5036void
5037pmap_zero_page_idle(vm_page_t m)
5038{
5039	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
5040
5041	pagezero((void *)va);
5042}
5043
5044/*
5045 *	pmap_copy_page copies the specified (machine independent)
5046 *	page by mapping the page into virtual memory and using
5047 *	bcopy to copy the page, one machine dependent page at a
5048 *	time.
5049 */
5050void
5051pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
5052{
5053	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
5054	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
5055
5056	pagecopy((void *)src, (void *)dst);
5057}
5058
5059int unmapped_buf_allowed = 1;
5060
5061void
5062pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
5063    vm_offset_t b_offset, int xfersize)
5064{
5065	void *a_cp, *b_cp;
5066	vm_page_t m_a, m_b;
5067	vm_paddr_t p_a, p_b;
5068	pt_entry_t *pte;
5069	vm_offset_t a_pg_offset, b_pg_offset;
5070	int cnt;
5071	boolean_t pinned;
5072
5073	/*
5074	 * NB:  The sequence of updating a page table followed by accesses
5075	 * to the corresponding pages used in the !DMAP case is subject to
5076	 * the situation described in the "AMD64 Architecture Programmer's
5077	 * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special
5078	 * Coherency Considerations".  Therefore, issuing the INVLPG right
5079	 * after modifying the PTE bits is crucial.
5080	 */
5081	pinned = FALSE;
5082	while (xfersize > 0) {
5083		a_pg_offset = a_offset & PAGE_MASK;
5084		m_a = ma[a_offset >> PAGE_SHIFT];
5085		p_a = m_a->phys_addr;
5086		b_pg_offset = b_offset & PAGE_MASK;
5087		m_b = mb[b_offset >> PAGE_SHIFT];
5088		p_b = m_b->phys_addr;
5089		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
5090		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
5091		if (__predict_false(p_a < DMAP_MIN_ADDRESS ||
5092		    p_a > DMAP_MIN_ADDRESS + dmaplimit)) {
5093			mtx_lock(&cpage_lock);
5094			sched_pin();
5095			pinned = TRUE;
5096			pte = vtopte(cpage_a);
5097			*pte = p_a | X86_PG_A | X86_PG_V |
5098			    pmap_cache_bits(kernel_pmap, m_a->md.pat_mode, 0);
5099			invlpg(cpage_a);
5100			a_cp = (char *)cpage_a + a_pg_offset;
5101		} else {
5102			a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
5103		}
5104		if (__predict_false(p_b < DMAP_MIN_ADDRESS ||
5105		    p_b > DMAP_MIN_ADDRESS + dmaplimit)) {
5106			if (!pinned) {
5107				mtx_lock(&cpage_lock);
5108				sched_pin();
5109				pinned = TRUE;
5110			}
5111			pte = vtopte(cpage_b);
5112			*pte = p_b | X86_PG_A | X86_PG_M | X86_PG_RW |
5113			    X86_PG_V | pmap_cache_bits(kernel_pmap,
5114			    m_b->md.pat_mode, 0);
5115			invlpg(cpage_b);
5116			b_cp = (char *)cpage_b + b_pg_offset;
5117		} else {
5118			b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
5119		}
5120		bcopy(a_cp, b_cp, cnt);
5121		if (__predict_false(pinned)) {
5122			sched_unpin();
5123			mtx_unlock(&cpage_lock);
5124			pinned = FALSE;
5125		}
5126		a_offset += cnt;
5127		b_offset += cnt;
5128		xfersize -= cnt;
5129	}
5130}
5131
5132/*
5133 * Returns true if the pmap's pv is one of the first
5134 * 16 pvs linked to from this page.  This count may
5135 * be changed upwards or downwards in the future; it
5136 * is only necessary that true be returned for a small
5137 * subset of pmaps for proper page aging.
5138 */
5139boolean_t
5140pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
5141{
5142	struct md_page *pvh;
5143	struct rwlock *lock;
5144	pv_entry_t pv;
5145	int loops = 0;
5146	boolean_t rv;
5147
5148	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5149	    ("pmap_page_exists_quick: page %p is not managed", m));
5150	rv = FALSE;
5151	rw_rlock(&pvh_global_lock);
5152	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5153	rw_rlock(lock);
5154	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5155		if (PV_PMAP(pv) == pmap) {
5156			rv = TRUE;
5157			break;
5158		}
5159		loops++;
5160		if (loops >= 16)
5161			break;
5162	}
5163	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
5164		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5165		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5166			if (PV_PMAP(pv) == pmap) {
5167				rv = TRUE;
5168				break;
5169			}
5170			loops++;
5171			if (loops >= 16)
5172				break;
5173		}
5174	}
5175	rw_runlock(lock);
5176	rw_runlock(&pvh_global_lock);
5177	return (rv);
5178}
5179
5180/*
5181 *	pmap_page_wired_mappings:
5182 *
5183 *	Return the number of managed mappings to the given physical page
5184 *	that are wired.
5185 */
5186int
5187pmap_page_wired_mappings(vm_page_t m)
5188{
5189	struct rwlock *lock;
5190	struct md_page *pvh;
5191	pmap_t pmap;
5192	pt_entry_t *pte;
5193	pv_entry_t pv;
5194	int count, md_gen, pvh_gen;
5195
5196	if ((m->oflags & VPO_UNMANAGED) != 0)
5197		return (0);
5198	rw_rlock(&pvh_global_lock);
5199	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5200	rw_rlock(lock);
5201restart:
5202	count = 0;
5203	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5204		pmap = PV_PMAP(pv);
5205		if (!PMAP_TRYLOCK(pmap)) {
5206			md_gen = m->md.pv_gen;
5207			rw_runlock(lock);
5208			PMAP_LOCK(pmap);
5209			rw_rlock(lock);
5210			if (md_gen != m->md.pv_gen) {
5211				PMAP_UNLOCK(pmap);
5212				goto restart;
5213			}
5214		}
5215		pte = pmap_pte(pmap, pv->pv_va);
5216		if ((*pte & PG_W) != 0)
5217			count++;
5218		PMAP_UNLOCK(pmap);
5219	}
5220	if ((m->flags & PG_FICTITIOUS) == 0) {
5221		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5222		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5223			pmap = PV_PMAP(pv);
5224			if (!PMAP_TRYLOCK(pmap)) {
5225				md_gen = m->md.pv_gen;
5226				pvh_gen = pvh->pv_gen;
5227				rw_runlock(lock);
5228				PMAP_LOCK(pmap);
5229				rw_rlock(lock);
5230				if (md_gen != m->md.pv_gen ||
5231				    pvh_gen != pvh->pv_gen) {
5232					PMAP_UNLOCK(pmap);
5233					goto restart;
5234				}
5235			}
5236			pte = pmap_pde(pmap, pv->pv_va);
5237			if ((*pte & PG_W) != 0)
5238				count++;
5239			PMAP_UNLOCK(pmap);
5240		}
5241	}
5242	rw_runlock(lock);
5243	rw_runlock(&pvh_global_lock);
5244	return (count);
5245}
5246
5247/*
5248 * Returns TRUE if the given page is mapped individually or as part of
5249 * a 2mpage.  Otherwise, returns FALSE.
5250 */
5251boolean_t
5252pmap_page_is_mapped(vm_page_t m)
5253{
5254	struct rwlock *lock;
5255	boolean_t rv;
5256
5257	if ((m->oflags & VPO_UNMANAGED) != 0)
5258		return (FALSE);
5259	rw_rlock(&pvh_global_lock);
5260	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5261	rw_rlock(lock);
5262	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
5263	    ((m->flags & PG_FICTITIOUS) == 0 &&
5264	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
5265	rw_runlock(lock);
5266	rw_runlock(&pvh_global_lock);
5267	return (rv);
5268}
5269
5270/*
5271 * Destroy all managed, non-wired mappings in the given user-space
5272 * pmap.  This pmap cannot be active on any processor besides the
5273 * caller.
5274 *
5275 * This function cannot be applied to the kernel pmap.  Moreover, it
5276 * is not intended for general use.  It is only to be used during
5277 * process termination.  Consequently, it can be implemented in ways
5278 * that make it faster than pmap_remove().  First, it can more quickly
5279 * destroy mappings by iterating over the pmap's collection of PV
5280 * entries, rather than searching the page table.  Second, it doesn't
5281 * have to test and clear the page table entries atomically, because
5282 * no processor is currently accessing the user address space.  In
5283 * particular, a page table entry's dirty bit won't change state once
5284 * this function starts.
5285 */
5286void
5287pmap_remove_pages(pmap_t pmap)
5288{
5289	pd_entry_t ptepde;
5290	pt_entry_t *pte, tpte;
5291	pt_entry_t PG_M, PG_RW, PG_V;
5292	struct spglist free;
5293	vm_page_t m, mpte, mt;
5294	pv_entry_t pv;
5295	struct md_page *pvh;
5296	struct pv_chunk *pc, *npc;
5297	struct rwlock *lock;
5298	int64_t bit;
5299	uint64_t inuse, bitmask;
5300	int allfree, field, freed, idx;
5301	boolean_t superpage;
5302	vm_paddr_t pa;
5303
5304	/*
5305	 * Assert that the given pmap is only active on the current
5306	 * CPU.  Unfortunately, we cannot block another CPU from
5307	 * activating the pmap while this function is executing.
5308	 */
5309	KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap));
5310#ifdef INVARIANTS
5311	{
5312		cpuset_t other_cpus;
5313
5314		other_cpus = all_cpus;
5315		critical_enter();
5316		CPU_CLR(PCPU_GET(cpuid), &other_cpus);
5317		CPU_AND(&other_cpus, &pmap->pm_active);
5318		critical_exit();
5319		KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap));
5320	}
5321#endif
5322
5323	lock = NULL;
5324	PG_M = pmap_modified_bit(pmap);
5325	PG_V = pmap_valid_bit(pmap);
5326	PG_RW = pmap_rw_bit(pmap);
5327
5328	SLIST_INIT(&free);
5329	rw_rlock(&pvh_global_lock);
5330	PMAP_LOCK(pmap);
5331	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
5332		allfree = 1;
5333		freed = 0;
5334		for (field = 0; field < _NPCM; field++) {
5335			inuse = ~pc->pc_map[field] & pc_freemask[field];
5336			while (inuse != 0) {
5337				bit = bsfq(inuse);
5338				bitmask = 1UL << bit;
5339				idx = field * 64 + bit;
5340				pv = &pc->pc_pventry[idx];
5341				inuse &= ~bitmask;
5342
5343				pte = pmap_pdpe(pmap, pv->pv_va);
5344				ptepde = *pte;
5345				pte = pmap_pdpe_to_pde(pte, pv->pv_va);
5346				tpte = *pte;
5347				if ((tpte & (PG_PS | PG_V)) == PG_V) {
5348					superpage = FALSE;
5349					ptepde = tpte;
5350					pte = (pt_entry_t *)PHYS_TO_DMAP(tpte &
5351					    PG_FRAME);
5352					pte = &pte[pmap_pte_index(pv->pv_va)];
5353					tpte = *pte;
5354				} else {
5355					/*
5356					 * Keep track whether 'tpte' is a
5357					 * superpage explicitly instead of
5358					 * relying on PG_PS being set.
5359					 *
5360					 * This is because PG_PS is numerically
5361					 * identical to PG_PTE_PAT and thus a
5362					 * regular page could be mistaken for
5363					 * a superpage.
5364					 */
5365					superpage = TRUE;
5366				}
5367
5368				if ((tpte & PG_V) == 0) {
5369					panic("bad pte va %lx pte %lx",
5370					    pv->pv_va, tpte);
5371				}
5372
5373/*
5374 * We cannot remove wired pages from a process' mapping at this time
5375 */
5376				if (tpte & PG_W) {
5377					allfree = 0;
5378					continue;
5379				}
5380
5381				if (superpage)
5382					pa = tpte & PG_PS_FRAME;
5383				else
5384					pa = tpte & PG_FRAME;
5385
5386				m = PHYS_TO_VM_PAGE(pa);
5387				KASSERT(m->phys_addr == pa,
5388				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
5389				    m, (uintmax_t)m->phys_addr,
5390				    (uintmax_t)tpte));
5391
5392				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
5393				    m < &vm_page_array[vm_page_array_size],
5394				    ("pmap_remove_pages: bad tpte %#jx",
5395				    (uintmax_t)tpte));
5396
5397				pte_clear(pte);
5398
5399				/*
5400				 * Update the vm_page_t clean/reference bits.
5401				 */
5402				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
5403					if (superpage) {
5404						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
5405							vm_page_dirty(mt);
5406					} else
5407						vm_page_dirty(m);
5408				}
5409
5410				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
5411
5412				/* Mark free */
5413				pc->pc_map[field] |= bitmask;
5414				if (superpage) {
5415					pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
5416					pvh = pa_to_pvh(tpte & PG_PS_FRAME);
5417					TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
5418					pvh->pv_gen++;
5419					if (TAILQ_EMPTY(&pvh->pv_list)) {
5420						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
5421							if ((mt->aflags & PGA_WRITEABLE) != 0 &&
5422							    TAILQ_EMPTY(&mt->md.pv_list))
5423								vm_page_aflag_clear(mt, PGA_WRITEABLE);
5424					}
5425					mpte = pmap_lookup_pt_page(pmap, pv->pv_va);
5426					if (mpte != NULL) {
5427						pmap_remove_pt_page(pmap, mpte);
5428						pmap_resident_count_dec(pmap, 1);
5429						KASSERT(mpte->wire_count == NPTEPG,
5430						    ("pmap_remove_pages: pte page wire count error"));
5431						mpte->wire_count = 0;
5432						pmap_add_delayed_free_list(mpte, &free, FALSE);
5433						atomic_subtract_int(&cnt.v_wire_count, 1);
5434					}
5435				} else {
5436					pmap_resident_count_dec(pmap, 1);
5437					TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
5438					m->md.pv_gen++;
5439					if ((m->aflags & PGA_WRITEABLE) != 0 &&
5440					    TAILQ_EMPTY(&m->md.pv_list) &&
5441					    (m->flags & PG_FICTITIOUS) == 0) {
5442						pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5443						if (TAILQ_EMPTY(&pvh->pv_list))
5444							vm_page_aflag_clear(m, PGA_WRITEABLE);
5445					}
5446				}
5447				pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
5448				freed++;
5449			}
5450		}
5451		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
5452		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
5453		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
5454		if (allfree) {
5455			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
5456			free_pv_chunk(pc);
5457		}
5458	}
5459	if (lock != NULL)
5460		rw_wunlock(lock);
5461	pmap_invalidate_all(pmap);
5462	rw_runlock(&pvh_global_lock);
5463	PMAP_UNLOCK(pmap);
5464	pmap_free_zero_pages(&free);
5465}
5466
5467static boolean_t
5468pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
5469{
5470	struct rwlock *lock;
5471	pv_entry_t pv;
5472	struct md_page *pvh;
5473	pt_entry_t *pte, mask;
5474	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
5475	pmap_t pmap;
5476	int md_gen, pvh_gen;
5477	boolean_t rv;
5478
5479	rv = FALSE;
5480	rw_rlock(&pvh_global_lock);
5481	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5482	rw_rlock(lock);
5483restart:
5484	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5485		pmap = PV_PMAP(pv);
5486		if (!PMAP_TRYLOCK(pmap)) {
5487			md_gen = m->md.pv_gen;
5488			rw_runlock(lock);
5489			PMAP_LOCK(pmap);
5490			rw_rlock(lock);
5491			if (md_gen != m->md.pv_gen) {
5492				PMAP_UNLOCK(pmap);
5493				goto restart;
5494			}
5495		}
5496		pte = pmap_pte(pmap, pv->pv_va);
5497		mask = 0;
5498		if (modified) {
5499			PG_M = pmap_modified_bit(pmap);
5500			PG_RW = pmap_rw_bit(pmap);
5501			mask |= PG_RW | PG_M;
5502		}
5503		if (accessed) {
5504			PG_A = pmap_accessed_bit(pmap);
5505			PG_V = pmap_valid_bit(pmap);
5506			mask |= PG_V | PG_A;
5507		}
5508		rv = (*pte & mask) == mask;
5509		PMAP_UNLOCK(pmap);
5510		if (rv)
5511			goto out;
5512	}
5513	if ((m->flags & PG_FICTITIOUS) == 0) {
5514		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5515		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5516			pmap = PV_PMAP(pv);
5517			if (!PMAP_TRYLOCK(pmap)) {
5518				md_gen = m->md.pv_gen;
5519				pvh_gen = pvh->pv_gen;
5520				rw_runlock(lock);
5521				PMAP_LOCK(pmap);
5522				rw_rlock(lock);
5523				if (md_gen != m->md.pv_gen ||
5524				    pvh_gen != pvh->pv_gen) {
5525					PMAP_UNLOCK(pmap);
5526					goto restart;
5527				}
5528			}
5529			pte = pmap_pde(pmap, pv->pv_va);
5530			mask = 0;
5531			if (modified) {
5532				PG_M = pmap_modified_bit(pmap);
5533				PG_RW = pmap_rw_bit(pmap);
5534				mask |= PG_RW | PG_M;
5535			}
5536			if (accessed) {
5537				PG_A = pmap_accessed_bit(pmap);
5538				PG_V = pmap_valid_bit(pmap);
5539				mask |= PG_V | PG_A;
5540			}
5541			rv = (*pte & mask) == mask;
5542			PMAP_UNLOCK(pmap);
5543			if (rv)
5544				goto out;
5545		}
5546	}
5547out:
5548	rw_runlock(lock);
5549	rw_runlock(&pvh_global_lock);
5550	return (rv);
5551}
5552
5553/*
5554 *	pmap_is_modified:
5555 *
5556 *	Return whether or not the specified physical page was modified
5557 *	in any physical maps.
5558 */
5559boolean_t
5560pmap_is_modified(vm_page_t m)
5561{
5562
5563	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5564	    ("pmap_is_modified: page %p is not managed", m));
5565
5566	/*
5567	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
5568	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
5569	 * is clear, no PTEs can have PG_M set.
5570	 */
5571	VM_OBJECT_ASSERT_WLOCKED(m->object);
5572	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
5573		return (FALSE);
5574	return (pmap_page_test_mappings(m, FALSE, TRUE));
5575}
5576
5577/*
5578 *	pmap_is_prefaultable:
5579 *
5580 *	Return whether or not the specified virtual address is eligible
5581 *	for prefault.
5582 */
5583boolean_t
5584pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
5585{
5586	pd_entry_t *pde;
5587	pt_entry_t *pte, PG_V;
5588	boolean_t rv;
5589
5590	PG_V = pmap_valid_bit(pmap);
5591	rv = FALSE;
5592	PMAP_LOCK(pmap);
5593	pde = pmap_pde(pmap, addr);
5594	if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) {
5595		pte = pmap_pde_to_pte(pde, addr);
5596		rv = (*pte & PG_V) == 0;
5597	}
5598	PMAP_UNLOCK(pmap);
5599	return (rv);
5600}
5601
5602/*
5603 *	pmap_is_referenced:
5604 *
5605 *	Return whether or not the specified physical page was referenced
5606 *	in any physical maps.
5607 */
5608boolean_t
5609pmap_is_referenced(vm_page_t m)
5610{
5611
5612	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5613	    ("pmap_is_referenced: page %p is not managed", m));
5614	return (pmap_page_test_mappings(m, TRUE, FALSE));
5615}
5616
5617/*
5618 * Clear the write and modified bits in each of the given page's mappings.
5619 */
5620void
5621pmap_remove_write(vm_page_t m)
5622{
5623	struct md_page *pvh;
5624	pmap_t pmap;
5625	struct rwlock *lock;
5626	pv_entry_t next_pv, pv;
5627	pd_entry_t *pde;
5628	pt_entry_t oldpte, *pte, PG_M, PG_RW;
5629	vm_offset_t va;
5630	int pvh_gen, md_gen;
5631
5632	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5633	    ("pmap_remove_write: page %p is not managed", m));
5634
5635	/*
5636	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
5637	 * set by another thread while the object is locked.  Thus,
5638	 * if PGA_WRITEABLE is clear, no page table entries need updating.
5639	 */
5640	VM_OBJECT_ASSERT_WLOCKED(m->object);
5641	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
5642		return;
5643	rw_rlock(&pvh_global_lock);
5644	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5645	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5646retry_pv_loop:
5647	rw_wlock(lock);
5648	if ((m->flags & PG_FICTITIOUS) != 0)
5649		goto small_mappings;
5650	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
5651		pmap = PV_PMAP(pv);
5652		if (!PMAP_TRYLOCK(pmap)) {
5653			pvh_gen = pvh->pv_gen;
5654			rw_wunlock(lock);
5655			PMAP_LOCK(pmap);
5656			rw_wlock(lock);
5657			if (pvh_gen != pvh->pv_gen) {
5658				PMAP_UNLOCK(pmap);
5659				rw_wunlock(lock);
5660				goto retry_pv_loop;
5661			}
5662		}
5663		PG_RW = pmap_rw_bit(pmap);
5664		va = pv->pv_va;
5665		pde = pmap_pde(pmap, va);
5666		if ((*pde & PG_RW) != 0)
5667			(void)pmap_demote_pde_locked(pmap, pde, va, &lock);
5668		KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
5669		    ("inconsistent pv lock %p %p for page %p",
5670		    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
5671		PMAP_UNLOCK(pmap);
5672	}
5673small_mappings:
5674	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5675		pmap = PV_PMAP(pv);
5676		if (!PMAP_TRYLOCK(pmap)) {
5677			pvh_gen = pvh->pv_gen;
5678			md_gen = m->md.pv_gen;
5679			rw_wunlock(lock);
5680			PMAP_LOCK(pmap);
5681			rw_wlock(lock);
5682			if (pvh_gen != pvh->pv_gen ||
5683			    md_gen != m->md.pv_gen) {
5684				PMAP_UNLOCK(pmap);
5685				rw_wunlock(lock);
5686				goto retry_pv_loop;
5687			}
5688		}
5689		PG_M = pmap_modified_bit(pmap);
5690		PG_RW = pmap_rw_bit(pmap);
5691		pde = pmap_pde(pmap, pv->pv_va);
5692		KASSERT((*pde & PG_PS) == 0,
5693		    ("pmap_remove_write: found a 2mpage in page %p's pv list",
5694		    m));
5695		pte = pmap_pde_to_pte(pde, pv->pv_va);
5696retry:
5697		oldpte = *pte;
5698		if (oldpte & PG_RW) {
5699			if (!atomic_cmpset_long(pte, oldpte, oldpte &
5700			    ~(PG_RW | PG_M)))
5701				goto retry;
5702			if ((oldpte & PG_M) != 0)
5703				vm_page_dirty(m);
5704			pmap_invalidate_page(pmap, pv->pv_va);
5705		}
5706		PMAP_UNLOCK(pmap);
5707	}
5708	rw_wunlock(lock);
5709	vm_page_aflag_clear(m, PGA_WRITEABLE);
5710	rw_runlock(&pvh_global_lock);
5711}
5712
5713static __inline boolean_t
5714safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte)
5715{
5716
5717	if (!pmap_emulate_ad_bits(pmap))
5718		return (TRUE);
5719
5720	KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type));
5721
5722	/*
5723	 * RWX = 010 or 110 will cause an unconditional EPT misconfiguration
5724	 * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared
5725	 * if the EPT_PG_WRITE bit is set.
5726	 */
5727	if ((pte & EPT_PG_WRITE) != 0)
5728		return (FALSE);
5729
5730	/*
5731	 * RWX = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set.
5732	 */
5733	if ((pte & EPT_PG_EXECUTE) == 0 ||
5734	    ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0))
5735		return (TRUE);
5736	else
5737		return (FALSE);
5738}
5739
5740#define	PMAP_TS_REFERENCED_MAX	5
5741
5742/*
5743 *	pmap_ts_referenced:
5744 *
5745 *	Return a count of reference bits for a page, clearing those bits.
5746 *	It is not necessary for every reference bit to be cleared, but it
5747 *	is necessary that 0 only be returned when there are truly no
5748 *	reference bits set.
5749 *
5750 *	XXX: The exact number of bits to check and clear is a matter that
5751 *	should be tested and standardized at some point in the future for
5752 *	optimal aging of shared pages.
5753 */
5754int
5755pmap_ts_referenced(vm_page_t m)
5756{
5757	struct md_page *pvh;
5758	pv_entry_t pv, pvf;
5759	pmap_t pmap;
5760	struct rwlock *lock;
5761	pd_entry_t oldpde, *pde;
5762	pt_entry_t *pte, PG_A;
5763	vm_offset_t va;
5764	vm_paddr_t pa;
5765	int cleared, md_gen, not_cleared, pvh_gen;
5766	struct spglist free;
5767	boolean_t demoted;
5768
5769	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5770	    ("pmap_ts_referenced: page %p is not managed", m));
5771	SLIST_INIT(&free);
5772	cleared = 0;
5773	pa = VM_PAGE_TO_PHYS(m);
5774	lock = PHYS_TO_PV_LIST_LOCK(pa);
5775	pvh = pa_to_pvh(pa);
5776	rw_rlock(&pvh_global_lock);
5777	rw_wlock(lock);
5778retry:
5779	not_cleared = 0;
5780	if ((m->flags & PG_FICTITIOUS) != 0 ||
5781	    (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
5782		goto small_mappings;
5783	pv = pvf;
5784	do {
5785		if (pvf == NULL)
5786			pvf = pv;
5787		pmap = PV_PMAP(pv);
5788		if (!PMAP_TRYLOCK(pmap)) {
5789			pvh_gen = pvh->pv_gen;
5790			rw_wunlock(lock);
5791			PMAP_LOCK(pmap);
5792			rw_wlock(lock);
5793			if (pvh_gen != pvh->pv_gen) {
5794				PMAP_UNLOCK(pmap);
5795				goto retry;
5796			}
5797		}
5798		PG_A = pmap_accessed_bit(pmap);
5799		va = pv->pv_va;
5800		pde = pmap_pde(pmap, pv->pv_va);
5801		oldpde = *pde;
5802		if ((*pde & PG_A) != 0) {
5803			/*
5804			 * Since this reference bit is shared by 512 4KB
5805			 * pages, it should not be cleared every time it is
5806			 * tested.  Apply a simple "hash" function on the
5807			 * physical page number, the virtual superpage number,
5808			 * and the pmap address to select one 4KB page out of
5809			 * the 512 on which testing the reference bit will
5810			 * result in clearing that reference bit.  This
5811			 * function is designed to avoid the selection of the
5812			 * same 4KB page for every 2MB page mapping.
5813			 *
5814			 * On demotion, a mapping that hasn't been referenced
5815			 * is simply destroyed.  To avoid the possibility of a
5816			 * subsequent page fault on a demoted wired mapping,
5817			 * always leave its reference bit set.  Moreover,
5818			 * since the superpage is wired, the current state of
5819			 * its reference bit won't affect page replacement.
5820			 */
5821			if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^
5822			    (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
5823			    (*pde & PG_W) == 0) {
5824				if (safe_to_clear_referenced(pmap, oldpde)) {
5825					atomic_clear_long(pde, PG_A);
5826					pmap_invalidate_page(pmap, pv->pv_va);
5827					demoted = FALSE;
5828				} else if (pmap_demote_pde_locked(pmap, pde,
5829				    pv->pv_va, &lock)) {
5830					/*
5831					 * Remove the mapping to a single page
5832					 * so that a subsequent access may
5833					 * repromote.  Since the underlying
5834					 * page table page is fully populated,
5835					 * this removal never frees a page
5836					 * table page.
5837					 */
5838					demoted = TRUE;
5839					va += VM_PAGE_TO_PHYS(m) - (oldpde &
5840					    PG_PS_FRAME);
5841					pte = pmap_pde_to_pte(pde, va);
5842					pmap_remove_pte(pmap, pte, va, *pde,
5843					    NULL, &lock);
5844					pmap_invalidate_page(pmap, va);
5845				} else
5846					demoted = TRUE;
5847
5848				if (demoted) {
5849					/*
5850					 * The superpage mapping was removed
5851					 * entirely and therefore 'pv' is no
5852					 * longer valid.
5853					 */
5854					if (pvf == pv)
5855						pvf = NULL;
5856					pv = NULL;
5857				}
5858				cleared++;
5859				KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
5860				    ("inconsistent pv lock %p %p for page %p",
5861				    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
5862			} else
5863				not_cleared++;
5864		}
5865		PMAP_UNLOCK(pmap);
5866		/* Rotate the PV list if it has more than one entry. */
5867		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
5868			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
5869			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
5870			pvh->pv_gen++;
5871		}
5872		if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
5873			goto out;
5874	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
5875small_mappings:
5876	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
5877		goto out;
5878	pv = pvf;
5879	do {
5880		if (pvf == NULL)
5881			pvf = pv;
5882		pmap = PV_PMAP(pv);
5883		if (!PMAP_TRYLOCK(pmap)) {
5884			pvh_gen = pvh->pv_gen;
5885			md_gen = m->md.pv_gen;
5886			rw_wunlock(lock);
5887			PMAP_LOCK(pmap);
5888			rw_wlock(lock);
5889			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
5890				PMAP_UNLOCK(pmap);
5891				goto retry;
5892			}
5893		}
5894		PG_A = pmap_accessed_bit(pmap);
5895		pde = pmap_pde(pmap, pv->pv_va);
5896		KASSERT((*pde & PG_PS) == 0,
5897		    ("pmap_ts_referenced: found a 2mpage in page %p's pv list",
5898		    m));
5899		pte = pmap_pde_to_pte(pde, pv->pv_va);
5900		if ((*pte & PG_A) != 0) {
5901			if (safe_to_clear_referenced(pmap, *pte)) {
5902				atomic_clear_long(pte, PG_A);
5903				pmap_invalidate_page(pmap, pv->pv_va);
5904				cleared++;
5905			} else if ((*pte & PG_W) == 0) {
5906				/*
5907				 * Wired pages cannot be paged out so
5908				 * doing accessed bit emulation for
5909				 * them is wasted effort. We do the
5910				 * hard work for unwired pages only.
5911				 */
5912				pmap_remove_pte(pmap, pte, pv->pv_va,
5913				    *pde, &free, &lock);
5914				pmap_invalidate_page(pmap, pv->pv_va);
5915				cleared++;
5916				if (pvf == pv)
5917					pvf = NULL;
5918				pv = NULL;
5919				KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
5920				    ("inconsistent pv lock %p %p for page %p",
5921				    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
5922			} else
5923				not_cleared++;
5924		}
5925		PMAP_UNLOCK(pmap);
5926		/* Rotate the PV list if it has more than one entry. */
5927		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
5928			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
5929			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
5930			m->md.pv_gen++;
5931		}
5932	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
5933	    not_cleared < PMAP_TS_REFERENCED_MAX);
5934out:
5935	rw_wunlock(lock);
5936	rw_runlock(&pvh_global_lock);
5937	pmap_free_zero_pages(&free);
5938	return (cleared + not_cleared);
5939}
5940
5941/*
5942 *	Apply the given advice to the specified range of addresses within the
5943 *	given pmap.  Depending on the advice, clear the referenced and/or
5944 *	modified flags in each mapping and set the mapped page's dirty field.
5945 */
5946void
5947pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
5948{
5949	struct rwlock *lock;
5950	pml4_entry_t *pml4e;
5951	pdp_entry_t *pdpe;
5952	pd_entry_t oldpde, *pde;
5953	pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V;
5954	vm_offset_t va_next;
5955	vm_page_t m;
5956	boolean_t anychanged, pv_lists_locked;
5957
5958	if (advice != MADV_DONTNEED && advice != MADV_FREE)
5959		return;
5960
5961	/*
5962	 * A/D bit emulation requires an alternate code path when clearing
5963	 * the modified and accessed bits below. Since this function is
5964	 * advisory in nature we skip it entirely for pmaps that require
5965	 * A/D bit emulation.
5966	 */
5967	if (pmap_emulate_ad_bits(pmap))
5968		return;
5969
5970	PG_A = pmap_accessed_bit(pmap);
5971	PG_G = pmap_global_bit(pmap);
5972	PG_M = pmap_modified_bit(pmap);
5973	PG_V = pmap_valid_bit(pmap);
5974	PG_RW = pmap_rw_bit(pmap);
5975
5976	pv_lists_locked = FALSE;
5977resume:
5978	anychanged = FALSE;
5979	PMAP_LOCK(pmap);
5980	for (; sva < eva; sva = va_next) {
5981		pml4e = pmap_pml4e(pmap, sva);
5982		if ((*pml4e & PG_V) == 0) {
5983			va_next = (sva + NBPML4) & ~PML4MASK;
5984			if (va_next < sva)
5985				va_next = eva;
5986			continue;
5987		}
5988		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
5989		if ((*pdpe & PG_V) == 0) {
5990			va_next = (sva + NBPDP) & ~PDPMASK;
5991			if (va_next < sva)
5992				va_next = eva;
5993			continue;
5994		}
5995		va_next = (sva + NBPDR) & ~PDRMASK;
5996		if (va_next < sva)
5997			va_next = eva;
5998		pde = pmap_pdpe_to_pde(pdpe, sva);
5999		oldpde = *pde;
6000		if ((oldpde & PG_V) == 0)
6001			continue;
6002		else if ((oldpde & PG_PS) != 0) {
6003			if ((oldpde & PG_MANAGED) == 0)
6004				continue;
6005			if (!pv_lists_locked) {
6006				pv_lists_locked = TRUE;
6007				if (!rw_try_rlock(&pvh_global_lock)) {
6008					if (anychanged)
6009						pmap_invalidate_all(pmap);
6010					PMAP_UNLOCK(pmap);
6011					rw_rlock(&pvh_global_lock);
6012					goto resume;
6013				}
6014			}
6015			lock = NULL;
6016			if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) {
6017				if (lock != NULL)
6018					rw_wunlock(lock);
6019
6020				/*
6021				 * The large page mapping was destroyed.
6022				 */
6023				continue;
6024			}
6025
6026			/*
6027			 * Unless the page mappings are wired, remove the
6028			 * mapping to a single page so that a subsequent
6029			 * access may repromote.  Since the underlying page
6030			 * table page is fully populated, this removal never
6031			 * frees a page table page.
6032			 */
6033			if ((oldpde & PG_W) == 0) {
6034				pte = pmap_pde_to_pte(pde, sva);
6035				KASSERT((*pte & PG_V) != 0,
6036				    ("pmap_advise: invalid PTE"));
6037				pmap_remove_pte(pmap, pte, sva, *pde, NULL,
6038				    &lock);
6039				anychanged = TRUE;
6040			}
6041			if (lock != NULL)
6042				rw_wunlock(lock);
6043		}
6044		if (va_next > eva)
6045			va_next = eva;
6046		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
6047		    sva += PAGE_SIZE) {
6048			if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED |
6049			    PG_V))
6050				continue;
6051			else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
6052				if (advice == MADV_DONTNEED) {
6053					/*
6054					 * Future calls to pmap_is_modified()
6055					 * can be avoided by making the page
6056					 * dirty now.
6057					 */
6058					m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
6059					vm_page_dirty(m);
6060				}
6061				atomic_clear_long(pte, PG_M | PG_A);
6062			} else if ((*pte & PG_A) != 0)
6063				atomic_clear_long(pte, PG_A);
6064			else
6065				continue;
6066			if ((*pte & PG_G) != 0)
6067				pmap_invalidate_page(pmap, sva);
6068			else
6069				anychanged = TRUE;
6070		}
6071	}
6072	if (anychanged)
6073		pmap_invalidate_all(pmap);
6074	if (pv_lists_locked)
6075		rw_runlock(&pvh_global_lock);
6076	PMAP_UNLOCK(pmap);
6077}
6078
6079/*
6080 *	Clear the modify bits on the specified physical page.
6081 */
6082void
6083pmap_clear_modify(vm_page_t m)
6084{
6085	struct md_page *pvh;
6086	pmap_t pmap;
6087	pv_entry_t next_pv, pv;
6088	pd_entry_t oldpde, *pde;
6089	pt_entry_t oldpte, *pte, PG_M, PG_RW, PG_V;
6090	struct rwlock *lock;
6091	vm_offset_t va;
6092	int md_gen, pvh_gen;
6093
6094	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6095	    ("pmap_clear_modify: page %p is not managed", m));
6096	VM_OBJECT_ASSERT_WLOCKED(m->object);
6097	KASSERT(!vm_page_xbusied(m),
6098	    ("pmap_clear_modify: page %p is exclusive busied", m));
6099
6100	/*
6101	 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
6102	 * If the object containing the page is locked and the page is not
6103	 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
6104	 */
6105	if ((m->aflags & PGA_WRITEABLE) == 0)
6106		return;
6107	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
6108	rw_rlock(&pvh_global_lock);
6109	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6110	rw_wlock(lock);
6111restart:
6112	if ((m->flags & PG_FICTITIOUS) != 0)
6113		goto small_mappings;
6114	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
6115		pmap = PV_PMAP(pv);
6116		if (!PMAP_TRYLOCK(pmap)) {
6117			pvh_gen = pvh->pv_gen;
6118			rw_wunlock(lock);
6119			PMAP_LOCK(pmap);
6120			rw_wlock(lock);
6121			if (pvh_gen != pvh->pv_gen) {
6122				PMAP_UNLOCK(pmap);
6123				goto restart;
6124			}
6125		}
6126		PG_M = pmap_modified_bit(pmap);
6127		PG_V = pmap_valid_bit(pmap);
6128		PG_RW = pmap_rw_bit(pmap);
6129		va = pv->pv_va;
6130		pde = pmap_pde(pmap, va);
6131		oldpde = *pde;
6132		if ((oldpde & PG_RW) != 0) {
6133			if (pmap_demote_pde_locked(pmap, pde, va, &lock)) {
6134				if ((oldpde & PG_W) == 0) {
6135					/*
6136					 * Write protect the mapping to a
6137					 * single page so that a subsequent
6138					 * write access may repromote.
6139					 */
6140					va += VM_PAGE_TO_PHYS(m) - (oldpde &
6141					    PG_PS_FRAME);
6142					pte = pmap_pde_to_pte(pde, va);
6143					oldpte = *pte;
6144					if ((oldpte & PG_V) != 0) {
6145						while (!atomic_cmpset_long(pte,
6146						    oldpte,
6147						    oldpte & ~(PG_M | PG_RW)))
6148							oldpte = *pte;
6149						vm_page_dirty(m);
6150						pmap_invalidate_page(pmap, va);
6151					}
6152				}
6153			}
6154		}
6155		PMAP_UNLOCK(pmap);
6156	}
6157small_mappings:
6158	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6159		pmap = PV_PMAP(pv);
6160		if (!PMAP_TRYLOCK(pmap)) {
6161			md_gen = m->md.pv_gen;
6162			pvh_gen = pvh->pv_gen;
6163			rw_wunlock(lock);
6164			PMAP_LOCK(pmap);
6165			rw_wlock(lock);
6166			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
6167				PMAP_UNLOCK(pmap);
6168				goto restart;
6169			}
6170		}
6171		PG_M = pmap_modified_bit(pmap);
6172		PG_RW = pmap_rw_bit(pmap);
6173		pde = pmap_pde(pmap, pv->pv_va);
6174		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
6175		    " a 2mpage in page %p's pv list", m));
6176		pte = pmap_pde_to_pte(pde, pv->pv_va);
6177		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
6178			atomic_clear_long(pte, PG_M);
6179			pmap_invalidate_page(pmap, pv->pv_va);
6180		}
6181		PMAP_UNLOCK(pmap);
6182	}
6183	rw_wunlock(lock);
6184	rw_runlock(&pvh_global_lock);
6185}
6186
6187/*
6188 * Miscellaneous support routines follow
6189 */
6190
6191/* Adjust the cache mode for a 4KB page mapped via a PTE. */
6192static __inline void
6193pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask)
6194{
6195	u_int opte, npte;
6196
6197	/*
6198	 * The cache mode bits are all in the low 32-bits of the
6199	 * PTE, so we can just spin on updating the low 32-bits.
6200	 */
6201	do {
6202		opte = *(u_int *)pte;
6203		npte = opte & ~mask;
6204		npte |= cache_bits;
6205	} while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
6206}
6207
6208/* Adjust the cache mode for a 2MB page mapped via a PDE. */
6209static __inline void
6210pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask)
6211{
6212	u_int opde, npde;
6213
6214	/*
6215	 * The cache mode bits are all in the low 32-bits of the
6216	 * PDE, so we can just spin on updating the low 32-bits.
6217	 */
6218	do {
6219		opde = *(u_int *)pde;
6220		npde = opde & ~mask;
6221		npde |= cache_bits;
6222	} while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
6223}
6224
6225/*
6226 * Map a set of physical memory pages into the kernel virtual
6227 * address space. Return a pointer to where it is mapped. This
6228 * routine is intended to be used for mapping device memory,
6229 * NOT real memory.
6230 */
6231void *
6232pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
6233{
6234	vm_offset_t va, offset;
6235	vm_size_t tmpsize;
6236
6237	/*
6238	 * If the specified range of physical addresses fits within the direct
6239	 * map window, use the direct map.
6240	 */
6241	if (pa < dmaplimit && pa + size < dmaplimit) {
6242		va = PHYS_TO_DMAP(pa);
6243		if (!pmap_change_attr(va, size, mode))
6244			return ((void *)va);
6245	}
6246	offset = pa & PAGE_MASK;
6247	size = round_page(offset + size);
6248	va = kva_alloc(size);
6249	if (!va)
6250		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
6251	pa = trunc_page(pa);
6252	for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
6253		pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
6254	pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
6255	pmap_invalidate_cache_range(va, va + tmpsize, FALSE);
6256	return ((void *)(va + offset));
6257}
6258
6259void *
6260pmap_mapdev(vm_paddr_t pa, vm_size_t size)
6261{
6262
6263	return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
6264}
6265
6266void *
6267pmap_mapbios(vm_paddr_t pa, vm_size_t size)
6268{
6269
6270	return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
6271}
6272
6273void
6274pmap_unmapdev(vm_offset_t va, vm_size_t size)
6275{
6276	vm_offset_t base, offset;
6277
6278	/* If we gave a direct map region in pmap_mapdev, do nothing */
6279	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
6280		return;
6281	base = trunc_page(va);
6282	offset = va & PAGE_MASK;
6283	size = round_page(offset + size);
6284	kva_free(base, size);
6285}
6286
6287/*
6288 * Tries to demote a 1GB page mapping.
6289 */
6290static boolean_t
6291pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va)
6292{
6293	pdp_entry_t newpdpe, oldpdpe;
6294	pd_entry_t *firstpde, newpde, *pde;
6295	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
6296	vm_paddr_t mpdepa;
6297	vm_page_t mpde;
6298
6299	PG_A = pmap_accessed_bit(pmap);
6300	PG_M = pmap_modified_bit(pmap);
6301	PG_V = pmap_valid_bit(pmap);
6302	PG_RW = pmap_rw_bit(pmap);
6303
6304	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6305	oldpdpe = *pdpe;
6306	KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V),
6307	    ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V"));
6308	if ((mpde = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT |
6309	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
6310		CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx"
6311		    " in pmap %p", va, pmap);
6312		return (FALSE);
6313	}
6314	mpdepa = VM_PAGE_TO_PHYS(mpde);
6315	firstpde = (pd_entry_t *)PHYS_TO_DMAP(mpdepa);
6316	newpdpe = mpdepa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V;
6317	KASSERT((oldpdpe & PG_A) != 0,
6318	    ("pmap_demote_pdpe: oldpdpe is missing PG_A"));
6319	KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW,
6320	    ("pmap_demote_pdpe: oldpdpe is missing PG_M"));
6321	newpde = oldpdpe;
6322
6323	/*
6324	 * Initialize the page directory page.
6325	 */
6326	for (pde = firstpde; pde < firstpde + NPDEPG; pde++) {
6327		*pde = newpde;
6328		newpde += NBPDR;
6329	}
6330
6331	/*
6332	 * Demote the mapping.
6333	 */
6334	*pdpe = newpdpe;
6335
6336	/*
6337	 * Invalidate a stale recursive mapping of the page directory page.
6338	 */
6339	pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va));
6340
6341	pmap_pdpe_demotions++;
6342	CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx"
6343	    " in pmap %p", va, pmap);
6344	return (TRUE);
6345}
6346
6347/*
6348 * Sets the memory attribute for the specified page.
6349 */
6350void
6351pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
6352{
6353
6354	m->md.pat_mode = ma;
6355
6356	/*
6357	 * If "m" is a normal page, update its direct mapping.  This update
6358	 * can be relied upon to perform any cache operations that are
6359	 * required for data coherence.
6360	 */
6361	if ((m->flags & PG_FICTITIOUS) == 0 &&
6362	    pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
6363	    m->md.pat_mode))
6364		panic("memory attribute change on the direct map failed");
6365}
6366
6367/*
6368 * Changes the specified virtual address range's memory type to that given by
6369 * the parameter "mode".  The specified virtual address range must be
6370 * completely contained within either the direct map or the kernel map.  If
6371 * the virtual address range is contained within the kernel map, then the
6372 * memory type for each of the corresponding ranges of the direct map is also
6373 * changed.  (The corresponding ranges of the direct map are those ranges that
6374 * map the same physical pages as the specified virtual address range.)  These
6375 * changes to the direct map are necessary because Intel describes the
6376 * behavior of their processors as "undefined" if two or more mappings to the
6377 * same physical page have different memory types.
6378 *
6379 * Returns zero if the change completed successfully, and either EINVAL or
6380 * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
6381 * of the virtual address range was not mapped, and ENOMEM is returned if
6382 * there was insufficient memory available to complete the change.  In the
6383 * latter case, the memory type may have been changed on some part of the
6384 * virtual address range or the direct map.
6385 */
6386int
6387pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
6388{
6389	int error;
6390
6391	PMAP_LOCK(kernel_pmap);
6392	error = pmap_change_attr_locked(va, size, mode);
6393	PMAP_UNLOCK(kernel_pmap);
6394	return (error);
6395}
6396
6397static int
6398pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
6399{
6400	vm_offset_t base, offset, tmpva;
6401	vm_paddr_t pa_start, pa_end;
6402	pdp_entry_t *pdpe;
6403	pd_entry_t *pde;
6404	pt_entry_t *pte;
6405	int cache_bits_pte, cache_bits_pde, error;
6406	boolean_t changed;
6407
6408	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
6409	base = trunc_page(va);
6410	offset = va & PAGE_MASK;
6411	size = round_page(offset + size);
6412
6413	/*
6414	 * Only supported on kernel virtual addresses, including the direct
6415	 * map but excluding the recursive map.
6416	 */
6417	if (base < DMAP_MIN_ADDRESS)
6418		return (EINVAL);
6419
6420	cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1);
6421	cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0);
6422	changed = FALSE;
6423
6424	/*
6425	 * Pages that aren't mapped aren't supported.  Also break down 2MB pages
6426	 * into 4KB pages if required.
6427	 */
6428	for (tmpva = base; tmpva < base + size; ) {
6429		pdpe = pmap_pdpe(kernel_pmap, tmpva);
6430		if (*pdpe == 0)
6431			return (EINVAL);
6432		if (*pdpe & PG_PS) {
6433			/*
6434			 * If the current 1GB page already has the required
6435			 * memory type, then we need not demote this page. Just
6436			 * increment tmpva to the next 1GB page frame.
6437			 */
6438			if ((*pdpe & X86_PG_PDE_CACHE) == cache_bits_pde) {
6439				tmpva = trunc_1gpage(tmpva) + NBPDP;
6440				continue;
6441			}
6442
6443			/*
6444			 * If the current offset aligns with a 1GB page frame
6445			 * and there is at least 1GB left within the range, then
6446			 * we need not break down this page into 2MB pages.
6447			 */
6448			if ((tmpva & PDPMASK) == 0 &&
6449			    tmpva + PDPMASK < base + size) {
6450				tmpva += NBPDP;
6451				continue;
6452			}
6453			if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva))
6454				return (ENOMEM);
6455		}
6456		pde = pmap_pdpe_to_pde(pdpe, tmpva);
6457		if (*pde == 0)
6458			return (EINVAL);
6459		if (*pde & PG_PS) {
6460			/*
6461			 * If the current 2MB page already has the required
6462			 * memory type, then we need not demote this page. Just
6463			 * increment tmpva to the next 2MB page frame.
6464			 */
6465			if ((*pde & X86_PG_PDE_CACHE) == cache_bits_pde) {
6466				tmpva = trunc_2mpage(tmpva) + NBPDR;
6467				continue;
6468			}
6469
6470			/*
6471			 * If the current offset aligns with a 2MB page frame
6472			 * and there is at least 2MB left within the range, then
6473			 * we need not break down this page into 4KB pages.
6474			 */
6475			if ((tmpva & PDRMASK) == 0 &&
6476			    tmpva + PDRMASK < base + size) {
6477				tmpva += NBPDR;
6478				continue;
6479			}
6480			if (!pmap_demote_pde(kernel_pmap, pde, tmpva))
6481				return (ENOMEM);
6482		}
6483		pte = pmap_pde_to_pte(pde, tmpva);
6484		if (*pte == 0)
6485			return (EINVAL);
6486		tmpva += PAGE_SIZE;
6487	}
6488	error = 0;
6489
6490	/*
6491	 * Ok, all the pages exist, so run through them updating their
6492	 * cache mode if required.
6493	 */
6494	pa_start = pa_end = 0;
6495	for (tmpva = base; tmpva < base + size; ) {
6496		pdpe = pmap_pdpe(kernel_pmap, tmpva);
6497		if (*pdpe & PG_PS) {
6498			if ((*pdpe & X86_PG_PDE_CACHE) != cache_bits_pde) {
6499				pmap_pde_attr(pdpe, cache_bits_pde,
6500				    X86_PG_PDE_CACHE);
6501				changed = TRUE;
6502			}
6503			if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
6504				if (pa_start == pa_end) {
6505					/* Start physical address run. */
6506					pa_start = *pdpe & PG_PS_FRAME;
6507					pa_end = pa_start + NBPDP;
6508				} else if (pa_end == (*pdpe & PG_PS_FRAME))
6509					pa_end += NBPDP;
6510				else {
6511					/* Run ended, update direct map. */
6512					error = pmap_change_attr_locked(
6513					    PHYS_TO_DMAP(pa_start),
6514					    pa_end - pa_start, mode);
6515					if (error != 0)
6516						break;
6517					/* Start physical address run. */
6518					pa_start = *pdpe & PG_PS_FRAME;
6519					pa_end = pa_start + NBPDP;
6520				}
6521			}
6522			tmpva = trunc_1gpage(tmpva) + NBPDP;
6523			continue;
6524		}
6525		pde = pmap_pdpe_to_pde(pdpe, tmpva);
6526		if (*pde & PG_PS) {
6527			if ((*pde & X86_PG_PDE_CACHE) != cache_bits_pde) {
6528				pmap_pde_attr(pde, cache_bits_pde,
6529				    X86_PG_PDE_CACHE);
6530				changed = TRUE;
6531			}
6532			if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
6533				if (pa_start == pa_end) {
6534					/* Start physical address run. */
6535					pa_start = *pde & PG_PS_FRAME;
6536					pa_end = pa_start + NBPDR;
6537				} else if (pa_end == (*pde & PG_PS_FRAME))
6538					pa_end += NBPDR;
6539				else {
6540					/* Run ended, update direct map. */
6541					error = pmap_change_attr_locked(
6542					    PHYS_TO_DMAP(pa_start),
6543					    pa_end - pa_start, mode);
6544					if (error != 0)
6545						break;
6546					/* Start physical address run. */
6547					pa_start = *pde & PG_PS_FRAME;
6548					pa_end = pa_start + NBPDR;
6549				}
6550			}
6551			tmpva = trunc_2mpage(tmpva) + NBPDR;
6552		} else {
6553			pte = pmap_pde_to_pte(pde, tmpva);
6554			if ((*pte & X86_PG_PTE_CACHE) != cache_bits_pte) {
6555				pmap_pte_attr(pte, cache_bits_pte,
6556				    X86_PG_PTE_CACHE);
6557				changed = TRUE;
6558			}
6559			if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
6560				if (pa_start == pa_end) {
6561					/* Start physical address run. */
6562					pa_start = *pte & PG_FRAME;
6563					pa_end = pa_start + PAGE_SIZE;
6564				} else if (pa_end == (*pte & PG_FRAME))
6565					pa_end += PAGE_SIZE;
6566				else {
6567					/* Run ended, update direct map. */
6568					error = pmap_change_attr_locked(
6569					    PHYS_TO_DMAP(pa_start),
6570					    pa_end - pa_start, mode);
6571					if (error != 0)
6572						break;
6573					/* Start physical address run. */
6574					pa_start = *pte & PG_FRAME;
6575					pa_end = pa_start + PAGE_SIZE;
6576				}
6577			}
6578			tmpva += PAGE_SIZE;
6579		}
6580	}
6581	if (error == 0 && pa_start != pa_end)
6582		error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start),
6583		    pa_end - pa_start, mode);
6584
6585	/*
6586	 * Flush CPU caches if required to make sure any data isn't cached that
6587	 * shouldn't be, etc.
6588	 */
6589	if (changed) {
6590		pmap_invalidate_range(kernel_pmap, base, tmpva);
6591		pmap_invalidate_cache_range(base, tmpva, FALSE);
6592	}
6593	return (error);
6594}
6595
6596/*
6597 * Demotes any mapping within the direct map region that covers more than the
6598 * specified range of physical addresses.  This range's size must be a power
6599 * of two and its starting address must be a multiple of its size.  Since the
6600 * demotion does not change any attributes of the mapping, a TLB invalidation
6601 * is not mandatory.  The caller may, however, request a TLB invalidation.
6602 */
6603void
6604pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate)
6605{
6606	pdp_entry_t *pdpe;
6607	pd_entry_t *pde;
6608	vm_offset_t va;
6609	boolean_t changed;
6610
6611	if (len == 0)
6612		return;
6613	KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2"));
6614	KASSERT((base & (len - 1)) == 0,
6615	    ("pmap_demote_DMAP: base is not a multiple of len"));
6616	if (len < NBPDP && base < dmaplimit) {
6617		va = PHYS_TO_DMAP(base);
6618		changed = FALSE;
6619		PMAP_LOCK(kernel_pmap);
6620		pdpe = pmap_pdpe(kernel_pmap, va);
6621		if ((*pdpe & X86_PG_V) == 0)
6622			panic("pmap_demote_DMAP: invalid PDPE");
6623		if ((*pdpe & PG_PS) != 0) {
6624			if (!pmap_demote_pdpe(kernel_pmap, pdpe, va))
6625				panic("pmap_demote_DMAP: PDPE failed");
6626			changed = TRUE;
6627		}
6628		if (len < NBPDR) {
6629			pde = pmap_pdpe_to_pde(pdpe, va);
6630			if ((*pde & X86_PG_V) == 0)
6631				panic("pmap_demote_DMAP: invalid PDE");
6632			if ((*pde & PG_PS) != 0) {
6633				if (!pmap_demote_pde(kernel_pmap, pde, va))
6634					panic("pmap_demote_DMAP: PDE failed");
6635				changed = TRUE;
6636			}
6637		}
6638		if (changed && invalidate)
6639			pmap_invalidate_page(kernel_pmap, va);
6640		PMAP_UNLOCK(kernel_pmap);
6641	}
6642}
6643
6644/*
6645 * perform the pmap work for mincore
6646 */
6647int
6648pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
6649{
6650	pd_entry_t *pdep;
6651	pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V;
6652	vm_paddr_t pa;
6653	int val;
6654
6655	PG_A = pmap_accessed_bit(pmap);
6656	PG_M = pmap_modified_bit(pmap);
6657	PG_V = pmap_valid_bit(pmap);
6658	PG_RW = pmap_rw_bit(pmap);
6659
6660	PMAP_LOCK(pmap);
6661retry:
6662	pdep = pmap_pde(pmap, addr);
6663	if (pdep != NULL && (*pdep & PG_V)) {
6664		if (*pdep & PG_PS) {
6665			pte = *pdep;
6666			/* Compute the physical address of the 4KB page. */
6667			pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
6668			    PG_FRAME;
6669			val = MINCORE_SUPER;
6670		} else {
6671			pte = *pmap_pde_to_pte(pdep, addr);
6672			pa = pte & PG_FRAME;
6673			val = 0;
6674		}
6675	} else {
6676		pte = 0;
6677		pa = 0;
6678		val = 0;
6679	}
6680	if ((pte & PG_V) != 0) {
6681		val |= MINCORE_INCORE;
6682		if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
6683			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
6684		if ((pte & PG_A) != 0)
6685			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
6686	}
6687	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
6688	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
6689	    (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
6690		/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
6691		if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
6692			goto retry;
6693	} else
6694		PA_UNLOCK_COND(*locked_pa);
6695	PMAP_UNLOCK(pmap);
6696	return (val);
6697}
6698
6699void
6700pmap_activate(struct thread *td)
6701{
6702	pmap_t	pmap, oldpmap;
6703	u_int	cpuid;
6704
6705	critical_enter();
6706	pmap = vmspace_pmap(td->td_proc->p_vmspace);
6707	oldpmap = PCPU_GET(curpmap);
6708	cpuid = PCPU_GET(cpuid);
6709#ifdef SMP
6710	CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
6711	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
6712	CPU_SET_ATOMIC(cpuid, &pmap->pm_save);
6713#else
6714	CPU_CLR(cpuid, &oldpmap->pm_active);
6715	CPU_SET(cpuid, &pmap->pm_active);
6716	CPU_SET(cpuid, &pmap->pm_save);
6717#endif
6718	td->td_pcb->pcb_cr3 = pmap->pm_cr3;
6719	load_cr3(pmap->pm_cr3);
6720	PCPU_SET(curpmap, pmap);
6721	critical_exit();
6722}
6723
6724void
6725pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
6726{
6727}
6728
6729/*
6730 *	Increase the starting virtual address of the given mapping if a
6731 *	different alignment might result in more superpage mappings.
6732 */
6733void
6734pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
6735    vm_offset_t *addr, vm_size_t size)
6736{
6737	vm_offset_t superpage_offset;
6738
6739	if (size < NBPDR)
6740		return;
6741	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
6742		offset += ptoa(object->pg_color);
6743	superpage_offset = offset & PDRMASK;
6744	if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
6745	    (*addr & PDRMASK) == superpage_offset)
6746		return;
6747	if ((*addr & PDRMASK) < superpage_offset)
6748		*addr = (*addr & ~PDRMASK) + superpage_offset;
6749	else
6750		*addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
6751}
6752
6753#ifdef INVARIANTS
6754static unsigned long num_dirty_emulations;
6755SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW,
6756	     &num_dirty_emulations, 0, NULL);
6757
6758static unsigned long num_accessed_emulations;
6759SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW,
6760	     &num_accessed_emulations, 0, NULL);
6761
6762static unsigned long num_superpage_accessed_emulations;
6763SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW,
6764	     &num_superpage_accessed_emulations, 0, NULL);
6765
6766static unsigned long ad_emulation_superpage_promotions;
6767SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW,
6768	     &ad_emulation_superpage_promotions, 0, NULL);
6769#endif	/* INVARIANTS */
6770
6771int
6772pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype)
6773{
6774	int rv;
6775	struct rwlock *lock;
6776	vm_page_t m, mpte;
6777	pd_entry_t *pde;
6778	pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V;
6779	boolean_t pv_lists_locked;
6780
6781	KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE,
6782	    ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype));
6783
6784	if (!pmap_emulate_ad_bits(pmap))
6785		return (-1);
6786
6787	PG_A = pmap_accessed_bit(pmap);
6788	PG_M = pmap_modified_bit(pmap);
6789	PG_V = pmap_valid_bit(pmap);
6790	PG_RW = pmap_rw_bit(pmap);
6791
6792	rv = -1;
6793	lock = NULL;
6794	pv_lists_locked = FALSE;
6795retry:
6796	PMAP_LOCK(pmap);
6797
6798	pde = pmap_pde(pmap, va);
6799	if (pde == NULL || (*pde & PG_V) == 0)
6800		goto done;
6801
6802	if ((*pde & PG_PS) != 0) {
6803		if (ftype == VM_PROT_READ) {
6804#ifdef INVARIANTS
6805			atomic_add_long(&num_superpage_accessed_emulations, 1);
6806#endif
6807			*pde |= PG_A;
6808			rv = 0;
6809		}
6810		goto done;
6811	}
6812
6813	pte = pmap_pde_to_pte(pde, va);
6814	if ((*pte & PG_V) == 0)
6815		goto done;
6816
6817	if (ftype == VM_PROT_WRITE) {
6818		if ((*pte & PG_RW) == 0)
6819			goto done;
6820		/*
6821		 * Set the modified and accessed bits simultaneously.
6822		 *
6823		 * Intel EPT PTEs that do software emulation of A/D bits map
6824		 * PG_A and PG_M to EPT_PG_READ and EPT_PG_WRITE respectively.
6825		 * An EPT misconfiguration is triggered if the PTE is writable
6826		 * but not readable (WR=10). This is avoided by setting PG_A
6827		 * and PG_M simultaneously.
6828		 */
6829		*pte |= PG_M | PG_A;
6830	} else {
6831		*pte |= PG_A;
6832	}
6833
6834	/* try to promote the mapping */
6835	if (va < VM_MAXUSER_ADDRESS)
6836		mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
6837	else
6838		mpte = NULL;
6839
6840	m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
6841
6842	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
6843	    pmap_ps_enabled(pmap) &&
6844	    (m->flags & PG_FICTITIOUS) == 0 &&
6845	    vm_reserv_level_iffullpop(m) == 0) {
6846		if (!pv_lists_locked) {
6847			pv_lists_locked = TRUE;
6848			if (!rw_try_rlock(&pvh_global_lock)) {
6849				PMAP_UNLOCK(pmap);
6850				rw_rlock(&pvh_global_lock);
6851				goto retry;
6852			}
6853		}
6854		pmap_promote_pde(pmap, pde, va, &lock);
6855#ifdef INVARIANTS
6856		atomic_add_long(&ad_emulation_superpage_promotions, 1);
6857#endif
6858	}
6859#ifdef INVARIANTS
6860	if (ftype == VM_PROT_WRITE)
6861		atomic_add_long(&num_dirty_emulations, 1);
6862	else
6863		atomic_add_long(&num_accessed_emulations, 1);
6864#endif
6865	rv = 0;		/* success */
6866done:
6867	if (lock != NULL)
6868		rw_wunlock(lock);
6869	if (pv_lists_locked)
6870		rw_runlock(&pvh_global_lock);
6871	PMAP_UNLOCK(pmap);
6872	return (rv);
6873}
6874
6875void
6876pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num)
6877{
6878	pml4_entry_t *pml4;
6879	pdp_entry_t *pdp;
6880	pd_entry_t *pde;
6881	pt_entry_t *pte, PG_V;
6882	int idx;
6883
6884	idx = 0;
6885	PG_V = pmap_valid_bit(pmap);
6886	PMAP_LOCK(pmap);
6887
6888	pml4 = pmap_pml4e(pmap, va);
6889	ptr[idx++] = *pml4;
6890	if ((*pml4 & PG_V) == 0)
6891		goto done;
6892
6893	pdp = pmap_pml4e_to_pdpe(pml4, va);
6894	ptr[idx++] = *pdp;
6895	if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0)
6896		goto done;
6897
6898	pde = pmap_pdpe_to_pde(pdp, va);
6899	ptr[idx++] = *pde;
6900	if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0)
6901		goto done;
6902
6903	pte = pmap_pde_to_pte(pde, va);
6904	ptr[idx++] = *pte;
6905
6906done:
6907	PMAP_UNLOCK(pmap);
6908	*num = idx;
6909}
6910
6911#include "opt_ddb.h"
6912#ifdef DDB
6913#include <ddb/ddb.h>
6914
6915DB_SHOW_COMMAND(pte, pmap_print_pte)
6916{
6917	pmap_t pmap;
6918	pml4_entry_t *pml4;
6919	pdp_entry_t *pdp;
6920	pd_entry_t *pde;
6921	pt_entry_t *pte, PG_V;
6922	vm_offset_t va;
6923
6924	if (have_addr) {
6925		va = (vm_offset_t)addr;
6926		pmap = PCPU_GET(curpmap); /* XXX */
6927	} else {
6928		db_printf("show pte addr\n");
6929		return;
6930	}
6931	PG_V = pmap_valid_bit(pmap);
6932	pml4 = pmap_pml4e(pmap, va);
6933	db_printf("VA %#016lx pml4e %#016lx", va, *pml4);
6934	if ((*pml4 & PG_V) == 0) {
6935		db_printf("\n");
6936		return;
6937	}
6938	pdp = pmap_pml4e_to_pdpe(pml4, va);
6939	db_printf(" pdpe %#016lx", *pdp);
6940	if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) {
6941		db_printf("\n");
6942		return;
6943	}
6944	pde = pmap_pdpe_to_pde(pdp, va);
6945	db_printf(" pde %#016lx", *pde);
6946	if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) {
6947		db_printf("\n");
6948		return;
6949	}
6950	pte = pmap_pde_to_pte(pde, va);
6951	db_printf(" pte %#016lx\n", *pte);
6952}
6953
6954DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap)
6955{
6956	vm_paddr_t a;
6957
6958	if (have_addr) {
6959		a = (vm_paddr_t)addr;
6960		db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a));
6961	} else {
6962		db_printf("show phys2dmap addr\n");
6963	}
6964}
6965#endif
6966