pmap.c revision 276386
1/*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2003 Peter Wemm
9 * All rights reserved.
10 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
11 * All rights reserved.
12 *
13 * This code is derived from software contributed to Berkeley by
14 * the Systems Programming Group of the University of Utah Computer
15 * Science Department and William Jolitz of UUNET Technologies Inc.
16 *
17 * Redistribution and use in source and binary forms, with or without
18 * modification, are permitted provided that the following conditions
19 * are met:
20 * 1. Redistributions of source code must retain the above copyright
21 *    notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 *    notice, this list of conditions and the following disclaimer in the
24 *    documentation and/or other materials provided with the distribution.
25 * 3. All advertising materials mentioning features or use of this software
26 *    must display the following acknowledgement:
27 *	This product includes software developed by the University of
28 *	California, Berkeley and its contributors.
29 * 4. Neither the name of the University nor the names of its contributors
30 *    may be used to endorse or promote products derived from this software
31 *    without specific prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
34 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
35 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
36 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
37 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
38 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
39 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
40 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
41 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
42 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
43 * SUCH DAMAGE.
44 *
45 *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
46 */
47/*-
48 * Copyright (c) 2003 Networks Associates Technology, Inc.
49 * All rights reserved.
50 *
51 * This software was developed for the FreeBSD Project by Jake Burkholder,
52 * Safeport Network Services, and Network Associates Laboratories, the
53 * Security Research Division of Network Associates, Inc. under
54 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
55 * CHATS research program.
56 *
57 * Redistribution and use in source and binary forms, with or without
58 * modification, are permitted provided that the following conditions
59 * are met:
60 * 1. Redistributions of source code must retain the above copyright
61 *    notice, this list of conditions and the following disclaimer.
62 * 2. Redistributions in binary form must reproduce the above copyright
63 *    notice, this list of conditions and the following disclaimer in the
64 *    documentation and/or other materials provided with the distribution.
65 *
66 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
67 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
68 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
69 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
70 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
71 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
72 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
73 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
74 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
75 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
76 * SUCH DAMAGE.
77 */
78
79#define	AMD64_NPT_AWARE
80
81#include <sys/cdefs.h>
82__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/pmap.c 276386 2014-12-30 00:00:42Z neel $");
83
84/*
85 *	Manages physical address maps.
86 *
87 *	Since the information managed by this module is
88 *	also stored by the logical address mapping module,
89 *	this module may throw away valid virtual-to-physical
90 *	mappings at almost any time.  However, invalidations
91 *	of virtual-to-physical mappings must be done as
92 *	requested.
93 *
94 *	In order to cope with hardware architectures which
95 *	make virtual-to-physical map invalidates expensive,
96 *	this module may delay invalidate or reduced protection
97 *	operations until such time as they are actually
98 *	necessary.  This module is given full information as
99 *	to which processors are currently using which maps,
100 *	and to when physical maps must be made correct.
101 */
102
103#include "opt_pmap.h"
104#include "opt_vm.h"
105
106#include <sys/param.h>
107#include <sys/bus.h>
108#include <sys/systm.h>
109#include <sys/kernel.h>
110#include <sys/ktr.h>
111#include <sys/lock.h>
112#include <sys/malloc.h>
113#include <sys/mman.h>
114#include <sys/mutex.h>
115#include <sys/proc.h>
116#include <sys/rwlock.h>
117#include <sys/sx.h>
118#include <sys/vmmeter.h>
119#include <sys/sched.h>
120#include <sys/sysctl.h>
121#include <sys/_unrhdr.h>
122#include <sys/smp.h>
123
124#include <vm/vm.h>
125#include <vm/vm_param.h>
126#include <vm/vm_kern.h>
127#include <vm/vm_page.h>
128#include <vm/vm_map.h>
129#include <vm/vm_object.h>
130#include <vm/vm_extern.h>
131#include <vm/vm_pageout.h>
132#include <vm/vm_pager.h>
133#include <vm/vm_radix.h>
134#include <vm/vm_reserv.h>
135#include <vm/uma.h>
136
137#include <machine/intr_machdep.h>
138#include <machine/apicvar.h>
139#include <machine/cpu.h>
140#include <machine/cputypes.h>
141#include <machine/md_var.h>
142#include <machine/pcb.h>
143#include <machine/specialreg.h>
144#ifdef SMP
145#include <machine/smp.h>
146#endif
147
148static __inline boolean_t
149pmap_type_guest(pmap_t pmap)
150{
151
152	return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI));
153}
154
155static __inline boolean_t
156pmap_emulate_ad_bits(pmap_t pmap)
157{
158
159	return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0);
160}
161
162static __inline pt_entry_t
163pmap_valid_bit(pmap_t pmap)
164{
165	pt_entry_t mask;
166
167	switch (pmap->pm_type) {
168	case PT_X86:
169	case PT_RVI:
170		mask = X86_PG_V;
171		break;
172	case PT_EPT:
173		if (pmap_emulate_ad_bits(pmap))
174			mask = EPT_PG_EMUL_V;
175		else
176			mask = EPT_PG_READ;
177		break;
178	default:
179		panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type);
180	}
181
182	return (mask);
183}
184
185static __inline pt_entry_t
186pmap_rw_bit(pmap_t pmap)
187{
188	pt_entry_t mask;
189
190	switch (pmap->pm_type) {
191	case PT_X86:
192	case PT_RVI:
193		mask = X86_PG_RW;
194		break;
195	case PT_EPT:
196		if (pmap_emulate_ad_bits(pmap))
197			mask = EPT_PG_EMUL_RW;
198		else
199			mask = EPT_PG_WRITE;
200		break;
201	default:
202		panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type);
203	}
204
205	return (mask);
206}
207
208static __inline pt_entry_t
209pmap_global_bit(pmap_t pmap)
210{
211	pt_entry_t mask;
212
213	switch (pmap->pm_type) {
214	case PT_X86:
215		mask = X86_PG_G;
216		break;
217	case PT_RVI:
218	case PT_EPT:
219		mask = 0;
220		break;
221	default:
222		panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type);
223	}
224
225	return (mask);
226}
227
228static __inline pt_entry_t
229pmap_accessed_bit(pmap_t pmap)
230{
231	pt_entry_t mask;
232
233	switch (pmap->pm_type) {
234	case PT_X86:
235	case PT_RVI:
236		mask = X86_PG_A;
237		break;
238	case PT_EPT:
239		if (pmap_emulate_ad_bits(pmap))
240			mask = EPT_PG_READ;
241		else
242			mask = EPT_PG_A;
243		break;
244	default:
245		panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type);
246	}
247
248	return (mask);
249}
250
251static __inline pt_entry_t
252pmap_modified_bit(pmap_t pmap)
253{
254	pt_entry_t mask;
255
256	switch (pmap->pm_type) {
257	case PT_X86:
258	case PT_RVI:
259		mask = X86_PG_M;
260		break;
261	case PT_EPT:
262		if (pmap_emulate_ad_bits(pmap))
263			mask = EPT_PG_WRITE;
264		else
265			mask = EPT_PG_M;
266		break;
267	default:
268		panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type);
269	}
270
271	return (mask);
272}
273
274#if !defined(DIAGNOSTIC)
275#ifdef __GNUC_GNU_INLINE__
276#define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
277#else
278#define PMAP_INLINE	extern inline
279#endif
280#else
281#define PMAP_INLINE
282#endif
283
284#ifdef PV_STATS
285#define PV_STAT(x)	do { x ; } while (0)
286#else
287#define PV_STAT(x)	do { } while (0)
288#endif
289
290#define	pa_index(pa)	((pa) >> PDRSHIFT)
291#define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
292
293#define	NPV_LIST_LOCKS	MAXCPU
294
295#define	PHYS_TO_PV_LIST_LOCK(pa)	\
296			(&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
297
298#define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)	do {	\
299	struct rwlock **_lockp = (lockp);		\
300	struct rwlock *_new_lock;			\
301							\
302	_new_lock = PHYS_TO_PV_LIST_LOCK(pa);		\
303	if (_new_lock != *_lockp) {			\
304		if (*_lockp != NULL)			\
305			rw_wunlock(*_lockp);		\
306		*_lockp = _new_lock;			\
307		rw_wlock(*_lockp);			\
308	}						\
309} while (0)
310
311#define	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)	\
312			CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
313
314#define	RELEASE_PV_LIST_LOCK(lockp)		do {	\
315	struct rwlock **_lockp = (lockp);		\
316							\
317	if (*_lockp != NULL) {				\
318		rw_wunlock(*_lockp);			\
319		*_lockp = NULL;				\
320	}						\
321} while (0)
322
323#define	VM_PAGE_TO_PV_LIST_LOCK(m)	\
324			PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
325
326struct pmap kernel_pmap_store;
327
328vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
329vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
330
331int nkpt;
332SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0,
333    "Number of kernel page table pages allocated on bootup");
334
335static int ndmpdp;
336vm_paddr_t dmaplimit;
337vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
338pt_entry_t pg_nx;
339
340static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
341
342static int pat_works = 1;
343SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1,
344    "Is page attribute table fully functional?");
345
346static int pg_ps_enabled = 1;
347SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0,
348    "Are large page mappings enabled?");
349
350#define	PAT_INDEX_SIZE	8
351static int pat_index[PAT_INDEX_SIZE];	/* cache mode to PAT index conversion */
352
353static u_int64_t	KPTphys;	/* phys addr of kernel level 1 */
354static u_int64_t	KPDphys;	/* phys addr of kernel level 2 */
355u_int64_t		KPDPphys;	/* phys addr of kernel level 3 */
356u_int64_t		KPML4phys;	/* phys addr of kernel level 4 */
357
358static u_int64_t	DMPDphys;	/* phys addr of direct mapped level 2 */
359static u_int64_t	DMPDPphys;	/* phys addr of direct mapped level 3 */
360static int		ndmpdpphys;	/* number of DMPDPphys pages */
361
362static struct rwlock_padalign pvh_global_lock;
363
364/*
365 * Data for the pv entry allocation mechanism
366 */
367static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
368static struct mtx pv_chunks_mutex;
369static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
370static struct md_page *pv_table;
371
372/*
373 * All those kernel PT submaps that BSD is so fond of
374 */
375pt_entry_t *CMAP1 = 0;
376caddr_t CADDR1 = 0;
377
378static int pmap_flags = PMAP_PDE_SUPERPAGE;	/* flags for x86 pmaps */
379
380static struct unrhdr pcid_unr;
381static struct mtx pcid_mtx;
382int pmap_pcid_enabled = 0;
383SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN, &pmap_pcid_enabled,
384    0, "Is TLB Context ID enabled ?");
385int invpcid_works = 0;
386SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0,
387    "Is the invpcid instruction available ?");
388
389static int
390pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS)
391{
392	int i;
393	uint64_t res;
394
395	res = 0;
396	CPU_FOREACH(i) {
397		res += cpuid_to_pcpu[i]->pc_pm_save_cnt;
398	}
399	return (sysctl_handle_64(oidp, &res, 0, req));
400}
401SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RW |
402    CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU",
403    "Count of saved TLB context on switch");
404
405/* pmap_copy_pages() over non-DMAP */
406static struct mtx cpage_lock;
407static vm_offset_t cpage_a;
408static vm_offset_t cpage_b;
409
410/*
411 * Crashdump maps.
412 */
413static caddr_t crashdumpmap;
414
415static void	free_pv_chunk(struct pv_chunk *pc);
416static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
417static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
418static int	popcnt_pc_map_elem(uint64_t elem);
419static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
420static void	reserve_pv_entries(pmap_t pmap, int needed,
421		    struct rwlock **lockp);
422static void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
423		    struct rwlock **lockp);
424static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
425		    struct rwlock **lockp);
426static void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
427		    struct rwlock **lockp);
428static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
429static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
430		    vm_offset_t va);
431
432static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode);
433static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
434static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde,
435    vm_offset_t va, struct rwlock **lockp);
436static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe,
437    vm_offset_t va);
438static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
439    vm_prot_t prot, struct rwlock **lockp);
440static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
441    vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
442static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
443static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
444static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
445static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va);
446static void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask);
447static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
448    struct rwlock **lockp);
449static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
450    vm_prot_t prot);
451static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask);
452static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
453    struct spglist *free, struct rwlock **lockp);
454static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
455    pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
456static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte);
457static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
458    struct spglist *free);
459static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
460    vm_page_t m, struct rwlock **lockp);
461static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
462    pd_entry_t newpde);
463static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde);
464
465static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
466		struct rwlock **lockp);
467static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va,
468		struct rwlock **lockp);
469static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va,
470		struct rwlock **lockp);
471
472static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
473    struct spglist *free);
474static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
475static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
476
477/*
478 * Move the kernel virtual free pointer to the next
479 * 2MB.  This is used to help improve performance
480 * by using a large (2MB) page for much of the kernel
481 * (.text, .data, .bss)
482 */
483static vm_offset_t
484pmap_kmem_choose(vm_offset_t addr)
485{
486	vm_offset_t newaddr = addr;
487
488	newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
489	return (newaddr);
490}
491
492/********************/
493/* Inline functions */
494/********************/
495
496/* Return a non-clipped PD index for a given VA */
497static __inline vm_pindex_t
498pmap_pde_pindex(vm_offset_t va)
499{
500	return (va >> PDRSHIFT);
501}
502
503
504/* Return various clipped indexes for a given VA */
505static __inline vm_pindex_t
506pmap_pte_index(vm_offset_t va)
507{
508
509	return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1));
510}
511
512static __inline vm_pindex_t
513pmap_pde_index(vm_offset_t va)
514{
515
516	return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
517}
518
519static __inline vm_pindex_t
520pmap_pdpe_index(vm_offset_t va)
521{
522
523	return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1));
524}
525
526static __inline vm_pindex_t
527pmap_pml4e_index(vm_offset_t va)
528{
529
530	return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
531}
532
533/* Return a pointer to the PML4 slot that corresponds to a VA */
534static __inline pml4_entry_t *
535pmap_pml4e(pmap_t pmap, vm_offset_t va)
536{
537
538	return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
539}
540
541/* Return a pointer to the PDP slot that corresponds to a VA */
542static __inline pdp_entry_t *
543pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va)
544{
545	pdp_entry_t *pdpe;
546
547	pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
548	return (&pdpe[pmap_pdpe_index(va)]);
549}
550
551/* Return a pointer to the PDP slot that corresponds to a VA */
552static __inline pdp_entry_t *
553pmap_pdpe(pmap_t pmap, vm_offset_t va)
554{
555	pml4_entry_t *pml4e;
556	pt_entry_t PG_V;
557
558	PG_V = pmap_valid_bit(pmap);
559	pml4e = pmap_pml4e(pmap, va);
560	if ((*pml4e & PG_V) == 0)
561		return (NULL);
562	return (pmap_pml4e_to_pdpe(pml4e, va));
563}
564
565/* Return a pointer to the PD slot that corresponds to a VA */
566static __inline pd_entry_t *
567pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va)
568{
569	pd_entry_t *pde;
570
571	pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
572	return (&pde[pmap_pde_index(va)]);
573}
574
575/* Return a pointer to the PD slot that corresponds to a VA */
576static __inline pd_entry_t *
577pmap_pde(pmap_t pmap, vm_offset_t va)
578{
579	pdp_entry_t *pdpe;
580	pt_entry_t PG_V;
581
582	PG_V = pmap_valid_bit(pmap);
583	pdpe = pmap_pdpe(pmap, va);
584	if (pdpe == NULL || (*pdpe & PG_V) == 0)
585		return (NULL);
586	return (pmap_pdpe_to_pde(pdpe, va));
587}
588
589/* Return a pointer to the PT slot that corresponds to a VA */
590static __inline pt_entry_t *
591pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va)
592{
593	pt_entry_t *pte;
594
595	pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
596	return (&pte[pmap_pte_index(va)]);
597}
598
599/* Return a pointer to the PT slot that corresponds to a VA */
600static __inline pt_entry_t *
601pmap_pte(pmap_t pmap, vm_offset_t va)
602{
603	pd_entry_t *pde;
604	pt_entry_t PG_V;
605
606	PG_V = pmap_valid_bit(pmap);
607	pde = pmap_pde(pmap, va);
608	if (pde == NULL || (*pde & PG_V) == 0)
609		return (NULL);
610	if ((*pde & PG_PS) != 0)	/* compat with i386 pmap_pte() */
611		return ((pt_entry_t *)pde);
612	return (pmap_pde_to_pte(pde, va));
613}
614
615static __inline void
616pmap_resident_count_inc(pmap_t pmap, int count)
617{
618
619	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
620	pmap->pm_stats.resident_count += count;
621}
622
623static __inline void
624pmap_resident_count_dec(pmap_t pmap, int count)
625{
626
627	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
628	KASSERT(pmap->pm_stats.resident_count >= count,
629	    ("pmap %p resident count underflow %ld %d", pmap,
630	    pmap->pm_stats.resident_count, count));
631	pmap->pm_stats.resident_count -= count;
632}
633
634PMAP_INLINE pt_entry_t *
635vtopte(vm_offset_t va)
636{
637	u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
638
639	KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va));
640
641	return (PTmap + ((va >> PAGE_SHIFT) & mask));
642}
643
644static __inline pd_entry_t *
645vtopde(vm_offset_t va)
646{
647	u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
648
649	KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va));
650
651	return (PDmap + ((va >> PDRSHIFT) & mask));
652}
653
654static u_int64_t
655allocpages(vm_paddr_t *firstaddr, int n)
656{
657	u_int64_t ret;
658
659	ret = *firstaddr;
660	bzero((void *)ret, n * PAGE_SIZE);
661	*firstaddr += n * PAGE_SIZE;
662	return (ret);
663}
664
665CTASSERT(powerof2(NDMPML4E));
666
667/* number of kernel PDP slots */
668#define	NKPDPE(ptpgs)		howmany((ptpgs), NPDEPG)
669
670static void
671nkpt_init(vm_paddr_t addr)
672{
673	int pt_pages;
674
675#ifdef NKPT
676	pt_pages = NKPT;
677#else
678	pt_pages = howmany(addr, 1 << PDRSHIFT);
679	pt_pages += NKPDPE(pt_pages);
680
681	/*
682	 * Add some slop beyond the bare minimum required for bootstrapping
683	 * the kernel.
684	 *
685	 * This is quite important when allocating KVA for kernel modules.
686	 * The modules are required to be linked in the negative 2GB of
687	 * the address space.  If we run out of KVA in this region then
688	 * pmap_growkernel() will need to allocate page table pages to map
689	 * the entire 512GB of KVA space which is an unnecessary tax on
690	 * physical memory.
691	 */
692	pt_pages += 8;		/* 16MB additional slop for kernel modules */
693#endif
694	nkpt = pt_pages;
695}
696
697static void
698create_pagetables(vm_paddr_t *firstaddr)
699{
700	int i, j, ndm1g, nkpdpe;
701	pt_entry_t *pt_p;
702	pd_entry_t *pd_p;
703	pdp_entry_t *pdp_p;
704	pml4_entry_t *p4_p;
705
706	/* Allocate page table pages for the direct map */
707	ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT;
708	if (ndmpdp < 4)		/* Minimum 4GB of dirmap */
709		ndmpdp = 4;
710	ndmpdpphys = howmany(ndmpdp, NPDPEPG);
711	if (ndmpdpphys > NDMPML4E) {
712		/*
713		 * Each NDMPML4E allows 512 GB, so limit to that,
714		 * and then readjust ndmpdp and ndmpdpphys.
715		 */
716		printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512);
717		Maxmem = atop(NDMPML4E * NBPML4);
718		ndmpdpphys = NDMPML4E;
719		ndmpdp = NDMPML4E * NPDEPG;
720	}
721	DMPDPphys = allocpages(firstaddr, ndmpdpphys);
722	ndm1g = 0;
723	if ((amd_feature & AMDID_PAGE1GB) != 0)
724		ndm1g = ptoa(Maxmem) >> PDPSHIFT;
725	if (ndm1g < ndmpdp)
726		DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g);
727	dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
728
729	/* Allocate pages */
730	KPML4phys = allocpages(firstaddr, 1);
731	KPDPphys = allocpages(firstaddr, NKPML4E);
732
733	/*
734	 * Allocate the initial number of kernel page table pages required to
735	 * bootstrap.  We defer this until after all memory-size dependent
736	 * allocations are done (e.g. direct map), so that we don't have to
737	 * build in too much slop in our estimate.
738	 *
739	 * Note that when NKPML4E > 1, we have an empty page underneath
740	 * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed)
741	 * pages.  (pmap_enter requires a PD page to exist for each KPML4E.)
742	 */
743	nkpt_init(*firstaddr);
744	nkpdpe = NKPDPE(nkpt);
745
746	KPTphys = allocpages(firstaddr, nkpt);
747	KPDphys = allocpages(firstaddr, nkpdpe);
748
749	/* Fill in the underlying page table pages */
750	/* Nominally read-only (but really R/W) from zero to physfree */
751	/* XXX not fully used, underneath 2M pages */
752	pt_p = (pt_entry_t *)KPTphys;
753	for (i = 0; ptoa(i) < *firstaddr; i++)
754		pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | X86_PG_G;
755
756	/* Now map the page tables at their location within PTmap */
757	pd_p = (pd_entry_t *)KPDphys;
758	for (i = 0; i < nkpt; i++)
759		pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
760
761	/* Map from zero to end of allocations under 2M pages */
762	/* This replaces some of the KPTphys entries above */
763	for (i = 0; (i << PDRSHIFT) < *firstaddr; i++)
764		pd_p[i] = (i << PDRSHIFT) | X86_PG_RW | X86_PG_V | PG_PS |
765		    X86_PG_G;
766
767	/* And connect up the PD to the PDP (leaving room for L4 pages) */
768	pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE));
769	for (i = 0; i < nkpdpe; i++)
770		pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V |
771		    PG_U;
772
773	/*
774	 * Now, set up the direct map region using 2MB and/or 1GB pages.  If
775	 * the end of physical memory is not aligned to a 1GB page boundary,
776	 * then the residual physical memory is mapped with 2MB pages.  Later,
777	 * if pmap_mapdev{_attr}() uses the direct map for non-write-back
778	 * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings
779	 * that are partially used.
780	 */
781	pd_p = (pd_entry_t *)DMPDphys;
782	for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) {
783		pd_p[j] = (vm_paddr_t)i << PDRSHIFT;
784		/* Preset PG_M and PG_A because demotion expects it. */
785		pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G |
786		    X86_PG_M | X86_PG_A;
787	}
788	pdp_p = (pdp_entry_t *)DMPDPphys;
789	for (i = 0; i < ndm1g; i++) {
790		pdp_p[i] = (vm_paddr_t)i << PDPSHIFT;
791		/* Preset PG_M and PG_A because demotion expects it. */
792		pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G |
793		    X86_PG_M | X86_PG_A;
794	}
795	for (j = 0; i < ndmpdp; i++, j++) {
796		pdp_p[i] = DMPDphys + ptoa(j);
797		pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_U;
798	}
799
800	/* And recursively map PML4 to itself in order to get PTmap */
801	p4_p = (pml4_entry_t *)KPML4phys;
802	p4_p[PML4PML4I] = KPML4phys;
803	p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | PG_U;
804
805	/* Connect the Direct Map slot(s) up to the PML4. */
806	for (i = 0; i < ndmpdpphys; i++) {
807		p4_p[DMPML4I + i] = DMPDPphys + ptoa(i);
808		p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | PG_U;
809	}
810
811	/* Connect the KVA slots up to the PML4 */
812	for (i = 0; i < NKPML4E; i++) {
813		p4_p[KPML4BASE + i] = KPDPphys + ptoa(i);
814		p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V | PG_U;
815	}
816}
817
818/*
819 *	Bootstrap the system enough to run with virtual memory.
820 *
821 *	On amd64 this is called after mapping has already been enabled
822 *	and just syncs the pmap module with what has already been done.
823 *	[We can't call it easily with mapping off since the kernel is not
824 *	mapped with PA == VA, hence we would have to relocate every address
825 *	from the linked base (virtual) address "KERNBASE" to the actual
826 *	(physical) address starting relative to 0]
827 */
828void
829pmap_bootstrap(vm_paddr_t *firstaddr)
830{
831	vm_offset_t va;
832	pt_entry_t *pte;
833
834	/*
835	 * Create an initial set of page tables to run the kernel in.
836	 */
837	create_pagetables(firstaddr);
838
839	virtual_avail = (vm_offset_t) KERNBASE + *firstaddr;
840	virtual_avail = pmap_kmem_choose(virtual_avail);
841
842	virtual_end = VM_MAX_KERNEL_ADDRESS;
843
844
845	/* XXX do %cr0 as well */
846	load_cr4(rcr4() | CR4_PGE | CR4_PSE);
847	load_cr3(KPML4phys);
848	if (cpu_stdext_feature & CPUID_STDEXT_SMEP)
849		load_cr4(rcr4() | CR4_SMEP);
850
851	/*
852	 * Initialize the kernel pmap (which is statically allocated).
853	 */
854	PMAP_LOCK_INIT(kernel_pmap);
855	kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys);
856	kernel_pmap->pm_cr3 = KPML4phys;
857	CPU_FILL(&kernel_pmap->pm_active);	/* don't allow deactivation */
858	CPU_FILL(&kernel_pmap->pm_save);	/* always superset of pm_active */
859	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
860	kernel_pmap->pm_flags = pmap_flags;
861
862 	/*
863	 * Initialize the global pv list lock.
864	 */
865	rw_init(&pvh_global_lock, "pmap pv global");
866
867	/*
868	 * Reserve some special page table entries/VA space for temporary
869	 * mapping of pages.
870	 */
871#define	SYSMAP(c, p, v, n)	\
872	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
873
874	va = virtual_avail;
875	pte = vtopte(va);
876
877	/*
878	 * Crashdump maps.  The first page is reused as CMAP1 for the
879	 * memory test.
880	 */
881	SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS)
882	CADDR1 = crashdumpmap;
883
884	virtual_avail = va;
885
886	/* Initialize the PAT MSR. */
887	pmap_init_pat();
888
889	/* Initialize TLB Context Id. */
890	TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
891	if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
892		load_cr4(rcr4() | CR4_PCIDE);
893		mtx_init(&pcid_mtx, "pcid", NULL, MTX_DEF);
894		init_unrhdr(&pcid_unr, 1, (1 << 12) - 1, &pcid_mtx);
895		/* Check for INVPCID support */
896		invpcid_works = (cpu_stdext_feature & CPUID_STDEXT_INVPCID)
897		    != 0;
898		kernel_pmap->pm_pcid = 0;
899#ifndef SMP
900		pmap_pcid_enabled = 0;
901#endif
902	} else
903		pmap_pcid_enabled = 0;
904}
905
906/*
907 * Setup the PAT MSR.
908 */
909void
910pmap_init_pat(void)
911{
912	int pat_table[PAT_INDEX_SIZE];
913	uint64_t pat_msr;
914	u_long cr0, cr4;
915	int i;
916
917	/* Bail if this CPU doesn't implement PAT. */
918	if ((cpu_feature & CPUID_PAT) == 0)
919		panic("no PAT??");
920
921	/* Set default PAT index table. */
922	for (i = 0; i < PAT_INDEX_SIZE; i++)
923		pat_table[i] = -1;
924	pat_table[PAT_WRITE_BACK] = 0;
925	pat_table[PAT_WRITE_THROUGH] = 1;
926	pat_table[PAT_UNCACHEABLE] = 3;
927	pat_table[PAT_WRITE_COMBINING] = 3;
928	pat_table[PAT_WRITE_PROTECTED] = 3;
929	pat_table[PAT_UNCACHED] = 3;
930
931	/* Initialize default PAT entries. */
932	pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
933	    PAT_VALUE(1, PAT_WRITE_THROUGH) |
934	    PAT_VALUE(2, PAT_UNCACHED) |
935	    PAT_VALUE(3, PAT_UNCACHEABLE) |
936	    PAT_VALUE(4, PAT_WRITE_BACK) |
937	    PAT_VALUE(5, PAT_WRITE_THROUGH) |
938	    PAT_VALUE(6, PAT_UNCACHED) |
939	    PAT_VALUE(7, PAT_UNCACHEABLE);
940
941	if (pat_works) {
942		/*
943		 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
944		 * Program 5 and 6 as WP and WC.
945		 * Leave 4 and 7 as WB and UC.
946		 */
947		pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6));
948		pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) |
949		    PAT_VALUE(6, PAT_WRITE_COMBINING);
950		pat_table[PAT_UNCACHED] = 2;
951		pat_table[PAT_WRITE_PROTECTED] = 5;
952		pat_table[PAT_WRITE_COMBINING] = 6;
953	} else {
954		/*
955		 * Just replace PAT Index 2 with WC instead of UC-.
956		 */
957		pat_msr &= ~PAT_MASK(2);
958		pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
959		pat_table[PAT_WRITE_COMBINING] = 2;
960	}
961
962	/* Disable PGE. */
963	cr4 = rcr4();
964	load_cr4(cr4 & ~CR4_PGE);
965
966	/* Disable caches (CD = 1, NW = 0). */
967	cr0 = rcr0();
968	load_cr0((cr0 & ~CR0_NW) | CR0_CD);
969
970	/* Flushes caches and TLBs. */
971	wbinvd();
972	invltlb();
973
974	/* Update PAT and index table. */
975	wrmsr(MSR_PAT, pat_msr);
976	for (i = 0; i < PAT_INDEX_SIZE; i++)
977		pat_index[i] = pat_table[i];
978
979	/* Flush caches and TLBs again. */
980	wbinvd();
981	invltlb();
982
983	/* Restore caches and PGE. */
984	load_cr0(cr0);
985	load_cr4(cr4);
986}
987
988/*
989 *	Initialize a vm_page's machine-dependent fields.
990 */
991void
992pmap_page_init(vm_page_t m)
993{
994
995	TAILQ_INIT(&m->md.pv_list);
996	m->md.pat_mode = PAT_WRITE_BACK;
997}
998
999/*
1000 *	Initialize the pmap module.
1001 *	Called by vm_init, to initialize any structures that the pmap
1002 *	system needs to map virtual memory.
1003 */
1004void
1005pmap_init(void)
1006{
1007	vm_page_t mpte;
1008	vm_size_t s;
1009	int i, pv_npg;
1010
1011	/*
1012	 * Initialize the vm page array entries for the kernel pmap's
1013	 * page table pages.
1014	 */
1015	for (i = 0; i < nkpt; i++) {
1016		mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
1017		KASSERT(mpte >= vm_page_array &&
1018		    mpte < &vm_page_array[vm_page_array_size],
1019		    ("pmap_init: page table page is out of range"));
1020		mpte->pindex = pmap_pde_pindex(KERNBASE) + i;
1021		mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
1022	}
1023
1024	/*
1025	 * If the kernel is running on a virtual machine, then it must assume
1026	 * that MCA is enabled by the hypervisor.  Moreover, the kernel must
1027	 * be prepared for the hypervisor changing the vendor and family that
1028	 * are reported by CPUID.  Consequently, the workaround for AMD Family
1029	 * 10h Erratum 383 is enabled if the processor's feature set does not
1030	 * include at least one feature that is only supported by older Intel
1031	 * or newer AMD processors.
1032	 */
1033	if (vm_guest == VM_GUEST_VM && (cpu_feature & CPUID_SS) == 0 &&
1034	    (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI |
1035	    CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP |
1036	    AMDID2_FMA4)) == 0)
1037		workaround_erratum383 = 1;
1038
1039	/*
1040	 * Are large page mappings enabled?
1041	 */
1042	TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
1043	if (pg_ps_enabled) {
1044		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
1045		    ("pmap_init: can't assign to pagesizes[1]"));
1046		pagesizes[1] = NBPDR;
1047	}
1048
1049	/*
1050	 * Initialize the pv chunk list mutex.
1051	 */
1052	mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
1053
1054	/*
1055	 * Initialize the pool of pv list locks.
1056	 */
1057	for (i = 0; i < NPV_LIST_LOCKS; i++)
1058		rw_init(&pv_list_locks[i], "pmap pv list");
1059
1060	/*
1061	 * Calculate the size of the pv head table for superpages.
1062	 */
1063	for (i = 0; phys_avail[i + 1]; i += 2);
1064	pv_npg = round_2mpage(phys_avail[(i - 2) + 1]) / NBPDR;
1065
1066	/*
1067	 * Allocate memory for the pv head table for superpages.
1068	 */
1069	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
1070	s = round_page(s);
1071	pv_table = (struct md_page *)kmem_malloc(kernel_arena, s,
1072	    M_WAITOK | M_ZERO);
1073	for (i = 0; i < pv_npg; i++)
1074		TAILQ_INIT(&pv_table[i].pv_list);
1075
1076	mtx_init(&cpage_lock, "cpage", NULL, MTX_DEF);
1077	cpage_a = kva_alloc(PAGE_SIZE);
1078	cpage_b = kva_alloc(PAGE_SIZE);
1079}
1080
1081static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
1082    "2MB page mapping counters");
1083
1084static u_long pmap_pde_demotions;
1085SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
1086    &pmap_pde_demotions, 0, "2MB page demotions");
1087
1088static u_long pmap_pde_mappings;
1089SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
1090    &pmap_pde_mappings, 0, "2MB page mappings");
1091
1092static u_long pmap_pde_p_failures;
1093SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
1094    &pmap_pde_p_failures, 0, "2MB page promotion failures");
1095
1096static u_long pmap_pde_promotions;
1097SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
1098    &pmap_pde_promotions, 0, "2MB page promotions");
1099
1100static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0,
1101    "1GB page mapping counters");
1102
1103static u_long pmap_pdpe_demotions;
1104SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD,
1105    &pmap_pdpe_demotions, 0, "1GB page demotions");
1106
1107/***************************************************
1108 * Low level helper routines.....
1109 ***************************************************/
1110
1111static pt_entry_t
1112pmap_swap_pat(pmap_t pmap, pt_entry_t entry)
1113{
1114	int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT;
1115
1116	switch (pmap->pm_type) {
1117	case PT_X86:
1118	case PT_RVI:
1119		/* Verify that both PAT bits are not set at the same time */
1120		KASSERT((entry & x86_pat_bits) != x86_pat_bits,
1121		    ("Invalid PAT bits in entry %#lx", entry));
1122
1123		/* Swap the PAT bits if one of them is set */
1124		if ((entry & x86_pat_bits) != 0)
1125			entry ^= x86_pat_bits;
1126		break;
1127	case PT_EPT:
1128		/*
1129		 * Nothing to do - the memory attributes are represented
1130		 * the same way for regular pages and superpages.
1131		 */
1132		break;
1133	default:
1134		panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type);
1135	}
1136
1137	return (entry);
1138}
1139
1140/*
1141 * Determine the appropriate bits to set in a PTE or PDE for a specified
1142 * caching mode.
1143 */
1144static int
1145pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde)
1146{
1147	int cache_bits, pat_flag, pat_idx;
1148
1149	if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0)
1150		panic("Unknown caching mode %d\n", mode);
1151
1152	switch (pmap->pm_type) {
1153	case PT_X86:
1154	case PT_RVI:
1155		/* The PAT bit is different for PTE's and PDE's. */
1156		pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT;
1157
1158		/* Map the caching mode to a PAT index. */
1159		pat_idx = pat_index[mode];
1160
1161		/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
1162		cache_bits = 0;
1163		if (pat_idx & 0x4)
1164			cache_bits |= pat_flag;
1165		if (pat_idx & 0x2)
1166			cache_bits |= PG_NC_PCD;
1167		if (pat_idx & 0x1)
1168			cache_bits |= PG_NC_PWT;
1169		break;
1170
1171	case PT_EPT:
1172		cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode);
1173		break;
1174
1175	default:
1176		panic("unsupported pmap type %d", pmap->pm_type);
1177	}
1178
1179	return (cache_bits);
1180}
1181
1182static int
1183pmap_cache_mask(pmap_t pmap, boolean_t is_pde)
1184{
1185	int mask;
1186
1187	switch (pmap->pm_type) {
1188	case PT_X86:
1189	case PT_RVI:
1190		mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE;
1191		break;
1192	case PT_EPT:
1193		mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7);
1194		break;
1195	default:
1196		panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type);
1197	}
1198
1199	return (mask);
1200}
1201
1202static __inline boolean_t
1203pmap_ps_enabled(pmap_t pmap)
1204{
1205
1206	return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0);
1207}
1208
1209static void
1210pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde)
1211{
1212
1213	switch (pmap->pm_type) {
1214	case PT_X86:
1215		break;
1216	case PT_RVI:
1217	case PT_EPT:
1218		/*
1219		 * XXX
1220		 * This is a little bogus since the generation number is
1221		 * supposed to be bumped up when a region of the address
1222		 * space is invalidated in the page tables.
1223		 *
1224		 * In this case the old PDE entry is valid but yet we want
1225		 * to make sure that any mappings using the old entry are
1226		 * invalidated in the TLB.
1227		 *
1228		 * The reason this works as expected is because we rendezvous
1229		 * "all" host cpus and force any vcpu context to exit as a
1230		 * side-effect.
1231		 */
1232		atomic_add_acq_long(&pmap->pm_eptgen, 1);
1233		break;
1234	default:
1235		panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type);
1236	}
1237	pde_store(pde, newpde);
1238}
1239
1240/*
1241 * After changing the page size for the specified virtual address in the page
1242 * table, flush the corresponding entries from the processor's TLB.  Only the
1243 * calling processor's TLB is affected.
1244 *
1245 * The calling thread must be pinned to a processor.
1246 */
1247static void
1248pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde)
1249{
1250	pt_entry_t PG_G;
1251
1252	if (pmap_type_guest(pmap))
1253		return;
1254
1255	KASSERT(pmap->pm_type == PT_X86,
1256	    ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type));
1257
1258	PG_G = pmap_global_bit(pmap);
1259
1260	if ((newpde & PG_PS) == 0)
1261		/* Demotion: flush a specific 2MB page mapping. */
1262		invlpg(va);
1263	else if ((newpde & PG_G) == 0)
1264		/*
1265		 * Promotion: flush every 4KB page mapping from the TLB
1266		 * because there are too many to flush individually.
1267		 */
1268		invltlb();
1269	else {
1270		/*
1271		 * Promotion: flush every 4KB page mapping from the TLB,
1272		 * including any global (PG_G) mappings.
1273		 */
1274		invltlb_globpcid();
1275	}
1276}
1277#ifdef SMP
1278
1279static void
1280pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va)
1281{
1282	struct invpcid_descr d;
1283	uint64_t cr3;
1284
1285	if (invpcid_works) {
1286		d.pcid = pmap->pm_pcid;
1287		d.pad = 0;
1288		d.addr = va;
1289		invpcid(&d, INVPCID_ADDR);
1290		return;
1291	}
1292
1293	cr3 = rcr3();
1294	critical_enter();
1295	load_cr3(pmap->pm_cr3 | CR3_PCID_SAVE);
1296	invlpg(va);
1297	load_cr3(cr3 | CR3_PCID_SAVE);
1298	critical_exit();
1299}
1300
1301/*
1302 * For SMP, these functions have to use the IPI mechanism for coherence.
1303 *
1304 * N.B.: Before calling any of the following TLB invalidation functions,
1305 * the calling processor must ensure that all stores updating a non-
1306 * kernel page table are globally performed.  Otherwise, another
1307 * processor could cache an old, pre-update entry without being
1308 * invalidated.  This can happen one of two ways: (1) The pmap becomes
1309 * active on another processor after its pm_active field is checked by
1310 * one of the following functions but before a store updating the page
1311 * table is globally performed. (2) The pmap becomes active on another
1312 * processor before its pm_active field is checked but due to
1313 * speculative loads one of the following functions stills reads the
1314 * pmap as inactive on the other processor.
1315 *
1316 * The kernel page table is exempt because its pm_active field is
1317 * immutable.  The kernel page table is always active on every
1318 * processor.
1319 */
1320
1321/*
1322 * Interrupt the cpus that are executing in the guest context.
1323 * This will force the vcpu to exit and the cached EPT mappings
1324 * will be invalidated by the host before the next vmresume.
1325 */
1326static __inline void
1327pmap_invalidate_ept(pmap_t pmap)
1328{
1329	int ipinum;
1330
1331	sched_pin();
1332	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
1333	    ("pmap_invalidate_ept: absurd pm_active"));
1334
1335	/*
1336	 * The TLB mappings associated with a vcpu context are not
1337	 * flushed each time a different vcpu is chosen to execute.
1338	 *
1339	 * This is in contrast with a process's vtop mappings that
1340	 * are flushed from the TLB on each context switch.
1341	 *
1342	 * Therefore we need to do more than just a TLB shootdown on
1343	 * the active cpus in 'pmap->pm_active'. To do this we keep
1344	 * track of the number of invalidations performed on this pmap.
1345	 *
1346	 * Each vcpu keeps a cache of this counter and compares it
1347	 * just before a vmresume. If the counter is out-of-date an
1348	 * invept will be done to flush stale mappings from the TLB.
1349	 */
1350	atomic_add_acq_long(&pmap->pm_eptgen, 1);
1351
1352	/*
1353	 * Force the vcpu to exit and trap back into the hypervisor.
1354	 */
1355	ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK;
1356	ipi_selected(pmap->pm_active, ipinum);
1357	sched_unpin();
1358}
1359
1360void
1361pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1362{
1363	cpuset_t other_cpus;
1364	u_int cpuid;
1365
1366	if (pmap_type_guest(pmap)) {
1367		pmap_invalidate_ept(pmap);
1368		return;
1369	}
1370
1371	KASSERT(pmap->pm_type == PT_X86,
1372	    ("pmap_invalidate_page: invalid type %d", pmap->pm_type));
1373
1374	sched_pin();
1375	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
1376		if (!pmap_pcid_enabled) {
1377			invlpg(va);
1378		} else {
1379			if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) {
1380				if (pmap == PCPU_GET(curpmap))
1381					invlpg(va);
1382				else
1383					pmap_invalidate_page_pcid(pmap, va);
1384			} else {
1385				invltlb_globpcid();
1386			}
1387		}
1388		smp_invlpg(pmap, va);
1389	} else {
1390		cpuid = PCPU_GET(cpuid);
1391		other_cpus = all_cpus;
1392		CPU_CLR(cpuid, &other_cpus);
1393		if (CPU_ISSET(cpuid, &pmap->pm_active))
1394			invlpg(va);
1395		else if (pmap_pcid_enabled) {
1396			if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0)
1397				pmap_invalidate_page_pcid(pmap, va);
1398			else
1399				invltlb_globpcid();
1400		}
1401		if (pmap_pcid_enabled)
1402			CPU_AND(&other_cpus, &pmap->pm_save);
1403		else
1404			CPU_AND(&other_cpus, &pmap->pm_active);
1405		if (!CPU_EMPTY(&other_cpus))
1406			smp_masked_invlpg(other_cpus, pmap, va);
1407	}
1408	sched_unpin();
1409}
1410
1411static void
1412pmap_invalidate_range_pcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1413{
1414	struct invpcid_descr d;
1415	uint64_t cr3;
1416	vm_offset_t addr;
1417
1418	if (invpcid_works) {
1419		d.pcid = pmap->pm_pcid;
1420		d.pad = 0;
1421		for (addr = sva; addr < eva; addr += PAGE_SIZE) {
1422			d.addr = addr;
1423			invpcid(&d, INVPCID_ADDR);
1424		}
1425		return;
1426	}
1427
1428	cr3 = rcr3();
1429	critical_enter();
1430	load_cr3(pmap->pm_cr3 | CR3_PCID_SAVE);
1431	for (addr = sva; addr < eva; addr += PAGE_SIZE)
1432		invlpg(addr);
1433	load_cr3(cr3 | CR3_PCID_SAVE);
1434	critical_exit();
1435}
1436
1437void
1438pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1439{
1440	cpuset_t other_cpus;
1441	vm_offset_t addr;
1442	u_int cpuid;
1443
1444	if (pmap_type_guest(pmap)) {
1445		pmap_invalidate_ept(pmap);
1446		return;
1447	}
1448
1449	KASSERT(pmap->pm_type == PT_X86,
1450	    ("pmap_invalidate_range: invalid type %d", pmap->pm_type));
1451
1452	sched_pin();
1453	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
1454		if (!pmap_pcid_enabled) {
1455			for (addr = sva; addr < eva; addr += PAGE_SIZE)
1456				invlpg(addr);
1457		} else {
1458			if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) {
1459				if (pmap == PCPU_GET(curpmap)) {
1460					for (addr = sva; addr < eva;
1461					    addr += PAGE_SIZE)
1462						invlpg(addr);
1463				} else {
1464					pmap_invalidate_range_pcid(pmap,
1465					    sva, eva);
1466				}
1467			} else {
1468				invltlb_globpcid();
1469			}
1470		}
1471		smp_invlpg_range(pmap, sva, eva);
1472	} else {
1473		cpuid = PCPU_GET(cpuid);
1474		other_cpus = all_cpus;
1475		CPU_CLR(cpuid, &other_cpus);
1476		if (CPU_ISSET(cpuid, &pmap->pm_active)) {
1477			for (addr = sva; addr < eva; addr += PAGE_SIZE)
1478				invlpg(addr);
1479		} else if (pmap_pcid_enabled) {
1480			if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0)
1481				pmap_invalidate_range_pcid(pmap, sva, eva);
1482			else
1483				invltlb_globpcid();
1484		}
1485		if (pmap_pcid_enabled)
1486			CPU_AND(&other_cpus, &pmap->pm_save);
1487		else
1488			CPU_AND(&other_cpus, &pmap->pm_active);
1489		if (!CPU_EMPTY(&other_cpus))
1490			smp_masked_invlpg_range(other_cpus, pmap, sva, eva);
1491	}
1492	sched_unpin();
1493}
1494
1495void
1496pmap_invalidate_all(pmap_t pmap)
1497{
1498	cpuset_t other_cpus;
1499	struct invpcid_descr d;
1500	uint64_t cr3;
1501	u_int cpuid;
1502
1503	if (pmap_type_guest(pmap)) {
1504		pmap_invalidate_ept(pmap);
1505		return;
1506	}
1507
1508	KASSERT(pmap->pm_type == PT_X86,
1509	    ("pmap_invalidate_all: invalid type %d", pmap->pm_type));
1510
1511	sched_pin();
1512	cpuid = PCPU_GET(cpuid);
1513	if (pmap == kernel_pmap ||
1514	    (pmap_pcid_enabled && !CPU_CMP(&pmap->pm_save, &all_cpus)) ||
1515	    !CPU_CMP(&pmap->pm_active, &all_cpus)) {
1516		if (invpcid_works) {
1517			bzero(&d, sizeof(d));
1518			invpcid(&d, INVPCID_CTXGLOB);
1519		} else {
1520			invltlb_globpcid();
1521		}
1522		if (!CPU_ISSET(cpuid, &pmap->pm_active))
1523			CPU_CLR_ATOMIC(cpuid, &pmap->pm_save);
1524		smp_invltlb(pmap);
1525	} else {
1526		other_cpus = all_cpus;
1527		CPU_CLR(cpuid, &other_cpus);
1528
1529		/*
1530		 * This logic is duplicated in the Xinvltlb shootdown
1531		 * IPI handler.
1532		 */
1533		if (pmap_pcid_enabled) {
1534			if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) {
1535				if (invpcid_works) {
1536					d.pcid = pmap->pm_pcid;
1537					d.pad = 0;
1538					d.addr = 0;
1539					invpcid(&d, INVPCID_CTX);
1540				} else {
1541					cr3 = rcr3();
1542					critical_enter();
1543
1544					/*
1545					 * Bit 63 is clear, pcid TLB
1546					 * entries are invalidated.
1547					 */
1548					load_cr3(pmap->pm_cr3);
1549					load_cr3(cr3 | CR3_PCID_SAVE);
1550					critical_exit();
1551				}
1552			} else {
1553				invltlb_globpcid();
1554			}
1555		} else if (CPU_ISSET(cpuid, &pmap->pm_active))
1556			invltlb();
1557		if (!CPU_ISSET(cpuid, &pmap->pm_active))
1558			CPU_CLR_ATOMIC(cpuid, &pmap->pm_save);
1559		if (pmap_pcid_enabled)
1560			CPU_AND(&other_cpus, &pmap->pm_save);
1561		else
1562			CPU_AND(&other_cpus, &pmap->pm_active);
1563		if (!CPU_EMPTY(&other_cpus))
1564			smp_masked_invltlb(other_cpus, pmap);
1565	}
1566	sched_unpin();
1567}
1568
1569void
1570pmap_invalidate_cache(void)
1571{
1572
1573	sched_pin();
1574	wbinvd();
1575	smp_cache_flush();
1576	sched_unpin();
1577}
1578
1579struct pde_action {
1580	cpuset_t invalidate;	/* processors that invalidate their TLB */
1581	pmap_t pmap;
1582	vm_offset_t va;
1583	pd_entry_t *pde;
1584	pd_entry_t newpde;
1585	u_int store;		/* processor that updates the PDE */
1586};
1587
1588static void
1589pmap_update_pde_action(void *arg)
1590{
1591	struct pde_action *act = arg;
1592
1593	if (act->store == PCPU_GET(cpuid))
1594		pmap_update_pde_store(act->pmap, act->pde, act->newpde);
1595}
1596
1597static void
1598pmap_update_pde_teardown(void *arg)
1599{
1600	struct pde_action *act = arg;
1601
1602	if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
1603		pmap_update_pde_invalidate(act->pmap, act->va, act->newpde);
1604}
1605
1606/*
1607 * Change the page size for the specified virtual address in a way that
1608 * prevents any possibility of the TLB ever having two entries that map the
1609 * same virtual address using different page sizes.  This is the recommended
1610 * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
1611 * machine check exception for a TLB state that is improperly diagnosed as a
1612 * hardware error.
1613 */
1614static void
1615pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1616{
1617	struct pde_action act;
1618	cpuset_t active, other_cpus;
1619	u_int cpuid;
1620
1621	sched_pin();
1622	cpuid = PCPU_GET(cpuid);
1623	other_cpus = all_cpus;
1624	CPU_CLR(cpuid, &other_cpus);
1625	if (pmap == kernel_pmap || pmap_type_guest(pmap))
1626		active = all_cpus;
1627	else {
1628		active = pmap->pm_active;
1629		CPU_AND_ATOMIC(&pmap->pm_save, &active);
1630	}
1631	if (CPU_OVERLAP(&active, &other_cpus)) {
1632		act.store = cpuid;
1633		act.invalidate = active;
1634		act.va = va;
1635		act.pmap = pmap;
1636		act.pde = pde;
1637		act.newpde = newpde;
1638		CPU_SET(cpuid, &active);
1639		smp_rendezvous_cpus(active,
1640		    smp_no_rendevous_barrier, pmap_update_pde_action,
1641		    pmap_update_pde_teardown, &act);
1642	} else {
1643		pmap_update_pde_store(pmap, pde, newpde);
1644		if (CPU_ISSET(cpuid, &active))
1645			pmap_update_pde_invalidate(pmap, va, newpde);
1646	}
1647	sched_unpin();
1648}
1649#else /* !SMP */
1650/*
1651 * Normal, non-SMP, invalidation functions.
1652 * We inline these within pmap.c for speed.
1653 */
1654PMAP_INLINE void
1655pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1656{
1657
1658	switch (pmap->pm_type) {
1659	case PT_X86:
1660		if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1661			invlpg(va);
1662		break;
1663	case PT_RVI:
1664	case PT_EPT:
1665		pmap->pm_eptgen++;
1666		break;
1667	default:
1668		panic("pmap_invalidate_page: unknown type: %d", pmap->pm_type);
1669	}
1670}
1671
1672PMAP_INLINE void
1673pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1674{
1675	vm_offset_t addr;
1676
1677	switch (pmap->pm_type) {
1678	case PT_X86:
1679		if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1680			for (addr = sva; addr < eva; addr += PAGE_SIZE)
1681				invlpg(addr);
1682		break;
1683	case PT_RVI:
1684	case PT_EPT:
1685		pmap->pm_eptgen++;
1686		break;
1687	default:
1688		panic("pmap_invalidate_range: unknown type: %d", pmap->pm_type);
1689	}
1690}
1691
1692PMAP_INLINE void
1693pmap_invalidate_all(pmap_t pmap)
1694{
1695
1696	switch (pmap->pm_type) {
1697	case PT_X86:
1698		if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1699			invltlb();
1700		break;
1701	case PT_RVI:
1702	case PT_EPT:
1703		pmap->pm_eptgen++;
1704		break;
1705	default:
1706		panic("pmap_invalidate_all: unknown type %d", pmap->pm_type);
1707	}
1708}
1709
1710PMAP_INLINE void
1711pmap_invalidate_cache(void)
1712{
1713
1714	wbinvd();
1715}
1716
1717static void
1718pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1719{
1720
1721	pmap_update_pde_store(pmap, pde, newpde);
1722	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1723		pmap_update_pde_invalidate(pmap, va, newpde);
1724	else
1725		CPU_ZERO(&pmap->pm_save);
1726}
1727#endif /* !SMP */
1728
1729#define PMAP_CLFLUSH_THRESHOLD   (2 * 1024 * 1024)
1730
1731void
1732pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force)
1733{
1734
1735	if (force) {
1736		sva &= ~(vm_offset_t)cpu_clflush_line_size;
1737	} else {
1738		KASSERT((sva & PAGE_MASK) == 0,
1739		    ("pmap_invalidate_cache_range: sva not page-aligned"));
1740		KASSERT((eva & PAGE_MASK) == 0,
1741		    ("pmap_invalidate_cache_range: eva not page-aligned"));
1742	}
1743
1744	if ((cpu_feature & CPUID_SS) != 0 && !force)
1745		; /* If "Self Snoop" is supported and allowed, do nothing. */
1746	else if ((cpu_feature & CPUID_CLFSH) != 0 &&
1747	    eva - sva < PMAP_CLFLUSH_THRESHOLD) {
1748
1749		/*
1750		 * XXX: Some CPUs fault, hang, or trash the local APIC
1751		 * registers if we use CLFLUSH on the local APIC
1752		 * range.  The local APIC is always uncached, so we
1753		 * don't need to flush for that range anyway.
1754		 */
1755		if (pmap_kextract(sva) == lapic_paddr)
1756			return;
1757
1758		/*
1759		 * Otherwise, do per-cache line flush.  Use the mfence
1760		 * instruction to insure that previous stores are
1761		 * included in the write-back.  The processor
1762		 * propagates flush to other processors in the cache
1763		 * coherence domain.
1764		 */
1765		mfence();
1766		for (; sva < eva; sva += cpu_clflush_line_size)
1767			clflush(sva);
1768		mfence();
1769	} else {
1770
1771		/*
1772		 * No targeted cache flush methods are supported by CPU,
1773		 * or the supplied range is bigger than 2MB.
1774		 * Globally invalidate cache.
1775		 */
1776		pmap_invalidate_cache();
1777	}
1778}
1779
1780/*
1781 * Remove the specified set of pages from the data and instruction caches.
1782 *
1783 * In contrast to pmap_invalidate_cache_range(), this function does not
1784 * rely on the CPU's self-snoop feature, because it is intended for use
1785 * when moving pages into a different cache domain.
1786 */
1787void
1788pmap_invalidate_cache_pages(vm_page_t *pages, int count)
1789{
1790	vm_offset_t daddr, eva;
1791	int i;
1792
1793	if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
1794	    (cpu_feature & CPUID_CLFSH) == 0)
1795		pmap_invalidate_cache();
1796	else {
1797		mfence();
1798		for (i = 0; i < count; i++) {
1799			daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i]));
1800			eva = daddr + PAGE_SIZE;
1801			for (; daddr < eva; daddr += cpu_clflush_line_size)
1802				clflush(daddr);
1803		}
1804		mfence();
1805	}
1806}
1807
1808/*
1809 *	Routine:	pmap_extract
1810 *	Function:
1811 *		Extract the physical page address associated
1812 *		with the given map/virtual_address pair.
1813 */
1814vm_paddr_t
1815pmap_extract(pmap_t pmap, vm_offset_t va)
1816{
1817	pdp_entry_t *pdpe;
1818	pd_entry_t *pde;
1819	pt_entry_t *pte, PG_V;
1820	vm_paddr_t pa;
1821
1822	pa = 0;
1823	PG_V = pmap_valid_bit(pmap);
1824	PMAP_LOCK(pmap);
1825	pdpe = pmap_pdpe(pmap, va);
1826	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
1827		if ((*pdpe & PG_PS) != 0)
1828			pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK);
1829		else {
1830			pde = pmap_pdpe_to_pde(pdpe, va);
1831			if ((*pde & PG_V) != 0) {
1832				if ((*pde & PG_PS) != 0) {
1833					pa = (*pde & PG_PS_FRAME) |
1834					    (va & PDRMASK);
1835				} else {
1836					pte = pmap_pde_to_pte(pde, va);
1837					pa = (*pte & PG_FRAME) |
1838					    (va & PAGE_MASK);
1839				}
1840			}
1841		}
1842	}
1843	PMAP_UNLOCK(pmap);
1844	return (pa);
1845}
1846
1847/*
1848 *	Routine:	pmap_extract_and_hold
1849 *	Function:
1850 *		Atomically extract and hold the physical page
1851 *		with the given pmap and virtual address pair
1852 *		if that mapping permits the given protection.
1853 */
1854vm_page_t
1855pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1856{
1857	pd_entry_t pde, *pdep;
1858	pt_entry_t pte, PG_RW, PG_V;
1859	vm_paddr_t pa;
1860	vm_page_t m;
1861
1862	pa = 0;
1863	m = NULL;
1864	PG_RW = pmap_rw_bit(pmap);
1865	PG_V = pmap_valid_bit(pmap);
1866	PMAP_LOCK(pmap);
1867retry:
1868	pdep = pmap_pde(pmap, va);
1869	if (pdep != NULL && (pde = *pdep)) {
1870		if (pde & PG_PS) {
1871			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
1872				if (vm_page_pa_tryrelock(pmap, (pde &
1873				    PG_PS_FRAME) | (va & PDRMASK), &pa))
1874					goto retry;
1875				m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
1876				    (va & PDRMASK));
1877				vm_page_hold(m);
1878			}
1879		} else {
1880			pte = *pmap_pde_to_pte(pdep, va);
1881			if ((pte & PG_V) &&
1882			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
1883				if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME,
1884				    &pa))
1885					goto retry;
1886				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
1887				vm_page_hold(m);
1888			}
1889		}
1890	}
1891	PA_UNLOCK_COND(pa);
1892	PMAP_UNLOCK(pmap);
1893	return (m);
1894}
1895
1896vm_paddr_t
1897pmap_kextract(vm_offset_t va)
1898{
1899	pd_entry_t pde;
1900	vm_paddr_t pa;
1901
1902	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
1903		pa = DMAP_TO_PHYS(va);
1904	} else {
1905		pde = *vtopde(va);
1906		if (pde & PG_PS) {
1907			pa = (pde & PG_PS_FRAME) | (va & PDRMASK);
1908		} else {
1909			/*
1910			 * Beware of a concurrent promotion that changes the
1911			 * PDE at this point!  For example, vtopte() must not
1912			 * be used to access the PTE because it would use the
1913			 * new PDE.  It is, however, safe to use the old PDE
1914			 * because the page table page is preserved by the
1915			 * promotion.
1916			 */
1917			pa = *pmap_pde_to_pte(&pde, va);
1918			pa = (pa & PG_FRAME) | (va & PAGE_MASK);
1919		}
1920	}
1921	return (pa);
1922}
1923
1924/***************************************************
1925 * Low level mapping routines.....
1926 ***************************************************/
1927
1928/*
1929 * Add a wired page to the kva.
1930 * Note: not SMP coherent.
1931 */
1932PMAP_INLINE void
1933pmap_kenter(vm_offset_t va, vm_paddr_t pa)
1934{
1935	pt_entry_t *pte;
1936
1937	pte = vtopte(va);
1938	pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G);
1939}
1940
1941static __inline void
1942pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
1943{
1944	pt_entry_t *pte;
1945	int cache_bits;
1946
1947	pte = vtopte(va);
1948	cache_bits = pmap_cache_bits(kernel_pmap, mode, 0);
1949	pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G | cache_bits);
1950}
1951
1952/*
1953 * Remove a page from the kernel pagetables.
1954 * Note: not SMP coherent.
1955 */
1956PMAP_INLINE void
1957pmap_kremove(vm_offset_t va)
1958{
1959	pt_entry_t *pte;
1960
1961	pte = vtopte(va);
1962	pte_clear(pte);
1963}
1964
1965/*
1966 *	Used to map a range of physical addresses into kernel
1967 *	virtual address space.
1968 *
1969 *	The value passed in '*virt' is a suggested virtual address for
1970 *	the mapping. Architectures which can support a direct-mapped
1971 *	physical to virtual region can return the appropriate address
1972 *	within that region, leaving '*virt' unchanged. Other
1973 *	architectures should map the pages starting at '*virt' and
1974 *	update '*virt' with the first usable address after the mapped
1975 *	region.
1976 */
1977vm_offset_t
1978pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1979{
1980	return PHYS_TO_DMAP(start);
1981}
1982
1983
1984/*
1985 * Add a list of wired pages to the kva
1986 * this routine is only used for temporary
1987 * kernel mappings that do not need to have
1988 * page modification or references recorded.
1989 * Note that old mappings are simply written
1990 * over.  The page *must* be wired.
1991 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1992 */
1993void
1994pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1995{
1996	pt_entry_t *endpte, oldpte, pa, *pte;
1997	vm_page_t m;
1998	int cache_bits;
1999
2000	oldpte = 0;
2001	pte = vtopte(sva);
2002	endpte = pte + count;
2003	while (pte < endpte) {
2004		m = *ma++;
2005		cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0);
2006		pa = VM_PAGE_TO_PHYS(m) | cache_bits;
2007		if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) {
2008			oldpte |= *pte;
2009			pte_store(pte, pa | X86_PG_G | X86_PG_RW | X86_PG_V);
2010		}
2011		pte++;
2012	}
2013	if (__predict_false((oldpte & X86_PG_V) != 0))
2014		pmap_invalidate_range(kernel_pmap, sva, sva + count *
2015		    PAGE_SIZE);
2016}
2017
2018/*
2019 * This routine tears out page mappings from the
2020 * kernel -- it is meant only for temporary mappings.
2021 * Note: SMP coherent.  Uses a ranged shootdown IPI.
2022 */
2023void
2024pmap_qremove(vm_offset_t sva, int count)
2025{
2026	vm_offset_t va;
2027
2028	va = sva;
2029	while (count-- > 0) {
2030		KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va));
2031		pmap_kremove(va);
2032		va += PAGE_SIZE;
2033	}
2034	pmap_invalidate_range(kernel_pmap, sva, va);
2035}
2036
2037/***************************************************
2038 * Page table page management routines.....
2039 ***************************************************/
2040static __inline void
2041pmap_free_zero_pages(struct spglist *free)
2042{
2043	vm_page_t m;
2044
2045	while ((m = SLIST_FIRST(free)) != NULL) {
2046		SLIST_REMOVE_HEAD(free, plinks.s.ss);
2047		/* Preserve the page's PG_ZERO setting. */
2048		vm_page_free_toq(m);
2049	}
2050}
2051
2052/*
2053 * Schedule the specified unused page table page to be freed.  Specifically,
2054 * add the page to the specified list of pages that will be released to the
2055 * physical memory manager after the TLB has been updated.
2056 */
2057static __inline void
2058pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
2059    boolean_t set_PG_ZERO)
2060{
2061
2062	if (set_PG_ZERO)
2063		m->flags |= PG_ZERO;
2064	else
2065		m->flags &= ~PG_ZERO;
2066	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
2067}
2068
2069/*
2070 * Inserts the specified page table page into the specified pmap's collection
2071 * of idle page table pages.  Each of a pmap's page table pages is responsible
2072 * for mapping a distinct range of virtual addresses.  The pmap's collection is
2073 * ordered by this virtual address range.
2074 */
2075static __inline int
2076pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
2077{
2078
2079	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2080	return (vm_radix_insert(&pmap->pm_root, mpte));
2081}
2082
2083/*
2084 * Looks for a page table page mapping the specified virtual address in the
2085 * specified pmap's collection of idle page table pages.  Returns NULL if there
2086 * is no page table page corresponding to the specified virtual address.
2087 */
2088static __inline vm_page_t
2089pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va)
2090{
2091
2092	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2093	return (vm_radix_lookup(&pmap->pm_root, pmap_pde_pindex(va)));
2094}
2095
2096/*
2097 * Removes the specified page table page from the specified pmap's collection
2098 * of idle page table pages.  The specified page table page must be a member of
2099 * the pmap's collection.
2100 */
2101static __inline void
2102pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte)
2103{
2104
2105	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2106	vm_radix_remove(&pmap->pm_root, mpte->pindex);
2107}
2108
2109/*
2110 * Decrements a page table page's wire count, which is used to record the
2111 * number of valid page table entries within the page.  If the wire count
2112 * drops to zero, then the page table page is unmapped.  Returns TRUE if the
2113 * page table page was unmapped and FALSE otherwise.
2114 */
2115static inline boolean_t
2116pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2117{
2118
2119	--m->wire_count;
2120	if (m->wire_count == 0) {
2121		_pmap_unwire_ptp(pmap, va, m, free);
2122		return (TRUE);
2123	} else
2124		return (FALSE);
2125}
2126
2127static void
2128_pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2129{
2130
2131	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2132	/*
2133	 * unmap the page table page
2134	 */
2135	if (m->pindex >= (NUPDE + NUPDPE)) {
2136		/* PDP page */
2137		pml4_entry_t *pml4;
2138		pml4 = pmap_pml4e(pmap, va);
2139		*pml4 = 0;
2140	} else if (m->pindex >= NUPDE) {
2141		/* PD page */
2142		pdp_entry_t *pdp;
2143		pdp = pmap_pdpe(pmap, va);
2144		*pdp = 0;
2145	} else {
2146		/* PTE page */
2147		pd_entry_t *pd;
2148		pd = pmap_pde(pmap, va);
2149		*pd = 0;
2150	}
2151	pmap_resident_count_dec(pmap, 1);
2152	if (m->pindex < NUPDE) {
2153		/* We just released a PT, unhold the matching PD */
2154		vm_page_t pdpg;
2155
2156		pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
2157		pmap_unwire_ptp(pmap, va, pdpg, free);
2158	}
2159	if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
2160		/* We just released a PD, unhold the matching PDP */
2161		vm_page_t pdppg;
2162
2163		pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
2164		pmap_unwire_ptp(pmap, va, pdppg, free);
2165	}
2166
2167	/*
2168	 * This is a release store so that the ordinary store unmapping
2169	 * the page table page is globally performed before TLB shoot-
2170	 * down is begun.
2171	 */
2172	atomic_subtract_rel_int(&cnt.v_wire_count, 1);
2173
2174	/*
2175	 * Put page on a list so that it is released after
2176	 * *ALL* TLB shootdown is done
2177	 */
2178	pmap_add_delayed_free_list(m, free, TRUE);
2179}
2180
2181/*
2182 * After removing a page table entry, this routine is used to
2183 * conditionally free the page, and manage the hold/wire counts.
2184 */
2185static int
2186pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
2187    struct spglist *free)
2188{
2189	vm_page_t mpte;
2190
2191	if (va >= VM_MAXUSER_ADDRESS)
2192		return (0);
2193	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
2194	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
2195	return (pmap_unwire_ptp(pmap, va, mpte, free));
2196}
2197
2198void
2199pmap_pinit0(pmap_t pmap)
2200{
2201
2202	PMAP_LOCK_INIT(pmap);
2203	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
2204	pmap->pm_cr3 = KPML4phys;
2205	pmap->pm_root.rt_root = 0;
2206	CPU_ZERO(&pmap->pm_active);
2207	CPU_ZERO(&pmap->pm_save);
2208	PCPU_SET(curpmap, pmap);
2209	TAILQ_INIT(&pmap->pm_pvchunk);
2210	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
2211	pmap->pm_pcid = pmap_pcid_enabled ? 0 : -1;
2212	pmap->pm_flags = pmap_flags;
2213}
2214
2215/*
2216 * Initialize a preallocated and zeroed pmap structure,
2217 * such as one in a vmspace structure.
2218 */
2219int
2220pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags)
2221{
2222	vm_page_t pml4pg;
2223	vm_paddr_t pml4phys;
2224	int i;
2225
2226	/*
2227	 * allocate the page directory page
2228	 */
2229	while ((pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
2230	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
2231		VM_WAIT;
2232
2233	pml4phys = VM_PAGE_TO_PHYS(pml4pg);
2234	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys);
2235	pmap->pm_pcid = -1;
2236	pmap->pm_cr3 = ~0;	/* initialize to an invalid value */
2237
2238	if ((pml4pg->flags & PG_ZERO) == 0)
2239		pagezero(pmap->pm_pml4);
2240
2241	/*
2242	 * Do not install the host kernel mappings in the nested page
2243	 * tables. These mappings are meaningless in the guest physical
2244	 * address space.
2245	 */
2246	if ((pmap->pm_type = pm_type) == PT_X86) {
2247		pmap->pm_cr3 = pml4phys;
2248
2249		/* Wire in kernel global address entries. */
2250		for (i = 0; i < NKPML4E; i++) {
2251			pmap->pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) |
2252			    X86_PG_RW | X86_PG_V | PG_U;
2253		}
2254		for (i = 0; i < ndmpdpphys; i++) {
2255			pmap->pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) |
2256			    X86_PG_RW | X86_PG_V | PG_U;
2257		}
2258
2259		/* install self-referential address mapping entry(s) */
2260		pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) |
2261		    X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
2262
2263		if (pmap_pcid_enabled) {
2264			pmap->pm_pcid = alloc_unr(&pcid_unr);
2265			if (pmap->pm_pcid != -1)
2266				pmap->pm_cr3 |= pmap->pm_pcid;
2267		}
2268	}
2269
2270	pmap->pm_root.rt_root = 0;
2271	CPU_ZERO(&pmap->pm_active);
2272	TAILQ_INIT(&pmap->pm_pvchunk);
2273	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
2274	pmap->pm_flags = flags;
2275	pmap->pm_eptgen = 0;
2276	CPU_ZERO(&pmap->pm_save);
2277
2278	return (1);
2279}
2280
2281int
2282pmap_pinit(pmap_t pmap)
2283{
2284
2285	return (pmap_pinit_type(pmap, PT_X86, pmap_flags));
2286}
2287
2288/*
2289 * This routine is called if the desired page table page does not exist.
2290 *
2291 * If page table page allocation fails, this routine may sleep before
2292 * returning NULL.  It sleeps only if a lock pointer was given.
2293 *
2294 * Note: If a page allocation fails at page table level two or three,
2295 * one or two pages may be held during the wait, only to be released
2296 * afterwards.  This conservative approach is easily argued to avoid
2297 * race conditions.
2298 */
2299static vm_page_t
2300_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
2301{
2302	vm_page_t m, pdppg, pdpg;
2303	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
2304
2305	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2306
2307	PG_A = pmap_accessed_bit(pmap);
2308	PG_M = pmap_modified_bit(pmap);
2309	PG_V = pmap_valid_bit(pmap);
2310	PG_RW = pmap_rw_bit(pmap);
2311
2312	/*
2313	 * Allocate a page table page.
2314	 */
2315	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
2316	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
2317		if (lockp != NULL) {
2318			RELEASE_PV_LIST_LOCK(lockp);
2319			PMAP_UNLOCK(pmap);
2320			rw_runlock(&pvh_global_lock);
2321			VM_WAIT;
2322			rw_rlock(&pvh_global_lock);
2323			PMAP_LOCK(pmap);
2324		}
2325
2326		/*
2327		 * Indicate the need to retry.  While waiting, the page table
2328		 * page may have been allocated.
2329		 */
2330		return (NULL);
2331	}
2332	if ((m->flags & PG_ZERO) == 0)
2333		pmap_zero_page(m);
2334
2335	/*
2336	 * Map the pagetable page into the process address space, if
2337	 * it isn't already there.
2338	 */
2339
2340	if (ptepindex >= (NUPDE + NUPDPE)) {
2341		pml4_entry_t *pml4;
2342		vm_pindex_t pml4index;
2343
2344		/* Wire up a new PDPE page */
2345		pml4index = ptepindex - (NUPDE + NUPDPE);
2346		pml4 = &pmap->pm_pml4[pml4index];
2347		*pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
2348
2349	} else if (ptepindex >= NUPDE) {
2350		vm_pindex_t pml4index;
2351		vm_pindex_t pdpindex;
2352		pml4_entry_t *pml4;
2353		pdp_entry_t *pdp;
2354
2355		/* Wire up a new PDE page */
2356		pdpindex = ptepindex - NUPDE;
2357		pml4index = pdpindex >> NPML4EPGSHIFT;
2358
2359		pml4 = &pmap->pm_pml4[pml4index];
2360		if ((*pml4 & PG_V) == 0) {
2361			/* Have to allocate a new pdp, recurse */
2362			if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index,
2363			    lockp) == NULL) {
2364				--m->wire_count;
2365				atomic_subtract_int(&cnt.v_wire_count, 1);
2366				vm_page_free_zero(m);
2367				return (NULL);
2368			}
2369		} else {
2370			/* Add reference to pdp page */
2371			pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
2372			pdppg->wire_count++;
2373		}
2374		pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
2375
2376		/* Now find the pdp page */
2377		pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
2378		*pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
2379
2380	} else {
2381		vm_pindex_t pml4index;
2382		vm_pindex_t pdpindex;
2383		pml4_entry_t *pml4;
2384		pdp_entry_t *pdp;
2385		pd_entry_t *pd;
2386
2387		/* Wire up a new PTE page */
2388		pdpindex = ptepindex >> NPDPEPGSHIFT;
2389		pml4index = pdpindex >> NPML4EPGSHIFT;
2390
2391		/* First, find the pdp and check that its valid. */
2392		pml4 = &pmap->pm_pml4[pml4index];
2393		if ((*pml4 & PG_V) == 0) {
2394			/* Have to allocate a new pd, recurse */
2395			if (_pmap_allocpte(pmap, NUPDE + pdpindex,
2396			    lockp) == NULL) {
2397				--m->wire_count;
2398				atomic_subtract_int(&cnt.v_wire_count, 1);
2399				vm_page_free_zero(m);
2400				return (NULL);
2401			}
2402			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
2403			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
2404		} else {
2405			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
2406			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
2407			if ((*pdp & PG_V) == 0) {
2408				/* Have to allocate a new pd, recurse */
2409				if (_pmap_allocpte(pmap, NUPDE + pdpindex,
2410				    lockp) == NULL) {
2411					--m->wire_count;
2412					atomic_subtract_int(&cnt.v_wire_count,
2413					    1);
2414					vm_page_free_zero(m);
2415					return (NULL);
2416				}
2417			} else {
2418				/* Add reference to the pd page */
2419				pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
2420				pdpg->wire_count++;
2421			}
2422		}
2423		pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
2424
2425		/* Now we know where the page directory page is */
2426		pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
2427		*pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
2428	}
2429
2430	pmap_resident_count_inc(pmap, 1);
2431
2432	return (m);
2433}
2434
2435static vm_page_t
2436pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
2437{
2438	vm_pindex_t pdpindex, ptepindex;
2439	pdp_entry_t *pdpe, PG_V;
2440	vm_page_t pdpg;
2441
2442	PG_V = pmap_valid_bit(pmap);
2443
2444retry:
2445	pdpe = pmap_pdpe(pmap, va);
2446	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
2447		/* Add a reference to the pd page. */
2448		pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
2449		pdpg->wire_count++;
2450	} else {
2451		/* Allocate a pd page. */
2452		ptepindex = pmap_pde_pindex(va);
2453		pdpindex = ptepindex >> NPDPEPGSHIFT;
2454		pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp);
2455		if (pdpg == NULL && lockp != NULL)
2456			goto retry;
2457	}
2458	return (pdpg);
2459}
2460
2461static vm_page_t
2462pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
2463{
2464	vm_pindex_t ptepindex;
2465	pd_entry_t *pd, PG_V;
2466	vm_page_t m;
2467
2468	PG_V = pmap_valid_bit(pmap);
2469
2470	/*
2471	 * Calculate pagetable page index
2472	 */
2473	ptepindex = pmap_pde_pindex(va);
2474retry:
2475	/*
2476	 * Get the page directory entry
2477	 */
2478	pd = pmap_pde(pmap, va);
2479
2480	/*
2481	 * This supports switching from a 2MB page to a
2482	 * normal 4K page.
2483	 */
2484	if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
2485		if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) {
2486			/*
2487			 * Invalidation of the 2MB page mapping may have caused
2488			 * the deallocation of the underlying PD page.
2489			 */
2490			pd = NULL;
2491		}
2492	}
2493
2494	/*
2495	 * If the page table page is mapped, we just increment the
2496	 * hold count, and activate it.
2497	 */
2498	if (pd != NULL && (*pd & PG_V) != 0) {
2499		m = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
2500		m->wire_count++;
2501	} else {
2502		/*
2503		 * Here if the pte page isn't mapped, or if it has been
2504		 * deallocated.
2505		 */
2506		m = _pmap_allocpte(pmap, ptepindex, lockp);
2507		if (m == NULL && lockp != NULL)
2508			goto retry;
2509	}
2510	return (m);
2511}
2512
2513
2514/***************************************************
2515 * Pmap allocation/deallocation routines.
2516 ***************************************************/
2517
2518/*
2519 * Release any resources held by the given physical map.
2520 * Called when a pmap initialized by pmap_pinit is being released.
2521 * Should only be called if the map contains no valid mappings.
2522 */
2523void
2524pmap_release(pmap_t pmap)
2525{
2526	vm_page_t m;
2527	int i;
2528
2529	KASSERT(pmap->pm_stats.resident_count == 0,
2530	    ("pmap_release: pmap resident count %ld != 0",
2531	    pmap->pm_stats.resident_count));
2532	KASSERT(vm_radix_is_empty(&pmap->pm_root),
2533	    ("pmap_release: pmap has reserved page table page(s)"));
2534
2535	if (pmap_pcid_enabled) {
2536		/*
2537		 * Invalidate any left TLB entries, to allow the reuse
2538		 * of the pcid.
2539		 */
2540		pmap_invalidate_all(pmap);
2541	}
2542
2543	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4));
2544
2545	for (i = 0; i < NKPML4E; i++)	/* KVA */
2546		pmap->pm_pml4[KPML4BASE + i] = 0;
2547	for (i = 0; i < ndmpdpphys; i++)/* Direct Map */
2548		pmap->pm_pml4[DMPML4I + i] = 0;
2549	pmap->pm_pml4[PML4PML4I] = 0;	/* Recursive Mapping */
2550
2551	m->wire_count--;
2552	atomic_subtract_int(&cnt.v_wire_count, 1);
2553	vm_page_free_zero(m);
2554	if (pmap->pm_pcid != -1)
2555		free_unr(&pcid_unr, pmap->pm_pcid);
2556}
2557
2558static int
2559kvm_size(SYSCTL_HANDLER_ARGS)
2560{
2561	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
2562
2563	return sysctl_handle_long(oidp, &ksize, 0, req);
2564}
2565SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
2566    0, 0, kvm_size, "LU", "Size of KVM");
2567
2568static int
2569kvm_free(SYSCTL_HANDLER_ARGS)
2570{
2571	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
2572
2573	return sysctl_handle_long(oidp, &kfree, 0, req);
2574}
2575SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
2576    0, 0, kvm_free, "LU", "Amount of KVM free");
2577
2578/*
2579 * grow the number of kernel page table entries, if needed
2580 */
2581void
2582pmap_growkernel(vm_offset_t addr)
2583{
2584	vm_paddr_t paddr;
2585	vm_page_t nkpg;
2586	pd_entry_t *pde, newpdir;
2587	pdp_entry_t *pdpe;
2588
2589	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
2590
2591	/*
2592	 * Return if "addr" is within the range of kernel page table pages
2593	 * that were preallocated during pmap bootstrap.  Moreover, leave
2594	 * "kernel_vm_end" and the kernel page table as they were.
2595	 *
2596	 * The correctness of this action is based on the following
2597	 * argument: vm_map_findspace() allocates contiguous ranges of the
2598	 * kernel virtual address space.  It calls this function if a range
2599	 * ends after "kernel_vm_end".  If the kernel is mapped between
2600	 * "kernel_vm_end" and "addr", then the range cannot begin at
2601	 * "kernel_vm_end".  In fact, its beginning address cannot be less
2602	 * than the kernel.  Thus, there is no immediate need to allocate
2603	 * any new kernel page table pages between "kernel_vm_end" and
2604	 * "KERNBASE".
2605	 */
2606	if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR)
2607		return;
2608
2609	addr = roundup2(addr, NBPDR);
2610	if (addr - 1 >= kernel_map->max_offset)
2611		addr = kernel_map->max_offset;
2612	while (kernel_vm_end < addr) {
2613		pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end);
2614		if ((*pdpe & X86_PG_V) == 0) {
2615			/* We need a new PDP entry */
2616			nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT,
2617			    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
2618			    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
2619			if (nkpg == NULL)
2620				panic("pmap_growkernel: no memory to grow kernel");
2621			if ((nkpg->flags & PG_ZERO) == 0)
2622				pmap_zero_page(nkpg);
2623			paddr = VM_PAGE_TO_PHYS(nkpg);
2624			*pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW |
2625			    X86_PG_A | X86_PG_M);
2626			continue; /* try again */
2627		}
2628		pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end);
2629		if ((*pde & X86_PG_V) != 0) {
2630			kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2631			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2632				kernel_vm_end = kernel_map->max_offset;
2633				break;
2634			}
2635			continue;
2636		}
2637
2638		nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end),
2639		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
2640		    VM_ALLOC_ZERO);
2641		if (nkpg == NULL)
2642			panic("pmap_growkernel: no memory to grow kernel");
2643		if ((nkpg->flags & PG_ZERO) == 0)
2644			pmap_zero_page(nkpg);
2645		paddr = VM_PAGE_TO_PHYS(nkpg);
2646		newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
2647		pde_store(pde, newpdir);
2648
2649		kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2650		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2651			kernel_vm_end = kernel_map->max_offset;
2652			break;
2653		}
2654	}
2655}
2656
2657
2658/***************************************************
2659 * page management routines.
2660 ***************************************************/
2661
2662CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
2663CTASSERT(_NPCM == 3);
2664CTASSERT(_NPCPV == 168);
2665
2666static __inline struct pv_chunk *
2667pv_to_chunk(pv_entry_t pv)
2668{
2669
2670	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
2671}
2672
2673#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
2674
2675#define	PC_FREE0	0xfffffffffffffffful
2676#define	PC_FREE1	0xfffffffffffffffful
2677#define	PC_FREE2	0x000000fffffffffful
2678
2679static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
2680
2681#ifdef PV_STATS
2682static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
2683
2684SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
2685	"Current number of pv entry chunks");
2686SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
2687	"Current number of pv entry chunks allocated");
2688SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
2689	"Current number of pv entry chunks frees");
2690SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
2691	"Number of times tried to get a chunk page but failed.");
2692
2693static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
2694static int pv_entry_spare;
2695
2696SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
2697	"Current number of pv entry frees");
2698SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
2699	"Current number of pv entry allocs");
2700SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
2701	"Current number of pv entries");
2702SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
2703	"Current number of spare pv entries");
2704#endif
2705
2706/*
2707 * We are in a serious low memory condition.  Resort to
2708 * drastic measures to free some pages so we can allocate
2709 * another pv entry chunk.
2710 *
2711 * Returns NULL if PV entries were reclaimed from the specified pmap.
2712 *
2713 * We do not, however, unmap 2mpages because subsequent accesses will
2714 * allocate per-page pv entries until repromotion occurs, thereby
2715 * exacerbating the shortage of free pv entries.
2716 */
2717static vm_page_t
2718reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
2719{
2720	struct pch new_tail;
2721	struct pv_chunk *pc;
2722	struct md_page *pvh;
2723	pd_entry_t *pde;
2724	pmap_t pmap;
2725	pt_entry_t *pte, tpte;
2726	pt_entry_t PG_G, PG_A, PG_M, PG_RW;
2727	pv_entry_t pv;
2728	vm_offset_t va;
2729	vm_page_t m, m_pc;
2730	struct spglist free;
2731	uint64_t inuse;
2732	int bit, field, freed;
2733
2734	rw_assert(&pvh_global_lock, RA_LOCKED);
2735	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
2736	KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
2737	pmap = NULL;
2738	m_pc = NULL;
2739	PG_G = PG_A = PG_M = PG_RW = 0;
2740	SLIST_INIT(&free);
2741	TAILQ_INIT(&new_tail);
2742	mtx_lock(&pv_chunks_mutex);
2743	while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && SLIST_EMPTY(&free)) {
2744		TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2745		mtx_unlock(&pv_chunks_mutex);
2746		if (pmap != pc->pc_pmap) {
2747			if (pmap != NULL) {
2748				pmap_invalidate_all(pmap);
2749				if (pmap != locked_pmap)
2750					PMAP_UNLOCK(pmap);
2751			}
2752			pmap = pc->pc_pmap;
2753			/* Avoid deadlock and lock recursion. */
2754			if (pmap > locked_pmap) {
2755				RELEASE_PV_LIST_LOCK(lockp);
2756				PMAP_LOCK(pmap);
2757			} else if (pmap != locked_pmap &&
2758			    !PMAP_TRYLOCK(pmap)) {
2759				pmap = NULL;
2760				TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
2761				mtx_lock(&pv_chunks_mutex);
2762				continue;
2763			}
2764			PG_G = pmap_global_bit(pmap);
2765			PG_A = pmap_accessed_bit(pmap);
2766			PG_M = pmap_modified_bit(pmap);
2767			PG_RW = pmap_rw_bit(pmap);
2768		}
2769
2770		/*
2771		 * Destroy every non-wired, 4 KB page mapping in the chunk.
2772		 */
2773		freed = 0;
2774		for (field = 0; field < _NPCM; field++) {
2775			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
2776			    inuse != 0; inuse &= ~(1UL << bit)) {
2777				bit = bsfq(inuse);
2778				pv = &pc->pc_pventry[field * 64 + bit];
2779				va = pv->pv_va;
2780				pde = pmap_pde(pmap, va);
2781				if ((*pde & PG_PS) != 0)
2782					continue;
2783				pte = pmap_pde_to_pte(pde, va);
2784				if ((*pte & PG_W) != 0)
2785					continue;
2786				tpte = pte_load_clear(pte);
2787				if ((tpte & PG_G) != 0)
2788					pmap_invalidate_page(pmap, va);
2789				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
2790				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2791					vm_page_dirty(m);
2792				if ((tpte & PG_A) != 0)
2793					vm_page_aflag_set(m, PGA_REFERENCED);
2794				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2795				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
2796				m->md.pv_gen++;
2797				if (TAILQ_EMPTY(&m->md.pv_list) &&
2798				    (m->flags & PG_FICTITIOUS) == 0) {
2799					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2800					if (TAILQ_EMPTY(&pvh->pv_list)) {
2801						vm_page_aflag_clear(m,
2802						    PGA_WRITEABLE);
2803					}
2804				}
2805				pc->pc_map[field] |= 1UL << bit;
2806				pmap_unuse_pt(pmap, va, *pde, &free);
2807				freed++;
2808			}
2809		}
2810		if (freed == 0) {
2811			TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
2812			mtx_lock(&pv_chunks_mutex);
2813			continue;
2814		}
2815		/* Every freed mapping is for a 4 KB page. */
2816		pmap_resident_count_dec(pmap, freed);
2817		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
2818		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
2819		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
2820		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2821		if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 &&
2822		    pc->pc_map[2] == PC_FREE2) {
2823			PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
2824			PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
2825			PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
2826			/* Entire chunk is free; return it. */
2827			m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
2828			dump_drop_page(m_pc->phys_addr);
2829			mtx_lock(&pv_chunks_mutex);
2830			break;
2831		}
2832		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2833		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
2834		mtx_lock(&pv_chunks_mutex);
2835		/* One freed pv entry in locked_pmap is sufficient. */
2836		if (pmap == locked_pmap)
2837			break;
2838	}
2839	TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
2840	mtx_unlock(&pv_chunks_mutex);
2841	if (pmap != NULL) {
2842		pmap_invalidate_all(pmap);
2843		if (pmap != locked_pmap)
2844			PMAP_UNLOCK(pmap);
2845	}
2846	if (m_pc == NULL && !SLIST_EMPTY(&free)) {
2847		m_pc = SLIST_FIRST(&free);
2848		SLIST_REMOVE_HEAD(&free, plinks.s.ss);
2849		/* Recycle a freed page table page. */
2850		m_pc->wire_count = 1;
2851		atomic_add_int(&cnt.v_wire_count, 1);
2852	}
2853	pmap_free_zero_pages(&free);
2854	return (m_pc);
2855}
2856
2857/*
2858 * free the pv_entry back to the free list
2859 */
2860static void
2861free_pv_entry(pmap_t pmap, pv_entry_t pv)
2862{
2863	struct pv_chunk *pc;
2864	int idx, field, bit;
2865
2866	rw_assert(&pvh_global_lock, RA_LOCKED);
2867	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2868	PV_STAT(atomic_add_long(&pv_entry_frees, 1));
2869	PV_STAT(atomic_add_int(&pv_entry_spare, 1));
2870	PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
2871	pc = pv_to_chunk(pv);
2872	idx = pv - &pc->pc_pventry[0];
2873	field = idx / 64;
2874	bit = idx % 64;
2875	pc->pc_map[field] |= 1ul << bit;
2876	if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
2877	    pc->pc_map[2] != PC_FREE2) {
2878		/* 98% of the time, pc is already at the head of the list. */
2879		if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
2880			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2881			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2882		}
2883		return;
2884	}
2885	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2886	free_pv_chunk(pc);
2887}
2888
2889static void
2890free_pv_chunk(struct pv_chunk *pc)
2891{
2892	vm_page_t m;
2893
2894	mtx_lock(&pv_chunks_mutex);
2895 	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2896	mtx_unlock(&pv_chunks_mutex);
2897	PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
2898	PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
2899	PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
2900	/* entire chunk is free, return it */
2901	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
2902	dump_drop_page(m->phys_addr);
2903	vm_page_unwire(m, 0);
2904	vm_page_free(m);
2905}
2906
2907/*
2908 * Returns a new PV entry, allocating a new PV chunk from the system when
2909 * needed.  If this PV chunk allocation fails and a PV list lock pointer was
2910 * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
2911 * returned.
2912 *
2913 * The given PV list lock may be released.
2914 */
2915static pv_entry_t
2916get_pv_entry(pmap_t pmap, struct rwlock **lockp)
2917{
2918	int bit, field;
2919	pv_entry_t pv;
2920	struct pv_chunk *pc;
2921	vm_page_t m;
2922
2923	rw_assert(&pvh_global_lock, RA_LOCKED);
2924	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2925	PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
2926retry:
2927	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2928	if (pc != NULL) {
2929		for (field = 0; field < _NPCM; field++) {
2930			if (pc->pc_map[field]) {
2931				bit = bsfq(pc->pc_map[field]);
2932				break;
2933			}
2934		}
2935		if (field < _NPCM) {
2936			pv = &pc->pc_pventry[field * 64 + bit];
2937			pc->pc_map[field] &= ~(1ul << bit);
2938			/* If this was the last item, move it to tail */
2939			if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
2940			    pc->pc_map[2] == 0) {
2941				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2942				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
2943				    pc_list);
2944			}
2945			PV_STAT(atomic_add_long(&pv_entry_count, 1));
2946			PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
2947			return (pv);
2948		}
2949	}
2950	/* No free items, allocate another chunk */
2951	m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
2952	    VM_ALLOC_WIRED);
2953	if (m == NULL) {
2954		if (lockp == NULL) {
2955			PV_STAT(pc_chunk_tryfail++);
2956			return (NULL);
2957		}
2958		m = reclaim_pv_chunk(pmap, lockp);
2959		if (m == NULL)
2960			goto retry;
2961	}
2962	PV_STAT(atomic_add_int(&pc_chunk_count, 1));
2963	PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
2964	dump_add_page(m->phys_addr);
2965	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
2966	pc->pc_pmap = pmap;
2967	pc->pc_map[0] = PC_FREE0 & ~1ul;	/* preallocated bit 0 */
2968	pc->pc_map[1] = PC_FREE1;
2969	pc->pc_map[2] = PC_FREE2;
2970	mtx_lock(&pv_chunks_mutex);
2971	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
2972	mtx_unlock(&pv_chunks_mutex);
2973	pv = &pc->pc_pventry[0];
2974	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2975	PV_STAT(atomic_add_long(&pv_entry_count, 1));
2976	PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
2977	return (pv);
2978}
2979
2980/*
2981 * Returns the number of one bits within the given PV chunk map element.
2982 */
2983static int
2984popcnt_pc_map_elem(uint64_t elem)
2985{
2986	int count;
2987
2988	/*
2989	 * This simple method of counting the one bits performs well because
2990	 * the given element typically contains more zero bits than one bits.
2991	 */
2992	count = 0;
2993	for (; elem != 0; elem &= elem - 1)
2994		count++;
2995	return (count);
2996}
2997
2998/*
2999 * Ensure that the number of spare PV entries in the specified pmap meets or
3000 * exceeds the given count, "needed".
3001 *
3002 * The given PV list lock may be released.
3003 */
3004static void
3005reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
3006{
3007	struct pch new_tail;
3008	struct pv_chunk *pc;
3009	int avail, free;
3010	vm_page_t m;
3011
3012	rw_assert(&pvh_global_lock, RA_LOCKED);
3013	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3014	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
3015
3016	/*
3017	 * Newly allocated PV chunks must be stored in a private list until
3018	 * the required number of PV chunks have been allocated.  Otherwise,
3019	 * reclaim_pv_chunk() could recycle one of these chunks.  In
3020	 * contrast, these chunks must be added to the pmap upon allocation.
3021	 */
3022	TAILQ_INIT(&new_tail);
3023retry:
3024	avail = 0;
3025	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
3026		if ((cpu_feature2 & CPUID2_POPCNT) == 0) {
3027			free = popcnt_pc_map_elem(pc->pc_map[0]);
3028			free += popcnt_pc_map_elem(pc->pc_map[1]);
3029			free += popcnt_pc_map_elem(pc->pc_map[2]);
3030		} else {
3031			free = popcntq(pc->pc_map[0]);
3032			free += popcntq(pc->pc_map[1]);
3033			free += popcntq(pc->pc_map[2]);
3034		}
3035		if (free == 0)
3036			break;
3037		avail += free;
3038		if (avail >= needed)
3039			break;
3040	}
3041	for (; avail < needed; avail += _NPCPV) {
3042		m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
3043		    VM_ALLOC_WIRED);
3044		if (m == NULL) {
3045			m = reclaim_pv_chunk(pmap, lockp);
3046			if (m == NULL)
3047				goto retry;
3048		}
3049		PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3050		PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3051		dump_add_page(m->phys_addr);
3052		pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3053		pc->pc_pmap = pmap;
3054		pc->pc_map[0] = PC_FREE0;
3055		pc->pc_map[1] = PC_FREE1;
3056		pc->pc_map[2] = PC_FREE2;
3057		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3058		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
3059		PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
3060	}
3061	if (!TAILQ_EMPTY(&new_tail)) {
3062		mtx_lock(&pv_chunks_mutex);
3063		TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
3064		mtx_unlock(&pv_chunks_mutex);
3065	}
3066}
3067
3068/*
3069 * First find and then remove the pv entry for the specified pmap and virtual
3070 * address from the specified pv list.  Returns the pv entry if found and NULL
3071 * otherwise.  This operation can be performed on pv lists for either 4KB or
3072 * 2MB page mappings.
3073 */
3074static __inline pv_entry_t
3075pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3076{
3077	pv_entry_t pv;
3078
3079	rw_assert(&pvh_global_lock, RA_LOCKED);
3080	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
3081		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
3082			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
3083			pvh->pv_gen++;
3084			break;
3085		}
3086	}
3087	return (pv);
3088}
3089
3090/*
3091 * After demotion from a 2MB page mapping to 512 4KB page mappings,
3092 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
3093 * entries for each of the 4KB page mappings.
3094 */
3095static void
3096pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3097    struct rwlock **lockp)
3098{
3099	struct md_page *pvh;
3100	struct pv_chunk *pc;
3101	pv_entry_t pv;
3102	vm_offset_t va_last;
3103	vm_page_t m;
3104	int bit, field;
3105
3106	rw_assert(&pvh_global_lock, RA_LOCKED);
3107	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3108	KASSERT((pa & PDRMASK) == 0,
3109	    ("pmap_pv_demote_pde: pa is not 2mpage aligned"));
3110	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3111
3112	/*
3113	 * Transfer the 2mpage's pv entry for this mapping to the first
3114	 * page's pv list.  Once this transfer begins, the pv list lock
3115	 * must not be released until the last pv entry is reinstantiated.
3116	 */
3117	pvh = pa_to_pvh(pa);
3118	va = trunc_2mpage(va);
3119	pv = pmap_pvh_remove(pvh, pmap, va);
3120	KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
3121	m = PHYS_TO_VM_PAGE(pa);
3122	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3123	m->md.pv_gen++;
3124	/* Instantiate the remaining NPTEPG - 1 pv entries. */
3125	PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1));
3126	va_last = va + NBPDR - PAGE_SIZE;
3127	for (;;) {
3128		pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3129		KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
3130		    pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare"));
3131		for (field = 0; field < _NPCM; field++) {
3132			while (pc->pc_map[field]) {
3133				bit = bsfq(pc->pc_map[field]);
3134				pc->pc_map[field] &= ~(1ul << bit);
3135				pv = &pc->pc_pventry[field * 64 + bit];
3136				va += PAGE_SIZE;
3137				pv->pv_va = va;
3138				m++;
3139				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3140			    ("pmap_pv_demote_pde: page %p is not managed", m));
3141				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3142				m->md.pv_gen++;
3143				if (va == va_last)
3144					goto out;
3145			}
3146		}
3147		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3148		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3149	}
3150out:
3151	if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
3152		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3153		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3154	}
3155	PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1));
3156	PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1));
3157}
3158
3159/*
3160 * After promotion from 512 4KB page mappings to a single 2MB page mapping,
3161 * replace the many pv entries for the 4KB page mappings by a single pv entry
3162 * for the 2MB page mapping.
3163 */
3164static void
3165pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3166    struct rwlock **lockp)
3167{
3168	struct md_page *pvh;
3169	pv_entry_t pv;
3170	vm_offset_t va_last;
3171	vm_page_t m;
3172
3173	rw_assert(&pvh_global_lock, RA_LOCKED);
3174	KASSERT((pa & PDRMASK) == 0,
3175	    ("pmap_pv_promote_pde: pa is not 2mpage aligned"));
3176	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3177
3178	/*
3179	 * Transfer the first page's pv entry for this mapping to the 2mpage's
3180	 * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
3181	 * a transfer avoids the possibility that get_pv_entry() calls
3182	 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
3183	 * mappings that is being promoted.
3184	 */
3185	m = PHYS_TO_VM_PAGE(pa);
3186	va = trunc_2mpage(va);
3187	pv = pmap_pvh_remove(&m->md, pmap, va);
3188	KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
3189	pvh = pa_to_pvh(pa);
3190	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
3191	pvh->pv_gen++;
3192	/* Free the remaining NPTEPG - 1 pv entries. */
3193	va_last = va + NBPDR - PAGE_SIZE;
3194	do {
3195		m++;
3196		va += PAGE_SIZE;
3197		pmap_pvh_free(&m->md, pmap, va);
3198	} while (va < va_last);
3199}
3200
3201/*
3202 * First find and then destroy the pv entry for the specified pmap and virtual
3203 * address.  This operation can be performed on pv lists for either 4KB or 2MB
3204 * page mappings.
3205 */
3206static void
3207pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3208{
3209	pv_entry_t pv;
3210
3211	pv = pmap_pvh_remove(pvh, pmap, va);
3212	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
3213	free_pv_entry(pmap, pv);
3214}
3215
3216/*
3217 * Conditionally create the PV entry for a 4KB page mapping if the required
3218 * memory can be allocated without resorting to reclamation.
3219 */
3220static boolean_t
3221pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
3222    struct rwlock **lockp)
3223{
3224	pv_entry_t pv;
3225
3226	rw_assert(&pvh_global_lock, RA_LOCKED);
3227	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3228	/* Pass NULL instead of the lock pointer to disable reclamation. */
3229	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
3230		pv->pv_va = va;
3231		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3232		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3233		m->md.pv_gen++;
3234		return (TRUE);
3235	} else
3236		return (FALSE);
3237}
3238
3239/*
3240 * Conditionally create the PV entry for a 2MB page mapping if the required
3241 * memory can be allocated without resorting to reclamation.
3242 */
3243static boolean_t
3244pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3245    struct rwlock **lockp)
3246{
3247	struct md_page *pvh;
3248	pv_entry_t pv;
3249
3250	rw_assert(&pvh_global_lock, RA_LOCKED);
3251	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3252	/* Pass NULL instead of the lock pointer to disable reclamation. */
3253	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
3254		pv->pv_va = va;
3255		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3256		pvh = pa_to_pvh(pa);
3257		TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
3258		pvh->pv_gen++;
3259		return (TRUE);
3260	} else
3261		return (FALSE);
3262}
3263
3264/*
3265 * Fills a page table page with mappings to consecutive physical pages.
3266 */
3267static void
3268pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
3269{
3270	pt_entry_t *pte;
3271
3272	for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
3273		*pte = newpte;
3274		newpte += PAGE_SIZE;
3275	}
3276}
3277
3278/*
3279 * Tries to demote a 2MB page mapping.  If demotion fails, the 2MB page
3280 * mapping is invalidated.
3281 */
3282static boolean_t
3283pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
3284{
3285	struct rwlock *lock;
3286	boolean_t rv;
3287
3288	lock = NULL;
3289	rv = pmap_demote_pde_locked(pmap, pde, va, &lock);
3290	if (lock != NULL)
3291		rw_wunlock(lock);
3292	return (rv);
3293}
3294
3295static boolean_t
3296pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
3297    struct rwlock **lockp)
3298{
3299	pd_entry_t newpde, oldpde;
3300	pt_entry_t *firstpte, newpte;
3301	pt_entry_t PG_A, PG_G, PG_M, PG_RW, PG_V;
3302	vm_paddr_t mptepa;
3303	vm_page_t mpte;
3304	struct spglist free;
3305	int PG_PTE_CACHE;
3306
3307	PG_G = pmap_global_bit(pmap);
3308	PG_A = pmap_accessed_bit(pmap);
3309	PG_M = pmap_modified_bit(pmap);
3310	PG_RW = pmap_rw_bit(pmap);
3311	PG_V = pmap_valid_bit(pmap);
3312	PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
3313
3314	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3315	oldpde = *pde;
3316	KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
3317	    ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
3318	if ((oldpde & PG_A) != 0 && (mpte = pmap_lookup_pt_page(pmap, va)) !=
3319	    NULL)
3320		pmap_remove_pt_page(pmap, mpte);
3321	else {
3322		KASSERT((oldpde & PG_W) == 0,
3323		    ("pmap_demote_pde: page table page for a wired mapping"
3324		    " is missing"));
3325
3326		/*
3327		 * Invalidate the 2MB page mapping and return "failure" if the
3328		 * mapping was never accessed or the allocation of the new
3329		 * page table page fails.  If the 2MB page mapping belongs to
3330		 * the direct map region of the kernel's address space, then
3331		 * the page allocation request specifies the highest possible
3332		 * priority (VM_ALLOC_INTERRUPT).  Otherwise, the priority is
3333		 * normal.  Page table pages are preallocated for every other
3334		 * part of the kernel address space, so the direct map region
3335		 * is the only part of the kernel address space that must be
3336		 * handled here.
3337		 */
3338		if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
3339		    pmap_pde_pindex(va), (va >= DMAP_MIN_ADDRESS && va <
3340		    DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) |
3341		    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
3342			SLIST_INIT(&free);
3343			pmap_remove_pde(pmap, pde, trunc_2mpage(va), &free,
3344			    lockp);
3345			pmap_invalidate_page(pmap, trunc_2mpage(va));
3346			pmap_free_zero_pages(&free);
3347			CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx"
3348			    " in pmap %p", va, pmap);
3349			return (FALSE);
3350		}
3351		if (va < VM_MAXUSER_ADDRESS)
3352			pmap_resident_count_inc(pmap, 1);
3353	}
3354	mptepa = VM_PAGE_TO_PHYS(mpte);
3355	firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
3356	newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
3357	KASSERT((oldpde & PG_A) != 0,
3358	    ("pmap_demote_pde: oldpde is missing PG_A"));
3359	KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
3360	    ("pmap_demote_pde: oldpde is missing PG_M"));
3361	newpte = oldpde & ~PG_PS;
3362	newpte = pmap_swap_pat(pmap, newpte);
3363
3364	/*
3365	 * If the page table page is new, initialize it.
3366	 */
3367	if (mpte->wire_count == 1) {
3368		mpte->wire_count = NPTEPG;
3369		pmap_fill_ptp(firstpte, newpte);
3370	}
3371	KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
3372	    ("pmap_demote_pde: firstpte and newpte map different physical"
3373	    " addresses"));
3374
3375	/*
3376	 * If the mapping has changed attributes, update the page table
3377	 * entries.
3378	 */
3379	if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
3380		pmap_fill_ptp(firstpte, newpte);
3381
3382	/*
3383	 * The spare PV entries must be reserved prior to demoting the
3384	 * mapping, that is, prior to changing the PDE.  Otherwise, the state
3385	 * of the PDE and the PV lists will be inconsistent, which can result
3386	 * in reclaim_pv_chunk() attempting to remove a PV entry from the
3387	 * wrong PV list and pmap_pv_demote_pde() failing to find the expected
3388	 * PV entry for the 2MB page mapping that is being demoted.
3389	 */
3390	if ((oldpde & PG_MANAGED) != 0)
3391		reserve_pv_entries(pmap, NPTEPG - 1, lockp);
3392
3393	/*
3394	 * Demote the mapping.  This pmap is locked.  The old PDE has
3395	 * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
3396	 * set.  Thus, there is no danger of a race with another
3397	 * processor changing the setting of PG_A and/or PG_M between
3398	 * the read above and the store below.
3399	 */
3400	if (workaround_erratum383)
3401		pmap_update_pde(pmap, va, pde, newpde);
3402	else
3403		pde_store(pde, newpde);
3404
3405	/*
3406	 * Invalidate a stale recursive mapping of the page table page.
3407	 */
3408	if (va >= VM_MAXUSER_ADDRESS)
3409		pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
3410
3411	/*
3412	 * Demote the PV entry.
3413	 */
3414	if ((oldpde & PG_MANAGED) != 0)
3415		pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp);
3416
3417	atomic_add_long(&pmap_pde_demotions, 1);
3418	CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx"
3419	    " in pmap %p", va, pmap);
3420	return (TRUE);
3421}
3422
3423/*
3424 * pmap_remove_kernel_pde: Remove a kernel superpage mapping.
3425 */
3426static void
3427pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
3428{
3429	pd_entry_t newpde;
3430	vm_paddr_t mptepa;
3431	vm_page_t mpte;
3432
3433	KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
3434	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3435	mpte = pmap_lookup_pt_page(pmap, va);
3436	if (mpte == NULL)
3437		panic("pmap_remove_kernel_pde: Missing pt page.");
3438
3439	pmap_remove_pt_page(pmap, mpte);
3440	mptepa = VM_PAGE_TO_PHYS(mpte);
3441	newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V;
3442
3443	/*
3444	 * Initialize the page table page.
3445	 */
3446	pagezero((void *)PHYS_TO_DMAP(mptepa));
3447
3448	/*
3449	 * Demote the mapping.
3450	 */
3451	if (workaround_erratum383)
3452		pmap_update_pde(pmap, va, pde, newpde);
3453	else
3454		pde_store(pde, newpde);
3455
3456	/*
3457	 * Invalidate a stale recursive mapping of the page table page.
3458	 */
3459	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
3460}
3461
3462/*
3463 * pmap_remove_pde: do the things to unmap a superpage in a process
3464 */
3465static int
3466pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
3467    struct spglist *free, struct rwlock **lockp)
3468{
3469	struct md_page *pvh;
3470	pd_entry_t oldpde;
3471	vm_offset_t eva, va;
3472	vm_page_t m, mpte;
3473	pt_entry_t PG_G, PG_A, PG_M, PG_RW;
3474
3475	PG_G = pmap_global_bit(pmap);
3476	PG_A = pmap_accessed_bit(pmap);
3477	PG_M = pmap_modified_bit(pmap);
3478	PG_RW = pmap_rw_bit(pmap);
3479
3480	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3481	KASSERT((sva & PDRMASK) == 0,
3482	    ("pmap_remove_pde: sva is not 2mpage aligned"));
3483	oldpde = pte_load_clear(pdq);
3484	if (oldpde & PG_W)
3485		pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
3486
3487	/*
3488	 * Machines that don't support invlpg, also don't support
3489	 * PG_G.
3490	 */
3491	if (oldpde & PG_G)
3492		pmap_invalidate_page(kernel_pmap, sva);
3493	pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
3494	if (oldpde & PG_MANAGED) {
3495		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME);
3496		pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
3497		pmap_pvh_free(pvh, pmap, sva);
3498		eva = sva + NBPDR;
3499		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
3500		    va < eva; va += PAGE_SIZE, m++) {
3501			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
3502				vm_page_dirty(m);
3503			if (oldpde & PG_A)
3504				vm_page_aflag_set(m, PGA_REFERENCED);
3505			if (TAILQ_EMPTY(&m->md.pv_list) &&
3506			    TAILQ_EMPTY(&pvh->pv_list))
3507				vm_page_aflag_clear(m, PGA_WRITEABLE);
3508		}
3509	}
3510	if (pmap == kernel_pmap) {
3511		pmap_remove_kernel_pde(pmap, pdq, sva);
3512	} else {
3513		mpte = pmap_lookup_pt_page(pmap, sva);
3514		if (mpte != NULL) {
3515			pmap_remove_pt_page(pmap, mpte);
3516			pmap_resident_count_dec(pmap, 1);
3517			KASSERT(mpte->wire_count == NPTEPG,
3518			    ("pmap_remove_pde: pte page wire count error"));
3519			mpte->wire_count = 0;
3520			pmap_add_delayed_free_list(mpte, free, FALSE);
3521			atomic_subtract_int(&cnt.v_wire_count, 1);
3522		}
3523	}
3524	return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free));
3525}
3526
3527/*
3528 * pmap_remove_pte: do the things to unmap a page in a process
3529 */
3530static int
3531pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
3532    pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp)
3533{
3534	struct md_page *pvh;
3535	pt_entry_t oldpte, PG_A, PG_M, PG_RW;
3536	vm_page_t m;
3537
3538	PG_A = pmap_accessed_bit(pmap);
3539	PG_M = pmap_modified_bit(pmap);
3540	PG_RW = pmap_rw_bit(pmap);
3541
3542	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3543	oldpte = pte_load_clear(ptq);
3544	if (oldpte & PG_W)
3545		pmap->pm_stats.wired_count -= 1;
3546	pmap_resident_count_dec(pmap, 1);
3547	if (oldpte & PG_MANAGED) {
3548		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
3549		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
3550			vm_page_dirty(m);
3551		if (oldpte & PG_A)
3552			vm_page_aflag_set(m, PGA_REFERENCED);
3553		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3554		pmap_pvh_free(&m->md, pmap, va);
3555		if (TAILQ_EMPTY(&m->md.pv_list) &&
3556		    (m->flags & PG_FICTITIOUS) == 0) {
3557			pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3558			if (TAILQ_EMPTY(&pvh->pv_list))
3559				vm_page_aflag_clear(m, PGA_WRITEABLE);
3560		}
3561	}
3562	return (pmap_unuse_pt(pmap, va, ptepde, free));
3563}
3564
3565/*
3566 * Remove a single page from a process address space
3567 */
3568static void
3569pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
3570    struct spglist *free)
3571{
3572	struct rwlock *lock;
3573	pt_entry_t *pte, PG_V;
3574
3575	PG_V = pmap_valid_bit(pmap);
3576	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3577	if ((*pde & PG_V) == 0)
3578		return;
3579	pte = pmap_pde_to_pte(pde, va);
3580	if ((*pte & PG_V) == 0)
3581		return;
3582	lock = NULL;
3583	pmap_remove_pte(pmap, pte, va, *pde, free, &lock);
3584	if (lock != NULL)
3585		rw_wunlock(lock);
3586	pmap_invalidate_page(pmap, va);
3587}
3588
3589/*
3590 *	Remove the given range of addresses from the specified map.
3591 *
3592 *	It is assumed that the start and end are properly
3593 *	rounded to the page size.
3594 */
3595void
3596pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
3597{
3598	struct rwlock *lock;
3599	vm_offset_t va, va_next;
3600	pml4_entry_t *pml4e;
3601	pdp_entry_t *pdpe;
3602	pd_entry_t ptpaddr, *pde;
3603	pt_entry_t *pte, PG_G, PG_V;
3604	struct spglist free;
3605	int anyvalid;
3606
3607	PG_G = pmap_global_bit(pmap);
3608	PG_V = pmap_valid_bit(pmap);
3609
3610	/*
3611	 * Perform an unsynchronized read.  This is, however, safe.
3612	 */
3613	if (pmap->pm_stats.resident_count == 0)
3614		return;
3615
3616	anyvalid = 0;
3617	SLIST_INIT(&free);
3618
3619	rw_rlock(&pvh_global_lock);
3620	PMAP_LOCK(pmap);
3621
3622	/*
3623	 * special handling of removing one page.  a very
3624	 * common operation and easy to short circuit some
3625	 * code.
3626	 */
3627	if (sva + PAGE_SIZE == eva) {
3628		pde = pmap_pde(pmap, sva);
3629		if (pde && (*pde & PG_PS) == 0) {
3630			pmap_remove_page(pmap, sva, pde, &free);
3631			goto out;
3632		}
3633	}
3634
3635	lock = NULL;
3636	for (; sva < eva; sva = va_next) {
3637
3638		if (pmap->pm_stats.resident_count == 0)
3639			break;
3640
3641		pml4e = pmap_pml4e(pmap, sva);
3642		if ((*pml4e & PG_V) == 0) {
3643			va_next = (sva + NBPML4) & ~PML4MASK;
3644			if (va_next < sva)
3645				va_next = eva;
3646			continue;
3647		}
3648
3649		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
3650		if ((*pdpe & PG_V) == 0) {
3651			va_next = (sva + NBPDP) & ~PDPMASK;
3652			if (va_next < sva)
3653				va_next = eva;
3654			continue;
3655		}
3656
3657		/*
3658		 * Calculate index for next page table.
3659		 */
3660		va_next = (sva + NBPDR) & ~PDRMASK;
3661		if (va_next < sva)
3662			va_next = eva;
3663
3664		pde = pmap_pdpe_to_pde(pdpe, sva);
3665		ptpaddr = *pde;
3666
3667		/*
3668		 * Weed out invalid mappings.
3669		 */
3670		if (ptpaddr == 0)
3671			continue;
3672
3673		/*
3674		 * Check for large page.
3675		 */
3676		if ((ptpaddr & PG_PS) != 0) {
3677			/*
3678			 * Are we removing the entire large page?  If not,
3679			 * demote the mapping and fall through.
3680			 */
3681			if (sva + NBPDR == va_next && eva >= va_next) {
3682				/*
3683				 * The TLB entry for a PG_G mapping is
3684				 * invalidated by pmap_remove_pde().
3685				 */
3686				if ((ptpaddr & PG_G) == 0)
3687					anyvalid = 1;
3688				pmap_remove_pde(pmap, pde, sva, &free, &lock);
3689				continue;
3690			} else if (!pmap_demote_pde_locked(pmap, pde, sva,
3691			    &lock)) {
3692				/* The large page mapping was destroyed. */
3693				continue;
3694			} else
3695				ptpaddr = *pde;
3696		}
3697
3698		/*
3699		 * Limit our scan to either the end of the va represented
3700		 * by the current page table page, or to the end of the
3701		 * range being removed.
3702		 */
3703		if (va_next > eva)
3704			va_next = eva;
3705
3706		va = va_next;
3707		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
3708		    sva += PAGE_SIZE) {
3709			if (*pte == 0) {
3710				if (va != va_next) {
3711					pmap_invalidate_range(pmap, va, sva);
3712					va = va_next;
3713				}
3714				continue;
3715			}
3716			if ((*pte & PG_G) == 0)
3717				anyvalid = 1;
3718			else if (va == va_next)
3719				va = sva;
3720			if (pmap_remove_pte(pmap, pte, sva, ptpaddr, &free,
3721			    &lock)) {
3722				sva += PAGE_SIZE;
3723				break;
3724			}
3725		}
3726		if (va != va_next)
3727			pmap_invalidate_range(pmap, va, sva);
3728	}
3729	if (lock != NULL)
3730		rw_wunlock(lock);
3731out:
3732	if (anyvalid)
3733		pmap_invalidate_all(pmap);
3734	rw_runlock(&pvh_global_lock);
3735	PMAP_UNLOCK(pmap);
3736	pmap_free_zero_pages(&free);
3737}
3738
3739/*
3740 *	Routine:	pmap_remove_all
3741 *	Function:
3742 *		Removes this physical page from
3743 *		all physical maps in which it resides.
3744 *		Reflects back modify bits to the pager.
3745 *
3746 *	Notes:
3747 *		Original versions of this routine were very
3748 *		inefficient because they iteratively called
3749 *		pmap_remove (slow...)
3750 */
3751
3752void
3753pmap_remove_all(vm_page_t m)
3754{
3755	struct md_page *pvh;
3756	pv_entry_t pv;
3757	pmap_t pmap;
3758	pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW;
3759	pd_entry_t *pde;
3760	vm_offset_t va;
3761	struct spglist free;
3762
3763	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3764	    ("pmap_remove_all: page %p is not managed", m));
3765	SLIST_INIT(&free);
3766	rw_wlock(&pvh_global_lock);
3767	if ((m->flags & PG_FICTITIOUS) != 0)
3768		goto small_mappings;
3769	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3770	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
3771		pmap = PV_PMAP(pv);
3772		PMAP_LOCK(pmap);
3773		va = pv->pv_va;
3774		pde = pmap_pde(pmap, va);
3775		(void)pmap_demote_pde(pmap, pde, va);
3776		PMAP_UNLOCK(pmap);
3777	}
3778small_mappings:
3779	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3780		pmap = PV_PMAP(pv);
3781		PMAP_LOCK(pmap);
3782		PG_A = pmap_accessed_bit(pmap);
3783		PG_M = pmap_modified_bit(pmap);
3784		PG_RW = pmap_rw_bit(pmap);
3785		pmap_resident_count_dec(pmap, 1);
3786		pde = pmap_pde(pmap, pv->pv_va);
3787		KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
3788		    " a 2mpage in page %p's pv list", m));
3789		pte = pmap_pde_to_pte(pde, pv->pv_va);
3790		tpte = pte_load_clear(pte);
3791		if (tpte & PG_W)
3792			pmap->pm_stats.wired_count--;
3793		if (tpte & PG_A)
3794			vm_page_aflag_set(m, PGA_REFERENCED);
3795
3796		/*
3797		 * Update the vm_page_t clean and reference bits.
3798		 */
3799		if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
3800			vm_page_dirty(m);
3801		pmap_unuse_pt(pmap, pv->pv_va, *pde, &free);
3802		pmap_invalidate_page(pmap, pv->pv_va);
3803		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3804		m->md.pv_gen++;
3805		free_pv_entry(pmap, pv);
3806		PMAP_UNLOCK(pmap);
3807	}
3808	vm_page_aflag_clear(m, PGA_WRITEABLE);
3809	rw_wunlock(&pvh_global_lock);
3810	pmap_free_zero_pages(&free);
3811}
3812
3813/*
3814 * pmap_protect_pde: do the things to protect a 2mpage in a process
3815 */
3816static boolean_t
3817pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
3818{
3819	pd_entry_t newpde, oldpde;
3820	vm_offset_t eva, va;
3821	vm_page_t m;
3822	boolean_t anychanged;
3823	pt_entry_t PG_G, PG_M, PG_RW;
3824
3825	PG_G = pmap_global_bit(pmap);
3826	PG_M = pmap_modified_bit(pmap);
3827	PG_RW = pmap_rw_bit(pmap);
3828
3829	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3830	KASSERT((sva & PDRMASK) == 0,
3831	    ("pmap_protect_pde: sva is not 2mpage aligned"));
3832	anychanged = FALSE;
3833retry:
3834	oldpde = newpde = *pde;
3835	if (oldpde & PG_MANAGED) {
3836		eva = sva + NBPDR;
3837		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
3838		    va < eva; va += PAGE_SIZE, m++)
3839			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
3840				vm_page_dirty(m);
3841	}
3842	if ((prot & VM_PROT_WRITE) == 0)
3843		newpde &= ~(PG_RW | PG_M);
3844	if ((prot & VM_PROT_EXECUTE) == 0)
3845		newpde |= pg_nx;
3846	if (newpde != oldpde) {
3847		if (!atomic_cmpset_long(pde, oldpde, newpde))
3848			goto retry;
3849		if (oldpde & PG_G)
3850			pmap_invalidate_page(pmap, sva);
3851		else
3852			anychanged = TRUE;
3853	}
3854	return (anychanged);
3855}
3856
3857/*
3858 *	Set the physical protection on the
3859 *	specified range of this map as requested.
3860 */
3861void
3862pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
3863{
3864	vm_offset_t va_next;
3865	pml4_entry_t *pml4e;
3866	pdp_entry_t *pdpe;
3867	pd_entry_t ptpaddr, *pde;
3868	pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V;
3869	boolean_t anychanged, pv_lists_locked;
3870
3871	KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
3872	if (prot == VM_PROT_NONE) {
3873		pmap_remove(pmap, sva, eva);
3874		return;
3875	}
3876
3877	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
3878	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
3879		return;
3880
3881	PG_G = pmap_global_bit(pmap);
3882	PG_M = pmap_modified_bit(pmap);
3883	PG_V = pmap_valid_bit(pmap);
3884	PG_RW = pmap_rw_bit(pmap);
3885	pv_lists_locked = FALSE;
3886resume:
3887	anychanged = FALSE;
3888
3889	PMAP_LOCK(pmap);
3890	for (; sva < eva; sva = va_next) {
3891
3892		pml4e = pmap_pml4e(pmap, sva);
3893		if ((*pml4e & PG_V) == 0) {
3894			va_next = (sva + NBPML4) & ~PML4MASK;
3895			if (va_next < sva)
3896				va_next = eva;
3897			continue;
3898		}
3899
3900		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
3901		if ((*pdpe & PG_V) == 0) {
3902			va_next = (sva + NBPDP) & ~PDPMASK;
3903			if (va_next < sva)
3904				va_next = eva;
3905			continue;
3906		}
3907
3908		va_next = (sva + NBPDR) & ~PDRMASK;
3909		if (va_next < sva)
3910			va_next = eva;
3911
3912		pde = pmap_pdpe_to_pde(pdpe, sva);
3913		ptpaddr = *pde;
3914
3915		/*
3916		 * Weed out invalid mappings.
3917		 */
3918		if (ptpaddr == 0)
3919			continue;
3920
3921		/*
3922		 * Check for large page.
3923		 */
3924		if ((ptpaddr & PG_PS) != 0) {
3925			/*
3926			 * Are we protecting the entire large page?  If not,
3927			 * demote the mapping and fall through.
3928			 */
3929			if (sva + NBPDR == va_next && eva >= va_next) {
3930				/*
3931				 * The TLB entry for a PG_G mapping is
3932				 * invalidated by pmap_protect_pde().
3933				 */
3934				if (pmap_protect_pde(pmap, pde, sva, prot))
3935					anychanged = TRUE;
3936				continue;
3937			} else {
3938				if (!pv_lists_locked) {
3939					pv_lists_locked = TRUE;
3940					if (!rw_try_rlock(&pvh_global_lock)) {
3941						if (anychanged)
3942							pmap_invalidate_all(
3943							    pmap);
3944						PMAP_UNLOCK(pmap);
3945						rw_rlock(&pvh_global_lock);
3946						goto resume;
3947					}
3948				}
3949				if (!pmap_demote_pde(pmap, pde, sva)) {
3950					/*
3951					 * The large page mapping was
3952					 * destroyed.
3953					 */
3954					continue;
3955				}
3956			}
3957		}
3958
3959		if (va_next > eva)
3960			va_next = eva;
3961
3962		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
3963		    sva += PAGE_SIZE) {
3964			pt_entry_t obits, pbits;
3965			vm_page_t m;
3966
3967retry:
3968			obits = pbits = *pte;
3969			if ((pbits & PG_V) == 0)
3970				continue;
3971
3972			if ((prot & VM_PROT_WRITE) == 0) {
3973				if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
3974				    (PG_MANAGED | PG_M | PG_RW)) {
3975					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
3976					vm_page_dirty(m);
3977				}
3978				pbits &= ~(PG_RW | PG_M);
3979			}
3980			if ((prot & VM_PROT_EXECUTE) == 0)
3981				pbits |= pg_nx;
3982
3983			if (pbits != obits) {
3984				if (!atomic_cmpset_long(pte, obits, pbits))
3985					goto retry;
3986				if (obits & PG_G)
3987					pmap_invalidate_page(pmap, sva);
3988				else
3989					anychanged = TRUE;
3990			}
3991		}
3992	}
3993	if (anychanged)
3994		pmap_invalidate_all(pmap);
3995	if (pv_lists_locked)
3996		rw_runlock(&pvh_global_lock);
3997	PMAP_UNLOCK(pmap);
3998}
3999
4000/*
4001 * Tries to promote the 512, contiguous 4KB page mappings that are within a
4002 * single page table page (PTP) to a single 2MB page mapping.  For promotion
4003 * to occur, two conditions must be met: (1) the 4KB page mappings must map
4004 * aligned, contiguous physical memory and (2) the 4KB page mappings must have
4005 * identical characteristics.
4006 */
4007static void
4008pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
4009    struct rwlock **lockp)
4010{
4011	pd_entry_t newpde;
4012	pt_entry_t *firstpte, oldpte, pa, *pte;
4013	pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V;
4014	vm_offset_t oldpteva;
4015	vm_page_t mpte;
4016	int PG_PTE_CACHE;
4017
4018	PG_A = pmap_accessed_bit(pmap);
4019	PG_G = pmap_global_bit(pmap);
4020	PG_M = pmap_modified_bit(pmap);
4021	PG_V = pmap_valid_bit(pmap);
4022	PG_RW = pmap_rw_bit(pmap);
4023	PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
4024
4025	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4026
4027	/*
4028	 * Examine the first PTE in the specified PTP.  Abort if this PTE is
4029	 * either invalid, unused, or does not map the first 4KB physical page
4030	 * within a 2MB page.
4031	 */
4032	firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
4033setpde:
4034	newpde = *firstpte;
4035	if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
4036		atomic_add_long(&pmap_pde_p_failures, 1);
4037		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
4038		    " in pmap %p", va, pmap);
4039		return;
4040	}
4041	if ((newpde & (PG_M | PG_RW)) == PG_RW) {
4042		/*
4043		 * When PG_M is already clear, PG_RW can be cleared without
4044		 * a TLB invalidation.
4045		 */
4046		if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW))
4047			goto setpde;
4048		newpde &= ~PG_RW;
4049	}
4050
4051	/*
4052	 * Examine each of the other PTEs in the specified PTP.  Abort if this
4053	 * PTE maps an unexpected 4KB physical page or does not have identical
4054	 * characteristics to the first PTE.
4055	 */
4056	pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
4057	for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
4058setpte:
4059		oldpte = *pte;
4060		if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
4061			atomic_add_long(&pmap_pde_p_failures, 1);
4062			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
4063			    " in pmap %p", va, pmap);
4064			return;
4065		}
4066		if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
4067			/*
4068			 * When PG_M is already clear, PG_RW can be cleared
4069			 * without a TLB invalidation.
4070			 */
4071			if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW))
4072				goto setpte;
4073			oldpte &= ~PG_RW;
4074			oldpteva = (oldpte & PG_FRAME & PDRMASK) |
4075			    (va & ~PDRMASK);
4076			CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx"
4077			    " in pmap %p", oldpteva, pmap);
4078		}
4079		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
4080			atomic_add_long(&pmap_pde_p_failures, 1);
4081			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
4082			    " in pmap %p", va, pmap);
4083			return;
4084		}
4085		pa -= PAGE_SIZE;
4086	}
4087
4088	/*
4089	 * Save the page table page in its current state until the PDE
4090	 * mapping the superpage is demoted by pmap_demote_pde() or
4091	 * destroyed by pmap_remove_pde().
4092	 */
4093	mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
4094	KASSERT(mpte >= vm_page_array &&
4095	    mpte < &vm_page_array[vm_page_array_size],
4096	    ("pmap_promote_pde: page table page is out of range"));
4097	KASSERT(mpte->pindex == pmap_pde_pindex(va),
4098	    ("pmap_promote_pde: page table page's pindex is wrong"));
4099	if (pmap_insert_pt_page(pmap, mpte)) {
4100		atomic_add_long(&pmap_pde_p_failures, 1);
4101		CTR2(KTR_PMAP,
4102		    "pmap_promote_pde: failure for va %#lx in pmap %p", va,
4103		    pmap);
4104		return;
4105	}
4106
4107	/*
4108	 * Promote the pv entries.
4109	 */
4110	if ((newpde & PG_MANAGED) != 0)
4111		pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp);
4112
4113	/*
4114	 * Propagate the PAT index to its proper position.
4115	 */
4116	newpde = pmap_swap_pat(pmap, newpde);
4117
4118	/*
4119	 * Map the superpage.
4120	 */
4121	if (workaround_erratum383)
4122		pmap_update_pde(pmap, va, pde, PG_PS | newpde);
4123	else
4124		pde_store(pde, PG_PS | newpde);
4125
4126	atomic_add_long(&pmap_pde_promotions, 1);
4127	CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx"
4128	    " in pmap %p", va, pmap);
4129}
4130
4131/*
4132 *	Insert the given physical page (p) at
4133 *	the specified virtual address (v) in the
4134 *	target physical map with the protection requested.
4135 *
4136 *	If specified, the page will be wired down, meaning
4137 *	that the related pte can not be reclaimed.
4138 *
4139 *	NB:  This is the only routine which MAY NOT lazy-evaluate
4140 *	or lose information.  That is, this routine must actually
4141 *	insert this page into the given map NOW.
4142 */
4143int
4144pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
4145    u_int flags, int8_t psind __unused)
4146{
4147	struct rwlock *lock;
4148	pd_entry_t *pde;
4149	pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V;
4150	pt_entry_t newpte, origpte;
4151	pv_entry_t pv;
4152	vm_paddr_t opa, pa;
4153	vm_page_t mpte, om;
4154	boolean_t nosleep;
4155
4156	PG_A = pmap_accessed_bit(pmap);
4157	PG_G = pmap_global_bit(pmap);
4158	PG_M = pmap_modified_bit(pmap);
4159	PG_V = pmap_valid_bit(pmap);
4160	PG_RW = pmap_rw_bit(pmap);
4161
4162	va = trunc_page(va);
4163	KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
4164	KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
4165	    ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)",
4166	    va));
4167	KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva ||
4168	    va >= kmi.clean_eva,
4169	    ("pmap_enter: managed mapping within the clean submap"));
4170	if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
4171		VM_OBJECT_ASSERT_LOCKED(m->object);
4172	pa = VM_PAGE_TO_PHYS(m);
4173	newpte = (pt_entry_t)(pa | PG_A | PG_V);
4174	if ((flags & VM_PROT_WRITE) != 0)
4175		newpte |= PG_M;
4176	if ((prot & VM_PROT_WRITE) != 0)
4177		newpte |= PG_RW;
4178	KASSERT((newpte & (PG_M | PG_RW)) != PG_M,
4179	    ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't"));
4180	if ((prot & VM_PROT_EXECUTE) == 0)
4181		newpte |= pg_nx;
4182	if ((flags & PMAP_ENTER_WIRED) != 0)
4183		newpte |= PG_W;
4184	if (va < VM_MAXUSER_ADDRESS)
4185		newpte |= PG_U;
4186	if (pmap == kernel_pmap)
4187		newpte |= PG_G;
4188	newpte |= pmap_cache_bits(pmap, m->md.pat_mode, 0);
4189
4190	/*
4191	 * Set modified bit gratuitously for writeable mappings if
4192	 * the page is unmanaged. We do not want to take a fault
4193	 * to do the dirty bit accounting for these mappings.
4194	 */
4195	if ((m->oflags & VPO_UNMANAGED) != 0) {
4196		if ((newpte & PG_RW) != 0)
4197			newpte |= PG_M;
4198	}
4199
4200	mpte = NULL;
4201
4202	lock = NULL;
4203	rw_rlock(&pvh_global_lock);
4204	PMAP_LOCK(pmap);
4205
4206	/*
4207	 * In the case that a page table page is not
4208	 * resident, we are creating it here.
4209	 */
4210retry:
4211	pde = pmap_pde(pmap, va);
4212	if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 ||
4213	    pmap_demote_pde_locked(pmap, pde, va, &lock))) {
4214		pte = pmap_pde_to_pte(pde, va);
4215		if (va < VM_MAXUSER_ADDRESS && mpte == NULL) {
4216			mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
4217			mpte->wire_count++;
4218		}
4219	} else if (va < VM_MAXUSER_ADDRESS) {
4220		/*
4221		 * Here if the pte page isn't mapped, or if it has been
4222		 * deallocated.
4223		 */
4224		nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
4225		mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va),
4226		    nosleep ? NULL : &lock);
4227		if (mpte == NULL && nosleep) {
4228			if (lock != NULL)
4229				rw_wunlock(lock);
4230			rw_runlock(&pvh_global_lock);
4231			PMAP_UNLOCK(pmap);
4232			return (KERN_RESOURCE_SHORTAGE);
4233		}
4234		goto retry;
4235	} else
4236		panic("pmap_enter: invalid page directory va=%#lx", va);
4237
4238	origpte = *pte;
4239
4240	/*
4241	 * Is the specified virtual address already mapped?
4242	 */
4243	if ((origpte & PG_V) != 0) {
4244		/*
4245		 * Wiring change, just update stats. We don't worry about
4246		 * wiring PT pages as they remain resident as long as there
4247		 * are valid mappings in them. Hence, if a user page is wired,
4248		 * the PT page will be also.
4249		 */
4250		if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0)
4251			pmap->pm_stats.wired_count++;
4252		else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0)
4253			pmap->pm_stats.wired_count--;
4254
4255		/*
4256		 * Remove the extra PT page reference.
4257		 */
4258		if (mpte != NULL) {
4259			mpte->wire_count--;
4260			KASSERT(mpte->wire_count > 0,
4261			    ("pmap_enter: missing reference to page table page,"
4262			     " va: 0x%lx", va));
4263		}
4264
4265		/*
4266		 * Has the physical page changed?
4267		 */
4268		opa = origpte & PG_FRAME;
4269		if (opa == pa) {
4270			/*
4271			 * No, might be a protection or wiring change.
4272			 */
4273			if ((origpte & PG_MANAGED) != 0) {
4274				newpte |= PG_MANAGED;
4275				if ((newpte & PG_RW) != 0)
4276					vm_page_aflag_set(m, PGA_WRITEABLE);
4277			}
4278			if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0)
4279				goto unchanged;
4280			goto validate;
4281		}
4282	} else {
4283		/*
4284		 * Increment the counters.
4285		 */
4286		if ((newpte & PG_W) != 0)
4287			pmap->pm_stats.wired_count++;
4288		pmap_resident_count_inc(pmap, 1);
4289	}
4290
4291	/*
4292	 * Enter on the PV list if part of our managed memory.
4293	 */
4294	if ((m->oflags & VPO_UNMANAGED) == 0) {
4295		newpte |= PG_MANAGED;
4296		pv = get_pv_entry(pmap, &lock);
4297		pv->pv_va = va;
4298		CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
4299		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
4300		m->md.pv_gen++;
4301		if ((newpte & PG_RW) != 0)
4302			vm_page_aflag_set(m, PGA_WRITEABLE);
4303	}
4304
4305	/*
4306	 * Update the PTE.
4307	 */
4308	if ((origpte & PG_V) != 0) {
4309validate:
4310		origpte = pte_load_store(pte, newpte);
4311		opa = origpte & PG_FRAME;
4312		if (opa != pa) {
4313			if ((origpte & PG_MANAGED) != 0) {
4314				om = PHYS_TO_VM_PAGE(opa);
4315				if ((origpte & (PG_M | PG_RW)) == (PG_M |
4316				    PG_RW))
4317					vm_page_dirty(om);
4318				if ((origpte & PG_A) != 0)
4319					vm_page_aflag_set(om, PGA_REFERENCED);
4320				CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
4321				pmap_pvh_free(&om->md, pmap, va);
4322				if ((om->aflags & PGA_WRITEABLE) != 0 &&
4323				    TAILQ_EMPTY(&om->md.pv_list) &&
4324				    ((om->flags & PG_FICTITIOUS) != 0 ||
4325				    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
4326					vm_page_aflag_clear(om, PGA_WRITEABLE);
4327			}
4328		} else if ((newpte & PG_M) == 0 && (origpte & (PG_M |
4329		    PG_RW)) == (PG_M | PG_RW)) {
4330			if ((origpte & PG_MANAGED) != 0)
4331				vm_page_dirty(m);
4332
4333			/*
4334			 * Although the PTE may still have PG_RW set, TLB
4335			 * invalidation may nonetheless be required because
4336			 * the PTE no longer has PG_M set.
4337			 */
4338		} else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) {
4339			/*
4340			 * This PTE change does not require TLB invalidation.
4341			 */
4342			goto unchanged;
4343		}
4344		if ((origpte & PG_A) != 0)
4345			pmap_invalidate_page(pmap, va);
4346	} else
4347		pte_store(pte, newpte);
4348
4349unchanged:
4350
4351	/*
4352	 * If both the page table page and the reservation are fully
4353	 * populated, then attempt promotion.
4354	 */
4355	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
4356	    pmap_ps_enabled(pmap) &&
4357	    (m->flags & PG_FICTITIOUS) == 0 &&
4358	    vm_reserv_level_iffullpop(m) == 0)
4359		pmap_promote_pde(pmap, pde, va, &lock);
4360
4361	if (lock != NULL)
4362		rw_wunlock(lock);
4363	rw_runlock(&pvh_global_lock);
4364	PMAP_UNLOCK(pmap);
4365	return (KERN_SUCCESS);
4366}
4367
4368/*
4369 * Tries to create a 2MB page mapping.  Returns TRUE if successful and FALSE
4370 * otherwise.  Fails if (1) a page table page cannot be allocated without
4371 * blocking, (2) a mapping already exists at the specified virtual address, or
4372 * (3) a pv entry cannot be allocated without reclaiming another pv entry.
4373 */
4374static boolean_t
4375pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
4376    struct rwlock **lockp)
4377{
4378	pd_entry_t *pde, newpde;
4379	pt_entry_t PG_V;
4380	vm_page_t mpde;
4381	struct spglist free;
4382
4383	PG_V = pmap_valid_bit(pmap);
4384	rw_assert(&pvh_global_lock, RA_LOCKED);
4385	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4386
4387	if ((mpde = pmap_allocpde(pmap, va, NULL)) == NULL) {
4388		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
4389		    " in pmap %p", va, pmap);
4390		return (FALSE);
4391	}
4392	pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpde));
4393	pde = &pde[pmap_pde_index(va)];
4394	if ((*pde & PG_V) != 0) {
4395		KASSERT(mpde->wire_count > 1,
4396		    ("pmap_enter_pde: mpde's wire count is too low"));
4397		mpde->wire_count--;
4398		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
4399		    " in pmap %p", va, pmap);
4400		return (FALSE);
4401	}
4402	newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) |
4403	    PG_PS | PG_V;
4404	if ((m->oflags & VPO_UNMANAGED) == 0) {
4405		newpde |= PG_MANAGED;
4406
4407		/*
4408		 * Abort this mapping if its PV entry could not be created.
4409		 */
4410		if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m),
4411		    lockp)) {
4412			SLIST_INIT(&free);
4413			if (pmap_unwire_ptp(pmap, va, mpde, &free)) {
4414				pmap_invalidate_page(pmap, va);
4415				pmap_free_zero_pages(&free);
4416			}
4417			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
4418			    " in pmap %p", va, pmap);
4419			return (FALSE);
4420		}
4421	}
4422	if ((prot & VM_PROT_EXECUTE) == 0)
4423		newpde |= pg_nx;
4424	if (va < VM_MAXUSER_ADDRESS)
4425		newpde |= PG_U;
4426
4427	/*
4428	 * Increment counters.
4429	 */
4430	pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
4431
4432	/*
4433	 * Map the superpage.
4434	 */
4435	pde_store(pde, newpde);
4436
4437	atomic_add_long(&pmap_pde_mappings, 1);
4438	CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
4439	    " in pmap %p", va, pmap);
4440	return (TRUE);
4441}
4442
4443/*
4444 * Maps a sequence of resident pages belonging to the same object.
4445 * The sequence begins with the given page m_start.  This page is
4446 * mapped at the given virtual address start.  Each subsequent page is
4447 * mapped at a virtual address that is offset from start by the same
4448 * amount as the page is offset from m_start within the object.  The
4449 * last page in the sequence is the page with the largest offset from
4450 * m_start that can be mapped at a virtual address less than the given
4451 * virtual address end.  Not every virtual page between start and end
4452 * is mapped; only those for which a resident page exists with the
4453 * corresponding offset from m_start are mapped.
4454 */
4455void
4456pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
4457    vm_page_t m_start, vm_prot_t prot)
4458{
4459	struct rwlock *lock;
4460	vm_offset_t va;
4461	vm_page_t m, mpte;
4462	vm_pindex_t diff, psize;
4463
4464	VM_OBJECT_ASSERT_LOCKED(m_start->object);
4465
4466	psize = atop(end - start);
4467	mpte = NULL;
4468	m = m_start;
4469	lock = NULL;
4470	rw_rlock(&pvh_global_lock);
4471	PMAP_LOCK(pmap);
4472	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
4473		va = start + ptoa(diff);
4474		if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
4475		    m->psind == 1 && pmap_ps_enabled(pmap) &&
4476		    pmap_enter_pde(pmap, va, m, prot, &lock))
4477			m = &m[NBPDR / PAGE_SIZE - 1];
4478		else
4479			mpte = pmap_enter_quick_locked(pmap, va, m, prot,
4480			    mpte, &lock);
4481		m = TAILQ_NEXT(m, listq);
4482	}
4483	if (lock != NULL)
4484		rw_wunlock(lock);
4485	rw_runlock(&pvh_global_lock);
4486	PMAP_UNLOCK(pmap);
4487}
4488
4489/*
4490 * this code makes some *MAJOR* assumptions:
4491 * 1. Current pmap & pmap exists.
4492 * 2. Not wired.
4493 * 3. Read access.
4494 * 4. No page table pages.
4495 * but is *MUCH* faster than pmap_enter...
4496 */
4497
4498void
4499pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
4500{
4501	struct rwlock *lock;
4502
4503	lock = NULL;
4504	rw_rlock(&pvh_global_lock);
4505	PMAP_LOCK(pmap);
4506	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
4507	if (lock != NULL)
4508		rw_wunlock(lock);
4509	rw_runlock(&pvh_global_lock);
4510	PMAP_UNLOCK(pmap);
4511}
4512
4513static vm_page_t
4514pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
4515    vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
4516{
4517	struct spglist free;
4518	pt_entry_t *pte, PG_V;
4519	vm_paddr_t pa;
4520
4521	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
4522	    (m->oflags & VPO_UNMANAGED) != 0,
4523	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
4524	PG_V = pmap_valid_bit(pmap);
4525	rw_assert(&pvh_global_lock, RA_LOCKED);
4526	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4527
4528	/*
4529	 * In the case that a page table page is not
4530	 * resident, we are creating it here.
4531	 */
4532	if (va < VM_MAXUSER_ADDRESS) {
4533		vm_pindex_t ptepindex;
4534		pd_entry_t *ptepa;
4535
4536		/*
4537		 * Calculate pagetable page index
4538		 */
4539		ptepindex = pmap_pde_pindex(va);
4540		if (mpte && (mpte->pindex == ptepindex)) {
4541			mpte->wire_count++;
4542		} else {
4543			/*
4544			 * Get the page directory entry
4545			 */
4546			ptepa = pmap_pde(pmap, va);
4547
4548			/*
4549			 * If the page table page is mapped, we just increment
4550			 * the hold count, and activate it.  Otherwise, we
4551			 * attempt to allocate a page table page.  If this
4552			 * attempt fails, we don't retry.  Instead, we give up.
4553			 */
4554			if (ptepa && (*ptepa & PG_V) != 0) {
4555				if (*ptepa & PG_PS)
4556					return (NULL);
4557				mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME);
4558				mpte->wire_count++;
4559			} else {
4560				/*
4561				 * Pass NULL instead of the PV list lock
4562				 * pointer, because we don't intend to sleep.
4563				 */
4564				mpte = _pmap_allocpte(pmap, ptepindex, NULL);
4565				if (mpte == NULL)
4566					return (mpte);
4567			}
4568		}
4569		pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
4570		pte = &pte[pmap_pte_index(va)];
4571	} else {
4572		mpte = NULL;
4573		pte = vtopte(va);
4574	}
4575	if (*pte) {
4576		if (mpte != NULL) {
4577			mpte->wire_count--;
4578			mpte = NULL;
4579		}
4580		return (mpte);
4581	}
4582
4583	/*
4584	 * Enter on the PV list if part of our managed memory.
4585	 */
4586	if ((m->oflags & VPO_UNMANAGED) == 0 &&
4587	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
4588		if (mpte != NULL) {
4589			SLIST_INIT(&free);
4590			if (pmap_unwire_ptp(pmap, va, mpte, &free)) {
4591				pmap_invalidate_page(pmap, va);
4592				pmap_free_zero_pages(&free);
4593			}
4594			mpte = NULL;
4595		}
4596		return (mpte);
4597	}
4598
4599	/*
4600	 * Increment counters
4601	 */
4602	pmap_resident_count_inc(pmap, 1);
4603
4604	pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 0);
4605	if ((prot & VM_PROT_EXECUTE) == 0)
4606		pa |= pg_nx;
4607
4608	/*
4609	 * Now validate mapping with RO protection
4610	 */
4611	if ((m->oflags & VPO_UNMANAGED) != 0)
4612		pte_store(pte, pa | PG_V | PG_U);
4613	else
4614		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
4615	return (mpte);
4616}
4617
4618/*
4619 * Make a temporary mapping for a physical address.  This is only intended
4620 * to be used for panic dumps.
4621 */
4622void *
4623pmap_kenter_temporary(vm_paddr_t pa, int i)
4624{
4625	vm_offset_t va;
4626
4627	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
4628	pmap_kenter(va, pa);
4629	invlpg(va);
4630	return ((void *)crashdumpmap);
4631}
4632
4633/*
4634 * This code maps large physical mmap regions into the
4635 * processor address space.  Note that some shortcuts
4636 * are taken, but the code works.
4637 */
4638void
4639pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
4640    vm_pindex_t pindex, vm_size_t size)
4641{
4642	pd_entry_t *pde;
4643	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
4644	vm_paddr_t pa, ptepa;
4645	vm_page_t p, pdpg;
4646	int pat_mode;
4647
4648	PG_A = pmap_accessed_bit(pmap);
4649	PG_M = pmap_modified_bit(pmap);
4650	PG_V = pmap_valid_bit(pmap);
4651	PG_RW = pmap_rw_bit(pmap);
4652
4653	VM_OBJECT_ASSERT_WLOCKED(object);
4654	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
4655	    ("pmap_object_init_pt: non-device object"));
4656	if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
4657		if (!pmap_ps_enabled(pmap))
4658			return;
4659		if (!vm_object_populate(object, pindex, pindex + atop(size)))
4660			return;
4661		p = vm_page_lookup(object, pindex);
4662		KASSERT(p->valid == VM_PAGE_BITS_ALL,
4663		    ("pmap_object_init_pt: invalid page %p", p));
4664		pat_mode = p->md.pat_mode;
4665
4666		/*
4667		 * Abort the mapping if the first page is not physically
4668		 * aligned to a 2MB page boundary.
4669		 */
4670		ptepa = VM_PAGE_TO_PHYS(p);
4671		if (ptepa & (NBPDR - 1))
4672			return;
4673
4674		/*
4675		 * Skip the first page.  Abort the mapping if the rest of
4676		 * the pages are not physically contiguous or have differing
4677		 * memory attributes.
4678		 */
4679		p = TAILQ_NEXT(p, listq);
4680		for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
4681		    pa += PAGE_SIZE) {
4682			KASSERT(p->valid == VM_PAGE_BITS_ALL,
4683			    ("pmap_object_init_pt: invalid page %p", p));
4684			if (pa != VM_PAGE_TO_PHYS(p) ||
4685			    pat_mode != p->md.pat_mode)
4686				return;
4687			p = TAILQ_NEXT(p, listq);
4688		}
4689
4690		/*
4691		 * Map using 2MB pages.  Since "ptepa" is 2M aligned and
4692		 * "size" is a multiple of 2M, adding the PAT setting to "pa"
4693		 * will not affect the termination of this loop.
4694		 */
4695		PMAP_LOCK(pmap);
4696		for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1);
4697		    pa < ptepa + size; pa += NBPDR) {
4698			pdpg = pmap_allocpde(pmap, addr, NULL);
4699			if (pdpg == NULL) {
4700				/*
4701				 * The creation of mappings below is only an
4702				 * optimization.  If a page directory page
4703				 * cannot be allocated without blocking,
4704				 * continue on to the next mapping rather than
4705				 * blocking.
4706				 */
4707				addr += NBPDR;
4708				continue;
4709			}
4710			pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
4711			pde = &pde[pmap_pde_index(addr)];
4712			if ((*pde & PG_V) == 0) {
4713				pde_store(pde, pa | PG_PS | PG_M | PG_A |
4714				    PG_U | PG_RW | PG_V);
4715				pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
4716				atomic_add_long(&pmap_pde_mappings, 1);
4717			} else {
4718				/* Continue on if the PDE is already valid. */
4719				pdpg->wire_count--;
4720				KASSERT(pdpg->wire_count > 0,
4721				    ("pmap_object_init_pt: missing reference "
4722				    "to page directory page, va: 0x%lx", addr));
4723			}
4724			addr += NBPDR;
4725		}
4726		PMAP_UNLOCK(pmap);
4727	}
4728}
4729
4730/*
4731 *	Clear the wired attribute from the mappings for the specified range of
4732 *	addresses in the given pmap.  Every valid mapping within that range
4733 *	must have the wired attribute set.  In contrast, invalid mappings
4734 *	cannot have the wired attribute set, so they are ignored.
4735 *
4736 *	The wired attribute of the page table entry is not a hardware feature,
4737 *	so there is no need to invalidate any TLB entries.
4738 */
4739void
4740pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
4741{
4742	vm_offset_t va_next;
4743	pml4_entry_t *pml4e;
4744	pdp_entry_t *pdpe;
4745	pd_entry_t *pde;
4746	pt_entry_t *pte, PG_V;
4747	boolean_t pv_lists_locked;
4748
4749	PG_V = pmap_valid_bit(pmap);
4750	pv_lists_locked = FALSE;
4751resume:
4752	PMAP_LOCK(pmap);
4753	for (; sva < eva; sva = va_next) {
4754		pml4e = pmap_pml4e(pmap, sva);
4755		if ((*pml4e & PG_V) == 0) {
4756			va_next = (sva + NBPML4) & ~PML4MASK;
4757			if (va_next < sva)
4758				va_next = eva;
4759			continue;
4760		}
4761		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
4762		if ((*pdpe & PG_V) == 0) {
4763			va_next = (sva + NBPDP) & ~PDPMASK;
4764			if (va_next < sva)
4765				va_next = eva;
4766			continue;
4767		}
4768		va_next = (sva + NBPDR) & ~PDRMASK;
4769		if (va_next < sva)
4770			va_next = eva;
4771		pde = pmap_pdpe_to_pde(pdpe, sva);
4772		if ((*pde & PG_V) == 0)
4773			continue;
4774		if ((*pde & PG_PS) != 0) {
4775			if ((*pde & PG_W) == 0)
4776				panic("pmap_unwire: pde %#jx is missing PG_W",
4777				    (uintmax_t)*pde);
4778
4779			/*
4780			 * Are we unwiring the entire large page?  If not,
4781			 * demote the mapping and fall through.
4782			 */
4783			if (sva + NBPDR == va_next && eva >= va_next) {
4784				atomic_clear_long(pde, PG_W);
4785				pmap->pm_stats.wired_count -= NBPDR /
4786				    PAGE_SIZE;
4787				continue;
4788			} else {
4789				if (!pv_lists_locked) {
4790					pv_lists_locked = TRUE;
4791					if (!rw_try_rlock(&pvh_global_lock)) {
4792						PMAP_UNLOCK(pmap);
4793						rw_rlock(&pvh_global_lock);
4794						/* Repeat sva. */
4795						goto resume;
4796					}
4797				}
4798				if (!pmap_demote_pde(pmap, pde, sva))
4799					panic("pmap_unwire: demotion failed");
4800			}
4801		}
4802		if (va_next > eva)
4803			va_next = eva;
4804		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
4805		    sva += PAGE_SIZE) {
4806			if ((*pte & PG_V) == 0)
4807				continue;
4808			if ((*pte & PG_W) == 0)
4809				panic("pmap_unwire: pte %#jx is missing PG_W",
4810				    (uintmax_t)*pte);
4811
4812			/*
4813			 * PG_W must be cleared atomically.  Although the pmap
4814			 * lock synchronizes access to PG_W, another processor
4815			 * could be setting PG_M and/or PG_A concurrently.
4816			 */
4817			atomic_clear_long(pte, PG_W);
4818			pmap->pm_stats.wired_count--;
4819		}
4820	}
4821	if (pv_lists_locked)
4822		rw_runlock(&pvh_global_lock);
4823	PMAP_UNLOCK(pmap);
4824}
4825
4826/*
4827 *	Copy the range specified by src_addr/len
4828 *	from the source map to the range dst_addr/len
4829 *	in the destination map.
4830 *
4831 *	This routine is only advisory and need not do anything.
4832 */
4833
4834void
4835pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
4836    vm_offset_t src_addr)
4837{
4838	struct rwlock *lock;
4839	struct spglist free;
4840	vm_offset_t addr;
4841	vm_offset_t end_addr = src_addr + len;
4842	vm_offset_t va_next;
4843	pt_entry_t PG_A, PG_M, PG_V;
4844
4845	if (dst_addr != src_addr)
4846		return;
4847
4848	if (dst_pmap->pm_type != src_pmap->pm_type)
4849		return;
4850
4851	/*
4852	 * EPT page table entries that require emulation of A/D bits are
4853	 * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although
4854	 * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit
4855	 * (aka EPT_PG_EXECUTE) could still be set. Since some EPT
4856	 * implementations flag an EPT misconfiguration for exec-only
4857	 * mappings we skip this function entirely for emulated pmaps.
4858	 */
4859	if (pmap_emulate_ad_bits(dst_pmap))
4860		return;
4861
4862	lock = NULL;
4863	rw_rlock(&pvh_global_lock);
4864	if (dst_pmap < src_pmap) {
4865		PMAP_LOCK(dst_pmap);
4866		PMAP_LOCK(src_pmap);
4867	} else {
4868		PMAP_LOCK(src_pmap);
4869		PMAP_LOCK(dst_pmap);
4870	}
4871
4872	PG_A = pmap_accessed_bit(dst_pmap);
4873	PG_M = pmap_modified_bit(dst_pmap);
4874	PG_V = pmap_valid_bit(dst_pmap);
4875
4876	for (addr = src_addr; addr < end_addr; addr = va_next) {
4877		pt_entry_t *src_pte, *dst_pte;
4878		vm_page_t dstmpde, dstmpte, srcmpte;
4879		pml4_entry_t *pml4e;
4880		pdp_entry_t *pdpe;
4881		pd_entry_t srcptepaddr, *pde;
4882
4883		KASSERT(addr < UPT_MIN_ADDRESS,
4884		    ("pmap_copy: invalid to pmap_copy page tables"));
4885
4886		pml4e = pmap_pml4e(src_pmap, addr);
4887		if ((*pml4e & PG_V) == 0) {
4888			va_next = (addr + NBPML4) & ~PML4MASK;
4889			if (va_next < addr)
4890				va_next = end_addr;
4891			continue;
4892		}
4893
4894		pdpe = pmap_pml4e_to_pdpe(pml4e, addr);
4895		if ((*pdpe & PG_V) == 0) {
4896			va_next = (addr + NBPDP) & ~PDPMASK;
4897			if (va_next < addr)
4898				va_next = end_addr;
4899			continue;
4900		}
4901
4902		va_next = (addr + NBPDR) & ~PDRMASK;
4903		if (va_next < addr)
4904			va_next = end_addr;
4905
4906		pde = pmap_pdpe_to_pde(pdpe, addr);
4907		srcptepaddr = *pde;
4908		if (srcptepaddr == 0)
4909			continue;
4910
4911		if (srcptepaddr & PG_PS) {
4912			if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr)
4913				continue;
4914			dstmpde = pmap_allocpde(dst_pmap, addr, NULL);
4915			if (dstmpde == NULL)
4916				break;
4917			pde = (pd_entry_t *)
4918			    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpde));
4919			pde = &pde[pmap_pde_index(addr)];
4920			if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 ||
4921			    pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr &
4922			    PG_PS_FRAME, &lock))) {
4923				*pde = srcptepaddr & ~PG_W;
4924				pmap_resident_count_inc(dst_pmap, NBPDR / PAGE_SIZE);
4925			} else
4926				dstmpde->wire_count--;
4927			continue;
4928		}
4929
4930		srcptepaddr &= PG_FRAME;
4931		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
4932		KASSERT(srcmpte->wire_count > 0,
4933		    ("pmap_copy: source page table page is unused"));
4934
4935		if (va_next > end_addr)
4936			va_next = end_addr;
4937
4938		src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
4939		src_pte = &src_pte[pmap_pte_index(addr)];
4940		dstmpte = NULL;
4941		while (addr < va_next) {
4942			pt_entry_t ptetemp;
4943			ptetemp = *src_pte;
4944			/*
4945			 * we only virtual copy managed pages
4946			 */
4947			if ((ptetemp & PG_MANAGED) != 0) {
4948				if (dstmpte != NULL &&
4949				    dstmpte->pindex == pmap_pde_pindex(addr))
4950					dstmpte->wire_count++;
4951				else if ((dstmpte = pmap_allocpte(dst_pmap,
4952				    addr, NULL)) == NULL)
4953					goto out;
4954				dst_pte = (pt_entry_t *)
4955				    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
4956				dst_pte = &dst_pte[pmap_pte_index(addr)];
4957				if (*dst_pte == 0 &&
4958				    pmap_try_insert_pv_entry(dst_pmap, addr,
4959				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME),
4960				    &lock)) {
4961					/*
4962					 * Clear the wired, modified, and
4963					 * accessed (referenced) bits
4964					 * during the copy.
4965					 */
4966					*dst_pte = ptetemp & ~(PG_W | PG_M |
4967					    PG_A);
4968					pmap_resident_count_inc(dst_pmap, 1);
4969				} else {
4970					SLIST_INIT(&free);
4971					if (pmap_unwire_ptp(dst_pmap, addr,
4972					    dstmpte, &free)) {
4973						pmap_invalidate_page(dst_pmap,
4974						    addr);
4975						pmap_free_zero_pages(&free);
4976					}
4977					goto out;
4978				}
4979				if (dstmpte->wire_count >= srcmpte->wire_count)
4980					break;
4981			}
4982			addr += PAGE_SIZE;
4983			src_pte++;
4984		}
4985	}
4986out:
4987	if (lock != NULL)
4988		rw_wunlock(lock);
4989	rw_runlock(&pvh_global_lock);
4990	PMAP_UNLOCK(src_pmap);
4991	PMAP_UNLOCK(dst_pmap);
4992}
4993
4994/*
4995 *	pmap_zero_page zeros the specified hardware page by mapping
4996 *	the page into KVM and using bzero to clear its contents.
4997 */
4998void
4999pmap_zero_page(vm_page_t m)
5000{
5001	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
5002
5003	pagezero((void *)va);
5004}
5005
5006/*
5007 *	pmap_zero_page_area zeros the specified hardware page by mapping
5008 *	the page into KVM and using bzero to clear its contents.
5009 *
5010 *	off and size may not cover an area beyond a single hardware page.
5011 */
5012void
5013pmap_zero_page_area(vm_page_t m, int off, int size)
5014{
5015	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
5016
5017	if (off == 0 && size == PAGE_SIZE)
5018		pagezero((void *)va);
5019	else
5020		bzero((char *)va + off, size);
5021}
5022
5023/*
5024 *	pmap_zero_page_idle zeros the specified hardware page by mapping
5025 *	the page into KVM and using bzero to clear its contents.  This
5026 *	is intended to be called from the vm_pagezero process only and
5027 *	outside of Giant.
5028 */
5029void
5030pmap_zero_page_idle(vm_page_t m)
5031{
5032	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
5033
5034	pagezero((void *)va);
5035}
5036
5037/*
5038 *	pmap_copy_page copies the specified (machine independent)
5039 *	page by mapping the page into virtual memory and using
5040 *	bcopy to copy the page, one machine dependent page at a
5041 *	time.
5042 */
5043void
5044pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
5045{
5046	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
5047	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
5048
5049	pagecopy((void *)src, (void *)dst);
5050}
5051
5052int unmapped_buf_allowed = 1;
5053
5054void
5055pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
5056    vm_offset_t b_offset, int xfersize)
5057{
5058	void *a_cp, *b_cp;
5059	vm_page_t m_a, m_b;
5060	vm_paddr_t p_a, p_b;
5061	pt_entry_t *pte;
5062	vm_offset_t a_pg_offset, b_pg_offset;
5063	int cnt;
5064	boolean_t pinned;
5065
5066	/*
5067	 * NB:  The sequence of updating a page table followed by accesses
5068	 * to the corresponding pages used in the !DMAP case is subject to
5069	 * the situation described in the "AMD64 Architecture Programmer's
5070	 * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special
5071	 * Coherency Considerations".  Therefore, issuing the INVLPG right
5072	 * after modifying the PTE bits is crucial.
5073	 */
5074	pinned = FALSE;
5075	while (xfersize > 0) {
5076		a_pg_offset = a_offset & PAGE_MASK;
5077		m_a = ma[a_offset >> PAGE_SHIFT];
5078		p_a = m_a->phys_addr;
5079		b_pg_offset = b_offset & PAGE_MASK;
5080		m_b = mb[b_offset >> PAGE_SHIFT];
5081		p_b = m_b->phys_addr;
5082		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
5083		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
5084		if (__predict_false(p_a < DMAP_MIN_ADDRESS ||
5085		    p_a > DMAP_MIN_ADDRESS + dmaplimit)) {
5086			mtx_lock(&cpage_lock);
5087			sched_pin();
5088			pinned = TRUE;
5089			pte = vtopte(cpage_a);
5090			*pte = p_a | X86_PG_A | X86_PG_V |
5091			    pmap_cache_bits(kernel_pmap, m_a->md.pat_mode, 0);
5092			invlpg(cpage_a);
5093			a_cp = (char *)cpage_a + a_pg_offset;
5094		} else {
5095			a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
5096		}
5097		if (__predict_false(p_b < DMAP_MIN_ADDRESS ||
5098		    p_b > DMAP_MIN_ADDRESS + dmaplimit)) {
5099			if (!pinned) {
5100				mtx_lock(&cpage_lock);
5101				sched_pin();
5102				pinned = TRUE;
5103			}
5104			pte = vtopte(cpage_b);
5105			*pte = p_b | X86_PG_A | X86_PG_M | X86_PG_RW |
5106			    X86_PG_V | pmap_cache_bits(kernel_pmap,
5107			    m_b->md.pat_mode, 0);
5108			invlpg(cpage_b);
5109			b_cp = (char *)cpage_b + b_pg_offset;
5110		} else {
5111			b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
5112		}
5113		bcopy(a_cp, b_cp, cnt);
5114		if (__predict_false(pinned)) {
5115			sched_unpin();
5116			mtx_unlock(&cpage_lock);
5117			pinned = FALSE;
5118		}
5119		a_offset += cnt;
5120		b_offset += cnt;
5121		xfersize -= cnt;
5122	}
5123}
5124
5125/*
5126 * Returns true if the pmap's pv is one of the first
5127 * 16 pvs linked to from this page.  This count may
5128 * be changed upwards or downwards in the future; it
5129 * is only necessary that true be returned for a small
5130 * subset of pmaps for proper page aging.
5131 */
5132boolean_t
5133pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
5134{
5135	struct md_page *pvh;
5136	struct rwlock *lock;
5137	pv_entry_t pv;
5138	int loops = 0;
5139	boolean_t rv;
5140
5141	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5142	    ("pmap_page_exists_quick: page %p is not managed", m));
5143	rv = FALSE;
5144	rw_rlock(&pvh_global_lock);
5145	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5146	rw_rlock(lock);
5147	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5148		if (PV_PMAP(pv) == pmap) {
5149			rv = TRUE;
5150			break;
5151		}
5152		loops++;
5153		if (loops >= 16)
5154			break;
5155	}
5156	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
5157		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5158		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5159			if (PV_PMAP(pv) == pmap) {
5160				rv = TRUE;
5161				break;
5162			}
5163			loops++;
5164			if (loops >= 16)
5165				break;
5166		}
5167	}
5168	rw_runlock(lock);
5169	rw_runlock(&pvh_global_lock);
5170	return (rv);
5171}
5172
5173/*
5174 *	pmap_page_wired_mappings:
5175 *
5176 *	Return the number of managed mappings to the given physical page
5177 *	that are wired.
5178 */
5179int
5180pmap_page_wired_mappings(vm_page_t m)
5181{
5182	struct rwlock *lock;
5183	struct md_page *pvh;
5184	pmap_t pmap;
5185	pt_entry_t *pte;
5186	pv_entry_t pv;
5187	int count, md_gen, pvh_gen;
5188
5189	if ((m->oflags & VPO_UNMANAGED) != 0)
5190		return (0);
5191	rw_rlock(&pvh_global_lock);
5192	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5193	rw_rlock(lock);
5194restart:
5195	count = 0;
5196	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5197		pmap = PV_PMAP(pv);
5198		if (!PMAP_TRYLOCK(pmap)) {
5199			md_gen = m->md.pv_gen;
5200			rw_runlock(lock);
5201			PMAP_LOCK(pmap);
5202			rw_rlock(lock);
5203			if (md_gen != m->md.pv_gen) {
5204				PMAP_UNLOCK(pmap);
5205				goto restart;
5206			}
5207		}
5208		pte = pmap_pte(pmap, pv->pv_va);
5209		if ((*pte & PG_W) != 0)
5210			count++;
5211		PMAP_UNLOCK(pmap);
5212	}
5213	if ((m->flags & PG_FICTITIOUS) == 0) {
5214		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5215		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5216			pmap = PV_PMAP(pv);
5217			if (!PMAP_TRYLOCK(pmap)) {
5218				md_gen = m->md.pv_gen;
5219				pvh_gen = pvh->pv_gen;
5220				rw_runlock(lock);
5221				PMAP_LOCK(pmap);
5222				rw_rlock(lock);
5223				if (md_gen != m->md.pv_gen ||
5224				    pvh_gen != pvh->pv_gen) {
5225					PMAP_UNLOCK(pmap);
5226					goto restart;
5227				}
5228			}
5229			pte = pmap_pde(pmap, pv->pv_va);
5230			if ((*pte & PG_W) != 0)
5231				count++;
5232			PMAP_UNLOCK(pmap);
5233		}
5234	}
5235	rw_runlock(lock);
5236	rw_runlock(&pvh_global_lock);
5237	return (count);
5238}
5239
5240/*
5241 * Returns TRUE if the given page is mapped individually or as part of
5242 * a 2mpage.  Otherwise, returns FALSE.
5243 */
5244boolean_t
5245pmap_page_is_mapped(vm_page_t m)
5246{
5247	struct rwlock *lock;
5248	boolean_t rv;
5249
5250	if ((m->oflags & VPO_UNMANAGED) != 0)
5251		return (FALSE);
5252	rw_rlock(&pvh_global_lock);
5253	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5254	rw_rlock(lock);
5255	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
5256	    ((m->flags & PG_FICTITIOUS) == 0 &&
5257	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
5258	rw_runlock(lock);
5259	rw_runlock(&pvh_global_lock);
5260	return (rv);
5261}
5262
5263/*
5264 * Destroy all managed, non-wired mappings in the given user-space
5265 * pmap.  This pmap cannot be active on any processor besides the
5266 * caller.
5267 *
5268 * This function cannot be applied to the kernel pmap.  Moreover, it
5269 * is not intended for general use.  It is only to be used during
5270 * process termination.  Consequently, it can be implemented in ways
5271 * that make it faster than pmap_remove().  First, it can more quickly
5272 * destroy mappings by iterating over the pmap's collection of PV
5273 * entries, rather than searching the page table.  Second, it doesn't
5274 * have to test and clear the page table entries atomically, because
5275 * no processor is currently accessing the user address space.  In
5276 * particular, a page table entry's dirty bit won't change state once
5277 * this function starts.
5278 */
5279void
5280pmap_remove_pages(pmap_t pmap)
5281{
5282	pd_entry_t ptepde;
5283	pt_entry_t *pte, tpte;
5284	pt_entry_t PG_M, PG_RW, PG_V;
5285	struct spglist free;
5286	vm_page_t m, mpte, mt;
5287	pv_entry_t pv;
5288	struct md_page *pvh;
5289	struct pv_chunk *pc, *npc;
5290	struct rwlock *lock;
5291	int64_t bit;
5292	uint64_t inuse, bitmask;
5293	int allfree, field, freed, idx;
5294	boolean_t superpage;
5295	vm_paddr_t pa;
5296
5297	/*
5298	 * Assert that the given pmap is only active on the current
5299	 * CPU.  Unfortunately, we cannot block another CPU from
5300	 * activating the pmap while this function is executing.
5301	 */
5302	KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap));
5303#ifdef INVARIANTS
5304	{
5305		cpuset_t other_cpus;
5306
5307		other_cpus = all_cpus;
5308		critical_enter();
5309		CPU_CLR(PCPU_GET(cpuid), &other_cpus);
5310		CPU_AND(&other_cpus, &pmap->pm_active);
5311		critical_exit();
5312		KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap));
5313	}
5314#endif
5315
5316	lock = NULL;
5317	PG_M = pmap_modified_bit(pmap);
5318	PG_V = pmap_valid_bit(pmap);
5319	PG_RW = pmap_rw_bit(pmap);
5320
5321	SLIST_INIT(&free);
5322	rw_rlock(&pvh_global_lock);
5323	PMAP_LOCK(pmap);
5324	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
5325		allfree = 1;
5326		freed = 0;
5327		for (field = 0; field < _NPCM; field++) {
5328			inuse = ~pc->pc_map[field] & pc_freemask[field];
5329			while (inuse != 0) {
5330				bit = bsfq(inuse);
5331				bitmask = 1UL << bit;
5332				idx = field * 64 + bit;
5333				pv = &pc->pc_pventry[idx];
5334				inuse &= ~bitmask;
5335
5336				pte = pmap_pdpe(pmap, pv->pv_va);
5337				ptepde = *pte;
5338				pte = pmap_pdpe_to_pde(pte, pv->pv_va);
5339				tpte = *pte;
5340				if ((tpte & (PG_PS | PG_V)) == PG_V) {
5341					superpage = FALSE;
5342					ptepde = tpte;
5343					pte = (pt_entry_t *)PHYS_TO_DMAP(tpte &
5344					    PG_FRAME);
5345					pte = &pte[pmap_pte_index(pv->pv_va)];
5346					tpte = *pte;
5347				} else {
5348					/*
5349					 * Keep track whether 'tpte' is a
5350					 * superpage explicitly instead of
5351					 * relying on PG_PS being set.
5352					 *
5353					 * This is because PG_PS is numerically
5354					 * identical to PG_PTE_PAT and thus a
5355					 * regular page could be mistaken for
5356					 * a superpage.
5357					 */
5358					superpage = TRUE;
5359				}
5360
5361				if ((tpte & PG_V) == 0) {
5362					panic("bad pte va %lx pte %lx",
5363					    pv->pv_va, tpte);
5364				}
5365
5366/*
5367 * We cannot remove wired pages from a process' mapping at this time
5368 */
5369				if (tpte & PG_W) {
5370					allfree = 0;
5371					continue;
5372				}
5373
5374				if (superpage)
5375					pa = tpte & PG_PS_FRAME;
5376				else
5377					pa = tpte & PG_FRAME;
5378
5379				m = PHYS_TO_VM_PAGE(pa);
5380				KASSERT(m->phys_addr == pa,
5381				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
5382				    m, (uintmax_t)m->phys_addr,
5383				    (uintmax_t)tpte));
5384
5385				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
5386				    m < &vm_page_array[vm_page_array_size],
5387				    ("pmap_remove_pages: bad tpte %#jx",
5388				    (uintmax_t)tpte));
5389
5390				pte_clear(pte);
5391
5392				/*
5393				 * Update the vm_page_t clean/reference bits.
5394				 */
5395				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
5396					if (superpage) {
5397						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
5398							vm_page_dirty(mt);
5399					} else
5400						vm_page_dirty(m);
5401				}
5402
5403				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
5404
5405				/* Mark free */
5406				pc->pc_map[field] |= bitmask;
5407				if (superpage) {
5408					pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
5409					pvh = pa_to_pvh(tpte & PG_PS_FRAME);
5410					TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
5411					pvh->pv_gen++;
5412					if (TAILQ_EMPTY(&pvh->pv_list)) {
5413						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
5414							if ((mt->aflags & PGA_WRITEABLE) != 0 &&
5415							    TAILQ_EMPTY(&mt->md.pv_list))
5416								vm_page_aflag_clear(mt, PGA_WRITEABLE);
5417					}
5418					mpte = pmap_lookup_pt_page(pmap, pv->pv_va);
5419					if (mpte != NULL) {
5420						pmap_remove_pt_page(pmap, mpte);
5421						pmap_resident_count_dec(pmap, 1);
5422						KASSERT(mpte->wire_count == NPTEPG,
5423						    ("pmap_remove_pages: pte page wire count error"));
5424						mpte->wire_count = 0;
5425						pmap_add_delayed_free_list(mpte, &free, FALSE);
5426						atomic_subtract_int(&cnt.v_wire_count, 1);
5427					}
5428				} else {
5429					pmap_resident_count_dec(pmap, 1);
5430					TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
5431					m->md.pv_gen++;
5432					if ((m->aflags & PGA_WRITEABLE) != 0 &&
5433					    TAILQ_EMPTY(&m->md.pv_list) &&
5434					    (m->flags & PG_FICTITIOUS) == 0) {
5435						pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5436						if (TAILQ_EMPTY(&pvh->pv_list))
5437							vm_page_aflag_clear(m, PGA_WRITEABLE);
5438					}
5439				}
5440				pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
5441				freed++;
5442			}
5443		}
5444		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
5445		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
5446		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
5447		if (allfree) {
5448			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
5449			free_pv_chunk(pc);
5450		}
5451	}
5452	if (lock != NULL)
5453		rw_wunlock(lock);
5454	pmap_invalidate_all(pmap);
5455	rw_runlock(&pvh_global_lock);
5456	PMAP_UNLOCK(pmap);
5457	pmap_free_zero_pages(&free);
5458}
5459
5460static boolean_t
5461pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
5462{
5463	struct rwlock *lock;
5464	pv_entry_t pv;
5465	struct md_page *pvh;
5466	pt_entry_t *pte, mask;
5467	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
5468	pmap_t pmap;
5469	int md_gen, pvh_gen;
5470	boolean_t rv;
5471
5472	rv = FALSE;
5473	rw_rlock(&pvh_global_lock);
5474	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5475	rw_rlock(lock);
5476restart:
5477	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5478		pmap = PV_PMAP(pv);
5479		if (!PMAP_TRYLOCK(pmap)) {
5480			md_gen = m->md.pv_gen;
5481			rw_runlock(lock);
5482			PMAP_LOCK(pmap);
5483			rw_rlock(lock);
5484			if (md_gen != m->md.pv_gen) {
5485				PMAP_UNLOCK(pmap);
5486				goto restart;
5487			}
5488		}
5489		pte = pmap_pte(pmap, pv->pv_va);
5490		mask = 0;
5491		if (modified) {
5492			PG_M = pmap_modified_bit(pmap);
5493			PG_RW = pmap_rw_bit(pmap);
5494			mask |= PG_RW | PG_M;
5495		}
5496		if (accessed) {
5497			PG_A = pmap_accessed_bit(pmap);
5498			PG_V = pmap_valid_bit(pmap);
5499			mask |= PG_V | PG_A;
5500		}
5501		rv = (*pte & mask) == mask;
5502		PMAP_UNLOCK(pmap);
5503		if (rv)
5504			goto out;
5505	}
5506	if ((m->flags & PG_FICTITIOUS) == 0) {
5507		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5508		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5509			pmap = PV_PMAP(pv);
5510			if (!PMAP_TRYLOCK(pmap)) {
5511				md_gen = m->md.pv_gen;
5512				pvh_gen = pvh->pv_gen;
5513				rw_runlock(lock);
5514				PMAP_LOCK(pmap);
5515				rw_rlock(lock);
5516				if (md_gen != m->md.pv_gen ||
5517				    pvh_gen != pvh->pv_gen) {
5518					PMAP_UNLOCK(pmap);
5519					goto restart;
5520				}
5521			}
5522			pte = pmap_pde(pmap, pv->pv_va);
5523			mask = 0;
5524			if (modified) {
5525				PG_M = pmap_modified_bit(pmap);
5526				PG_RW = pmap_rw_bit(pmap);
5527				mask |= PG_RW | PG_M;
5528			}
5529			if (accessed) {
5530				PG_A = pmap_accessed_bit(pmap);
5531				PG_V = pmap_valid_bit(pmap);
5532				mask |= PG_V | PG_A;
5533			}
5534			rv = (*pte & mask) == mask;
5535			PMAP_UNLOCK(pmap);
5536			if (rv)
5537				goto out;
5538		}
5539	}
5540out:
5541	rw_runlock(lock);
5542	rw_runlock(&pvh_global_lock);
5543	return (rv);
5544}
5545
5546/*
5547 *	pmap_is_modified:
5548 *
5549 *	Return whether or not the specified physical page was modified
5550 *	in any physical maps.
5551 */
5552boolean_t
5553pmap_is_modified(vm_page_t m)
5554{
5555
5556	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5557	    ("pmap_is_modified: page %p is not managed", m));
5558
5559	/*
5560	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
5561	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
5562	 * is clear, no PTEs can have PG_M set.
5563	 */
5564	VM_OBJECT_ASSERT_WLOCKED(m->object);
5565	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
5566		return (FALSE);
5567	return (pmap_page_test_mappings(m, FALSE, TRUE));
5568}
5569
5570/*
5571 *	pmap_is_prefaultable:
5572 *
5573 *	Return whether or not the specified virtual address is eligible
5574 *	for prefault.
5575 */
5576boolean_t
5577pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
5578{
5579	pd_entry_t *pde;
5580	pt_entry_t *pte, PG_V;
5581	boolean_t rv;
5582
5583	PG_V = pmap_valid_bit(pmap);
5584	rv = FALSE;
5585	PMAP_LOCK(pmap);
5586	pde = pmap_pde(pmap, addr);
5587	if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) {
5588		pte = pmap_pde_to_pte(pde, addr);
5589		rv = (*pte & PG_V) == 0;
5590	}
5591	PMAP_UNLOCK(pmap);
5592	return (rv);
5593}
5594
5595/*
5596 *	pmap_is_referenced:
5597 *
5598 *	Return whether or not the specified physical page was referenced
5599 *	in any physical maps.
5600 */
5601boolean_t
5602pmap_is_referenced(vm_page_t m)
5603{
5604
5605	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5606	    ("pmap_is_referenced: page %p is not managed", m));
5607	return (pmap_page_test_mappings(m, TRUE, FALSE));
5608}
5609
5610/*
5611 * Clear the write and modified bits in each of the given page's mappings.
5612 */
5613void
5614pmap_remove_write(vm_page_t m)
5615{
5616	struct md_page *pvh;
5617	pmap_t pmap;
5618	struct rwlock *lock;
5619	pv_entry_t next_pv, pv;
5620	pd_entry_t *pde;
5621	pt_entry_t oldpte, *pte, PG_M, PG_RW;
5622	vm_offset_t va;
5623	int pvh_gen, md_gen;
5624
5625	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5626	    ("pmap_remove_write: page %p is not managed", m));
5627
5628	/*
5629	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
5630	 * set by another thread while the object is locked.  Thus,
5631	 * if PGA_WRITEABLE is clear, no page table entries need updating.
5632	 */
5633	VM_OBJECT_ASSERT_WLOCKED(m->object);
5634	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
5635		return;
5636	rw_rlock(&pvh_global_lock);
5637	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5638	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5639retry_pv_loop:
5640	rw_wlock(lock);
5641	if ((m->flags & PG_FICTITIOUS) != 0)
5642		goto small_mappings;
5643	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
5644		pmap = PV_PMAP(pv);
5645		if (!PMAP_TRYLOCK(pmap)) {
5646			pvh_gen = pvh->pv_gen;
5647			rw_wunlock(lock);
5648			PMAP_LOCK(pmap);
5649			rw_wlock(lock);
5650			if (pvh_gen != pvh->pv_gen) {
5651				PMAP_UNLOCK(pmap);
5652				rw_wunlock(lock);
5653				goto retry_pv_loop;
5654			}
5655		}
5656		PG_RW = pmap_rw_bit(pmap);
5657		va = pv->pv_va;
5658		pde = pmap_pde(pmap, va);
5659		if ((*pde & PG_RW) != 0)
5660			(void)pmap_demote_pde_locked(pmap, pde, va, &lock);
5661		KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
5662		    ("inconsistent pv lock %p %p for page %p",
5663		    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
5664		PMAP_UNLOCK(pmap);
5665	}
5666small_mappings:
5667	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5668		pmap = PV_PMAP(pv);
5669		if (!PMAP_TRYLOCK(pmap)) {
5670			pvh_gen = pvh->pv_gen;
5671			md_gen = m->md.pv_gen;
5672			rw_wunlock(lock);
5673			PMAP_LOCK(pmap);
5674			rw_wlock(lock);
5675			if (pvh_gen != pvh->pv_gen ||
5676			    md_gen != m->md.pv_gen) {
5677				PMAP_UNLOCK(pmap);
5678				rw_wunlock(lock);
5679				goto retry_pv_loop;
5680			}
5681		}
5682		PG_M = pmap_modified_bit(pmap);
5683		PG_RW = pmap_rw_bit(pmap);
5684		pde = pmap_pde(pmap, pv->pv_va);
5685		KASSERT((*pde & PG_PS) == 0,
5686		    ("pmap_remove_write: found a 2mpage in page %p's pv list",
5687		    m));
5688		pte = pmap_pde_to_pte(pde, pv->pv_va);
5689retry:
5690		oldpte = *pte;
5691		if (oldpte & PG_RW) {
5692			if (!atomic_cmpset_long(pte, oldpte, oldpte &
5693			    ~(PG_RW | PG_M)))
5694				goto retry;
5695			if ((oldpte & PG_M) != 0)
5696				vm_page_dirty(m);
5697			pmap_invalidate_page(pmap, pv->pv_va);
5698		}
5699		PMAP_UNLOCK(pmap);
5700	}
5701	rw_wunlock(lock);
5702	vm_page_aflag_clear(m, PGA_WRITEABLE);
5703	rw_runlock(&pvh_global_lock);
5704}
5705
5706static __inline boolean_t
5707safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte)
5708{
5709
5710	if (!pmap_emulate_ad_bits(pmap))
5711		return (TRUE);
5712
5713	KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type));
5714
5715	/*
5716	 * RWX = 010 or 110 will cause an unconditional EPT misconfiguration
5717	 * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared
5718	 * if the EPT_PG_WRITE bit is set.
5719	 */
5720	if ((pte & EPT_PG_WRITE) != 0)
5721		return (FALSE);
5722
5723	/*
5724	 * RWX = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set.
5725	 */
5726	if ((pte & EPT_PG_EXECUTE) == 0 ||
5727	    ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0))
5728		return (TRUE);
5729	else
5730		return (FALSE);
5731}
5732
5733#define	PMAP_TS_REFERENCED_MAX	5
5734
5735/*
5736 *	pmap_ts_referenced:
5737 *
5738 *	Return a count of reference bits for a page, clearing those bits.
5739 *	It is not necessary for every reference bit to be cleared, but it
5740 *	is necessary that 0 only be returned when there are truly no
5741 *	reference bits set.
5742 *
5743 *	XXX: The exact number of bits to check and clear is a matter that
5744 *	should be tested and standardized at some point in the future for
5745 *	optimal aging of shared pages.
5746 */
5747int
5748pmap_ts_referenced(vm_page_t m)
5749{
5750	struct md_page *pvh;
5751	pv_entry_t pv, pvf;
5752	pmap_t pmap;
5753	struct rwlock *lock;
5754	pd_entry_t oldpde, *pde;
5755	pt_entry_t *pte, PG_A;
5756	vm_offset_t va;
5757	vm_paddr_t pa;
5758	int cleared, md_gen, not_cleared, pvh_gen;
5759	struct spglist free;
5760	boolean_t demoted;
5761
5762	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5763	    ("pmap_ts_referenced: page %p is not managed", m));
5764	SLIST_INIT(&free);
5765	cleared = 0;
5766	pa = VM_PAGE_TO_PHYS(m);
5767	lock = PHYS_TO_PV_LIST_LOCK(pa);
5768	pvh = pa_to_pvh(pa);
5769	rw_rlock(&pvh_global_lock);
5770	rw_wlock(lock);
5771retry:
5772	not_cleared = 0;
5773	if ((m->flags & PG_FICTITIOUS) != 0 ||
5774	    (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
5775		goto small_mappings;
5776	pv = pvf;
5777	do {
5778		if (pvf == NULL)
5779			pvf = pv;
5780		pmap = PV_PMAP(pv);
5781		if (!PMAP_TRYLOCK(pmap)) {
5782			pvh_gen = pvh->pv_gen;
5783			rw_wunlock(lock);
5784			PMAP_LOCK(pmap);
5785			rw_wlock(lock);
5786			if (pvh_gen != pvh->pv_gen) {
5787				PMAP_UNLOCK(pmap);
5788				goto retry;
5789			}
5790		}
5791		PG_A = pmap_accessed_bit(pmap);
5792		va = pv->pv_va;
5793		pde = pmap_pde(pmap, pv->pv_va);
5794		oldpde = *pde;
5795		if ((*pde & PG_A) != 0) {
5796			/*
5797			 * Since this reference bit is shared by 512 4KB
5798			 * pages, it should not be cleared every time it is
5799			 * tested.  Apply a simple "hash" function on the
5800			 * physical page number, the virtual superpage number,
5801			 * and the pmap address to select one 4KB page out of
5802			 * the 512 on which testing the reference bit will
5803			 * result in clearing that reference bit.  This
5804			 * function is designed to avoid the selection of the
5805			 * same 4KB page for every 2MB page mapping.
5806			 *
5807			 * On demotion, a mapping that hasn't been referenced
5808			 * is simply destroyed.  To avoid the possibility of a
5809			 * subsequent page fault on a demoted wired mapping,
5810			 * always leave its reference bit set.  Moreover,
5811			 * since the superpage is wired, the current state of
5812			 * its reference bit won't affect page replacement.
5813			 */
5814			if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^
5815			    (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
5816			    (*pde & PG_W) == 0) {
5817				if (safe_to_clear_referenced(pmap, oldpde)) {
5818					atomic_clear_long(pde, PG_A);
5819					pmap_invalidate_page(pmap, pv->pv_va);
5820					demoted = FALSE;
5821				} else if (pmap_demote_pde_locked(pmap, pde,
5822				    pv->pv_va, &lock)) {
5823					/*
5824					 * Remove the mapping to a single page
5825					 * so that a subsequent access may
5826					 * repromote.  Since the underlying
5827					 * page table page is fully populated,
5828					 * this removal never frees a page
5829					 * table page.
5830					 */
5831					demoted = TRUE;
5832					va += VM_PAGE_TO_PHYS(m) - (oldpde &
5833					    PG_PS_FRAME);
5834					pte = pmap_pde_to_pte(pde, va);
5835					pmap_remove_pte(pmap, pte, va, *pde,
5836					    NULL, &lock);
5837					pmap_invalidate_page(pmap, va);
5838				} else
5839					demoted = TRUE;
5840
5841				if (demoted) {
5842					/*
5843					 * The superpage mapping was removed
5844					 * entirely and therefore 'pv' is no
5845					 * longer valid.
5846					 */
5847					if (pvf == pv)
5848						pvf = NULL;
5849					pv = NULL;
5850				}
5851				cleared++;
5852				KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
5853				    ("inconsistent pv lock %p %p for page %p",
5854				    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
5855			} else
5856				not_cleared++;
5857		}
5858		PMAP_UNLOCK(pmap);
5859		/* Rotate the PV list if it has more than one entry. */
5860		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
5861			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
5862			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
5863			pvh->pv_gen++;
5864		}
5865		if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
5866			goto out;
5867	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
5868small_mappings:
5869	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
5870		goto out;
5871	pv = pvf;
5872	do {
5873		if (pvf == NULL)
5874			pvf = pv;
5875		pmap = PV_PMAP(pv);
5876		if (!PMAP_TRYLOCK(pmap)) {
5877			pvh_gen = pvh->pv_gen;
5878			md_gen = m->md.pv_gen;
5879			rw_wunlock(lock);
5880			PMAP_LOCK(pmap);
5881			rw_wlock(lock);
5882			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
5883				PMAP_UNLOCK(pmap);
5884				goto retry;
5885			}
5886		}
5887		PG_A = pmap_accessed_bit(pmap);
5888		pde = pmap_pde(pmap, pv->pv_va);
5889		KASSERT((*pde & PG_PS) == 0,
5890		    ("pmap_ts_referenced: found a 2mpage in page %p's pv list",
5891		    m));
5892		pte = pmap_pde_to_pte(pde, pv->pv_va);
5893		if ((*pte & PG_A) != 0) {
5894			if (safe_to_clear_referenced(pmap, *pte)) {
5895				atomic_clear_long(pte, PG_A);
5896				pmap_invalidate_page(pmap, pv->pv_va);
5897				cleared++;
5898			} else if ((*pte & PG_W) == 0) {
5899				/*
5900				 * Wired pages cannot be paged out so
5901				 * doing accessed bit emulation for
5902				 * them is wasted effort. We do the
5903				 * hard work for unwired pages only.
5904				 */
5905				pmap_remove_pte(pmap, pte, pv->pv_va,
5906				    *pde, &free, &lock);
5907				pmap_invalidate_page(pmap, pv->pv_va);
5908				cleared++;
5909				if (pvf == pv)
5910					pvf = NULL;
5911				pv = NULL;
5912				KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
5913				    ("inconsistent pv lock %p %p for page %p",
5914				    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
5915			} else
5916				not_cleared++;
5917		}
5918		PMAP_UNLOCK(pmap);
5919		/* Rotate the PV list if it has more than one entry. */
5920		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
5921			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
5922			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
5923			m->md.pv_gen++;
5924		}
5925	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
5926	    not_cleared < PMAP_TS_REFERENCED_MAX);
5927out:
5928	rw_wunlock(lock);
5929	rw_runlock(&pvh_global_lock);
5930	pmap_free_zero_pages(&free);
5931	return (cleared + not_cleared);
5932}
5933
5934/*
5935 *	Apply the given advice to the specified range of addresses within the
5936 *	given pmap.  Depending on the advice, clear the referenced and/or
5937 *	modified flags in each mapping and set the mapped page's dirty field.
5938 */
5939void
5940pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
5941{
5942	struct rwlock *lock;
5943	pml4_entry_t *pml4e;
5944	pdp_entry_t *pdpe;
5945	pd_entry_t oldpde, *pde;
5946	pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V;
5947	vm_offset_t va_next;
5948	vm_page_t m;
5949	boolean_t anychanged, pv_lists_locked;
5950
5951	if (advice != MADV_DONTNEED && advice != MADV_FREE)
5952		return;
5953
5954	/*
5955	 * A/D bit emulation requires an alternate code path when clearing
5956	 * the modified and accessed bits below. Since this function is
5957	 * advisory in nature we skip it entirely for pmaps that require
5958	 * A/D bit emulation.
5959	 */
5960	if (pmap_emulate_ad_bits(pmap))
5961		return;
5962
5963	PG_A = pmap_accessed_bit(pmap);
5964	PG_G = pmap_global_bit(pmap);
5965	PG_M = pmap_modified_bit(pmap);
5966	PG_V = pmap_valid_bit(pmap);
5967	PG_RW = pmap_rw_bit(pmap);
5968
5969	pv_lists_locked = FALSE;
5970resume:
5971	anychanged = FALSE;
5972	PMAP_LOCK(pmap);
5973	for (; sva < eva; sva = va_next) {
5974		pml4e = pmap_pml4e(pmap, sva);
5975		if ((*pml4e & PG_V) == 0) {
5976			va_next = (sva + NBPML4) & ~PML4MASK;
5977			if (va_next < sva)
5978				va_next = eva;
5979			continue;
5980		}
5981		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
5982		if ((*pdpe & PG_V) == 0) {
5983			va_next = (sva + NBPDP) & ~PDPMASK;
5984			if (va_next < sva)
5985				va_next = eva;
5986			continue;
5987		}
5988		va_next = (sva + NBPDR) & ~PDRMASK;
5989		if (va_next < sva)
5990			va_next = eva;
5991		pde = pmap_pdpe_to_pde(pdpe, sva);
5992		oldpde = *pde;
5993		if ((oldpde & PG_V) == 0)
5994			continue;
5995		else if ((oldpde & PG_PS) != 0) {
5996			if ((oldpde & PG_MANAGED) == 0)
5997				continue;
5998			if (!pv_lists_locked) {
5999				pv_lists_locked = TRUE;
6000				if (!rw_try_rlock(&pvh_global_lock)) {
6001					if (anychanged)
6002						pmap_invalidate_all(pmap);
6003					PMAP_UNLOCK(pmap);
6004					rw_rlock(&pvh_global_lock);
6005					goto resume;
6006				}
6007			}
6008			lock = NULL;
6009			if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) {
6010				if (lock != NULL)
6011					rw_wunlock(lock);
6012
6013				/*
6014				 * The large page mapping was destroyed.
6015				 */
6016				continue;
6017			}
6018
6019			/*
6020			 * Unless the page mappings are wired, remove the
6021			 * mapping to a single page so that a subsequent
6022			 * access may repromote.  Since the underlying page
6023			 * table page is fully populated, this removal never
6024			 * frees a page table page.
6025			 */
6026			if ((oldpde & PG_W) == 0) {
6027				pte = pmap_pde_to_pte(pde, sva);
6028				KASSERT((*pte & PG_V) != 0,
6029				    ("pmap_advise: invalid PTE"));
6030				pmap_remove_pte(pmap, pte, sva, *pde, NULL,
6031				    &lock);
6032				anychanged = TRUE;
6033			}
6034			if (lock != NULL)
6035				rw_wunlock(lock);
6036		}
6037		if (va_next > eva)
6038			va_next = eva;
6039		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
6040		    sva += PAGE_SIZE) {
6041			if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED |
6042			    PG_V))
6043				continue;
6044			else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
6045				if (advice == MADV_DONTNEED) {
6046					/*
6047					 * Future calls to pmap_is_modified()
6048					 * can be avoided by making the page
6049					 * dirty now.
6050					 */
6051					m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
6052					vm_page_dirty(m);
6053				}
6054				atomic_clear_long(pte, PG_M | PG_A);
6055			} else if ((*pte & PG_A) != 0)
6056				atomic_clear_long(pte, PG_A);
6057			else
6058				continue;
6059			if ((*pte & PG_G) != 0)
6060				pmap_invalidate_page(pmap, sva);
6061			else
6062				anychanged = TRUE;
6063		}
6064	}
6065	if (anychanged)
6066		pmap_invalidate_all(pmap);
6067	if (pv_lists_locked)
6068		rw_runlock(&pvh_global_lock);
6069	PMAP_UNLOCK(pmap);
6070}
6071
6072/*
6073 *	Clear the modify bits on the specified physical page.
6074 */
6075void
6076pmap_clear_modify(vm_page_t m)
6077{
6078	struct md_page *pvh;
6079	pmap_t pmap;
6080	pv_entry_t next_pv, pv;
6081	pd_entry_t oldpde, *pde;
6082	pt_entry_t oldpte, *pte, PG_M, PG_RW, PG_V;
6083	struct rwlock *lock;
6084	vm_offset_t va;
6085	int md_gen, pvh_gen;
6086
6087	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6088	    ("pmap_clear_modify: page %p is not managed", m));
6089	VM_OBJECT_ASSERT_WLOCKED(m->object);
6090	KASSERT(!vm_page_xbusied(m),
6091	    ("pmap_clear_modify: page %p is exclusive busied", m));
6092
6093	/*
6094	 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
6095	 * If the object containing the page is locked and the page is not
6096	 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
6097	 */
6098	if ((m->aflags & PGA_WRITEABLE) == 0)
6099		return;
6100	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
6101	rw_rlock(&pvh_global_lock);
6102	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6103	rw_wlock(lock);
6104restart:
6105	if ((m->flags & PG_FICTITIOUS) != 0)
6106		goto small_mappings;
6107	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
6108		pmap = PV_PMAP(pv);
6109		if (!PMAP_TRYLOCK(pmap)) {
6110			pvh_gen = pvh->pv_gen;
6111			rw_wunlock(lock);
6112			PMAP_LOCK(pmap);
6113			rw_wlock(lock);
6114			if (pvh_gen != pvh->pv_gen) {
6115				PMAP_UNLOCK(pmap);
6116				goto restart;
6117			}
6118		}
6119		PG_M = pmap_modified_bit(pmap);
6120		PG_V = pmap_valid_bit(pmap);
6121		PG_RW = pmap_rw_bit(pmap);
6122		va = pv->pv_va;
6123		pde = pmap_pde(pmap, va);
6124		oldpde = *pde;
6125		if ((oldpde & PG_RW) != 0) {
6126			if (pmap_demote_pde_locked(pmap, pde, va, &lock)) {
6127				if ((oldpde & PG_W) == 0) {
6128					/*
6129					 * Write protect the mapping to a
6130					 * single page so that a subsequent
6131					 * write access may repromote.
6132					 */
6133					va += VM_PAGE_TO_PHYS(m) - (oldpde &
6134					    PG_PS_FRAME);
6135					pte = pmap_pde_to_pte(pde, va);
6136					oldpte = *pte;
6137					if ((oldpte & PG_V) != 0) {
6138						while (!atomic_cmpset_long(pte,
6139						    oldpte,
6140						    oldpte & ~(PG_M | PG_RW)))
6141							oldpte = *pte;
6142						vm_page_dirty(m);
6143						pmap_invalidate_page(pmap, va);
6144					}
6145				}
6146			}
6147		}
6148		PMAP_UNLOCK(pmap);
6149	}
6150small_mappings:
6151	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6152		pmap = PV_PMAP(pv);
6153		if (!PMAP_TRYLOCK(pmap)) {
6154			md_gen = m->md.pv_gen;
6155			pvh_gen = pvh->pv_gen;
6156			rw_wunlock(lock);
6157			PMAP_LOCK(pmap);
6158			rw_wlock(lock);
6159			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
6160				PMAP_UNLOCK(pmap);
6161				goto restart;
6162			}
6163		}
6164		PG_M = pmap_modified_bit(pmap);
6165		PG_RW = pmap_rw_bit(pmap);
6166		pde = pmap_pde(pmap, pv->pv_va);
6167		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
6168		    " a 2mpage in page %p's pv list", m));
6169		pte = pmap_pde_to_pte(pde, pv->pv_va);
6170		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
6171			atomic_clear_long(pte, PG_M);
6172			pmap_invalidate_page(pmap, pv->pv_va);
6173		}
6174		PMAP_UNLOCK(pmap);
6175	}
6176	rw_wunlock(lock);
6177	rw_runlock(&pvh_global_lock);
6178}
6179
6180/*
6181 * Miscellaneous support routines follow
6182 */
6183
6184/* Adjust the cache mode for a 4KB page mapped via a PTE. */
6185static __inline void
6186pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask)
6187{
6188	u_int opte, npte;
6189
6190	/*
6191	 * The cache mode bits are all in the low 32-bits of the
6192	 * PTE, so we can just spin on updating the low 32-bits.
6193	 */
6194	do {
6195		opte = *(u_int *)pte;
6196		npte = opte & ~mask;
6197		npte |= cache_bits;
6198	} while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
6199}
6200
6201/* Adjust the cache mode for a 2MB page mapped via a PDE. */
6202static __inline void
6203pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask)
6204{
6205	u_int opde, npde;
6206
6207	/*
6208	 * The cache mode bits are all in the low 32-bits of the
6209	 * PDE, so we can just spin on updating the low 32-bits.
6210	 */
6211	do {
6212		opde = *(u_int *)pde;
6213		npde = opde & ~mask;
6214		npde |= cache_bits;
6215	} while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
6216}
6217
6218/*
6219 * Map a set of physical memory pages into the kernel virtual
6220 * address space. Return a pointer to where it is mapped. This
6221 * routine is intended to be used for mapping device memory,
6222 * NOT real memory.
6223 */
6224void *
6225pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
6226{
6227	vm_offset_t va, offset;
6228	vm_size_t tmpsize;
6229
6230	/*
6231	 * If the specified range of physical addresses fits within the direct
6232	 * map window, use the direct map.
6233	 */
6234	if (pa < dmaplimit && pa + size < dmaplimit) {
6235		va = PHYS_TO_DMAP(pa);
6236		if (!pmap_change_attr(va, size, mode))
6237			return ((void *)va);
6238	}
6239	offset = pa & PAGE_MASK;
6240	size = round_page(offset + size);
6241	va = kva_alloc(size);
6242	if (!va)
6243		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
6244	pa = trunc_page(pa);
6245	for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
6246		pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
6247	pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
6248	pmap_invalidate_cache_range(va, va + tmpsize, FALSE);
6249	return ((void *)(va + offset));
6250}
6251
6252void *
6253pmap_mapdev(vm_paddr_t pa, vm_size_t size)
6254{
6255
6256	return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
6257}
6258
6259void *
6260pmap_mapbios(vm_paddr_t pa, vm_size_t size)
6261{
6262
6263	return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
6264}
6265
6266void
6267pmap_unmapdev(vm_offset_t va, vm_size_t size)
6268{
6269	vm_offset_t base, offset;
6270
6271	/* If we gave a direct map region in pmap_mapdev, do nothing */
6272	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
6273		return;
6274	base = trunc_page(va);
6275	offset = va & PAGE_MASK;
6276	size = round_page(offset + size);
6277	kva_free(base, size);
6278}
6279
6280/*
6281 * Tries to demote a 1GB page mapping.
6282 */
6283static boolean_t
6284pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va)
6285{
6286	pdp_entry_t newpdpe, oldpdpe;
6287	pd_entry_t *firstpde, newpde, *pde;
6288	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
6289	vm_paddr_t mpdepa;
6290	vm_page_t mpde;
6291
6292	PG_A = pmap_accessed_bit(pmap);
6293	PG_M = pmap_modified_bit(pmap);
6294	PG_V = pmap_valid_bit(pmap);
6295	PG_RW = pmap_rw_bit(pmap);
6296
6297	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6298	oldpdpe = *pdpe;
6299	KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V),
6300	    ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V"));
6301	if ((mpde = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT |
6302	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
6303		CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx"
6304		    " in pmap %p", va, pmap);
6305		return (FALSE);
6306	}
6307	mpdepa = VM_PAGE_TO_PHYS(mpde);
6308	firstpde = (pd_entry_t *)PHYS_TO_DMAP(mpdepa);
6309	newpdpe = mpdepa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V;
6310	KASSERT((oldpdpe & PG_A) != 0,
6311	    ("pmap_demote_pdpe: oldpdpe is missing PG_A"));
6312	KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW,
6313	    ("pmap_demote_pdpe: oldpdpe is missing PG_M"));
6314	newpde = oldpdpe;
6315
6316	/*
6317	 * Initialize the page directory page.
6318	 */
6319	for (pde = firstpde; pde < firstpde + NPDEPG; pde++) {
6320		*pde = newpde;
6321		newpde += NBPDR;
6322	}
6323
6324	/*
6325	 * Demote the mapping.
6326	 */
6327	*pdpe = newpdpe;
6328
6329	/*
6330	 * Invalidate a stale recursive mapping of the page directory page.
6331	 */
6332	pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va));
6333
6334	pmap_pdpe_demotions++;
6335	CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx"
6336	    " in pmap %p", va, pmap);
6337	return (TRUE);
6338}
6339
6340/*
6341 * Sets the memory attribute for the specified page.
6342 */
6343void
6344pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
6345{
6346
6347	m->md.pat_mode = ma;
6348
6349	/*
6350	 * If "m" is a normal page, update its direct mapping.  This update
6351	 * can be relied upon to perform any cache operations that are
6352	 * required for data coherence.
6353	 */
6354	if ((m->flags & PG_FICTITIOUS) == 0 &&
6355	    pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
6356	    m->md.pat_mode))
6357		panic("memory attribute change on the direct map failed");
6358}
6359
6360/*
6361 * Changes the specified virtual address range's memory type to that given by
6362 * the parameter "mode".  The specified virtual address range must be
6363 * completely contained within either the direct map or the kernel map.  If
6364 * the virtual address range is contained within the kernel map, then the
6365 * memory type for each of the corresponding ranges of the direct map is also
6366 * changed.  (The corresponding ranges of the direct map are those ranges that
6367 * map the same physical pages as the specified virtual address range.)  These
6368 * changes to the direct map are necessary because Intel describes the
6369 * behavior of their processors as "undefined" if two or more mappings to the
6370 * same physical page have different memory types.
6371 *
6372 * Returns zero if the change completed successfully, and either EINVAL or
6373 * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
6374 * of the virtual address range was not mapped, and ENOMEM is returned if
6375 * there was insufficient memory available to complete the change.  In the
6376 * latter case, the memory type may have been changed on some part of the
6377 * virtual address range or the direct map.
6378 */
6379int
6380pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
6381{
6382	int error;
6383
6384	PMAP_LOCK(kernel_pmap);
6385	error = pmap_change_attr_locked(va, size, mode);
6386	PMAP_UNLOCK(kernel_pmap);
6387	return (error);
6388}
6389
6390static int
6391pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
6392{
6393	vm_offset_t base, offset, tmpva;
6394	vm_paddr_t pa_start, pa_end;
6395	pdp_entry_t *pdpe;
6396	pd_entry_t *pde;
6397	pt_entry_t *pte;
6398	int cache_bits_pte, cache_bits_pde, error;
6399	boolean_t changed;
6400
6401	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
6402	base = trunc_page(va);
6403	offset = va & PAGE_MASK;
6404	size = round_page(offset + size);
6405
6406	/*
6407	 * Only supported on kernel virtual addresses, including the direct
6408	 * map but excluding the recursive map.
6409	 */
6410	if (base < DMAP_MIN_ADDRESS)
6411		return (EINVAL);
6412
6413	cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1);
6414	cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0);
6415	changed = FALSE;
6416
6417	/*
6418	 * Pages that aren't mapped aren't supported.  Also break down 2MB pages
6419	 * into 4KB pages if required.
6420	 */
6421	for (tmpva = base; tmpva < base + size; ) {
6422		pdpe = pmap_pdpe(kernel_pmap, tmpva);
6423		if (*pdpe == 0)
6424			return (EINVAL);
6425		if (*pdpe & PG_PS) {
6426			/*
6427			 * If the current 1GB page already has the required
6428			 * memory type, then we need not demote this page. Just
6429			 * increment tmpva to the next 1GB page frame.
6430			 */
6431			if ((*pdpe & X86_PG_PDE_CACHE) == cache_bits_pde) {
6432				tmpva = trunc_1gpage(tmpva) + NBPDP;
6433				continue;
6434			}
6435
6436			/*
6437			 * If the current offset aligns with a 1GB page frame
6438			 * and there is at least 1GB left within the range, then
6439			 * we need not break down this page into 2MB pages.
6440			 */
6441			if ((tmpva & PDPMASK) == 0 &&
6442			    tmpva + PDPMASK < base + size) {
6443				tmpva += NBPDP;
6444				continue;
6445			}
6446			if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva))
6447				return (ENOMEM);
6448		}
6449		pde = pmap_pdpe_to_pde(pdpe, tmpva);
6450		if (*pde == 0)
6451			return (EINVAL);
6452		if (*pde & PG_PS) {
6453			/*
6454			 * If the current 2MB page already has the required
6455			 * memory type, then we need not demote this page. Just
6456			 * increment tmpva to the next 2MB page frame.
6457			 */
6458			if ((*pde & X86_PG_PDE_CACHE) == cache_bits_pde) {
6459				tmpva = trunc_2mpage(tmpva) + NBPDR;
6460				continue;
6461			}
6462
6463			/*
6464			 * If the current offset aligns with a 2MB page frame
6465			 * and there is at least 2MB left within the range, then
6466			 * we need not break down this page into 4KB pages.
6467			 */
6468			if ((tmpva & PDRMASK) == 0 &&
6469			    tmpva + PDRMASK < base + size) {
6470				tmpva += NBPDR;
6471				continue;
6472			}
6473			if (!pmap_demote_pde(kernel_pmap, pde, tmpva))
6474				return (ENOMEM);
6475		}
6476		pte = pmap_pde_to_pte(pde, tmpva);
6477		if (*pte == 0)
6478			return (EINVAL);
6479		tmpva += PAGE_SIZE;
6480	}
6481	error = 0;
6482
6483	/*
6484	 * Ok, all the pages exist, so run through them updating their
6485	 * cache mode if required.
6486	 */
6487	pa_start = pa_end = 0;
6488	for (tmpva = base; tmpva < base + size; ) {
6489		pdpe = pmap_pdpe(kernel_pmap, tmpva);
6490		if (*pdpe & PG_PS) {
6491			if ((*pdpe & X86_PG_PDE_CACHE) != cache_bits_pde) {
6492				pmap_pde_attr(pdpe, cache_bits_pde,
6493				    X86_PG_PDE_CACHE);
6494				changed = TRUE;
6495			}
6496			if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
6497				if (pa_start == pa_end) {
6498					/* Start physical address run. */
6499					pa_start = *pdpe & PG_PS_FRAME;
6500					pa_end = pa_start + NBPDP;
6501				} else if (pa_end == (*pdpe & PG_PS_FRAME))
6502					pa_end += NBPDP;
6503				else {
6504					/* Run ended, update direct map. */
6505					error = pmap_change_attr_locked(
6506					    PHYS_TO_DMAP(pa_start),
6507					    pa_end - pa_start, mode);
6508					if (error != 0)
6509						break;
6510					/* Start physical address run. */
6511					pa_start = *pdpe & PG_PS_FRAME;
6512					pa_end = pa_start + NBPDP;
6513				}
6514			}
6515			tmpva = trunc_1gpage(tmpva) + NBPDP;
6516			continue;
6517		}
6518		pde = pmap_pdpe_to_pde(pdpe, tmpva);
6519		if (*pde & PG_PS) {
6520			if ((*pde & X86_PG_PDE_CACHE) != cache_bits_pde) {
6521				pmap_pde_attr(pde, cache_bits_pde,
6522				    X86_PG_PDE_CACHE);
6523				changed = TRUE;
6524			}
6525			if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
6526				if (pa_start == pa_end) {
6527					/* Start physical address run. */
6528					pa_start = *pde & PG_PS_FRAME;
6529					pa_end = pa_start + NBPDR;
6530				} else if (pa_end == (*pde & PG_PS_FRAME))
6531					pa_end += NBPDR;
6532				else {
6533					/* Run ended, update direct map. */
6534					error = pmap_change_attr_locked(
6535					    PHYS_TO_DMAP(pa_start),
6536					    pa_end - pa_start, mode);
6537					if (error != 0)
6538						break;
6539					/* Start physical address run. */
6540					pa_start = *pde & PG_PS_FRAME;
6541					pa_end = pa_start + NBPDR;
6542				}
6543			}
6544			tmpva = trunc_2mpage(tmpva) + NBPDR;
6545		} else {
6546			pte = pmap_pde_to_pte(pde, tmpva);
6547			if ((*pte & X86_PG_PTE_CACHE) != cache_bits_pte) {
6548				pmap_pte_attr(pte, cache_bits_pte,
6549				    X86_PG_PTE_CACHE);
6550				changed = TRUE;
6551			}
6552			if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
6553				if (pa_start == pa_end) {
6554					/* Start physical address run. */
6555					pa_start = *pte & PG_FRAME;
6556					pa_end = pa_start + PAGE_SIZE;
6557				} else if (pa_end == (*pte & PG_FRAME))
6558					pa_end += PAGE_SIZE;
6559				else {
6560					/* Run ended, update direct map. */
6561					error = pmap_change_attr_locked(
6562					    PHYS_TO_DMAP(pa_start),
6563					    pa_end - pa_start, mode);
6564					if (error != 0)
6565						break;
6566					/* Start physical address run. */
6567					pa_start = *pte & PG_FRAME;
6568					pa_end = pa_start + PAGE_SIZE;
6569				}
6570			}
6571			tmpva += PAGE_SIZE;
6572		}
6573	}
6574	if (error == 0 && pa_start != pa_end)
6575		error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start),
6576		    pa_end - pa_start, mode);
6577
6578	/*
6579	 * Flush CPU caches if required to make sure any data isn't cached that
6580	 * shouldn't be, etc.
6581	 */
6582	if (changed) {
6583		pmap_invalidate_range(kernel_pmap, base, tmpva);
6584		pmap_invalidate_cache_range(base, tmpva, FALSE);
6585	}
6586	return (error);
6587}
6588
6589/*
6590 * Demotes any mapping within the direct map region that covers more than the
6591 * specified range of physical addresses.  This range's size must be a power
6592 * of two and its starting address must be a multiple of its size.  Since the
6593 * demotion does not change any attributes of the mapping, a TLB invalidation
6594 * is not mandatory.  The caller may, however, request a TLB invalidation.
6595 */
6596void
6597pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate)
6598{
6599	pdp_entry_t *pdpe;
6600	pd_entry_t *pde;
6601	vm_offset_t va;
6602	boolean_t changed;
6603
6604	if (len == 0)
6605		return;
6606	KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2"));
6607	KASSERT((base & (len - 1)) == 0,
6608	    ("pmap_demote_DMAP: base is not a multiple of len"));
6609	if (len < NBPDP && base < dmaplimit) {
6610		va = PHYS_TO_DMAP(base);
6611		changed = FALSE;
6612		PMAP_LOCK(kernel_pmap);
6613		pdpe = pmap_pdpe(kernel_pmap, va);
6614		if ((*pdpe & X86_PG_V) == 0)
6615			panic("pmap_demote_DMAP: invalid PDPE");
6616		if ((*pdpe & PG_PS) != 0) {
6617			if (!pmap_demote_pdpe(kernel_pmap, pdpe, va))
6618				panic("pmap_demote_DMAP: PDPE failed");
6619			changed = TRUE;
6620		}
6621		if (len < NBPDR) {
6622			pde = pmap_pdpe_to_pde(pdpe, va);
6623			if ((*pde & X86_PG_V) == 0)
6624				panic("pmap_demote_DMAP: invalid PDE");
6625			if ((*pde & PG_PS) != 0) {
6626				if (!pmap_demote_pde(kernel_pmap, pde, va))
6627					panic("pmap_demote_DMAP: PDE failed");
6628				changed = TRUE;
6629			}
6630		}
6631		if (changed && invalidate)
6632			pmap_invalidate_page(kernel_pmap, va);
6633		PMAP_UNLOCK(kernel_pmap);
6634	}
6635}
6636
6637/*
6638 * perform the pmap work for mincore
6639 */
6640int
6641pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
6642{
6643	pd_entry_t *pdep;
6644	pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V;
6645	vm_paddr_t pa;
6646	int val;
6647
6648	PG_A = pmap_accessed_bit(pmap);
6649	PG_M = pmap_modified_bit(pmap);
6650	PG_V = pmap_valid_bit(pmap);
6651	PG_RW = pmap_rw_bit(pmap);
6652
6653	PMAP_LOCK(pmap);
6654retry:
6655	pdep = pmap_pde(pmap, addr);
6656	if (pdep != NULL && (*pdep & PG_V)) {
6657		if (*pdep & PG_PS) {
6658			pte = *pdep;
6659			/* Compute the physical address of the 4KB page. */
6660			pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
6661			    PG_FRAME;
6662			val = MINCORE_SUPER;
6663		} else {
6664			pte = *pmap_pde_to_pte(pdep, addr);
6665			pa = pte & PG_FRAME;
6666			val = 0;
6667		}
6668	} else {
6669		pte = 0;
6670		pa = 0;
6671		val = 0;
6672	}
6673	if ((pte & PG_V) != 0) {
6674		val |= MINCORE_INCORE;
6675		if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
6676			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
6677		if ((pte & PG_A) != 0)
6678			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
6679	}
6680	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
6681	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
6682	    (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
6683		/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
6684		if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
6685			goto retry;
6686	} else
6687		PA_UNLOCK_COND(*locked_pa);
6688	PMAP_UNLOCK(pmap);
6689	return (val);
6690}
6691
6692void
6693pmap_activate(struct thread *td)
6694{
6695	pmap_t	pmap, oldpmap;
6696	u_int	cpuid;
6697
6698	critical_enter();
6699	pmap = vmspace_pmap(td->td_proc->p_vmspace);
6700	oldpmap = PCPU_GET(curpmap);
6701	cpuid = PCPU_GET(cpuid);
6702#ifdef SMP
6703	CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
6704	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
6705	CPU_SET_ATOMIC(cpuid, &pmap->pm_save);
6706#else
6707	CPU_CLR(cpuid, &oldpmap->pm_active);
6708	CPU_SET(cpuid, &pmap->pm_active);
6709	CPU_SET(cpuid, &pmap->pm_save);
6710#endif
6711	td->td_pcb->pcb_cr3 = pmap->pm_cr3;
6712	load_cr3(pmap->pm_cr3);
6713	PCPU_SET(curpmap, pmap);
6714	critical_exit();
6715}
6716
6717void
6718pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
6719{
6720}
6721
6722/*
6723 *	Increase the starting virtual address of the given mapping if a
6724 *	different alignment might result in more superpage mappings.
6725 */
6726void
6727pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
6728    vm_offset_t *addr, vm_size_t size)
6729{
6730	vm_offset_t superpage_offset;
6731
6732	if (size < NBPDR)
6733		return;
6734	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
6735		offset += ptoa(object->pg_color);
6736	superpage_offset = offset & PDRMASK;
6737	if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
6738	    (*addr & PDRMASK) == superpage_offset)
6739		return;
6740	if ((*addr & PDRMASK) < superpage_offset)
6741		*addr = (*addr & ~PDRMASK) + superpage_offset;
6742	else
6743		*addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
6744}
6745
6746#ifdef INVARIANTS
6747static unsigned long num_dirty_emulations;
6748SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW,
6749	     &num_dirty_emulations, 0, NULL);
6750
6751static unsigned long num_accessed_emulations;
6752SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW,
6753	     &num_accessed_emulations, 0, NULL);
6754
6755static unsigned long num_superpage_accessed_emulations;
6756SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW,
6757	     &num_superpage_accessed_emulations, 0, NULL);
6758
6759static unsigned long ad_emulation_superpage_promotions;
6760SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW,
6761	     &ad_emulation_superpage_promotions, 0, NULL);
6762#endif	/* INVARIANTS */
6763
6764int
6765pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype)
6766{
6767	int rv;
6768	struct rwlock *lock;
6769	vm_page_t m, mpte;
6770	pd_entry_t *pde;
6771	pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V;
6772	boolean_t pv_lists_locked;
6773
6774	KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE,
6775	    ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype));
6776
6777	if (!pmap_emulate_ad_bits(pmap))
6778		return (-1);
6779
6780	PG_A = pmap_accessed_bit(pmap);
6781	PG_M = pmap_modified_bit(pmap);
6782	PG_V = pmap_valid_bit(pmap);
6783	PG_RW = pmap_rw_bit(pmap);
6784
6785	rv = -1;
6786	lock = NULL;
6787	pv_lists_locked = FALSE;
6788retry:
6789	PMAP_LOCK(pmap);
6790
6791	pde = pmap_pde(pmap, va);
6792	if (pde == NULL || (*pde & PG_V) == 0)
6793		goto done;
6794
6795	if ((*pde & PG_PS) != 0) {
6796		if (ftype == VM_PROT_READ) {
6797#ifdef INVARIANTS
6798			atomic_add_long(&num_superpage_accessed_emulations, 1);
6799#endif
6800			*pde |= PG_A;
6801			rv = 0;
6802		}
6803		goto done;
6804	}
6805
6806	pte = pmap_pde_to_pte(pde, va);
6807	if ((*pte & PG_V) == 0)
6808		goto done;
6809
6810	if (ftype == VM_PROT_WRITE) {
6811		if ((*pte & PG_RW) == 0)
6812			goto done;
6813		/*
6814		 * Set the modified and accessed bits simultaneously.
6815		 *
6816		 * Intel EPT PTEs that do software emulation of A/D bits map
6817		 * PG_A and PG_M to EPT_PG_READ and EPT_PG_WRITE respectively.
6818		 * An EPT misconfiguration is triggered if the PTE is writable
6819		 * but not readable (WR=10). This is avoided by setting PG_A
6820		 * and PG_M simultaneously.
6821		 */
6822		*pte |= PG_M | PG_A;
6823	} else {
6824		*pte |= PG_A;
6825	}
6826
6827	/* try to promote the mapping */
6828	if (va < VM_MAXUSER_ADDRESS)
6829		mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
6830	else
6831		mpte = NULL;
6832
6833	m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
6834
6835	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
6836	    pmap_ps_enabled(pmap) &&
6837	    (m->flags & PG_FICTITIOUS) == 0 &&
6838	    vm_reserv_level_iffullpop(m) == 0) {
6839		if (!pv_lists_locked) {
6840			pv_lists_locked = TRUE;
6841			if (!rw_try_rlock(&pvh_global_lock)) {
6842				PMAP_UNLOCK(pmap);
6843				rw_rlock(&pvh_global_lock);
6844				goto retry;
6845			}
6846		}
6847		pmap_promote_pde(pmap, pde, va, &lock);
6848#ifdef INVARIANTS
6849		atomic_add_long(&ad_emulation_superpage_promotions, 1);
6850#endif
6851	}
6852#ifdef INVARIANTS
6853	if (ftype == VM_PROT_WRITE)
6854		atomic_add_long(&num_dirty_emulations, 1);
6855	else
6856		atomic_add_long(&num_accessed_emulations, 1);
6857#endif
6858	rv = 0;		/* success */
6859done:
6860	if (lock != NULL)
6861		rw_wunlock(lock);
6862	if (pv_lists_locked)
6863		rw_runlock(&pvh_global_lock);
6864	PMAP_UNLOCK(pmap);
6865	return (rv);
6866}
6867
6868void
6869pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num)
6870{
6871	pml4_entry_t *pml4;
6872	pdp_entry_t *pdp;
6873	pd_entry_t *pde;
6874	pt_entry_t *pte, PG_V;
6875	int idx;
6876
6877	idx = 0;
6878	PG_V = pmap_valid_bit(pmap);
6879	PMAP_LOCK(pmap);
6880
6881	pml4 = pmap_pml4e(pmap, va);
6882	ptr[idx++] = *pml4;
6883	if ((*pml4 & PG_V) == 0)
6884		goto done;
6885
6886	pdp = pmap_pml4e_to_pdpe(pml4, va);
6887	ptr[idx++] = *pdp;
6888	if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0)
6889		goto done;
6890
6891	pde = pmap_pdpe_to_pde(pdp, va);
6892	ptr[idx++] = *pde;
6893	if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0)
6894		goto done;
6895
6896	pte = pmap_pde_to_pte(pde, va);
6897	ptr[idx++] = *pte;
6898
6899done:
6900	PMAP_UNLOCK(pmap);
6901	*num = idx;
6902}
6903
6904#include "opt_ddb.h"
6905#ifdef DDB
6906#include <ddb/ddb.h>
6907
6908DB_SHOW_COMMAND(pte, pmap_print_pte)
6909{
6910	pmap_t pmap;
6911	pml4_entry_t *pml4;
6912	pdp_entry_t *pdp;
6913	pd_entry_t *pde;
6914	pt_entry_t *pte, PG_V;
6915	vm_offset_t va;
6916
6917	if (have_addr) {
6918		va = (vm_offset_t)addr;
6919		pmap = PCPU_GET(curpmap); /* XXX */
6920	} else {
6921		db_printf("show pte addr\n");
6922		return;
6923	}
6924	PG_V = pmap_valid_bit(pmap);
6925	pml4 = pmap_pml4e(pmap, va);
6926	db_printf("VA %#016lx pml4e %#016lx", va, *pml4);
6927	if ((*pml4 & PG_V) == 0) {
6928		db_printf("\n");
6929		return;
6930	}
6931	pdp = pmap_pml4e_to_pdpe(pml4, va);
6932	db_printf(" pdpe %#016lx", *pdp);
6933	if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) {
6934		db_printf("\n");
6935		return;
6936	}
6937	pde = pmap_pdpe_to_pde(pdp, va);
6938	db_printf(" pde %#016lx", *pde);
6939	if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) {
6940		db_printf("\n");
6941		return;
6942	}
6943	pte = pmap_pde_to_pte(pde, va);
6944	db_printf(" pte %#016lx\n", *pte);
6945}
6946
6947DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap)
6948{
6949	vm_paddr_t a;
6950
6951	if (have_addr) {
6952		a = (vm_paddr_t)addr;
6953		db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a));
6954	} else {
6955		db_printf("show phys2dmap addr\n");
6956	}
6957}
6958#endif
6959