pmap.c revision 263875
1/*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2003 Peter Wemm
9 * All rights reserved.
10 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
11 * All rights reserved.
12 *
13 * This code is derived from software contributed to Berkeley by
14 * the Systems Programming Group of the University of Utah Computer
15 * Science Department and William Jolitz of UUNET Technologies Inc.
16 *
17 * Redistribution and use in source and binary forms, with or without
18 * modification, are permitted provided that the following conditions
19 * are met:
20 * 1. Redistributions of source code must retain the above copyright
21 *    notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 *    notice, this list of conditions and the following disclaimer in the
24 *    documentation and/or other materials provided with the distribution.
25 * 3. All advertising materials mentioning features or use of this software
26 *    must display the following acknowledgement:
27 *	This product includes software developed by the University of
28 *	California, Berkeley and its contributors.
29 * 4. Neither the name of the University nor the names of its contributors
30 *    may be used to endorse or promote products derived from this software
31 *    without specific prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
34 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
35 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
36 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
37 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
38 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
39 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
40 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
41 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
42 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
43 * SUCH DAMAGE.
44 *
45 *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
46 */
47/*-
48 * Copyright (c) 2003 Networks Associates Technology, Inc.
49 * All rights reserved.
50 *
51 * This software was developed for the FreeBSD Project by Jake Burkholder,
52 * Safeport Network Services, and Network Associates Laboratories, the
53 * Security Research Division of Network Associates, Inc. under
54 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
55 * CHATS research program.
56 *
57 * Redistribution and use in source and binary forms, with or without
58 * modification, are permitted provided that the following conditions
59 * are met:
60 * 1. Redistributions of source code must retain the above copyright
61 *    notice, this list of conditions and the following disclaimer.
62 * 2. Redistributions in binary form must reproduce the above copyright
63 *    notice, this list of conditions and the following disclaimer in the
64 *    documentation and/or other materials provided with the distribution.
65 *
66 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
67 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
68 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
69 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
70 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
71 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
72 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
73 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
74 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
75 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
76 * SUCH DAMAGE.
77 */
78
79#define	AMD64_NPT_AWARE
80
81#include <sys/cdefs.h>
82__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/pmap.c 263875 2014-03-28 15:38:38Z kib $");
83
84/*
85 *	Manages physical address maps.
86 *
87 *	Since the information managed by this module is
88 *	also stored by the logical address mapping module,
89 *	this module may throw away valid virtual-to-physical
90 *	mappings at almost any time.  However, invalidations
91 *	of virtual-to-physical mappings must be done as
92 *	requested.
93 *
94 *	In order to cope with hardware architectures which
95 *	make virtual-to-physical map invalidates expensive,
96 *	this module may delay invalidate or reduced protection
97 *	operations until such time as they are actually
98 *	necessary.  This module is given full information as
99 *	to which processors are currently using which maps,
100 *	and to when physical maps must be made correct.
101 */
102
103#include "opt_pmap.h"
104#include "opt_vm.h"
105
106#include <sys/param.h>
107#include <sys/bus.h>
108#include <sys/systm.h>
109#include <sys/kernel.h>
110#include <sys/ktr.h>
111#include <sys/lock.h>
112#include <sys/malloc.h>
113#include <sys/mman.h>
114#include <sys/mutex.h>
115#include <sys/proc.h>
116#include <sys/rwlock.h>
117#include <sys/sx.h>
118#include <sys/vmmeter.h>
119#include <sys/sched.h>
120#include <sys/sysctl.h>
121#include <sys/_unrhdr.h>
122#include <sys/smp.h>
123
124#include <vm/vm.h>
125#include <vm/vm_param.h>
126#include <vm/vm_kern.h>
127#include <vm/vm_page.h>
128#include <vm/vm_map.h>
129#include <vm/vm_object.h>
130#include <vm/vm_extern.h>
131#include <vm/vm_pageout.h>
132#include <vm/vm_pager.h>
133#include <vm/vm_radix.h>
134#include <vm/vm_reserv.h>
135#include <vm/uma.h>
136
137#include <machine/intr_machdep.h>
138#include <machine/apicvar.h>
139#include <machine/cpu.h>
140#include <machine/cputypes.h>
141#include <machine/md_var.h>
142#include <machine/pcb.h>
143#include <machine/specialreg.h>
144#ifdef SMP
145#include <machine/smp.h>
146#endif
147
148static __inline boolean_t
149pmap_emulate_ad_bits(pmap_t pmap)
150{
151
152	return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0);
153}
154
155static __inline pt_entry_t
156pmap_valid_bit(pmap_t pmap)
157{
158	pt_entry_t mask;
159
160	switch (pmap->pm_type) {
161	case PT_X86:
162		mask = X86_PG_V;
163		break;
164	case PT_EPT:
165		if (pmap_emulate_ad_bits(pmap))
166			mask = EPT_PG_EMUL_V;
167		else
168			mask = EPT_PG_READ;
169		break;
170	default:
171		panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type);
172	}
173
174	return (mask);
175}
176
177static __inline pt_entry_t
178pmap_rw_bit(pmap_t pmap)
179{
180	pt_entry_t mask;
181
182	switch (pmap->pm_type) {
183	case PT_X86:
184		mask = X86_PG_RW;
185		break;
186	case PT_EPT:
187		if (pmap_emulate_ad_bits(pmap))
188			mask = EPT_PG_EMUL_RW;
189		else
190			mask = EPT_PG_WRITE;
191		break;
192	default:
193		panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type);
194	}
195
196	return (mask);
197}
198
199static __inline pt_entry_t
200pmap_global_bit(pmap_t pmap)
201{
202	pt_entry_t mask;
203
204	switch (pmap->pm_type) {
205	case PT_X86:
206		mask = X86_PG_G;
207		break;
208	case PT_EPT:
209		mask = 0;
210		break;
211	default:
212		panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type);
213	}
214
215	return (mask);
216}
217
218static __inline pt_entry_t
219pmap_accessed_bit(pmap_t pmap)
220{
221	pt_entry_t mask;
222
223	switch (pmap->pm_type) {
224	case PT_X86:
225		mask = X86_PG_A;
226		break;
227	case PT_EPT:
228		if (pmap_emulate_ad_bits(pmap))
229			mask = EPT_PG_READ;
230		else
231			mask = EPT_PG_A;
232		break;
233	default:
234		panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type);
235	}
236
237	return (mask);
238}
239
240static __inline pt_entry_t
241pmap_modified_bit(pmap_t pmap)
242{
243	pt_entry_t mask;
244
245	switch (pmap->pm_type) {
246	case PT_X86:
247		mask = X86_PG_M;
248		break;
249	case PT_EPT:
250		if (pmap_emulate_ad_bits(pmap))
251			mask = EPT_PG_WRITE;
252		else
253			mask = EPT_PG_M;
254		break;
255	default:
256		panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type);
257	}
258
259	return (mask);
260}
261
262#if !defined(DIAGNOSTIC)
263#ifdef __GNUC_GNU_INLINE__
264#define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
265#else
266#define PMAP_INLINE	extern inline
267#endif
268#else
269#define PMAP_INLINE
270#endif
271
272#ifdef PV_STATS
273#define PV_STAT(x)	do { x ; } while (0)
274#else
275#define PV_STAT(x)	do { } while (0)
276#endif
277
278#define	pa_index(pa)	((pa) >> PDRSHIFT)
279#define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
280
281#define	NPV_LIST_LOCKS	MAXCPU
282
283#define	PHYS_TO_PV_LIST_LOCK(pa)	\
284			(&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
285
286#define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)	do {	\
287	struct rwlock **_lockp = (lockp);		\
288	struct rwlock *_new_lock;			\
289							\
290	_new_lock = PHYS_TO_PV_LIST_LOCK(pa);		\
291	if (_new_lock != *_lockp) {			\
292		if (*_lockp != NULL)			\
293			rw_wunlock(*_lockp);		\
294		*_lockp = _new_lock;			\
295		rw_wlock(*_lockp);			\
296	}						\
297} while (0)
298
299#define	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)	\
300			CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
301
302#define	RELEASE_PV_LIST_LOCK(lockp)		do {	\
303	struct rwlock **_lockp = (lockp);		\
304							\
305	if (*_lockp != NULL) {				\
306		rw_wunlock(*_lockp);			\
307		*_lockp = NULL;				\
308	}						\
309} while (0)
310
311#define	VM_PAGE_TO_PV_LIST_LOCK(m)	\
312			PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
313
314struct pmap kernel_pmap_store;
315
316vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
317vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
318
319int nkpt;
320SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0,
321    "Number of kernel page table pages allocated on bootup");
322
323static int ndmpdp;
324vm_paddr_t dmaplimit;
325vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
326pt_entry_t pg_nx;
327
328static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
329
330static int pat_works = 1;
331SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1,
332    "Is page attribute table fully functional?");
333
334static int pg_ps_enabled = 1;
335SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0,
336    "Are large page mappings enabled?");
337
338#define	PAT_INDEX_SIZE	8
339static int pat_index[PAT_INDEX_SIZE];	/* cache mode to PAT index conversion */
340
341static u_int64_t	KPTphys;	/* phys addr of kernel level 1 */
342static u_int64_t	KPDphys;	/* phys addr of kernel level 2 */
343u_int64_t		KPDPphys;	/* phys addr of kernel level 3 */
344u_int64_t		KPML4phys;	/* phys addr of kernel level 4 */
345
346static u_int64_t	DMPDphys;	/* phys addr of direct mapped level 2 */
347static u_int64_t	DMPDPphys;	/* phys addr of direct mapped level 3 */
348static int		ndmpdpphys;	/* number of DMPDPphys pages */
349
350static struct rwlock_padalign pvh_global_lock;
351
352/*
353 * Data for the pv entry allocation mechanism
354 */
355static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
356static struct mtx pv_chunks_mutex;
357static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
358static struct md_page *pv_table;
359
360/*
361 * All those kernel PT submaps that BSD is so fond of
362 */
363pt_entry_t *CMAP1 = 0;
364caddr_t CADDR1 = 0;
365
366static int pmap_flags = PMAP_PDE_SUPERPAGE;	/* flags for x86 pmaps */
367
368static struct unrhdr pcid_unr;
369static struct mtx pcid_mtx;
370int pmap_pcid_enabled = 0;
371SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN, &pmap_pcid_enabled,
372    0, "Is TLB Context ID enabled ?");
373int invpcid_works = 0;
374SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0,
375    "Is the invpcid instruction available ?");
376
377static int
378pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS)
379{
380	int i;
381	uint64_t res;
382
383	res = 0;
384	CPU_FOREACH(i) {
385		res += cpuid_to_pcpu[i]->pc_pm_save_cnt;
386	}
387	return (sysctl_handle_64(oidp, &res, 0, req));
388}
389SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RW |
390    CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU",
391    "Count of saved TLB context on switch");
392
393/*
394 * Crashdump maps.
395 */
396static caddr_t crashdumpmap;
397
398static void	free_pv_chunk(struct pv_chunk *pc);
399static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
400static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
401static int	popcnt_pc_map_elem(uint64_t elem);
402static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
403static void	reserve_pv_entries(pmap_t pmap, int needed,
404		    struct rwlock **lockp);
405static void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
406		    struct rwlock **lockp);
407static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
408		    struct rwlock **lockp);
409static void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
410		    struct rwlock **lockp);
411static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
412static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
413		    vm_offset_t va);
414
415static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode);
416static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
417static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde,
418    vm_offset_t va, struct rwlock **lockp);
419static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe,
420    vm_offset_t va);
421static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
422    vm_prot_t prot, struct rwlock **lockp);
423static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
424    vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
425static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
426static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
427static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
428static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va);
429static void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask);
430static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
431    struct rwlock **lockp);
432static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
433    vm_prot_t prot);
434static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask);
435static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
436    struct spglist *free, struct rwlock **lockp);
437static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
438    pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
439static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte);
440static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
441    struct spglist *free);
442static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
443    vm_page_t m, struct rwlock **lockp);
444static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
445    pd_entry_t newpde);
446static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde);
447
448static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
449		struct rwlock **lockp);
450static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va,
451		struct rwlock **lockp);
452static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va,
453		struct rwlock **lockp);
454
455static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
456    struct spglist *free);
457static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
458static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
459
460/*
461 * Move the kernel virtual free pointer to the next
462 * 2MB.  This is used to help improve performance
463 * by using a large (2MB) page for much of the kernel
464 * (.text, .data, .bss)
465 */
466static vm_offset_t
467pmap_kmem_choose(vm_offset_t addr)
468{
469	vm_offset_t newaddr = addr;
470
471	newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
472	return (newaddr);
473}
474
475/********************/
476/* Inline functions */
477/********************/
478
479/* Return a non-clipped PD index for a given VA */
480static __inline vm_pindex_t
481pmap_pde_pindex(vm_offset_t va)
482{
483	return (va >> PDRSHIFT);
484}
485
486
487/* Return various clipped indexes for a given VA */
488static __inline vm_pindex_t
489pmap_pte_index(vm_offset_t va)
490{
491
492	return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1));
493}
494
495static __inline vm_pindex_t
496pmap_pde_index(vm_offset_t va)
497{
498
499	return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
500}
501
502static __inline vm_pindex_t
503pmap_pdpe_index(vm_offset_t va)
504{
505
506	return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1));
507}
508
509static __inline vm_pindex_t
510pmap_pml4e_index(vm_offset_t va)
511{
512
513	return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
514}
515
516/* Return a pointer to the PML4 slot that corresponds to a VA */
517static __inline pml4_entry_t *
518pmap_pml4e(pmap_t pmap, vm_offset_t va)
519{
520
521	return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
522}
523
524/* Return a pointer to the PDP slot that corresponds to a VA */
525static __inline pdp_entry_t *
526pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va)
527{
528	pdp_entry_t *pdpe;
529
530	pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
531	return (&pdpe[pmap_pdpe_index(va)]);
532}
533
534/* Return a pointer to the PDP slot that corresponds to a VA */
535static __inline pdp_entry_t *
536pmap_pdpe(pmap_t pmap, vm_offset_t va)
537{
538	pml4_entry_t *pml4e;
539	pt_entry_t PG_V;
540
541	PG_V = pmap_valid_bit(pmap);
542	pml4e = pmap_pml4e(pmap, va);
543	if ((*pml4e & PG_V) == 0)
544		return (NULL);
545	return (pmap_pml4e_to_pdpe(pml4e, va));
546}
547
548/* Return a pointer to the PD slot that corresponds to a VA */
549static __inline pd_entry_t *
550pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va)
551{
552	pd_entry_t *pde;
553
554	pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
555	return (&pde[pmap_pde_index(va)]);
556}
557
558/* Return a pointer to the PD slot that corresponds to a VA */
559static __inline pd_entry_t *
560pmap_pde(pmap_t pmap, vm_offset_t va)
561{
562	pdp_entry_t *pdpe;
563	pt_entry_t PG_V;
564
565	PG_V = pmap_valid_bit(pmap);
566	pdpe = pmap_pdpe(pmap, va);
567	if (pdpe == NULL || (*pdpe & PG_V) == 0)
568		return (NULL);
569	return (pmap_pdpe_to_pde(pdpe, va));
570}
571
572/* Return a pointer to the PT slot that corresponds to a VA */
573static __inline pt_entry_t *
574pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va)
575{
576	pt_entry_t *pte;
577
578	pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
579	return (&pte[pmap_pte_index(va)]);
580}
581
582/* Return a pointer to the PT slot that corresponds to a VA */
583static __inline pt_entry_t *
584pmap_pte(pmap_t pmap, vm_offset_t va)
585{
586	pd_entry_t *pde;
587	pt_entry_t PG_V;
588
589	PG_V = pmap_valid_bit(pmap);
590	pde = pmap_pde(pmap, va);
591	if (pde == NULL || (*pde & PG_V) == 0)
592		return (NULL);
593	if ((*pde & PG_PS) != 0)	/* compat with i386 pmap_pte() */
594		return ((pt_entry_t *)pde);
595	return (pmap_pde_to_pte(pde, va));
596}
597
598static __inline void
599pmap_resident_count_inc(pmap_t pmap, int count)
600{
601
602	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
603	pmap->pm_stats.resident_count += count;
604}
605
606static __inline void
607pmap_resident_count_dec(pmap_t pmap, int count)
608{
609
610	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
611	KASSERT(pmap->pm_stats.resident_count >= count,
612	    ("pmap %p resident count underflow %ld %d", pmap,
613	    pmap->pm_stats.resident_count, count));
614	pmap->pm_stats.resident_count -= count;
615}
616
617PMAP_INLINE pt_entry_t *
618vtopte(vm_offset_t va)
619{
620	u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
621
622	KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va));
623
624	return (PTmap + ((va >> PAGE_SHIFT) & mask));
625}
626
627static __inline pd_entry_t *
628vtopde(vm_offset_t va)
629{
630	u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
631
632	KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va));
633
634	return (PDmap + ((va >> PDRSHIFT) & mask));
635}
636
637static u_int64_t
638allocpages(vm_paddr_t *firstaddr, int n)
639{
640	u_int64_t ret;
641
642	ret = *firstaddr;
643	bzero((void *)ret, n * PAGE_SIZE);
644	*firstaddr += n * PAGE_SIZE;
645	return (ret);
646}
647
648CTASSERT(powerof2(NDMPML4E));
649
650/* number of kernel PDP slots */
651#define	NKPDPE(ptpgs)		howmany((ptpgs), NPDEPG)
652
653static void
654nkpt_init(vm_paddr_t addr)
655{
656	int pt_pages;
657
658#ifdef NKPT
659	pt_pages = NKPT;
660#else
661	pt_pages = howmany(addr, 1 << PDRSHIFT);
662	pt_pages += NKPDPE(pt_pages);
663
664	/*
665	 * Add some slop beyond the bare minimum required for bootstrapping
666	 * the kernel.
667	 *
668	 * This is quite important when allocating KVA for kernel modules.
669	 * The modules are required to be linked in the negative 2GB of
670	 * the address space.  If we run out of KVA in this region then
671	 * pmap_growkernel() will need to allocate page table pages to map
672	 * the entire 512GB of KVA space which is an unnecessary tax on
673	 * physical memory.
674	 */
675	pt_pages += 8;		/* 16MB additional slop for kernel modules */
676#endif
677	nkpt = pt_pages;
678}
679
680static void
681create_pagetables(vm_paddr_t *firstaddr)
682{
683	int i, j, ndm1g, nkpdpe;
684	pt_entry_t *pt_p;
685	pd_entry_t *pd_p;
686	pdp_entry_t *pdp_p;
687	pml4_entry_t *p4_p;
688
689	/* Allocate page table pages for the direct map */
690	ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT;
691	if (ndmpdp < 4)		/* Minimum 4GB of dirmap */
692		ndmpdp = 4;
693	ndmpdpphys = howmany(ndmpdp, NPDPEPG);
694	if (ndmpdpphys > NDMPML4E) {
695		/*
696		 * Each NDMPML4E allows 512 GB, so limit to that,
697		 * and then readjust ndmpdp and ndmpdpphys.
698		 */
699		printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512);
700		Maxmem = atop(NDMPML4E * NBPML4);
701		ndmpdpphys = NDMPML4E;
702		ndmpdp = NDMPML4E * NPDEPG;
703	}
704	DMPDPphys = allocpages(firstaddr, ndmpdpphys);
705	ndm1g = 0;
706	if ((amd_feature & AMDID_PAGE1GB) != 0)
707		ndm1g = ptoa(Maxmem) >> PDPSHIFT;
708	if (ndm1g < ndmpdp)
709		DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g);
710	dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
711
712	/* Allocate pages */
713	KPML4phys = allocpages(firstaddr, 1);
714	KPDPphys = allocpages(firstaddr, NKPML4E);
715
716	/*
717	 * Allocate the initial number of kernel page table pages required to
718	 * bootstrap.  We defer this until after all memory-size dependent
719	 * allocations are done (e.g. direct map), so that we don't have to
720	 * build in too much slop in our estimate.
721	 *
722	 * Note that when NKPML4E > 1, we have an empty page underneath
723	 * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed)
724	 * pages.  (pmap_enter requires a PD page to exist for each KPML4E.)
725	 */
726	nkpt_init(*firstaddr);
727	nkpdpe = NKPDPE(nkpt);
728
729	KPTphys = allocpages(firstaddr, nkpt);
730	KPDphys = allocpages(firstaddr, nkpdpe);
731
732	/* Fill in the underlying page table pages */
733	/* Nominally read-only (but really R/W) from zero to physfree */
734	/* XXX not fully used, underneath 2M pages */
735	pt_p = (pt_entry_t *)KPTphys;
736	for (i = 0; ptoa(i) < *firstaddr; i++)
737		pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | X86_PG_G;
738
739	/* Now map the page tables at their location within PTmap */
740	pd_p = (pd_entry_t *)KPDphys;
741	for (i = 0; i < nkpt; i++)
742		pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
743
744	/* Map from zero to end of allocations under 2M pages */
745	/* This replaces some of the KPTphys entries above */
746	for (i = 0; (i << PDRSHIFT) < *firstaddr; i++)
747		pd_p[i] = (i << PDRSHIFT) | X86_PG_RW | X86_PG_V | PG_PS |
748		    X86_PG_G;
749
750	/* And connect up the PD to the PDP (leaving room for L4 pages) */
751	pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE));
752	for (i = 0; i < nkpdpe; i++)
753		pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V |
754		    PG_U;
755
756	/*
757	 * Now, set up the direct map region using 2MB and/or 1GB pages.  If
758	 * the end of physical memory is not aligned to a 1GB page boundary,
759	 * then the residual physical memory is mapped with 2MB pages.  Later,
760	 * if pmap_mapdev{_attr}() uses the direct map for non-write-back
761	 * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings
762	 * that are partially used.
763	 */
764	pd_p = (pd_entry_t *)DMPDphys;
765	for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) {
766		pd_p[j] = (vm_paddr_t)i << PDRSHIFT;
767		/* Preset PG_M and PG_A because demotion expects it. */
768		pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G |
769		    X86_PG_M | X86_PG_A;
770	}
771	pdp_p = (pdp_entry_t *)DMPDPphys;
772	for (i = 0; i < ndm1g; i++) {
773		pdp_p[i] = (vm_paddr_t)i << PDPSHIFT;
774		/* Preset PG_M and PG_A because demotion expects it. */
775		pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G |
776		    X86_PG_M | X86_PG_A;
777	}
778	for (j = 0; i < ndmpdp; i++, j++) {
779		pdp_p[i] = DMPDphys + ptoa(j);
780		pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_U;
781	}
782
783	/* And recursively map PML4 to itself in order to get PTmap */
784	p4_p = (pml4_entry_t *)KPML4phys;
785	p4_p[PML4PML4I] = KPML4phys;
786	p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | PG_U;
787
788	/* Connect the Direct Map slot(s) up to the PML4. */
789	for (i = 0; i < ndmpdpphys; i++) {
790		p4_p[DMPML4I + i] = DMPDPphys + ptoa(i);
791		p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | PG_U;
792	}
793
794	/* Connect the KVA slots up to the PML4 */
795	for (i = 0; i < NKPML4E; i++) {
796		p4_p[KPML4BASE + i] = KPDPphys + ptoa(i);
797		p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V | PG_U;
798	}
799}
800
801/*
802 *	Bootstrap the system enough to run with virtual memory.
803 *
804 *	On amd64 this is called after mapping has already been enabled
805 *	and just syncs the pmap module with what has already been done.
806 *	[We can't call it easily with mapping off since the kernel is not
807 *	mapped with PA == VA, hence we would have to relocate every address
808 *	from the linked base (virtual) address "KERNBASE" to the actual
809 *	(physical) address starting relative to 0]
810 */
811void
812pmap_bootstrap(vm_paddr_t *firstaddr)
813{
814	vm_offset_t va;
815	pt_entry_t *pte, *unused;
816
817	/*
818	 * Create an initial set of page tables to run the kernel in.
819	 */
820	create_pagetables(firstaddr);
821
822	virtual_avail = (vm_offset_t) KERNBASE + *firstaddr;
823	virtual_avail = pmap_kmem_choose(virtual_avail);
824
825	virtual_end = VM_MAX_KERNEL_ADDRESS;
826
827
828	/* XXX do %cr0 as well */
829	load_cr4(rcr4() | CR4_PGE | CR4_PSE);
830	load_cr3(KPML4phys);
831	if (cpu_stdext_feature & CPUID_STDEXT_SMEP)
832		load_cr4(rcr4() | CR4_SMEP);
833
834	/*
835	 * Initialize the kernel pmap (which is statically allocated).
836	 */
837	PMAP_LOCK_INIT(kernel_pmap);
838	kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys);
839	kernel_pmap->pm_cr3 = KPML4phys;
840	CPU_FILL(&kernel_pmap->pm_active);	/* don't allow deactivation */
841	CPU_ZERO(&kernel_pmap->pm_save);
842	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
843	kernel_pmap->pm_flags = pmap_flags;
844
845 	/*
846	 * Initialize the global pv list lock.
847	 */
848	rw_init(&pvh_global_lock, "pmap pv global");
849
850	/*
851	 * Reserve some special page table entries/VA space for temporary
852	 * mapping of pages.
853	 */
854#define	SYSMAP(c, p, v, n)	\
855	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
856
857	va = virtual_avail;
858	pte = vtopte(va);
859
860	/*
861	 * CMAP1 is only used for the memory test.
862	 */
863	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
864
865	/*
866	 * Crashdump maps.
867	 */
868	SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
869
870	virtual_avail = va;
871
872	/* Initialize the PAT MSR. */
873	pmap_init_pat();
874
875	/* Initialize TLB Context Id. */
876	TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
877	if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
878		load_cr4(rcr4() | CR4_PCIDE);
879		mtx_init(&pcid_mtx, "pcid", NULL, MTX_DEF);
880		init_unrhdr(&pcid_unr, 1, (1 << 12) - 1, &pcid_mtx);
881		/* Check for INVPCID support */
882		invpcid_works = (cpu_stdext_feature & CPUID_STDEXT_INVPCID)
883		    != 0;
884		kernel_pmap->pm_pcid = 0;
885#ifndef SMP
886		pmap_pcid_enabled = 0;
887#endif
888	} else
889		pmap_pcid_enabled = 0;
890}
891
892/*
893 * Setup the PAT MSR.
894 */
895void
896pmap_init_pat(void)
897{
898	int pat_table[PAT_INDEX_SIZE];
899	uint64_t pat_msr;
900	u_long cr0, cr4;
901	int i;
902
903	/* Bail if this CPU doesn't implement PAT. */
904	if ((cpu_feature & CPUID_PAT) == 0)
905		panic("no PAT??");
906
907	/* Set default PAT index table. */
908	for (i = 0; i < PAT_INDEX_SIZE; i++)
909		pat_table[i] = -1;
910	pat_table[PAT_WRITE_BACK] = 0;
911	pat_table[PAT_WRITE_THROUGH] = 1;
912	pat_table[PAT_UNCACHEABLE] = 3;
913	pat_table[PAT_WRITE_COMBINING] = 3;
914	pat_table[PAT_WRITE_PROTECTED] = 3;
915	pat_table[PAT_UNCACHED] = 3;
916
917	/* Initialize default PAT entries. */
918	pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
919	    PAT_VALUE(1, PAT_WRITE_THROUGH) |
920	    PAT_VALUE(2, PAT_UNCACHED) |
921	    PAT_VALUE(3, PAT_UNCACHEABLE) |
922	    PAT_VALUE(4, PAT_WRITE_BACK) |
923	    PAT_VALUE(5, PAT_WRITE_THROUGH) |
924	    PAT_VALUE(6, PAT_UNCACHED) |
925	    PAT_VALUE(7, PAT_UNCACHEABLE);
926
927	if (pat_works) {
928		/*
929		 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
930		 * Program 5 and 6 as WP and WC.
931		 * Leave 4 and 7 as WB and UC.
932		 */
933		pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6));
934		pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) |
935		    PAT_VALUE(6, PAT_WRITE_COMBINING);
936		pat_table[PAT_UNCACHED] = 2;
937		pat_table[PAT_WRITE_PROTECTED] = 5;
938		pat_table[PAT_WRITE_COMBINING] = 6;
939	} else {
940		/*
941		 * Just replace PAT Index 2 with WC instead of UC-.
942		 */
943		pat_msr &= ~PAT_MASK(2);
944		pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
945		pat_table[PAT_WRITE_COMBINING] = 2;
946	}
947
948	/* Disable PGE. */
949	cr4 = rcr4();
950	load_cr4(cr4 & ~CR4_PGE);
951
952	/* Disable caches (CD = 1, NW = 0). */
953	cr0 = rcr0();
954	load_cr0((cr0 & ~CR0_NW) | CR0_CD);
955
956	/* Flushes caches and TLBs. */
957	wbinvd();
958	invltlb();
959
960	/* Update PAT and index table. */
961	wrmsr(MSR_PAT, pat_msr);
962	for (i = 0; i < PAT_INDEX_SIZE; i++)
963		pat_index[i] = pat_table[i];
964
965	/* Flush caches and TLBs again. */
966	wbinvd();
967	invltlb();
968
969	/* Restore caches and PGE. */
970	load_cr0(cr0);
971	load_cr4(cr4);
972}
973
974/*
975 *	Initialize a vm_page's machine-dependent fields.
976 */
977void
978pmap_page_init(vm_page_t m)
979{
980
981	TAILQ_INIT(&m->md.pv_list);
982	m->md.pat_mode = PAT_WRITE_BACK;
983}
984
985/*
986 *	Initialize the pmap module.
987 *	Called by vm_init, to initialize any structures that the pmap
988 *	system needs to map virtual memory.
989 */
990void
991pmap_init(void)
992{
993	vm_page_t mpte;
994	vm_size_t s;
995	int i, pv_npg;
996
997	/*
998	 * Initialize the vm page array entries for the kernel pmap's
999	 * page table pages.
1000	 */
1001	for (i = 0; i < nkpt; i++) {
1002		mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
1003		KASSERT(mpte >= vm_page_array &&
1004		    mpte < &vm_page_array[vm_page_array_size],
1005		    ("pmap_init: page table page is out of range"));
1006		mpte->pindex = pmap_pde_pindex(KERNBASE) + i;
1007		mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
1008	}
1009
1010	/*
1011	 * If the kernel is running in a virtual machine on an AMD Family 10h
1012	 * processor, then it must assume that MCA is enabled by the virtual
1013	 * machine monitor.
1014	 */
1015	if (vm_guest == VM_GUEST_VM && cpu_vendor_id == CPU_VENDOR_AMD &&
1016	    CPUID_TO_FAMILY(cpu_id) == 0x10)
1017		workaround_erratum383 = 1;
1018
1019	/*
1020	 * Are large page mappings enabled?
1021	 */
1022	TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
1023	if (pg_ps_enabled) {
1024		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
1025		    ("pmap_init: can't assign to pagesizes[1]"));
1026		pagesizes[1] = NBPDR;
1027	}
1028
1029	/*
1030	 * Initialize the pv chunk list mutex.
1031	 */
1032	mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
1033
1034	/*
1035	 * Initialize the pool of pv list locks.
1036	 */
1037	for (i = 0; i < NPV_LIST_LOCKS; i++)
1038		rw_init(&pv_list_locks[i], "pmap pv list");
1039
1040	/*
1041	 * Calculate the size of the pv head table for superpages.
1042	 */
1043	for (i = 0; phys_avail[i + 1]; i += 2);
1044	pv_npg = round_2mpage(phys_avail[(i - 2) + 1]) / NBPDR;
1045
1046	/*
1047	 * Allocate memory for the pv head table for superpages.
1048	 */
1049	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
1050	s = round_page(s);
1051	pv_table = (struct md_page *)kmem_malloc(kernel_arena, s,
1052	    M_WAITOK | M_ZERO);
1053	for (i = 0; i < pv_npg; i++)
1054		TAILQ_INIT(&pv_table[i].pv_list);
1055}
1056
1057static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
1058    "2MB page mapping counters");
1059
1060static u_long pmap_pde_demotions;
1061SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
1062    &pmap_pde_demotions, 0, "2MB page demotions");
1063
1064static u_long pmap_pde_mappings;
1065SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
1066    &pmap_pde_mappings, 0, "2MB page mappings");
1067
1068static u_long pmap_pde_p_failures;
1069SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
1070    &pmap_pde_p_failures, 0, "2MB page promotion failures");
1071
1072static u_long pmap_pde_promotions;
1073SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
1074    &pmap_pde_promotions, 0, "2MB page promotions");
1075
1076static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0,
1077    "1GB page mapping counters");
1078
1079static u_long pmap_pdpe_demotions;
1080SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD,
1081    &pmap_pdpe_demotions, 0, "1GB page demotions");
1082
1083/***************************************************
1084 * Low level helper routines.....
1085 ***************************************************/
1086
1087static pt_entry_t
1088pmap_swap_pat(pmap_t pmap, pt_entry_t entry)
1089{
1090	int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT;
1091
1092	switch (pmap->pm_type) {
1093	case PT_X86:
1094		/* Verify that both PAT bits are not set at the same time */
1095		KASSERT((entry & x86_pat_bits) != x86_pat_bits,
1096		    ("Invalid PAT bits in entry %#lx", entry));
1097
1098		/* Swap the PAT bits if one of them is set */
1099		if ((entry & x86_pat_bits) != 0)
1100			entry ^= x86_pat_bits;
1101		break;
1102	case PT_EPT:
1103		/*
1104		 * Nothing to do - the memory attributes are represented
1105		 * the same way for regular pages and superpages.
1106		 */
1107		break;
1108	default:
1109		panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type);
1110	}
1111
1112	return (entry);
1113}
1114
1115/*
1116 * Determine the appropriate bits to set in a PTE or PDE for a specified
1117 * caching mode.
1118 */
1119static int
1120pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde)
1121{
1122	int cache_bits, pat_flag, pat_idx;
1123
1124	if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0)
1125		panic("Unknown caching mode %d\n", mode);
1126
1127	switch (pmap->pm_type) {
1128	case PT_X86:
1129		/* The PAT bit is different for PTE's and PDE's. */
1130		pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT;
1131
1132		/* Map the caching mode to a PAT index. */
1133		pat_idx = pat_index[mode];
1134
1135		/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
1136		cache_bits = 0;
1137		if (pat_idx & 0x4)
1138			cache_bits |= pat_flag;
1139		if (pat_idx & 0x2)
1140			cache_bits |= PG_NC_PCD;
1141		if (pat_idx & 0x1)
1142			cache_bits |= PG_NC_PWT;
1143		break;
1144
1145	case PT_EPT:
1146		cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode);
1147		break;
1148
1149	default:
1150		panic("unsupported pmap type %d", pmap->pm_type);
1151	}
1152
1153	return (cache_bits);
1154}
1155
1156static int
1157pmap_cache_mask(pmap_t pmap, boolean_t is_pde)
1158{
1159	int mask;
1160
1161	switch (pmap->pm_type) {
1162	case PT_X86:
1163		mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE;
1164		break;
1165	case PT_EPT:
1166		mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7);
1167		break;
1168	default:
1169		panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type);
1170	}
1171
1172	return (mask);
1173}
1174
1175static __inline boolean_t
1176pmap_ps_enabled(pmap_t pmap)
1177{
1178
1179	return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0);
1180}
1181
1182static void
1183pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde)
1184{
1185
1186	switch (pmap->pm_type) {
1187	case PT_X86:
1188		break;
1189	case PT_EPT:
1190		/*
1191		 * XXX
1192		 * This is a little bogus since the generation number is
1193		 * supposed to be bumped up when a region of the address
1194		 * space is invalidated in the page tables.
1195		 *
1196		 * In this case the old PDE entry is valid but yet we want
1197		 * to make sure that any mappings using the old entry are
1198		 * invalidated in the TLB.
1199		 *
1200		 * The reason this works as expected is because we rendezvous
1201		 * "all" host cpus and force any vcpu context to exit as a
1202		 * side-effect.
1203		 */
1204		atomic_add_acq_long(&pmap->pm_eptgen, 1);
1205		break;
1206	default:
1207		panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type);
1208	}
1209	pde_store(pde, newpde);
1210}
1211
1212/*
1213 * After changing the page size for the specified virtual address in the page
1214 * table, flush the corresponding entries from the processor's TLB.  Only the
1215 * calling processor's TLB is affected.
1216 *
1217 * The calling thread must be pinned to a processor.
1218 */
1219static void
1220pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde)
1221{
1222	pt_entry_t PG_G;
1223
1224	if (pmap->pm_type == PT_EPT)
1225		return;
1226
1227	KASSERT(pmap->pm_type == PT_X86,
1228	    ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type));
1229
1230	PG_G = pmap_global_bit(pmap);
1231
1232	if ((newpde & PG_PS) == 0)
1233		/* Demotion: flush a specific 2MB page mapping. */
1234		invlpg(va);
1235	else if ((newpde & PG_G) == 0)
1236		/*
1237		 * Promotion: flush every 4KB page mapping from the TLB
1238		 * because there are too many to flush individually.
1239		 */
1240		invltlb();
1241	else {
1242		/*
1243		 * Promotion: flush every 4KB page mapping from the TLB,
1244		 * including any global (PG_G) mappings.
1245		 */
1246		invltlb_globpcid();
1247	}
1248}
1249#ifdef SMP
1250
1251static void
1252pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va)
1253{
1254	struct invpcid_descr d;
1255	uint64_t cr3;
1256
1257	if (invpcid_works) {
1258		d.pcid = pmap->pm_pcid;
1259		d.pad = 0;
1260		d.addr = va;
1261		invpcid(&d, INVPCID_ADDR);
1262		return;
1263	}
1264
1265	cr3 = rcr3();
1266	critical_enter();
1267	load_cr3(pmap->pm_cr3 | CR3_PCID_SAVE);
1268	invlpg(va);
1269	load_cr3(cr3 | CR3_PCID_SAVE);
1270	critical_exit();
1271}
1272
1273/*
1274 * For SMP, these functions have to use the IPI mechanism for coherence.
1275 *
1276 * N.B.: Before calling any of the following TLB invalidation functions,
1277 * the calling processor must ensure that all stores updating a non-
1278 * kernel page table are globally performed.  Otherwise, another
1279 * processor could cache an old, pre-update entry without being
1280 * invalidated.  This can happen one of two ways: (1) The pmap becomes
1281 * active on another processor after its pm_active field is checked by
1282 * one of the following functions but before a store updating the page
1283 * table is globally performed. (2) The pmap becomes active on another
1284 * processor before its pm_active field is checked but due to
1285 * speculative loads one of the following functions stills reads the
1286 * pmap as inactive on the other processor.
1287 *
1288 * The kernel page table is exempt because its pm_active field is
1289 * immutable.  The kernel page table is always active on every
1290 * processor.
1291 */
1292
1293/*
1294 * Interrupt the cpus that are executing in the guest context.
1295 * This will force the vcpu to exit and the cached EPT mappings
1296 * will be invalidated by the host before the next vmresume.
1297 */
1298static __inline void
1299pmap_invalidate_ept(pmap_t pmap)
1300{
1301
1302	sched_pin();
1303	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
1304	    ("pmap_invalidate_ept: absurd pm_active"));
1305
1306	/*
1307	 * The TLB mappings associated with a vcpu context are not
1308	 * flushed each time a different vcpu is chosen to execute.
1309	 *
1310	 * This is in contrast with a process's vtop mappings that
1311	 * are flushed from the TLB on each context switch.
1312	 *
1313	 * Therefore we need to do more than just a TLB shootdown on
1314	 * the active cpus in 'pmap->pm_active'. To do this we keep
1315	 * track of the number of invalidations performed on this pmap.
1316	 *
1317	 * Each vcpu keeps a cache of this counter and compares it
1318	 * just before a vmresume. If the counter is out-of-date an
1319	 * invept will be done to flush stale mappings from the TLB.
1320	 */
1321	atomic_add_acq_long(&pmap->pm_eptgen, 1);
1322
1323	/*
1324	 * Force the vcpu to exit and trap back into the hypervisor.
1325	 *
1326	 * XXX this is not optimal because IPI_AST builds a trapframe
1327	 * whereas all we need is an 'eoi' followed by 'iret'.
1328	 */
1329	ipi_selected(pmap->pm_active, IPI_AST);
1330	sched_unpin();
1331}
1332
1333void
1334pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1335{
1336	cpuset_t other_cpus;
1337	u_int cpuid;
1338
1339	if (pmap->pm_type == PT_EPT) {
1340		pmap_invalidate_ept(pmap);
1341		return;
1342	}
1343
1344	KASSERT(pmap->pm_type == PT_X86,
1345	    ("pmap_invalidate_page: invalid type %d", pmap->pm_type));
1346
1347	sched_pin();
1348	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
1349		if (!pmap_pcid_enabled) {
1350			invlpg(va);
1351		} else {
1352			if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) {
1353				if (pmap == PCPU_GET(curpmap))
1354					invlpg(va);
1355				else
1356					pmap_invalidate_page_pcid(pmap, va);
1357			} else {
1358				invltlb_globpcid();
1359			}
1360		}
1361		smp_invlpg(pmap, va);
1362	} else {
1363		cpuid = PCPU_GET(cpuid);
1364		other_cpus = all_cpus;
1365		CPU_CLR(cpuid, &other_cpus);
1366		if (CPU_ISSET(cpuid, &pmap->pm_active))
1367			invlpg(va);
1368		else if (pmap_pcid_enabled) {
1369			if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0)
1370				pmap_invalidate_page_pcid(pmap, va);
1371			else
1372				invltlb_globpcid();
1373		}
1374		if (pmap_pcid_enabled)
1375			CPU_AND(&other_cpus, &pmap->pm_save);
1376		else
1377			CPU_AND(&other_cpus, &pmap->pm_active);
1378		if (!CPU_EMPTY(&other_cpus))
1379			smp_masked_invlpg(other_cpus, pmap, va);
1380	}
1381	sched_unpin();
1382}
1383
1384static void
1385pmap_invalidate_range_pcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1386{
1387	struct invpcid_descr d;
1388	uint64_t cr3;
1389	vm_offset_t addr;
1390
1391	if (invpcid_works) {
1392		d.pcid = pmap->pm_pcid;
1393		d.pad = 0;
1394		for (addr = sva; addr < eva; addr += PAGE_SIZE) {
1395			d.addr = addr;
1396			invpcid(&d, INVPCID_ADDR);
1397		}
1398		return;
1399	}
1400
1401	cr3 = rcr3();
1402	critical_enter();
1403	load_cr3(pmap->pm_cr3 | CR3_PCID_SAVE);
1404	for (addr = sva; addr < eva; addr += PAGE_SIZE)
1405		invlpg(addr);
1406	load_cr3(cr3 | CR3_PCID_SAVE);
1407	critical_exit();
1408}
1409
1410void
1411pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1412{
1413	cpuset_t other_cpus;
1414	vm_offset_t addr;
1415	u_int cpuid;
1416
1417	if (pmap->pm_type == PT_EPT) {
1418		pmap_invalidate_ept(pmap);
1419		return;
1420	}
1421
1422	KASSERT(pmap->pm_type == PT_X86,
1423	    ("pmap_invalidate_range: invalid type %d", pmap->pm_type));
1424
1425	sched_pin();
1426	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
1427		if (!pmap_pcid_enabled) {
1428			for (addr = sva; addr < eva; addr += PAGE_SIZE)
1429				invlpg(addr);
1430		} else {
1431			if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) {
1432				if (pmap == PCPU_GET(curpmap)) {
1433					for (addr = sva; addr < eva;
1434					    addr += PAGE_SIZE)
1435						invlpg(addr);
1436				} else {
1437					pmap_invalidate_range_pcid(pmap,
1438					    sva, eva);
1439				}
1440			} else {
1441				invltlb_globpcid();
1442			}
1443		}
1444		smp_invlpg_range(pmap, sva, eva);
1445	} else {
1446		cpuid = PCPU_GET(cpuid);
1447		other_cpus = all_cpus;
1448		CPU_CLR(cpuid, &other_cpus);
1449		if (CPU_ISSET(cpuid, &pmap->pm_active)) {
1450			for (addr = sva; addr < eva; addr += PAGE_SIZE)
1451				invlpg(addr);
1452		} else if (pmap_pcid_enabled) {
1453			if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0)
1454				pmap_invalidate_range_pcid(pmap, sva, eva);
1455			else
1456				invltlb_globpcid();
1457		}
1458		if (pmap_pcid_enabled)
1459			CPU_AND(&other_cpus, &pmap->pm_save);
1460		else
1461			CPU_AND(&other_cpus, &pmap->pm_active);
1462		if (!CPU_EMPTY(&other_cpus))
1463			smp_masked_invlpg_range(other_cpus, pmap, sva, eva);
1464	}
1465	sched_unpin();
1466}
1467
1468void
1469pmap_invalidate_all(pmap_t pmap)
1470{
1471	cpuset_t other_cpus;
1472	struct invpcid_descr d;
1473	uint64_t cr3;
1474	u_int cpuid;
1475
1476	if (pmap->pm_type == PT_EPT) {
1477		pmap_invalidate_ept(pmap);
1478		return;
1479	}
1480
1481	KASSERT(pmap->pm_type == PT_X86,
1482	    ("pmap_invalidate_all: invalid type %d", pmap->pm_type));
1483
1484	sched_pin();
1485	cpuid = PCPU_GET(cpuid);
1486	if (pmap == kernel_pmap ||
1487	    (pmap_pcid_enabled && !CPU_CMP(&pmap->pm_save, &all_cpus)) ||
1488	    !CPU_CMP(&pmap->pm_active, &all_cpus)) {
1489		if (invpcid_works) {
1490			bzero(&d, sizeof(d));
1491			invpcid(&d, INVPCID_CTXGLOB);
1492		} else {
1493			invltlb_globpcid();
1494		}
1495		CPU_CLR_ATOMIC(cpuid, &pmap->pm_save);
1496		smp_invltlb(pmap);
1497	} else {
1498		other_cpus = all_cpus;
1499		CPU_CLR(cpuid, &other_cpus);
1500
1501		/*
1502		 * This logic is duplicated in the Xinvltlb shootdown
1503		 * IPI handler.
1504		 */
1505		if (pmap_pcid_enabled) {
1506			if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) {
1507				if (invpcid_works) {
1508					d.pcid = pmap->pm_pcid;
1509					d.pad = 0;
1510					d.addr = 0;
1511					invpcid(&d, INVPCID_CTX);
1512				} else {
1513					cr3 = rcr3();
1514					critical_enter();
1515
1516					/*
1517					 * Bit 63 is clear, pcid TLB
1518					 * entries are invalidated.
1519					 */
1520					load_cr3(pmap->pm_cr3);
1521					load_cr3(cr3 | CR3_PCID_SAVE);
1522					critical_exit();
1523				}
1524			} else {
1525				invltlb_globpcid();
1526			}
1527		} else if (CPU_ISSET(cpuid, &pmap->pm_active))
1528			invltlb();
1529		CPU_CLR_ATOMIC(cpuid, &pmap->pm_save);
1530		if (pmap_pcid_enabled)
1531			CPU_AND(&other_cpus, &pmap->pm_save);
1532		else
1533			CPU_AND(&other_cpus, &pmap->pm_active);
1534		if (!CPU_EMPTY(&other_cpus))
1535			smp_masked_invltlb(other_cpus, pmap);
1536	}
1537	sched_unpin();
1538}
1539
1540void
1541pmap_invalidate_cache(void)
1542{
1543
1544	sched_pin();
1545	wbinvd();
1546	smp_cache_flush();
1547	sched_unpin();
1548}
1549
1550struct pde_action {
1551	cpuset_t invalidate;	/* processors that invalidate their TLB */
1552	pmap_t pmap;
1553	vm_offset_t va;
1554	pd_entry_t *pde;
1555	pd_entry_t newpde;
1556	u_int store;		/* processor that updates the PDE */
1557};
1558
1559static void
1560pmap_update_pde_action(void *arg)
1561{
1562	struct pde_action *act = arg;
1563
1564	if (act->store == PCPU_GET(cpuid))
1565		pmap_update_pde_store(act->pmap, act->pde, act->newpde);
1566}
1567
1568static void
1569pmap_update_pde_teardown(void *arg)
1570{
1571	struct pde_action *act = arg;
1572
1573	if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
1574		pmap_update_pde_invalidate(act->pmap, act->va, act->newpde);
1575}
1576
1577/*
1578 * Change the page size for the specified virtual address in a way that
1579 * prevents any possibility of the TLB ever having two entries that map the
1580 * same virtual address using different page sizes.  This is the recommended
1581 * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
1582 * machine check exception for a TLB state that is improperly diagnosed as a
1583 * hardware error.
1584 */
1585static void
1586pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1587{
1588	struct pde_action act;
1589	cpuset_t active, other_cpus;
1590	u_int cpuid;
1591
1592	sched_pin();
1593	cpuid = PCPU_GET(cpuid);
1594	other_cpus = all_cpus;
1595	CPU_CLR(cpuid, &other_cpus);
1596	if (pmap == kernel_pmap || pmap->pm_type == PT_EPT)
1597		active = all_cpus;
1598	else {
1599		active = pmap->pm_active;
1600		CPU_AND_ATOMIC(&pmap->pm_save, &active);
1601	}
1602	if (CPU_OVERLAP(&active, &other_cpus)) {
1603		act.store = cpuid;
1604		act.invalidate = active;
1605		act.va = va;
1606		act.pmap = pmap;
1607		act.pde = pde;
1608		act.newpde = newpde;
1609		CPU_SET(cpuid, &active);
1610		smp_rendezvous_cpus(active,
1611		    smp_no_rendevous_barrier, pmap_update_pde_action,
1612		    pmap_update_pde_teardown, &act);
1613	} else {
1614		pmap_update_pde_store(pmap, pde, newpde);
1615		if (CPU_ISSET(cpuid, &active))
1616			pmap_update_pde_invalidate(pmap, va, newpde);
1617	}
1618	sched_unpin();
1619}
1620#else /* !SMP */
1621/*
1622 * Normal, non-SMP, invalidation functions.
1623 * We inline these within pmap.c for speed.
1624 */
1625PMAP_INLINE void
1626pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1627{
1628
1629	switch (pmap->pm_type) {
1630	case PT_X86:
1631		if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1632			invlpg(va);
1633		break;
1634	case PT_EPT:
1635		pmap->pm_eptgen++;
1636		break;
1637	default:
1638		panic("pmap_invalidate_page: unknown type: %d", pmap->pm_type);
1639	}
1640}
1641
1642PMAP_INLINE void
1643pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1644{
1645	vm_offset_t addr;
1646
1647	switch (pmap->pm_type) {
1648	case PT_X86:
1649		if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1650			for (addr = sva; addr < eva; addr += PAGE_SIZE)
1651				invlpg(addr);
1652		break;
1653	case PT_EPT:
1654		pmap->pm_eptgen++;
1655		break;
1656	default:
1657		panic("pmap_invalidate_range: unknown type: %d", pmap->pm_type);
1658	}
1659}
1660
1661PMAP_INLINE void
1662pmap_invalidate_all(pmap_t pmap)
1663{
1664
1665	switch (pmap->pm_type) {
1666	case PT_X86:
1667		if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1668			invltlb();
1669		break;
1670	case PT_EPT:
1671		pmap->pm_eptgen++;
1672		break;
1673	default:
1674		panic("pmap_invalidate_all: unknown type %d", pmap->pm_type);
1675	}
1676}
1677
1678PMAP_INLINE void
1679pmap_invalidate_cache(void)
1680{
1681
1682	wbinvd();
1683}
1684
1685static void
1686pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1687{
1688
1689	pmap_update_pde_store(pmap, pde, newpde);
1690	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1691		pmap_update_pde_invalidate(pmap, va, newpde);
1692	else
1693		CPU_ZERO(&pmap->pm_save);
1694}
1695#endif /* !SMP */
1696
1697#define PMAP_CLFLUSH_THRESHOLD   (2 * 1024 * 1024)
1698
1699void
1700pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
1701{
1702
1703	KASSERT((sva & PAGE_MASK) == 0,
1704	    ("pmap_invalidate_cache_range: sva not page-aligned"));
1705	KASSERT((eva & PAGE_MASK) == 0,
1706	    ("pmap_invalidate_cache_range: eva not page-aligned"));
1707
1708	if (cpu_feature & CPUID_SS)
1709		; /* If "Self Snoop" is supported, do nothing. */
1710	else if ((cpu_feature & CPUID_CLFSH) != 0 &&
1711	    eva - sva < PMAP_CLFLUSH_THRESHOLD) {
1712
1713		/*
1714		 * XXX: Some CPUs fault, hang, or trash the local APIC
1715		 * registers if we use CLFLUSH on the local APIC
1716		 * range.  The local APIC is always uncached, so we
1717		 * don't need to flush for that range anyway.
1718		 */
1719		if (pmap_kextract(sva) == lapic_paddr)
1720			return;
1721
1722		/*
1723		 * Otherwise, do per-cache line flush.  Use the mfence
1724		 * instruction to insure that previous stores are
1725		 * included in the write-back.  The processor
1726		 * propagates flush to other processors in the cache
1727		 * coherence domain.
1728		 */
1729		mfence();
1730		for (; sva < eva; sva += cpu_clflush_line_size)
1731			clflush(sva);
1732		mfence();
1733	} else {
1734
1735		/*
1736		 * No targeted cache flush methods are supported by CPU,
1737		 * or the supplied range is bigger than 2MB.
1738		 * Globally invalidate cache.
1739		 */
1740		pmap_invalidate_cache();
1741	}
1742}
1743
1744/*
1745 * Remove the specified set of pages from the data and instruction caches.
1746 *
1747 * In contrast to pmap_invalidate_cache_range(), this function does not
1748 * rely on the CPU's self-snoop feature, because it is intended for use
1749 * when moving pages into a different cache domain.
1750 */
1751void
1752pmap_invalidate_cache_pages(vm_page_t *pages, int count)
1753{
1754	vm_offset_t daddr, eva;
1755	int i;
1756
1757	if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
1758	    (cpu_feature & CPUID_CLFSH) == 0)
1759		pmap_invalidate_cache();
1760	else {
1761		mfence();
1762		for (i = 0; i < count; i++) {
1763			daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i]));
1764			eva = daddr + PAGE_SIZE;
1765			for (; daddr < eva; daddr += cpu_clflush_line_size)
1766				clflush(daddr);
1767		}
1768		mfence();
1769	}
1770}
1771
1772/*
1773 *	Routine:	pmap_extract
1774 *	Function:
1775 *		Extract the physical page address associated
1776 *		with the given map/virtual_address pair.
1777 */
1778vm_paddr_t
1779pmap_extract(pmap_t pmap, vm_offset_t va)
1780{
1781	pdp_entry_t *pdpe;
1782	pd_entry_t *pde;
1783	pt_entry_t *pte, PG_V;
1784	vm_paddr_t pa;
1785
1786	pa = 0;
1787	PG_V = pmap_valid_bit(pmap);
1788	PMAP_LOCK(pmap);
1789	pdpe = pmap_pdpe(pmap, va);
1790	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
1791		if ((*pdpe & PG_PS) != 0)
1792			pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK);
1793		else {
1794			pde = pmap_pdpe_to_pde(pdpe, va);
1795			if ((*pde & PG_V) != 0) {
1796				if ((*pde & PG_PS) != 0) {
1797					pa = (*pde & PG_PS_FRAME) |
1798					    (va & PDRMASK);
1799				} else {
1800					pte = pmap_pde_to_pte(pde, va);
1801					pa = (*pte & PG_FRAME) |
1802					    (va & PAGE_MASK);
1803				}
1804			}
1805		}
1806	}
1807	PMAP_UNLOCK(pmap);
1808	return (pa);
1809}
1810
1811/*
1812 *	Routine:	pmap_extract_and_hold
1813 *	Function:
1814 *		Atomically extract and hold the physical page
1815 *		with the given pmap and virtual address pair
1816 *		if that mapping permits the given protection.
1817 */
1818vm_page_t
1819pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1820{
1821	pd_entry_t pde, *pdep;
1822	pt_entry_t pte, PG_RW, PG_V;
1823	vm_paddr_t pa;
1824	vm_page_t m;
1825
1826	pa = 0;
1827	m = NULL;
1828	PG_RW = pmap_rw_bit(pmap);
1829	PG_V = pmap_valid_bit(pmap);
1830	PMAP_LOCK(pmap);
1831retry:
1832	pdep = pmap_pde(pmap, va);
1833	if (pdep != NULL && (pde = *pdep)) {
1834		if (pde & PG_PS) {
1835			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
1836				if (vm_page_pa_tryrelock(pmap, (pde &
1837				    PG_PS_FRAME) | (va & PDRMASK), &pa))
1838					goto retry;
1839				m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
1840				    (va & PDRMASK));
1841				vm_page_hold(m);
1842			}
1843		} else {
1844			pte = *pmap_pde_to_pte(pdep, va);
1845			if ((pte & PG_V) &&
1846			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
1847				if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME,
1848				    &pa))
1849					goto retry;
1850				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
1851				vm_page_hold(m);
1852			}
1853		}
1854	}
1855	PA_UNLOCK_COND(pa);
1856	PMAP_UNLOCK(pmap);
1857	return (m);
1858}
1859
1860vm_paddr_t
1861pmap_kextract(vm_offset_t va)
1862{
1863	pd_entry_t pde;
1864	vm_paddr_t pa;
1865
1866	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
1867		pa = DMAP_TO_PHYS(va);
1868	} else {
1869		pde = *vtopde(va);
1870		if (pde & PG_PS) {
1871			pa = (pde & PG_PS_FRAME) | (va & PDRMASK);
1872		} else {
1873			/*
1874			 * Beware of a concurrent promotion that changes the
1875			 * PDE at this point!  For example, vtopte() must not
1876			 * be used to access the PTE because it would use the
1877			 * new PDE.  It is, however, safe to use the old PDE
1878			 * because the page table page is preserved by the
1879			 * promotion.
1880			 */
1881			pa = *pmap_pde_to_pte(&pde, va);
1882			pa = (pa & PG_FRAME) | (va & PAGE_MASK);
1883		}
1884	}
1885	return (pa);
1886}
1887
1888/***************************************************
1889 * Low level mapping routines.....
1890 ***************************************************/
1891
1892/*
1893 * Add a wired page to the kva.
1894 * Note: not SMP coherent.
1895 */
1896PMAP_INLINE void
1897pmap_kenter(vm_offset_t va, vm_paddr_t pa)
1898{
1899	pt_entry_t *pte;
1900
1901	pte = vtopte(va);
1902	pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G);
1903}
1904
1905static __inline void
1906pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
1907{
1908	pt_entry_t *pte;
1909	int cache_bits;
1910
1911	pte = vtopte(va);
1912	cache_bits = pmap_cache_bits(kernel_pmap, mode, 0);
1913	pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G | cache_bits);
1914}
1915
1916/*
1917 * Remove a page from the kernel pagetables.
1918 * Note: not SMP coherent.
1919 */
1920PMAP_INLINE void
1921pmap_kremove(vm_offset_t va)
1922{
1923	pt_entry_t *pte;
1924
1925	pte = vtopte(va);
1926	pte_clear(pte);
1927}
1928
1929/*
1930 *	Used to map a range of physical addresses into kernel
1931 *	virtual address space.
1932 *
1933 *	The value passed in '*virt' is a suggested virtual address for
1934 *	the mapping. Architectures which can support a direct-mapped
1935 *	physical to virtual region can return the appropriate address
1936 *	within that region, leaving '*virt' unchanged. Other
1937 *	architectures should map the pages starting at '*virt' and
1938 *	update '*virt' with the first usable address after the mapped
1939 *	region.
1940 */
1941vm_offset_t
1942pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1943{
1944	return PHYS_TO_DMAP(start);
1945}
1946
1947
1948/*
1949 * Add a list of wired pages to the kva
1950 * this routine is only used for temporary
1951 * kernel mappings that do not need to have
1952 * page modification or references recorded.
1953 * Note that old mappings are simply written
1954 * over.  The page *must* be wired.
1955 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1956 */
1957void
1958pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1959{
1960	pt_entry_t *endpte, oldpte, pa, *pte;
1961	vm_page_t m;
1962	int cache_bits;
1963
1964	oldpte = 0;
1965	pte = vtopte(sva);
1966	endpte = pte + count;
1967	while (pte < endpte) {
1968		m = *ma++;
1969		cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0);
1970		pa = VM_PAGE_TO_PHYS(m) | cache_bits;
1971		if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) {
1972			oldpte |= *pte;
1973			pte_store(pte, pa | X86_PG_G | X86_PG_RW | X86_PG_V);
1974		}
1975		pte++;
1976	}
1977	if (__predict_false((oldpte & X86_PG_V) != 0))
1978		pmap_invalidate_range(kernel_pmap, sva, sva + count *
1979		    PAGE_SIZE);
1980}
1981
1982/*
1983 * This routine tears out page mappings from the
1984 * kernel -- it is meant only for temporary mappings.
1985 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1986 */
1987void
1988pmap_qremove(vm_offset_t sva, int count)
1989{
1990	vm_offset_t va;
1991
1992	va = sva;
1993	while (count-- > 0) {
1994		KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va));
1995		pmap_kremove(va);
1996		va += PAGE_SIZE;
1997	}
1998	pmap_invalidate_range(kernel_pmap, sva, va);
1999}
2000
2001/***************************************************
2002 * Page table page management routines.....
2003 ***************************************************/
2004static __inline void
2005pmap_free_zero_pages(struct spglist *free)
2006{
2007	vm_page_t m;
2008
2009	while ((m = SLIST_FIRST(free)) != NULL) {
2010		SLIST_REMOVE_HEAD(free, plinks.s.ss);
2011		/* Preserve the page's PG_ZERO setting. */
2012		vm_page_free_toq(m);
2013	}
2014}
2015
2016/*
2017 * Schedule the specified unused page table page to be freed.  Specifically,
2018 * add the page to the specified list of pages that will be released to the
2019 * physical memory manager after the TLB has been updated.
2020 */
2021static __inline void
2022pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
2023    boolean_t set_PG_ZERO)
2024{
2025
2026	if (set_PG_ZERO)
2027		m->flags |= PG_ZERO;
2028	else
2029		m->flags &= ~PG_ZERO;
2030	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
2031}
2032
2033/*
2034 * Inserts the specified page table page into the specified pmap's collection
2035 * of idle page table pages.  Each of a pmap's page table pages is responsible
2036 * for mapping a distinct range of virtual addresses.  The pmap's collection is
2037 * ordered by this virtual address range.
2038 */
2039static __inline int
2040pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
2041{
2042
2043	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2044	return (vm_radix_insert(&pmap->pm_root, mpte));
2045}
2046
2047/*
2048 * Looks for a page table page mapping the specified virtual address in the
2049 * specified pmap's collection of idle page table pages.  Returns NULL if there
2050 * is no page table page corresponding to the specified virtual address.
2051 */
2052static __inline vm_page_t
2053pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va)
2054{
2055
2056	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2057	return (vm_radix_lookup(&pmap->pm_root, pmap_pde_pindex(va)));
2058}
2059
2060/*
2061 * Removes the specified page table page from the specified pmap's collection
2062 * of idle page table pages.  The specified page table page must be a member of
2063 * the pmap's collection.
2064 */
2065static __inline void
2066pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte)
2067{
2068
2069	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2070	vm_radix_remove(&pmap->pm_root, mpte->pindex);
2071}
2072
2073/*
2074 * Decrements a page table page's wire count, which is used to record the
2075 * number of valid page table entries within the page.  If the wire count
2076 * drops to zero, then the page table page is unmapped.  Returns TRUE if the
2077 * page table page was unmapped and FALSE otherwise.
2078 */
2079static inline boolean_t
2080pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2081{
2082
2083	--m->wire_count;
2084	if (m->wire_count == 0) {
2085		_pmap_unwire_ptp(pmap, va, m, free);
2086		return (TRUE);
2087	} else
2088		return (FALSE);
2089}
2090
2091static void
2092_pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2093{
2094
2095	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2096	/*
2097	 * unmap the page table page
2098	 */
2099	if (m->pindex >= (NUPDE + NUPDPE)) {
2100		/* PDP page */
2101		pml4_entry_t *pml4;
2102		pml4 = pmap_pml4e(pmap, va);
2103		*pml4 = 0;
2104	} else if (m->pindex >= NUPDE) {
2105		/* PD page */
2106		pdp_entry_t *pdp;
2107		pdp = pmap_pdpe(pmap, va);
2108		*pdp = 0;
2109	} else {
2110		/* PTE page */
2111		pd_entry_t *pd;
2112		pd = pmap_pde(pmap, va);
2113		*pd = 0;
2114	}
2115	pmap_resident_count_dec(pmap, 1);
2116	if (m->pindex < NUPDE) {
2117		/* We just released a PT, unhold the matching PD */
2118		vm_page_t pdpg;
2119
2120		pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
2121		pmap_unwire_ptp(pmap, va, pdpg, free);
2122	}
2123	if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
2124		/* We just released a PD, unhold the matching PDP */
2125		vm_page_t pdppg;
2126
2127		pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
2128		pmap_unwire_ptp(pmap, va, pdppg, free);
2129	}
2130
2131	/*
2132	 * This is a release store so that the ordinary store unmapping
2133	 * the page table page is globally performed before TLB shoot-
2134	 * down is begun.
2135	 */
2136	atomic_subtract_rel_int(&cnt.v_wire_count, 1);
2137
2138	/*
2139	 * Put page on a list so that it is released after
2140	 * *ALL* TLB shootdown is done
2141	 */
2142	pmap_add_delayed_free_list(m, free, TRUE);
2143}
2144
2145/*
2146 * After removing a page table entry, this routine is used to
2147 * conditionally free the page, and manage the hold/wire counts.
2148 */
2149static int
2150pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
2151    struct spglist *free)
2152{
2153	vm_page_t mpte;
2154
2155	if (va >= VM_MAXUSER_ADDRESS)
2156		return (0);
2157	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
2158	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
2159	return (pmap_unwire_ptp(pmap, va, mpte, free));
2160}
2161
2162void
2163pmap_pinit0(pmap_t pmap)
2164{
2165
2166	PMAP_LOCK_INIT(pmap);
2167	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
2168	pmap->pm_cr3 = KPML4phys;
2169	pmap->pm_root.rt_root = 0;
2170	CPU_ZERO(&pmap->pm_active);
2171	CPU_ZERO(&pmap->pm_save);
2172	PCPU_SET(curpmap, pmap);
2173	TAILQ_INIT(&pmap->pm_pvchunk);
2174	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
2175	pmap->pm_pcid = pmap_pcid_enabled ? 0 : -1;
2176	pmap->pm_flags = pmap_flags;
2177}
2178
2179/*
2180 * Initialize a preallocated and zeroed pmap structure,
2181 * such as one in a vmspace structure.
2182 */
2183int
2184pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags)
2185{
2186	vm_page_t pml4pg;
2187	vm_paddr_t pml4phys;
2188	int i;
2189
2190	/*
2191	 * allocate the page directory page
2192	 */
2193	while ((pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
2194	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
2195		VM_WAIT;
2196
2197	pml4phys = VM_PAGE_TO_PHYS(pml4pg);
2198	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys);
2199	pmap->pm_pcid = -1;
2200	pmap->pm_cr3 = ~0;	/* initialize to an invalid value */
2201
2202	if ((pml4pg->flags & PG_ZERO) == 0)
2203		pagezero(pmap->pm_pml4);
2204
2205	/*
2206	 * Do not install the host kernel mappings in the nested page
2207	 * tables. These mappings are meaningless in the guest physical
2208	 * address space.
2209	 */
2210	if ((pmap->pm_type = pm_type) == PT_X86) {
2211		pmap->pm_cr3 = pml4phys;
2212
2213		/* Wire in kernel global address entries. */
2214		for (i = 0; i < NKPML4E; i++) {
2215			pmap->pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) |
2216			    X86_PG_RW | X86_PG_V | PG_U;
2217		}
2218		for (i = 0; i < ndmpdpphys; i++) {
2219			pmap->pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) |
2220			    X86_PG_RW | X86_PG_V | PG_U;
2221		}
2222
2223		/* install self-referential address mapping entry(s) */
2224		pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) |
2225		    X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
2226
2227		if (pmap_pcid_enabled) {
2228			pmap->pm_pcid = alloc_unr(&pcid_unr);
2229			if (pmap->pm_pcid != -1)
2230				pmap->pm_cr3 |= pmap->pm_pcid;
2231		}
2232	}
2233
2234	pmap->pm_root.rt_root = 0;
2235	CPU_ZERO(&pmap->pm_active);
2236	TAILQ_INIT(&pmap->pm_pvchunk);
2237	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
2238	pmap->pm_flags = flags;
2239	pmap->pm_eptgen = 0;
2240	CPU_ZERO(&pmap->pm_save);
2241
2242	return (1);
2243}
2244
2245int
2246pmap_pinit(pmap_t pmap)
2247{
2248
2249	return (pmap_pinit_type(pmap, PT_X86, pmap_flags));
2250}
2251
2252/*
2253 * This routine is called if the desired page table page does not exist.
2254 *
2255 * If page table page allocation fails, this routine may sleep before
2256 * returning NULL.  It sleeps only if a lock pointer was given.
2257 *
2258 * Note: If a page allocation fails at page table level two or three,
2259 * one or two pages may be held during the wait, only to be released
2260 * afterwards.  This conservative approach is easily argued to avoid
2261 * race conditions.
2262 */
2263static vm_page_t
2264_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
2265{
2266	vm_page_t m, pdppg, pdpg;
2267	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
2268
2269	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2270
2271	PG_A = pmap_accessed_bit(pmap);
2272	PG_M = pmap_modified_bit(pmap);
2273	PG_V = pmap_valid_bit(pmap);
2274	PG_RW = pmap_rw_bit(pmap);
2275
2276	/*
2277	 * Allocate a page table page.
2278	 */
2279	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
2280	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
2281		if (lockp != NULL) {
2282			RELEASE_PV_LIST_LOCK(lockp);
2283			PMAP_UNLOCK(pmap);
2284			rw_runlock(&pvh_global_lock);
2285			VM_WAIT;
2286			rw_rlock(&pvh_global_lock);
2287			PMAP_LOCK(pmap);
2288		}
2289
2290		/*
2291		 * Indicate the need to retry.  While waiting, the page table
2292		 * page may have been allocated.
2293		 */
2294		return (NULL);
2295	}
2296	if ((m->flags & PG_ZERO) == 0)
2297		pmap_zero_page(m);
2298
2299	/*
2300	 * Map the pagetable page into the process address space, if
2301	 * it isn't already there.
2302	 */
2303
2304	if (ptepindex >= (NUPDE + NUPDPE)) {
2305		pml4_entry_t *pml4;
2306		vm_pindex_t pml4index;
2307
2308		/* Wire up a new PDPE page */
2309		pml4index = ptepindex - (NUPDE + NUPDPE);
2310		pml4 = &pmap->pm_pml4[pml4index];
2311		*pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
2312
2313	} else if (ptepindex >= NUPDE) {
2314		vm_pindex_t pml4index;
2315		vm_pindex_t pdpindex;
2316		pml4_entry_t *pml4;
2317		pdp_entry_t *pdp;
2318
2319		/* Wire up a new PDE page */
2320		pdpindex = ptepindex - NUPDE;
2321		pml4index = pdpindex >> NPML4EPGSHIFT;
2322
2323		pml4 = &pmap->pm_pml4[pml4index];
2324		if ((*pml4 & PG_V) == 0) {
2325			/* Have to allocate a new pdp, recurse */
2326			if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index,
2327			    lockp) == NULL) {
2328				--m->wire_count;
2329				atomic_subtract_int(&cnt.v_wire_count, 1);
2330				vm_page_free_zero(m);
2331				return (NULL);
2332			}
2333		} else {
2334			/* Add reference to pdp page */
2335			pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
2336			pdppg->wire_count++;
2337		}
2338		pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
2339
2340		/* Now find the pdp page */
2341		pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
2342		*pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
2343
2344	} else {
2345		vm_pindex_t pml4index;
2346		vm_pindex_t pdpindex;
2347		pml4_entry_t *pml4;
2348		pdp_entry_t *pdp;
2349		pd_entry_t *pd;
2350
2351		/* Wire up a new PTE page */
2352		pdpindex = ptepindex >> NPDPEPGSHIFT;
2353		pml4index = pdpindex >> NPML4EPGSHIFT;
2354
2355		/* First, find the pdp and check that its valid. */
2356		pml4 = &pmap->pm_pml4[pml4index];
2357		if ((*pml4 & PG_V) == 0) {
2358			/* Have to allocate a new pd, recurse */
2359			if (_pmap_allocpte(pmap, NUPDE + pdpindex,
2360			    lockp) == NULL) {
2361				--m->wire_count;
2362				atomic_subtract_int(&cnt.v_wire_count, 1);
2363				vm_page_free_zero(m);
2364				return (NULL);
2365			}
2366			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
2367			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
2368		} else {
2369			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
2370			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
2371			if ((*pdp & PG_V) == 0) {
2372				/* Have to allocate a new pd, recurse */
2373				if (_pmap_allocpte(pmap, NUPDE + pdpindex,
2374				    lockp) == NULL) {
2375					--m->wire_count;
2376					atomic_subtract_int(&cnt.v_wire_count,
2377					    1);
2378					vm_page_free_zero(m);
2379					return (NULL);
2380				}
2381			} else {
2382				/* Add reference to the pd page */
2383				pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
2384				pdpg->wire_count++;
2385			}
2386		}
2387		pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
2388
2389		/* Now we know where the page directory page is */
2390		pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
2391		*pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
2392	}
2393
2394	pmap_resident_count_inc(pmap, 1);
2395
2396	return (m);
2397}
2398
2399static vm_page_t
2400pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
2401{
2402	vm_pindex_t pdpindex, ptepindex;
2403	pdp_entry_t *pdpe, PG_V;
2404	vm_page_t pdpg;
2405
2406	PG_V = pmap_valid_bit(pmap);
2407
2408retry:
2409	pdpe = pmap_pdpe(pmap, va);
2410	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
2411		/* Add a reference to the pd page. */
2412		pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
2413		pdpg->wire_count++;
2414	} else {
2415		/* Allocate a pd page. */
2416		ptepindex = pmap_pde_pindex(va);
2417		pdpindex = ptepindex >> NPDPEPGSHIFT;
2418		pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp);
2419		if (pdpg == NULL && lockp != NULL)
2420			goto retry;
2421	}
2422	return (pdpg);
2423}
2424
2425static vm_page_t
2426pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
2427{
2428	vm_pindex_t ptepindex;
2429	pd_entry_t *pd, PG_V;
2430	vm_page_t m;
2431
2432	PG_V = pmap_valid_bit(pmap);
2433
2434	/*
2435	 * Calculate pagetable page index
2436	 */
2437	ptepindex = pmap_pde_pindex(va);
2438retry:
2439	/*
2440	 * Get the page directory entry
2441	 */
2442	pd = pmap_pde(pmap, va);
2443
2444	/*
2445	 * This supports switching from a 2MB page to a
2446	 * normal 4K page.
2447	 */
2448	if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
2449		if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) {
2450			/*
2451			 * Invalidation of the 2MB page mapping may have caused
2452			 * the deallocation of the underlying PD page.
2453			 */
2454			pd = NULL;
2455		}
2456	}
2457
2458	/*
2459	 * If the page table page is mapped, we just increment the
2460	 * hold count, and activate it.
2461	 */
2462	if (pd != NULL && (*pd & PG_V) != 0) {
2463		m = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
2464		m->wire_count++;
2465	} else {
2466		/*
2467		 * Here if the pte page isn't mapped, or if it has been
2468		 * deallocated.
2469		 */
2470		m = _pmap_allocpte(pmap, ptepindex, lockp);
2471		if (m == NULL && lockp != NULL)
2472			goto retry;
2473	}
2474	return (m);
2475}
2476
2477
2478/***************************************************
2479 * Pmap allocation/deallocation routines.
2480 ***************************************************/
2481
2482/*
2483 * Release any resources held by the given physical map.
2484 * Called when a pmap initialized by pmap_pinit is being released.
2485 * Should only be called if the map contains no valid mappings.
2486 */
2487void
2488pmap_release(pmap_t pmap)
2489{
2490	vm_page_t m;
2491	int i;
2492
2493	KASSERT(pmap->pm_stats.resident_count == 0,
2494	    ("pmap_release: pmap resident count %ld != 0",
2495	    pmap->pm_stats.resident_count));
2496	KASSERT(vm_radix_is_empty(&pmap->pm_root),
2497	    ("pmap_release: pmap has reserved page table page(s)"));
2498
2499	if (pmap_pcid_enabled) {
2500		/*
2501		 * Invalidate any left TLB entries, to allow the reuse
2502		 * of the pcid.
2503		 */
2504		pmap_invalidate_all(pmap);
2505	}
2506
2507	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4));
2508
2509	for (i = 0; i < NKPML4E; i++)	/* KVA */
2510		pmap->pm_pml4[KPML4BASE + i] = 0;
2511	for (i = 0; i < ndmpdpphys; i++)/* Direct Map */
2512		pmap->pm_pml4[DMPML4I + i] = 0;
2513	pmap->pm_pml4[PML4PML4I] = 0;	/* Recursive Mapping */
2514
2515	m->wire_count--;
2516	atomic_subtract_int(&cnt.v_wire_count, 1);
2517	vm_page_free_zero(m);
2518	if (pmap->pm_pcid != -1)
2519		free_unr(&pcid_unr, pmap->pm_pcid);
2520}
2521
2522static int
2523kvm_size(SYSCTL_HANDLER_ARGS)
2524{
2525	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
2526
2527	return sysctl_handle_long(oidp, &ksize, 0, req);
2528}
2529SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
2530    0, 0, kvm_size, "LU", "Size of KVM");
2531
2532static int
2533kvm_free(SYSCTL_HANDLER_ARGS)
2534{
2535	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
2536
2537	return sysctl_handle_long(oidp, &kfree, 0, req);
2538}
2539SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
2540    0, 0, kvm_free, "LU", "Amount of KVM free");
2541
2542/*
2543 * grow the number of kernel page table entries, if needed
2544 */
2545void
2546pmap_growkernel(vm_offset_t addr)
2547{
2548	vm_paddr_t paddr;
2549	vm_page_t nkpg;
2550	pd_entry_t *pde, newpdir;
2551	pdp_entry_t *pdpe;
2552
2553	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
2554
2555	/*
2556	 * Return if "addr" is within the range of kernel page table pages
2557	 * that were preallocated during pmap bootstrap.  Moreover, leave
2558	 * "kernel_vm_end" and the kernel page table as they were.
2559	 *
2560	 * The correctness of this action is based on the following
2561	 * argument: vm_map_findspace() allocates contiguous ranges of the
2562	 * kernel virtual address space.  It calls this function if a range
2563	 * ends after "kernel_vm_end".  If the kernel is mapped between
2564	 * "kernel_vm_end" and "addr", then the range cannot begin at
2565	 * "kernel_vm_end".  In fact, its beginning address cannot be less
2566	 * than the kernel.  Thus, there is no immediate need to allocate
2567	 * any new kernel page table pages between "kernel_vm_end" and
2568	 * "KERNBASE".
2569	 */
2570	if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR)
2571		return;
2572
2573	addr = roundup2(addr, NBPDR);
2574	if (addr - 1 >= kernel_map->max_offset)
2575		addr = kernel_map->max_offset;
2576	while (kernel_vm_end < addr) {
2577		pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end);
2578		if ((*pdpe & X86_PG_V) == 0) {
2579			/* We need a new PDP entry */
2580			nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT,
2581			    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
2582			    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
2583			if (nkpg == NULL)
2584				panic("pmap_growkernel: no memory to grow kernel");
2585			if ((nkpg->flags & PG_ZERO) == 0)
2586				pmap_zero_page(nkpg);
2587			paddr = VM_PAGE_TO_PHYS(nkpg);
2588			*pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW |
2589			    X86_PG_A | X86_PG_M);
2590			continue; /* try again */
2591		}
2592		pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end);
2593		if ((*pde & X86_PG_V) != 0) {
2594			kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2595			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2596				kernel_vm_end = kernel_map->max_offset;
2597				break;
2598			}
2599			continue;
2600		}
2601
2602		nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end),
2603		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
2604		    VM_ALLOC_ZERO);
2605		if (nkpg == NULL)
2606			panic("pmap_growkernel: no memory to grow kernel");
2607		if ((nkpg->flags & PG_ZERO) == 0)
2608			pmap_zero_page(nkpg);
2609		paddr = VM_PAGE_TO_PHYS(nkpg);
2610		newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
2611		pde_store(pde, newpdir);
2612
2613		kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2614		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2615			kernel_vm_end = kernel_map->max_offset;
2616			break;
2617		}
2618	}
2619}
2620
2621
2622/***************************************************
2623 * page management routines.
2624 ***************************************************/
2625
2626CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
2627CTASSERT(_NPCM == 3);
2628CTASSERT(_NPCPV == 168);
2629
2630static __inline struct pv_chunk *
2631pv_to_chunk(pv_entry_t pv)
2632{
2633
2634	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
2635}
2636
2637#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
2638
2639#define	PC_FREE0	0xfffffffffffffffful
2640#define	PC_FREE1	0xfffffffffffffffful
2641#define	PC_FREE2	0x000000fffffffffful
2642
2643static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
2644
2645#ifdef PV_STATS
2646static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
2647
2648SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
2649	"Current number of pv entry chunks");
2650SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
2651	"Current number of pv entry chunks allocated");
2652SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
2653	"Current number of pv entry chunks frees");
2654SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
2655	"Number of times tried to get a chunk page but failed.");
2656
2657static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
2658static int pv_entry_spare;
2659
2660SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
2661	"Current number of pv entry frees");
2662SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
2663	"Current number of pv entry allocs");
2664SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
2665	"Current number of pv entries");
2666SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
2667	"Current number of spare pv entries");
2668#endif
2669
2670/*
2671 * We are in a serious low memory condition.  Resort to
2672 * drastic measures to free some pages so we can allocate
2673 * another pv entry chunk.
2674 *
2675 * Returns NULL if PV entries were reclaimed from the specified pmap.
2676 *
2677 * We do not, however, unmap 2mpages because subsequent accesses will
2678 * allocate per-page pv entries until repromotion occurs, thereby
2679 * exacerbating the shortage of free pv entries.
2680 */
2681static vm_page_t
2682reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
2683{
2684	struct pch new_tail;
2685	struct pv_chunk *pc;
2686	struct md_page *pvh;
2687	pd_entry_t *pde;
2688	pmap_t pmap;
2689	pt_entry_t *pte, tpte;
2690	pt_entry_t PG_G, PG_A, PG_M, PG_RW;
2691	pv_entry_t pv;
2692	vm_offset_t va;
2693	vm_page_t m, m_pc;
2694	struct spglist free;
2695	uint64_t inuse;
2696	int bit, field, freed;
2697
2698	rw_assert(&pvh_global_lock, RA_LOCKED);
2699	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
2700	KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
2701	pmap = NULL;
2702	m_pc = NULL;
2703	PG_G = PG_A = PG_M = PG_RW = 0;
2704	SLIST_INIT(&free);
2705	TAILQ_INIT(&new_tail);
2706	mtx_lock(&pv_chunks_mutex);
2707	while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && SLIST_EMPTY(&free)) {
2708		TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2709		mtx_unlock(&pv_chunks_mutex);
2710		if (pmap != pc->pc_pmap) {
2711			if (pmap != NULL) {
2712				pmap_invalidate_all(pmap);
2713				if (pmap != locked_pmap)
2714					PMAP_UNLOCK(pmap);
2715			}
2716			pmap = pc->pc_pmap;
2717			/* Avoid deadlock and lock recursion. */
2718			if (pmap > locked_pmap) {
2719				RELEASE_PV_LIST_LOCK(lockp);
2720				PMAP_LOCK(pmap);
2721			} else if (pmap != locked_pmap &&
2722			    !PMAP_TRYLOCK(pmap)) {
2723				pmap = NULL;
2724				TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
2725				mtx_lock(&pv_chunks_mutex);
2726				continue;
2727			}
2728			PG_G = pmap_global_bit(pmap);
2729			PG_A = pmap_accessed_bit(pmap);
2730			PG_M = pmap_modified_bit(pmap);
2731			PG_RW = pmap_rw_bit(pmap);
2732		}
2733
2734		/*
2735		 * Destroy every non-wired, 4 KB page mapping in the chunk.
2736		 */
2737		freed = 0;
2738		for (field = 0; field < _NPCM; field++) {
2739			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
2740			    inuse != 0; inuse &= ~(1UL << bit)) {
2741				bit = bsfq(inuse);
2742				pv = &pc->pc_pventry[field * 64 + bit];
2743				va = pv->pv_va;
2744				pde = pmap_pde(pmap, va);
2745				if ((*pde & PG_PS) != 0)
2746					continue;
2747				pte = pmap_pde_to_pte(pde, va);
2748				if ((*pte & PG_W) != 0)
2749					continue;
2750				tpte = pte_load_clear(pte);
2751				if ((tpte & PG_G) != 0)
2752					pmap_invalidate_page(pmap, va);
2753				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
2754				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2755					vm_page_dirty(m);
2756				if ((tpte & PG_A) != 0)
2757					vm_page_aflag_set(m, PGA_REFERENCED);
2758				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2759				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
2760				m->md.pv_gen++;
2761				if (TAILQ_EMPTY(&m->md.pv_list) &&
2762				    (m->flags & PG_FICTITIOUS) == 0) {
2763					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2764					if (TAILQ_EMPTY(&pvh->pv_list)) {
2765						vm_page_aflag_clear(m,
2766						    PGA_WRITEABLE);
2767					}
2768				}
2769				pc->pc_map[field] |= 1UL << bit;
2770				pmap_unuse_pt(pmap, va, *pde, &free);
2771				freed++;
2772			}
2773		}
2774		if (freed == 0) {
2775			TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
2776			mtx_lock(&pv_chunks_mutex);
2777			continue;
2778		}
2779		/* Every freed mapping is for a 4 KB page. */
2780		pmap_resident_count_dec(pmap, freed);
2781		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
2782		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
2783		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
2784		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2785		if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 &&
2786		    pc->pc_map[2] == PC_FREE2) {
2787			PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
2788			PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
2789			PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
2790			/* Entire chunk is free; return it. */
2791			m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
2792			dump_drop_page(m_pc->phys_addr);
2793			mtx_lock(&pv_chunks_mutex);
2794			break;
2795		}
2796		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2797		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
2798		mtx_lock(&pv_chunks_mutex);
2799		/* One freed pv entry in locked_pmap is sufficient. */
2800		if (pmap == locked_pmap)
2801			break;
2802	}
2803	TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
2804	mtx_unlock(&pv_chunks_mutex);
2805	if (pmap != NULL) {
2806		pmap_invalidate_all(pmap);
2807		if (pmap != locked_pmap)
2808			PMAP_UNLOCK(pmap);
2809	}
2810	if (m_pc == NULL && !SLIST_EMPTY(&free)) {
2811		m_pc = SLIST_FIRST(&free);
2812		SLIST_REMOVE_HEAD(&free, plinks.s.ss);
2813		/* Recycle a freed page table page. */
2814		m_pc->wire_count = 1;
2815		atomic_add_int(&cnt.v_wire_count, 1);
2816	}
2817	pmap_free_zero_pages(&free);
2818	return (m_pc);
2819}
2820
2821/*
2822 * free the pv_entry back to the free list
2823 */
2824static void
2825free_pv_entry(pmap_t pmap, pv_entry_t pv)
2826{
2827	struct pv_chunk *pc;
2828	int idx, field, bit;
2829
2830	rw_assert(&pvh_global_lock, RA_LOCKED);
2831	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2832	PV_STAT(atomic_add_long(&pv_entry_frees, 1));
2833	PV_STAT(atomic_add_int(&pv_entry_spare, 1));
2834	PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
2835	pc = pv_to_chunk(pv);
2836	idx = pv - &pc->pc_pventry[0];
2837	field = idx / 64;
2838	bit = idx % 64;
2839	pc->pc_map[field] |= 1ul << bit;
2840	if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
2841	    pc->pc_map[2] != PC_FREE2) {
2842		/* 98% of the time, pc is already at the head of the list. */
2843		if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
2844			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2845			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2846		}
2847		return;
2848	}
2849	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2850	free_pv_chunk(pc);
2851}
2852
2853static void
2854free_pv_chunk(struct pv_chunk *pc)
2855{
2856	vm_page_t m;
2857
2858	mtx_lock(&pv_chunks_mutex);
2859 	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2860	mtx_unlock(&pv_chunks_mutex);
2861	PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
2862	PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
2863	PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
2864	/* entire chunk is free, return it */
2865	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
2866	dump_drop_page(m->phys_addr);
2867	vm_page_unwire(m, 0);
2868	vm_page_free(m);
2869}
2870
2871/*
2872 * Returns a new PV entry, allocating a new PV chunk from the system when
2873 * needed.  If this PV chunk allocation fails and a PV list lock pointer was
2874 * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
2875 * returned.
2876 *
2877 * The given PV list lock may be released.
2878 */
2879static pv_entry_t
2880get_pv_entry(pmap_t pmap, struct rwlock **lockp)
2881{
2882	int bit, field;
2883	pv_entry_t pv;
2884	struct pv_chunk *pc;
2885	vm_page_t m;
2886
2887	rw_assert(&pvh_global_lock, RA_LOCKED);
2888	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2889	PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
2890retry:
2891	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2892	if (pc != NULL) {
2893		for (field = 0; field < _NPCM; field++) {
2894			if (pc->pc_map[field]) {
2895				bit = bsfq(pc->pc_map[field]);
2896				break;
2897			}
2898		}
2899		if (field < _NPCM) {
2900			pv = &pc->pc_pventry[field * 64 + bit];
2901			pc->pc_map[field] &= ~(1ul << bit);
2902			/* If this was the last item, move it to tail */
2903			if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
2904			    pc->pc_map[2] == 0) {
2905				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2906				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
2907				    pc_list);
2908			}
2909			PV_STAT(atomic_add_long(&pv_entry_count, 1));
2910			PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
2911			return (pv);
2912		}
2913	}
2914	/* No free items, allocate another chunk */
2915	m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
2916	    VM_ALLOC_WIRED);
2917	if (m == NULL) {
2918		if (lockp == NULL) {
2919			PV_STAT(pc_chunk_tryfail++);
2920			return (NULL);
2921		}
2922		m = reclaim_pv_chunk(pmap, lockp);
2923		if (m == NULL)
2924			goto retry;
2925	}
2926	PV_STAT(atomic_add_int(&pc_chunk_count, 1));
2927	PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
2928	dump_add_page(m->phys_addr);
2929	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
2930	pc->pc_pmap = pmap;
2931	pc->pc_map[0] = PC_FREE0 & ~1ul;	/* preallocated bit 0 */
2932	pc->pc_map[1] = PC_FREE1;
2933	pc->pc_map[2] = PC_FREE2;
2934	mtx_lock(&pv_chunks_mutex);
2935	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
2936	mtx_unlock(&pv_chunks_mutex);
2937	pv = &pc->pc_pventry[0];
2938	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2939	PV_STAT(atomic_add_long(&pv_entry_count, 1));
2940	PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
2941	return (pv);
2942}
2943
2944/*
2945 * Returns the number of one bits within the given PV chunk map element.
2946 */
2947static int
2948popcnt_pc_map_elem(uint64_t elem)
2949{
2950	int count;
2951
2952	/*
2953	 * This simple method of counting the one bits performs well because
2954	 * the given element typically contains more zero bits than one bits.
2955	 */
2956	count = 0;
2957	for (; elem != 0; elem &= elem - 1)
2958		count++;
2959	return (count);
2960}
2961
2962/*
2963 * Ensure that the number of spare PV entries in the specified pmap meets or
2964 * exceeds the given count, "needed".
2965 *
2966 * The given PV list lock may be released.
2967 */
2968static void
2969reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
2970{
2971	struct pch new_tail;
2972	struct pv_chunk *pc;
2973	int avail, free;
2974	vm_page_t m;
2975
2976	rw_assert(&pvh_global_lock, RA_LOCKED);
2977	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2978	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
2979
2980	/*
2981	 * Newly allocated PV chunks must be stored in a private list until
2982	 * the required number of PV chunks have been allocated.  Otherwise,
2983	 * reclaim_pv_chunk() could recycle one of these chunks.  In
2984	 * contrast, these chunks must be added to the pmap upon allocation.
2985	 */
2986	TAILQ_INIT(&new_tail);
2987retry:
2988	avail = 0;
2989	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
2990		if ((cpu_feature2 & CPUID2_POPCNT) == 0) {
2991			free = popcnt_pc_map_elem(pc->pc_map[0]);
2992			free += popcnt_pc_map_elem(pc->pc_map[1]);
2993			free += popcnt_pc_map_elem(pc->pc_map[2]);
2994		} else {
2995			free = popcntq(pc->pc_map[0]);
2996			free += popcntq(pc->pc_map[1]);
2997			free += popcntq(pc->pc_map[2]);
2998		}
2999		if (free == 0)
3000			break;
3001		avail += free;
3002		if (avail >= needed)
3003			break;
3004	}
3005	for (; avail < needed; avail += _NPCPV) {
3006		m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
3007		    VM_ALLOC_WIRED);
3008		if (m == NULL) {
3009			m = reclaim_pv_chunk(pmap, lockp);
3010			if (m == NULL)
3011				goto retry;
3012		}
3013		PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3014		PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3015		dump_add_page(m->phys_addr);
3016		pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3017		pc->pc_pmap = pmap;
3018		pc->pc_map[0] = PC_FREE0;
3019		pc->pc_map[1] = PC_FREE1;
3020		pc->pc_map[2] = PC_FREE2;
3021		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3022		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
3023		PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
3024	}
3025	if (!TAILQ_EMPTY(&new_tail)) {
3026		mtx_lock(&pv_chunks_mutex);
3027		TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
3028		mtx_unlock(&pv_chunks_mutex);
3029	}
3030}
3031
3032/*
3033 * First find and then remove the pv entry for the specified pmap and virtual
3034 * address from the specified pv list.  Returns the pv entry if found and NULL
3035 * otherwise.  This operation can be performed on pv lists for either 4KB or
3036 * 2MB page mappings.
3037 */
3038static __inline pv_entry_t
3039pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3040{
3041	pv_entry_t pv;
3042
3043	rw_assert(&pvh_global_lock, RA_LOCKED);
3044	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
3045		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
3046			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
3047			pvh->pv_gen++;
3048			break;
3049		}
3050	}
3051	return (pv);
3052}
3053
3054/*
3055 * After demotion from a 2MB page mapping to 512 4KB page mappings,
3056 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
3057 * entries for each of the 4KB page mappings.
3058 */
3059static void
3060pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3061    struct rwlock **lockp)
3062{
3063	struct md_page *pvh;
3064	struct pv_chunk *pc;
3065	pv_entry_t pv;
3066	vm_offset_t va_last;
3067	vm_page_t m;
3068	int bit, field;
3069
3070	rw_assert(&pvh_global_lock, RA_LOCKED);
3071	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3072	KASSERT((pa & PDRMASK) == 0,
3073	    ("pmap_pv_demote_pde: pa is not 2mpage aligned"));
3074	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3075
3076	/*
3077	 * Transfer the 2mpage's pv entry for this mapping to the first
3078	 * page's pv list.  Once this transfer begins, the pv list lock
3079	 * must not be released until the last pv entry is reinstantiated.
3080	 */
3081	pvh = pa_to_pvh(pa);
3082	va = trunc_2mpage(va);
3083	pv = pmap_pvh_remove(pvh, pmap, va);
3084	KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
3085	m = PHYS_TO_VM_PAGE(pa);
3086	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3087	m->md.pv_gen++;
3088	/* Instantiate the remaining NPTEPG - 1 pv entries. */
3089	PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1));
3090	va_last = va + NBPDR - PAGE_SIZE;
3091	for (;;) {
3092		pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3093		KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
3094		    pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare"));
3095		for (field = 0; field < _NPCM; field++) {
3096			while (pc->pc_map[field]) {
3097				bit = bsfq(pc->pc_map[field]);
3098				pc->pc_map[field] &= ~(1ul << bit);
3099				pv = &pc->pc_pventry[field * 64 + bit];
3100				va += PAGE_SIZE;
3101				pv->pv_va = va;
3102				m++;
3103				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3104			    ("pmap_pv_demote_pde: page %p is not managed", m));
3105				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3106				m->md.pv_gen++;
3107				if (va == va_last)
3108					goto out;
3109			}
3110		}
3111		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3112		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3113	}
3114out:
3115	if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
3116		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3117		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3118	}
3119	PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1));
3120	PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1));
3121}
3122
3123/*
3124 * After promotion from 512 4KB page mappings to a single 2MB page mapping,
3125 * replace the many pv entries for the 4KB page mappings by a single pv entry
3126 * for the 2MB page mapping.
3127 */
3128static void
3129pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3130    struct rwlock **lockp)
3131{
3132	struct md_page *pvh;
3133	pv_entry_t pv;
3134	vm_offset_t va_last;
3135	vm_page_t m;
3136
3137	rw_assert(&pvh_global_lock, RA_LOCKED);
3138	KASSERT((pa & PDRMASK) == 0,
3139	    ("pmap_pv_promote_pde: pa is not 2mpage aligned"));
3140	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3141
3142	/*
3143	 * Transfer the first page's pv entry for this mapping to the 2mpage's
3144	 * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
3145	 * a transfer avoids the possibility that get_pv_entry() calls
3146	 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
3147	 * mappings that is being promoted.
3148	 */
3149	m = PHYS_TO_VM_PAGE(pa);
3150	va = trunc_2mpage(va);
3151	pv = pmap_pvh_remove(&m->md, pmap, va);
3152	KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
3153	pvh = pa_to_pvh(pa);
3154	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
3155	pvh->pv_gen++;
3156	/* Free the remaining NPTEPG - 1 pv entries. */
3157	va_last = va + NBPDR - PAGE_SIZE;
3158	do {
3159		m++;
3160		va += PAGE_SIZE;
3161		pmap_pvh_free(&m->md, pmap, va);
3162	} while (va < va_last);
3163}
3164
3165/*
3166 * First find and then destroy the pv entry for the specified pmap and virtual
3167 * address.  This operation can be performed on pv lists for either 4KB or 2MB
3168 * page mappings.
3169 */
3170static void
3171pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3172{
3173	pv_entry_t pv;
3174
3175	pv = pmap_pvh_remove(pvh, pmap, va);
3176	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
3177	free_pv_entry(pmap, pv);
3178}
3179
3180/*
3181 * Conditionally create the PV entry for a 4KB page mapping if the required
3182 * memory can be allocated without resorting to reclamation.
3183 */
3184static boolean_t
3185pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
3186    struct rwlock **lockp)
3187{
3188	pv_entry_t pv;
3189
3190	rw_assert(&pvh_global_lock, RA_LOCKED);
3191	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3192	/* Pass NULL instead of the lock pointer to disable reclamation. */
3193	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
3194		pv->pv_va = va;
3195		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3196		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3197		m->md.pv_gen++;
3198		return (TRUE);
3199	} else
3200		return (FALSE);
3201}
3202
3203/*
3204 * Conditionally create the PV entry for a 2MB page mapping if the required
3205 * memory can be allocated without resorting to reclamation.
3206 */
3207static boolean_t
3208pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3209    struct rwlock **lockp)
3210{
3211	struct md_page *pvh;
3212	pv_entry_t pv;
3213
3214	rw_assert(&pvh_global_lock, RA_LOCKED);
3215	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3216	/* Pass NULL instead of the lock pointer to disable reclamation. */
3217	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
3218		pv->pv_va = va;
3219		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3220		pvh = pa_to_pvh(pa);
3221		TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
3222		pvh->pv_gen++;
3223		return (TRUE);
3224	} else
3225		return (FALSE);
3226}
3227
3228/*
3229 * Fills a page table page with mappings to consecutive physical pages.
3230 */
3231static void
3232pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
3233{
3234	pt_entry_t *pte;
3235
3236	for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
3237		*pte = newpte;
3238		newpte += PAGE_SIZE;
3239	}
3240}
3241
3242/*
3243 * Tries to demote a 2MB page mapping.  If demotion fails, the 2MB page
3244 * mapping is invalidated.
3245 */
3246static boolean_t
3247pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
3248{
3249	struct rwlock *lock;
3250	boolean_t rv;
3251
3252	lock = NULL;
3253	rv = pmap_demote_pde_locked(pmap, pde, va, &lock);
3254	if (lock != NULL)
3255		rw_wunlock(lock);
3256	return (rv);
3257}
3258
3259static boolean_t
3260pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
3261    struct rwlock **lockp)
3262{
3263	pd_entry_t newpde, oldpde;
3264	pt_entry_t *firstpte, newpte;
3265	pt_entry_t PG_A, PG_G, PG_M, PG_RW, PG_V;
3266	vm_paddr_t mptepa;
3267	vm_page_t mpte;
3268	struct spglist free;
3269	int PG_PTE_CACHE;
3270
3271	PG_G = pmap_global_bit(pmap);
3272	PG_A = pmap_accessed_bit(pmap);
3273	PG_M = pmap_modified_bit(pmap);
3274	PG_RW = pmap_rw_bit(pmap);
3275	PG_V = pmap_valid_bit(pmap);
3276	PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
3277
3278	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3279	oldpde = *pde;
3280	KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
3281	    ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
3282	if ((oldpde & PG_A) != 0 && (mpte = pmap_lookup_pt_page(pmap, va)) !=
3283	    NULL)
3284		pmap_remove_pt_page(pmap, mpte);
3285	else {
3286		KASSERT((oldpde & PG_W) == 0,
3287		    ("pmap_demote_pde: page table page for a wired mapping"
3288		    " is missing"));
3289
3290		/*
3291		 * Invalidate the 2MB page mapping and return "failure" if the
3292		 * mapping was never accessed or the allocation of the new
3293		 * page table page fails.  If the 2MB page mapping belongs to
3294		 * the direct map region of the kernel's address space, then
3295		 * the page allocation request specifies the highest possible
3296		 * priority (VM_ALLOC_INTERRUPT).  Otherwise, the priority is
3297		 * normal.  Page table pages are preallocated for every other
3298		 * part of the kernel address space, so the direct map region
3299		 * is the only part of the kernel address space that must be
3300		 * handled here.
3301		 */
3302		if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
3303		    pmap_pde_pindex(va), (va >= DMAP_MIN_ADDRESS && va <
3304		    DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) |
3305		    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
3306			SLIST_INIT(&free);
3307			pmap_remove_pde(pmap, pde, trunc_2mpage(va), &free,
3308			    lockp);
3309			pmap_invalidate_page(pmap, trunc_2mpage(va));
3310			pmap_free_zero_pages(&free);
3311			CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx"
3312			    " in pmap %p", va, pmap);
3313			return (FALSE);
3314		}
3315		if (va < VM_MAXUSER_ADDRESS)
3316			pmap_resident_count_inc(pmap, 1);
3317	}
3318	mptepa = VM_PAGE_TO_PHYS(mpte);
3319	firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
3320	newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
3321	KASSERT((oldpde & PG_A) != 0,
3322	    ("pmap_demote_pde: oldpde is missing PG_A"));
3323	KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
3324	    ("pmap_demote_pde: oldpde is missing PG_M"));
3325	newpte = oldpde & ~PG_PS;
3326	newpte = pmap_swap_pat(pmap, newpte);
3327
3328	/*
3329	 * If the page table page is new, initialize it.
3330	 */
3331	if (mpte->wire_count == 1) {
3332		mpte->wire_count = NPTEPG;
3333		pmap_fill_ptp(firstpte, newpte);
3334	}
3335	KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
3336	    ("pmap_demote_pde: firstpte and newpte map different physical"
3337	    " addresses"));
3338
3339	/*
3340	 * If the mapping has changed attributes, update the page table
3341	 * entries.
3342	 */
3343	if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
3344		pmap_fill_ptp(firstpte, newpte);
3345
3346	/*
3347	 * The spare PV entries must be reserved prior to demoting the
3348	 * mapping, that is, prior to changing the PDE.  Otherwise, the state
3349	 * of the PDE and the PV lists will be inconsistent, which can result
3350	 * in reclaim_pv_chunk() attempting to remove a PV entry from the
3351	 * wrong PV list and pmap_pv_demote_pde() failing to find the expected
3352	 * PV entry for the 2MB page mapping that is being demoted.
3353	 */
3354	if ((oldpde & PG_MANAGED) != 0)
3355		reserve_pv_entries(pmap, NPTEPG - 1, lockp);
3356
3357	/*
3358	 * Demote the mapping.  This pmap is locked.  The old PDE has
3359	 * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
3360	 * set.  Thus, there is no danger of a race with another
3361	 * processor changing the setting of PG_A and/or PG_M between
3362	 * the read above and the store below.
3363	 */
3364	if (workaround_erratum383)
3365		pmap_update_pde(pmap, va, pde, newpde);
3366	else
3367		pde_store(pde, newpde);
3368
3369	/*
3370	 * Invalidate a stale recursive mapping of the page table page.
3371	 */
3372	if (va >= VM_MAXUSER_ADDRESS)
3373		pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
3374
3375	/*
3376	 * Demote the PV entry.
3377	 */
3378	if ((oldpde & PG_MANAGED) != 0)
3379		pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp);
3380
3381	atomic_add_long(&pmap_pde_demotions, 1);
3382	CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx"
3383	    " in pmap %p", va, pmap);
3384	return (TRUE);
3385}
3386
3387/*
3388 * pmap_remove_kernel_pde: Remove a kernel superpage mapping.
3389 */
3390static void
3391pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
3392{
3393	pd_entry_t newpde;
3394	vm_paddr_t mptepa;
3395	vm_page_t mpte;
3396
3397	KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
3398	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3399	mpte = pmap_lookup_pt_page(pmap, va);
3400	if (mpte == NULL)
3401		panic("pmap_remove_kernel_pde: Missing pt page.");
3402
3403	pmap_remove_pt_page(pmap, mpte);
3404	mptepa = VM_PAGE_TO_PHYS(mpte);
3405	newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V;
3406
3407	/*
3408	 * Initialize the page table page.
3409	 */
3410	pagezero((void *)PHYS_TO_DMAP(mptepa));
3411
3412	/*
3413	 * Demote the mapping.
3414	 */
3415	if (workaround_erratum383)
3416		pmap_update_pde(pmap, va, pde, newpde);
3417	else
3418		pde_store(pde, newpde);
3419
3420	/*
3421	 * Invalidate a stale recursive mapping of the page table page.
3422	 */
3423	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
3424}
3425
3426/*
3427 * pmap_remove_pde: do the things to unmap a superpage in a process
3428 */
3429static int
3430pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
3431    struct spglist *free, struct rwlock **lockp)
3432{
3433	struct md_page *pvh;
3434	pd_entry_t oldpde;
3435	vm_offset_t eva, va;
3436	vm_page_t m, mpte;
3437	pt_entry_t PG_G, PG_A, PG_M, PG_RW;
3438
3439	PG_G = pmap_global_bit(pmap);
3440	PG_A = pmap_accessed_bit(pmap);
3441	PG_M = pmap_modified_bit(pmap);
3442	PG_RW = pmap_rw_bit(pmap);
3443
3444	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3445	KASSERT((sva & PDRMASK) == 0,
3446	    ("pmap_remove_pde: sva is not 2mpage aligned"));
3447	oldpde = pte_load_clear(pdq);
3448	if (oldpde & PG_W)
3449		pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
3450
3451	/*
3452	 * Machines that don't support invlpg, also don't support
3453	 * PG_G.
3454	 */
3455	if (oldpde & PG_G)
3456		pmap_invalidate_page(kernel_pmap, sva);
3457	pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
3458	if (oldpde & PG_MANAGED) {
3459		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME);
3460		pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
3461		pmap_pvh_free(pvh, pmap, sva);
3462		eva = sva + NBPDR;
3463		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
3464		    va < eva; va += PAGE_SIZE, m++) {
3465			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
3466				vm_page_dirty(m);
3467			if (oldpde & PG_A)
3468				vm_page_aflag_set(m, PGA_REFERENCED);
3469			if (TAILQ_EMPTY(&m->md.pv_list) &&
3470			    TAILQ_EMPTY(&pvh->pv_list))
3471				vm_page_aflag_clear(m, PGA_WRITEABLE);
3472		}
3473	}
3474	if (pmap == kernel_pmap) {
3475		pmap_remove_kernel_pde(pmap, pdq, sva);
3476	} else {
3477		mpte = pmap_lookup_pt_page(pmap, sva);
3478		if (mpte != NULL) {
3479			pmap_remove_pt_page(pmap, mpte);
3480			pmap_resident_count_dec(pmap, 1);
3481			KASSERT(mpte->wire_count == NPTEPG,
3482			    ("pmap_remove_pde: pte page wire count error"));
3483			mpte->wire_count = 0;
3484			pmap_add_delayed_free_list(mpte, free, FALSE);
3485			atomic_subtract_int(&cnt.v_wire_count, 1);
3486		}
3487	}
3488	return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free));
3489}
3490
3491/*
3492 * pmap_remove_pte: do the things to unmap a page in a process
3493 */
3494static int
3495pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
3496    pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp)
3497{
3498	struct md_page *pvh;
3499	pt_entry_t oldpte, PG_A, PG_M, PG_RW;
3500	vm_page_t m;
3501
3502	PG_A = pmap_accessed_bit(pmap);
3503	PG_M = pmap_modified_bit(pmap);
3504	PG_RW = pmap_rw_bit(pmap);
3505
3506	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3507	oldpte = pte_load_clear(ptq);
3508	if (oldpte & PG_W)
3509		pmap->pm_stats.wired_count -= 1;
3510	pmap_resident_count_dec(pmap, 1);
3511	if (oldpte & PG_MANAGED) {
3512		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
3513		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
3514			vm_page_dirty(m);
3515		if (oldpte & PG_A)
3516			vm_page_aflag_set(m, PGA_REFERENCED);
3517		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3518		pmap_pvh_free(&m->md, pmap, va);
3519		if (TAILQ_EMPTY(&m->md.pv_list) &&
3520		    (m->flags & PG_FICTITIOUS) == 0) {
3521			pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3522			if (TAILQ_EMPTY(&pvh->pv_list))
3523				vm_page_aflag_clear(m, PGA_WRITEABLE);
3524		}
3525	}
3526	return (pmap_unuse_pt(pmap, va, ptepde, free));
3527}
3528
3529/*
3530 * Remove a single page from a process address space
3531 */
3532static void
3533pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
3534    struct spglist *free)
3535{
3536	struct rwlock *lock;
3537	pt_entry_t *pte, PG_V;
3538
3539	PG_V = pmap_valid_bit(pmap);
3540	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3541	if ((*pde & PG_V) == 0)
3542		return;
3543	pte = pmap_pde_to_pte(pde, va);
3544	if ((*pte & PG_V) == 0)
3545		return;
3546	lock = NULL;
3547	pmap_remove_pte(pmap, pte, va, *pde, free, &lock);
3548	if (lock != NULL)
3549		rw_wunlock(lock);
3550	pmap_invalidate_page(pmap, va);
3551}
3552
3553/*
3554 *	Remove the given range of addresses from the specified map.
3555 *
3556 *	It is assumed that the start and end are properly
3557 *	rounded to the page size.
3558 */
3559void
3560pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
3561{
3562	struct rwlock *lock;
3563	vm_offset_t va, va_next;
3564	pml4_entry_t *pml4e;
3565	pdp_entry_t *pdpe;
3566	pd_entry_t ptpaddr, *pde;
3567	pt_entry_t *pte, PG_G, PG_V;
3568	struct spglist free;
3569	int anyvalid;
3570
3571	PG_G = pmap_global_bit(pmap);
3572	PG_V = pmap_valid_bit(pmap);
3573
3574	/*
3575	 * Perform an unsynchronized read.  This is, however, safe.
3576	 */
3577	if (pmap->pm_stats.resident_count == 0)
3578		return;
3579
3580	anyvalid = 0;
3581	SLIST_INIT(&free);
3582
3583	rw_rlock(&pvh_global_lock);
3584	PMAP_LOCK(pmap);
3585
3586	/*
3587	 * special handling of removing one page.  a very
3588	 * common operation and easy to short circuit some
3589	 * code.
3590	 */
3591	if (sva + PAGE_SIZE == eva) {
3592		pde = pmap_pde(pmap, sva);
3593		if (pde && (*pde & PG_PS) == 0) {
3594			pmap_remove_page(pmap, sva, pde, &free);
3595			goto out;
3596		}
3597	}
3598
3599	lock = NULL;
3600	for (; sva < eva; sva = va_next) {
3601
3602		if (pmap->pm_stats.resident_count == 0)
3603			break;
3604
3605		pml4e = pmap_pml4e(pmap, sva);
3606		if ((*pml4e & PG_V) == 0) {
3607			va_next = (sva + NBPML4) & ~PML4MASK;
3608			if (va_next < sva)
3609				va_next = eva;
3610			continue;
3611		}
3612
3613		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
3614		if ((*pdpe & PG_V) == 0) {
3615			va_next = (sva + NBPDP) & ~PDPMASK;
3616			if (va_next < sva)
3617				va_next = eva;
3618			continue;
3619		}
3620
3621		/*
3622		 * Calculate index for next page table.
3623		 */
3624		va_next = (sva + NBPDR) & ~PDRMASK;
3625		if (va_next < sva)
3626			va_next = eva;
3627
3628		pde = pmap_pdpe_to_pde(pdpe, sva);
3629		ptpaddr = *pde;
3630
3631		/*
3632		 * Weed out invalid mappings.
3633		 */
3634		if (ptpaddr == 0)
3635			continue;
3636
3637		/*
3638		 * Check for large page.
3639		 */
3640		if ((ptpaddr & PG_PS) != 0) {
3641			/*
3642			 * Are we removing the entire large page?  If not,
3643			 * demote the mapping and fall through.
3644			 */
3645			if (sva + NBPDR == va_next && eva >= va_next) {
3646				/*
3647				 * The TLB entry for a PG_G mapping is
3648				 * invalidated by pmap_remove_pde().
3649				 */
3650				if ((ptpaddr & PG_G) == 0)
3651					anyvalid = 1;
3652				pmap_remove_pde(pmap, pde, sva, &free, &lock);
3653				continue;
3654			} else if (!pmap_demote_pde_locked(pmap, pde, sva,
3655			    &lock)) {
3656				/* The large page mapping was destroyed. */
3657				continue;
3658			} else
3659				ptpaddr = *pde;
3660		}
3661
3662		/*
3663		 * Limit our scan to either the end of the va represented
3664		 * by the current page table page, or to the end of the
3665		 * range being removed.
3666		 */
3667		if (va_next > eva)
3668			va_next = eva;
3669
3670		va = va_next;
3671		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
3672		    sva += PAGE_SIZE) {
3673			if (*pte == 0) {
3674				if (va != va_next) {
3675					pmap_invalidate_range(pmap, va, sva);
3676					va = va_next;
3677				}
3678				continue;
3679			}
3680			if ((*pte & PG_G) == 0)
3681				anyvalid = 1;
3682			else if (va == va_next)
3683				va = sva;
3684			if (pmap_remove_pte(pmap, pte, sva, ptpaddr, &free,
3685			    &lock)) {
3686				sva += PAGE_SIZE;
3687				break;
3688			}
3689		}
3690		if (va != va_next)
3691			pmap_invalidate_range(pmap, va, sva);
3692	}
3693	if (lock != NULL)
3694		rw_wunlock(lock);
3695out:
3696	if (anyvalid)
3697		pmap_invalidate_all(pmap);
3698	rw_runlock(&pvh_global_lock);
3699	PMAP_UNLOCK(pmap);
3700	pmap_free_zero_pages(&free);
3701}
3702
3703/*
3704 *	Routine:	pmap_remove_all
3705 *	Function:
3706 *		Removes this physical page from
3707 *		all physical maps in which it resides.
3708 *		Reflects back modify bits to the pager.
3709 *
3710 *	Notes:
3711 *		Original versions of this routine were very
3712 *		inefficient because they iteratively called
3713 *		pmap_remove (slow...)
3714 */
3715
3716void
3717pmap_remove_all(vm_page_t m)
3718{
3719	struct md_page *pvh;
3720	pv_entry_t pv;
3721	pmap_t pmap;
3722	pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW;
3723	pd_entry_t *pde;
3724	vm_offset_t va;
3725	struct spglist free;
3726
3727	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3728	    ("pmap_remove_all: page %p is not managed", m));
3729	SLIST_INIT(&free);
3730	rw_wlock(&pvh_global_lock);
3731	if ((m->flags & PG_FICTITIOUS) != 0)
3732		goto small_mappings;
3733	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3734	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
3735		pmap = PV_PMAP(pv);
3736		PMAP_LOCK(pmap);
3737		va = pv->pv_va;
3738		pde = pmap_pde(pmap, va);
3739		(void)pmap_demote_pde(pmap, pde, va);
3740		PMAP_UNLOCK(pmap);
3741	}
3742small_mappings:
3743	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3744		pmap = PV_PMAP(pv);
3745		PMAP_LOCK(pmap);
3746		PG_A = pmap_accessed_bit(pmap);
3747		PG_M = pmap_modified_bit(pmap);
3748		PG_RW = pmap_rw_bit(pmap);
3749		pmap_resident_count_dec(pmap, 1);
3750		pde = pmap_pde(pmap, pv->pv_va);
3751		KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
3752		    " a 2mpage in page %p's pv list", m));
3753		pte = pmap_pde_to_pte(pde, pv->pv_va);
3754		tpte = pte_load_clear(pte);
3755		if (tpte & PG_W)
3756			pmap->pm_stats.wired_count--;
3757		if (tpte & PG_A)
3758			vm_page_aflag_set(m, PGA_REFERENCED);
3759
3760		/*
3761		 * Update the vm_page_t clean and reference bits.
3762		 */
3763		if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
3764			vm_page_dirty(m);
3765		pmap_unuse_pt(pmap, pv->pv_va, *pde, &free);
3766		pmap_invalidate_page(pmap, pv->pv_va);
3767		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3768		m->md.pv_gen++;
3769		free_pv_entry(pmap, pv);
3770		PMAP_UNLOCK(pmap);
3771	}
3772	vm_page_aflag_clear(m, PGA_WRITEABLE);
3773	rw_wunlock(&pvh_global_lock);
3774	pmap_free_zero_pages(&free);
3775}
3776
3777/*
3778 * pmap_protect_pde: do the things to protect a 2mpage in a process
3779 */
3780static boolean_t
3781pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
3782{
3783	pd_entry_t newpde, oldpde;
3784	vm_offset_t eva, va;
3785	vm_page_t m;
3786	boolean_t anychanged;
3787	pt_entry_t PG_G, PG_M, PG_RW;
3788
3789	PG_G = pmap_global_bit(pmap);
3790	PG_M = pmap_modified_bit(pmap);
3791	PG_RW = pmap_rw_bit(pmap);
3792
3793	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3794	KASSERT((sva & PDRMASK) == 0,
3795	    ("pmap_protect_pde: sva is not 2mpage aligned"));
3796	anychanged = FALSE;
3797retry:
3798	oldpde = newpde = *pde;
3799	if (oldpde & PG_MANAGED) {
3800		eva = sva + NBPDR;
3801		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
3802		    va < eva; va += PAGE_SIZE, m++)
3803			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
3804				vm_page_dirty(m);
3805	}
3806	if ((prot & VM_PROT_WRITE) == 0)
3807		newpde &= ~(PG_RW | PG_M);
3808	if ((prot & VM_PROT_EXECUTE) == 0)
3809		newpde |= pg_nx;
3810	if (newpde != oldpde) {
3811		if (!atomic_cmpset_long(pde, oldpde, newpde))
3812			goto retry;
3813		if (oldpde & PG_G)
3814			pmap_invalidate_page(pmap, sva);
3815		else
3816			anychanged = TRUE;
3817	}
3818	return (anychanged);
3819}
3820
3821/*
3822 *	Set the physical protection on the
3823 *	specified range of this map as requested.
3824 */
3825void
3826pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
3827{
3828	vm_offset_t va_next;
3829	pml4_entry_t *pml4e;
3830	pdp_entry_t *pdpe;
3831	pd_entry_t ptpaddr, *pde;
3832	pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V;
3833	boolean_t anychanged, pv_lists_locked;
3834
3835	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
3836		pmap_remove(pmap, sva, eva);
3837		return;
3838	}
3839
3840	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
3841	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
3842		return;
3843
3844	PG_G = pmap_global_bit(pmap);
3845	PG_M = pmap_modified_bit(pmap);
3846	PG_V = pmap_valid_bit(pmap);
3847	PG_RW = pmap_rw_bit(pmap);
3848	pv_lists_locked = FALSE;
3849resume:
3850	anychanged = FALSE;
3851
3852	PMAP_LOCK(pmap);
3853	for (; sva < eva; sva = va_next) {
3854
3855		pml4e = pmap_pml4e(pmap, sva);
3856		if ((*pml4e & PG_V) == 0) {
3857			va_next = (sva + NBPML4) & ~PML4MASK;
3858			if (va_next < sva)
3859				va_next = eva;
3860			continue;
3861		}
3862
3863		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
3864		if ((*pdpe & PG_V) == 0) {
3865			va_next = (sva + NBPDP) & ~PDPMASK;
3866			if (va_next < sva)
3867				va_next = eva;
3868			continue;
3869		}
3870
3871		va_next = (sva + NBPDR) & ~PDRMASK;
3872		if (va_next < sva)
3873			va_next = eva;
3874
3875		pde = pmap_pdpe_to_pde(pdpe, sva);
3876		ptpaddr = *pde;
3877
3878		/*
3879		 * Weed out invalid mappings.
3880		 */
3881		if (ptpaddr == 0)
3882			continue;
3883
3884		/*
3885		 * Check for large page.
3886		 */
3887		if ((ptpaddr & PG_PS) != 0) {
3888			/*
3889			 * Are we protecting the entire large page?  If not,
3890			 * demote the mapping and fall through.
3891			 */
3892			if (sva + NBPDR == va_next && eva >= va_next) {
3893				/*
3894				 * The TLB entry for a PG_G mapping is
3895				 * invalidated by pmap_protect_pde().
3896				 */
3897				if (pmap_protect_pde(pmap, pde, sva, prot))
3898					anychanged = TRUE;
3899				continue;
3900			} else {
3901				if (!pv_lists_locked) {
3902					pv_lists_locked = TRUE;
3903					if (!rw_try_rlock(&pvh_global_lock)) {
3904						if (anychanged)
3905							pmap_invalidate_all(
3906							    pmap);
3907						PMAP_UNLOCK(pmap);
3908						rw_rlock(&pvh_global_lock);
3909						goto resume;
3910					}
3911				}
3912				if (!pmap_demote_pde(pmap, pde, sva)) {
3913					/*
3914					 * The large page mapping was
3915					 * destroyed.
3916					 */
3917					continue;
3918				}
3919			}
3920		}
3921
3922		if (va_next > eva)
3923			va_next = eva;
3924
3925		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
3926		    sva += PAGE_SIZE) {
3927			pt_entry_t obits, pbits;
3928			vm_page_t m;
3929
3930retry:
3931			obits = pbits = *pte;
3932			if ((pbits & PG_V) == 0)
3933				continue;
3934
3935			if ((prot & VM_PROT_WRITE) == 0) {
3936				if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
3937				    (PG_MANAGED | PG_M | PG_RW)) {
3938					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
3939					vm_page_dirty(m);
3940				}
3941				pbits &= ~(PG_RW | PG_M);
3942			}
3943			if ((prot & VM_PROT_EXECUTE) == 0)
3944				pbits |= pg_nx;
3945
3946			if (pbits != obits) {
3947				if (!atomic_cmpset_long(pte, obits, pbits))
3948					goto retry;
3949				if (obits & PG_G)
3950					pmap_invalidate_page(pmap, sva);
3951				else
3952					anychanged = TRUE;
3953			}
3954		}
3955	}
3956	if (anychanged)
3957		pmap_invalidate_all(pmap);
3958	if (pv_lists_locked)
3959		rw_runlock(&pvh_global_lock);
3960	PMAP_UNLOCK(pmap);
3961}
3962
3963/*
3964 * Tries to promote the 512, contiguous 4KB page mappings that are within a
3965 * single page table page (PTP) to a single 2MB page mapping.  For promotion
3966 * to occur, two conditions must be met: (1) the 4KB page mappings must map
3967 * aligned, contiguous physical memory and (2) the 4KB page mappings must have
3968 * identical characteristics.
3969 */
3970static void
3971pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
3972    struct rwlock **lockp)
3973{
3974	pd_entry_t newpde;
3975	pt_entry_t *firstpte, oldpte, pa, *pte;
3976	pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V;
3977	vm_offset_t oldpteva;
3978	vm_page_t mpte;
3979	int PG_PTE_CACHE;
3980
3981	PG_A = pmap_accessed_bit(pmap);
3982	PG_G = pmap_global_bit(pmap);
3983	PG_M = pmap_modified_bit(pmap);
3984	PG_V = pmap_valid_bit(pmap);
3985	PG_RW = pmap_rw_bit(pmap);
3986	PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
3987
3988	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3989
3990	/*
3991	 * Examine the first PTE in the specified PTP.  Abort if this PTE is
3992	 * either invalid, unused, or does not map the first 4KB physical page
3993	 * within a 2MB page.
3994	 */
3995	firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
3996setpde:
3997	newpde = *firstpte;
3998	if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
3999		atomic_add_long(&pmap_pde_p_failures, 1);
4000		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
4001		    " in pmap %p", va, pmap);
4002		return;
4003	}
4004	if ((newpde & (PG_M | PG_RW)) == PG_RW) {
4005		/*
4006		 * When PG_M is already clear, PG_RW can be cleared without
4007		 * a TLB invalidation.
4008		 */
4009		if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW))
4010			goto setpde;
4011		newpde &= ~PG_RW;
4012	}
4013
4014	/*
4015	 * Examine each of the other PTEs in the specified PTP.  Abort if this
4016	 * PTE maps an unexpected 4KB physical page or does not have identical
4017	 * characteristics to the first PTE.
4018	 */
4019	pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
4020	for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
4021setpte:
4022		oldpte = *pte;
4023		if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
4024			atomic_add_long(&pmap_pde_p_failures, 1);
4025			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
4026			    " in pmap %p", va, pmap);
4027			return;
4028		}
4029		if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
4030			/*
4031			 * When PG_M is already clear, PG_RW can be cleared
4032			 * without a TLB invalidation.
4033			 */
4034			if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW))
4035				goto setpte;
4036			oldpte &= ~PG_RW;
4037			oldpteva = (oldpte & PG_FRAME & PDRMASK) |
4038			    (va & ~PDRMASK);
4039			CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx"
4040			    " in pmap %p", oldpteva, pmap);
4041		}
4042		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
4043			atomic_add_long(&pmap_pde_p_failures, 1);
4044			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
4045			    " in pmap %p", va, pmap);
4046			return;
4047		}
4048		pa -= PAGE_SIZE;
4049	}
4050
4051	/*
4052	 * Save the page table page in its current state until the PDE
4053	 * mapping the superpage is demoted by pmap_demote_pde() or
4054	 * destroyed by pmap_remove_pde().
4055	 */
4056	mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
4057	KASSERT(mpte >= vm_page_array &&
4058	    mpte < &vm_page_array[vm_page_array_size],
4059	    ("pmap_promote_pde: page table page is out of range"));
4060	KASSERT(mpte->pindex == pmap_pde_pindex(va),
4061	    ("pmap_promote_pde: page table page's pindex is wrong"));
4062	if (pmap_insert_pt_page(pmap, mpte)) {
4063		atomic_add_long(&pmap_pde_p_failures, 1);
4064		CTR2(KTR_PMAP,
4065		    "pmap_promote_pde: failure for va %#lx in pmap %p", va,
4066		    pmap);
4067		return;
4068	}
4069
4070	/*
4071	 * Promote the pv entries.
4072	 */
4073	if ((newpde & PG_MANAGED) != 0)
4074		pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp);
4075
4076	/*
4077	 * Propagate the PAT index to its proper position.
4078	 */
4079	newpde = pmap_swap_pat(pmap, newpde);
4080
4081	/*
4082	 * Map the superpage.
4083	 */
4084	if (workaround_erratum383)
4085		pmap_update_pde(pmap, va, pde, PG_PS | newpde);
4086	else
4087		pde_store(pde, PG_PS | newpde);
4088
4089	atomic_add_long(&pmap_pde_promotions, 1);
4090	CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx"
4091	    " in pmap %p", va, pmap);
4092}
4093
4094/*
4095 *	Insert the given physical page (p) at
4096 *	the specified virtual address (v) in the
4097 *	target physical map with the protection requested.
4098 *
4099 *	If specified, the page will be wired down, meaning
4100 *	that the related pte can not be reclaimed.
4101 *
4102 *	NB:  This is the only routine which MAY NOT lazy-evaluate
4103 *	or lose information.  That is, this routine must actually
4104 *	insert this page into the given map NOW.
4105 */
4106void
4107pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m,
4108    vm_prot_t prot, boolean_t wired)
4109{
4110	struct rwlock *lock;
4111	pd_entry_t *pde;
4112	pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V;
4113	pt_entry_t newpte, origpte;
4114	pv_entry_t pv;
4115	vm_paddr_t opa, pa;
4116	vm_page_t mpte, om;
4117
4118	PG_A = pmap_accessed_bit(pmap);
4119	PG_G = pmap_global_bit(pmap);
4120	PG_M = pmap_modified_bit(pmap);
4121	PG_V = pmap_valid_bit(pmap);
4122	PG_RW = pmap_rw_bit(pmap);
4123
4124	va = trunc_page(va);
4125	KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
4126	KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
4127	    ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)",
4128	    va));
4129	KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva ||
4130	    va >= kmi.clean_eva,
4131	    ("pmap_enter: managed mapping within the clean submap"));
4132	if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
4133		VM_OBJECT_ASSERT_WLOCKED(m->object);
4134	pa = VM_PAGE_TO_PHYS(m);
4135	newpte = (pt_entry_t)(pa | PG_A | PG_V);
4136	if ((access & VM_PROT_WRITE) != 0)
4137		newpte |= PG_M;
4138	if ((prot & VM_PROT_WRITE) != 0)
4139		newpte |= PG_RW;
4140	KASSERT((newpte & (PG_M | PG_RW)) != PG_M,
4141	    ("pmap_enter: access includes VM_PROT_WRITE but prot doesn't"));
4142	if ((prot & VM_PROT_EXECUTE) == 0)
4143		newpte |= pg_nx;
4144	if (wired)
4145		newpte |= PG_W;
4146	if (va < VM_MAXUSER_ADDRESS)
4147		newpte |= PG_U;
4148	if (pmap == kernel_pmap)
4149		newpte |= PG_G;
4150	newpte |= pmap_cache_bits(pmap, m->md.pat_mode, 0);
4151
4152	/*
4153	 * Set modified bit gratuitously for writeable mappings if
4154	 * the page is unmanaged. We do not want to take a fault
4155	 * to do the dirty bit accounting for these mappings.
4156	 */
4157	if ((m->oflags & VPO_UNMANAGED) != 0) {
4158		if ((newpte & PG_RW) != 0)
4159			newpte |= PG_M;
4160	}
4161
4162	mpte = NULL;
4163
4164	lock = NULL;
4165	rw_rlock(&pvh_global_lock);
4166	PMAP_LOCK(pmap);
4167
4168	/*
4169	 * In the case that a page table page is not
4170	 * resident, we are creating it here.
4171	 */
4172retry:
4173	pde = pmap_pde(pmap, va);
4174	if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 ||
4175	    pmap_demote_pde_locked(pmap, pde, va, &lock))) {
4176		pte = pmap_pde_to_pte(pde, va);
4177		if (va < VM_MAXUSER_ADDRESS && mpte == NULL) {
4178			mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
4179			mpte->wire_count++;
4180		}
4181	} else if (va < VM_MAXUSER_ADDRESS) {
4182		/*
4183		 * Here if the pte page isn't mapped, or if it has been
4184		 * deallocated.
4185		 */
4186		mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va), &lock);
4187		goto retry;
4188	} else
4189		panic("pmap_enter: invalid page directory va=%#lx", va);
4190
4191	origpte = *pte;
4192
4193	/*
4194	 * Is the specified virtual address already mapped?
4195	 */
4196	if ((origpte & PG_V) != 0) {
4197		/*
4198		 * Wiring change, just update stats. We don't worry about
4199		 * wiring PT pages as they remain resident as long as there
4200		 * are valid mappings in them. Hence, if a user page is wired,
4201		 * the PT page will be also.
4202		 */
4203		if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0)
4204			pmap->pm_stats.wired_count++;
4205		else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0)
4206			pmap->pm_stats.wired_count--;
4207
4208		/*
4209		 * Remove the extra PT page reference.
4210		 */
4211		if (mpte != NULL) {
4212			mpte->wire_count--;
4213			KASSERT(mpte->wire_count > 0,
4214			    ("pmap_enter: missing reference to page table page,"
4215			     " va: 0x%lx", va));
4216		}
4217
4218		/*
4219		 * Has the physical page changed?
4220		 */
4221		opa = origpte & PG_FRAME;
4222		if (opa == pa) {
4223			/*
4224			 * No, might be a protection or wiring change.
4225			 */
4226			if ((origpte & PG_MANAGED) != 0) {
4227				newpte |= PG_MANAGED;
4228				if ((newpte & PG_RW) != 0)
4229					vm_page_aflag_set(m, PGA_WRITEABLE);
4230			}
4231			if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0)
4232				goto unchanged;
4233			goto validate;
4234		}
4235	} else {
4236		/*
4237		 * Increment the counters.
4238		 */
4239		if ((newpte & PG_W) != 0)
4240			pmap->pm_stats.wired_count++;
4241		pmap_resident_count_inc(pmap, 1);
4242	}
4243
4244	/*
4245	 * Enter on the PV list if part of our managed memory.
4246	 */
4247	if ((m->oflags & VPO_UNMANAGED) == 0) {
4248		newpte |= PG_MANAGED;
4249		pv = get_pv_entry(pmap, &lock);
4250		pv->pv_va = va;
4251		CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
4252		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
4253		m->md.pv_gen++;
4254		if ((newpte & PG_RW) != 0)
4255			vm_page_aflag_set(m, PGA_WRITEABLE);
4256	}
4257
4258	/*
4259	 * Update the PTE.
4260	 */
4261	if ((origpte & PG_V) != 0) {
4262validate:
4263		origpte = pte_load_store(pte, newpte);
4264		opa = origpte & PG_FRAME;
4265		if (opa != pa) {
4266			if ((origpte & PG_MANAGED) != 0) {
4267				om = PHYS_TO_VM_PAGE(opa);
4268				if ((origpte & (PG_M | PG_RW)) == (PG_M |
4269				    PG_RW))
4270					vm_page_dirty(om);
4271				if ((origpte & PG_A) != 0)
4272					vm_page_aflag_set(om, PGA_REFERENCED);
4273				CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
4274				pmap_pvh_free(&om->md, pmap, va);
4275				if ((om->aflags & PGA_WRITEABLE) != 0 &&
4276				    TAILQ_EMPTY(&om->md.pv_list) &&
4277				    ((om->flags & PG_FICTITIOUS) != 0 ||
4278				    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
4279					vm_page_aflag_clear(om, PGA_WRITEABLE);
4280			}
4281		} else if ((newpte & PG_M) == 0 && (origpte & (PG_M |
4282		    PG_RW)) == (PG_M | PG_RW)) {
4283			if ((origpte & PG_MANAGED) != 0)
4284				vm_page_dirty(m);
4285
4286			/*
4287			 * Although the PTE may still have PG_RW set, TLB
4288			 * invalidation may nonetheless be required because
4289			 * the PTE no longer has PG_M set.
4290			 */
4291		} else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) {
4292			/*
4293			 * This PTE change does not require TLB invalidation.
4294			 */
4295			goto unchanged;
4296		}
4297		if ((origpte & PG_A) != 0)
4298			pmap_invalidate_page(pmap, va);
4299	} else
4300		pte_store(pte, newpte);
4301
4302unchanged:
4303
4304	/*
4305	 * If both the page table page and the reservation are fully
4306	 * populated, then attempt promotion.
4307	 */
4308	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
4309	    pmap_ps_enabled(pmap) &&
4310	    (m->flags & PG_FICTITIOUS) == 0 &&
4311	    vm_reserv_level_iffullpop(m) == 0)
4312		pmap_promote_pde(pmap, pde, va, &lock);
4313
4314	if (lock != NULL)
4315		rw_wunlock(lock);
4316	rw_runlock(&pvh_global_lock);
4317	PMAP_UNLOCK(pmap);
4318}
4319
4320/*
4321 * Tries to create a 2MB page mapping.  Returns TRUE if successful and FALSE
4322 * otherwise.  Fails if (1) a page table page cannot be allocated without
4323 * blocking, (2) a mapping already exists at the specified virtual address, or
4324 * (3) a pv entry cannot be allocated without reclaiming another pv entry.
4325 */
4326static boolean_t
4327pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
4328    struct rwlock **lockp)
4329{
4330	pd_entry_t *pde, newpde;
4331	pt_entry_t PG_V;
4332	vm_page_t mpde;
4333	struct spglist free;
4334
4335	PG_V = pmap_valid_bit(pmap);
4336	rw_assert(&pvh_global_lock, RA_LOCKED);
4337	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4338
4339	if ((mpde = pmap_allocpde(pmap, va, NULL)) == NULL) {
4340		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
4341		    " in pmap %p", va, pmap);
4342		return (FALSE);
4343	}
4344	pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpde));
4345	pde = &pde[pmap_pde_index(va)];
4346	if ((*pde & PG_V) != 0) {
4347		KASSERT(mpde->wire_count > 1,
4348		    ("pmap_enter_pde: mpde's wire count is too low"));
4349		mpde->wire_count--;
4350		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
4351		    " in pmap %p", va, pmap);
4352		return (FALSE);
4353	}
4354	newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) |
4355	    PG_PS | PG_V;
4356	if ((m->oflags & VPO_UNMANAGED) == 0) {
4357		newpde |= PG_MANAGED;
4358
4359		/*
4360		 * Abort this mapping if its PV entry could not be created.
4361		 */
4362		if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m),
4363		    lockp)) {
4364			SLIST_INIT(&free);
4365			if (pmap_unwire_ptp(pmap, va, mpde, &free)) {
4366				pmap_invalidate_page(pmap, va);
4367				pmap_free_zero_pages(&free);
4368			}
4369			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
4370			    " in pmap %p", va, pmap);
4371			return (FALSE);
4372		}
4373	}
4374	if ((prot & VM_PROT_EXECUTE) == 0)
4375		newpde |= pg_nx;
4376	if (va < VM_MAXUSER_ADDRESS)
4377		newpde |= PG_U;
4378
4379	/*
4380	 * Increment counters.
4381	 */
4382	pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
4383
4384	/*
4385	 * Map the superpage.
4386	 */
4387	pde_store(pde, newpde);
4388
4389	atomic_add_long(&pmap_pde_mappings, 1);
4390	CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
4391	    " in pmap %p", va, pmap);
4392	return (TRUE);
4393}
4394
4395/*
4396 * Maps a sequence of resident pages belonging to the same object.
4397 * The sequence begins with the given page m_start.  This page is
4398 * mapped at the given virtual address start.  Each subsequent page is
4399 * mapped at a virtual address that is offset from start by the same
4400 * amount as the page is offset from m_start within the object.  The
4401 * last page in the sequence is the page with the largest offset from
4402 * m_start that can be mapped at a virtual address less than the given
4403 * virtual address end.  Not every virtual page between start and end
4404 * is mapped; only those for which a resident page exists with the
4405 * corresponding offset from m_start are mapped.
4406 */
4407void
4408pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
4409    vm_page_t m_start, vm_prot_t prot)
4410{
4411	struct rwlock *lock;
4412	vm_offset_t va;
4413	vm_page_t m, mpte;
4414	vm_pindex_t diff, psize;
4415
4416	VM_OBJECT_ASSERT_LOCKED(m_start->object);
4417
4418	psize = atop(end - start);
4419	mpte = NULL;
4420	m = m_start;
4421	lock = NULL;
4422	rw_rlock(&pvh_global_lock);
4423	PMAP_LOCK(pmap);
4424	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
4425		va = start + ptoa(diff);
4426		if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
4427		    (VM_PAGE_TO_PHYS(m) & PDRMASK) == 0 &&
4428		    pmap_ps_enabled(pmap) &&
4429		    vm_reserv_level_iffullpop(m) == 0 &&
4430		    pmap_enter_pde(pmap, va, m, prot, &lock))
4431			m = &m[NBPDR / PAGE_SIZE - 1];
4432		else
4433			mpte = pmap_enter_quick_locked(pmap, va, m, prot,
4434			    mpte, &lock);
4435		m = TAILQ_NEXT(m, listq);
4436	}
4437	if (lock != NULL)
4438		rw_wunlock(lock);
4439	rw_runlock(&pvh_global_lock);
4440	PMAP_UNLOCK(pmap);
4441}
4442
4443/*
4444 * this code makes some *MAJOR* assumptions:
4445 * 1. Current pmap & pmap exists.
4446 * 2. Not wired.
4447 * 3. Read access.
4448 * 4. No page table pages.
4449 * but is *MUCH* faster than pmap_enter...
4450 */
4451
4452void
4453pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
4454{
4455	struct rwlock *lock;
4456
4457	lock = NULL;
4458	rw_rlock(&pvh_global_lock);
4459	PMAP_LOCK(pmap);
4460	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
4461	if (lock != NULL)
4462		rw_wunlock(lock);
4463	rw_runlock(&pvh_global_lock);
4464	PMAP_UNLOCK(pmap);
4465}
4466
4467static vm_page_t
4468pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
4469    vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
4470{
4471	struct spglist free;
4472	pt_entry_t *pte, PG_V;
4473	vm_paddr_t pa;
4474
4475	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
4476	    (m->oflags & VPO_UNMANAGED) != 0,
4477	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
4478	PG_V = pmap_valid_bit(pmap);
4479	rw_assert(&pvh_global_lock, RA_LOCKED);
4480	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4481
4482	/*
4483	 * In the case that a page table page is not
4484	 * resident, we are creating it here.
4485	 */
4486	if (va < VM_MAXUSER_ADDRESS) {
4487		vm_pindex_t ptepindex;
4488		pd_entry_t *ptepa;
4489
4490		/*
4491		 * Calculate pagetable page index
4492		 */
4493		ptepindex = pmap_pde_pindex(va);
4494		if (mpte && (mpte->pindex == ptepindex)) {
4495			mpte->wire_count++;
4496		} else {
4497			/*
4498			 * Get the page directory entry
4499			 */
4500			ptepa = pmap_pde(pmap, va);
4501
4502			/*
4503			 * If the page table page is mapped, we just increment
4504			 * the hold count, and activate it.  Otherwise, we
4505			 * attempt to allocate a page table page.  If this
4506			 * attempt fails, we don't retry.  Instead, we give up.
4507			 */
4508			if (ptepa && (*ptepa & PG_V) != 0) {
4509				if (*ptepa & PG_PS)
4510					return (NULL);
4511				mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME);
4512				mpte->wire_count++;
4513			} else {
4514				/*
4515				 * Pass NULL instead of the PV list lock
4516				 * pointer, because we don't intend to sleep.
4517				 */
4518				mpte = _pmap_allocpte(pmap, ptepindex, NULL);
4519				if (mpte == NULL)
4520					return (mpte);
4521			}
4522		}
4523		pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
4524		pte = &pte[pmap_pte_index(va)];
4525	} else {
4526		mpte = NULL;
4527		pte = vtopte(va);
4528	}
4529	if (*pte) {
4530		if (mpte != NULL) {
4531			mpte->wire_count--;
4532			mpte = NULL;
4533		}
4534		return (mpte);
4535	}
4536
4537	/*
4538	 * Enter on the PV list if part of our managed memory.
4539	 */
4540	if ((m->oflags & VPO_UNMANAGED) == 0 &&
4541	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
4542		if (mpte != NULL) {
4543			SLIST_INIT(&free);
4544			if (pmap_unwire_ptp(pmap, va, mpte, &free)) {
4545				pmap_invalidate_page(pmap, va);
4546				pmap_free_zero_pages(&free);
4547			}
4548			mpte = NULL;
4549		}
4550		return (mpte);
4551	}
4552
4553	/*
4554	 * Increment counters
4555	 */
4556	pmap_resident_count_inc(pmap, 1);
4557
4558	pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 0);
4559	if ((prot & VM_PROT_EXECUTE) == 0)
4560		pa |= pg_nx;
4561
4562	/*
4563	 * Now validate mapping with RO protection
4564	 */
4565	if ((m->oflags & VPO_UNMANAGED) != 0)
4566		pte_store(pte, pa | PG_V | PG_U);
4567	else
4568		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
4569	return (mpte);
4570}
4571
4572/*
4573 * Make a temporary mapping for a physical address.  This is only intended
4574 * to be used for panic dumps.
4575 */
4576void *
4577pmap_kenter_temporary(vm_paddr_t pa, int i)
4578{
4579	vm_offset_t va;
4580
4581	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
4582	pmap_kenter(va, pa);
4583	invlpg(va);
4584	return ((void *)crashdumpmap);
4585}
4586
4587/*
4588 * This code maps large physical mmap regions into the
4589 * processor address space.  Note that some shortcuts
4590 * are taken, but the code works.
4591 */
4592void
4593pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
4594    vm_pindex_t pindex, vm_size_t size)
4595{
4596	pd_entry_t *pde;
4597	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
4598	vm_paddr_t pa, ptepa;
4599	vm_page_t p, pdpg;
4600	int pat_mode;
4601
4602	PG_A = pmap_accessed_bit(pmap);
4603	PG_M = pmap_modified_bit(pmap);
4604	PG_V = pmap_valid_bit(pmap);
4605	PG_RW = pmap_rw_bit(pmap);
4606
4607	VM_OBJECT_ASSERT_WLOCKED(object);
4608	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
4609	    ("pmap_object_init_pt: non-device object"));
4610	if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
4611		if (!pmap_ps_enabled(pmap))
4612			return;
4613		if (!vm_object_populate(object, pindex, pindex + atop(size)))
4614			return;
4615		p = vm_page_lookup(object, pindex);
4616		KASSERT(p->valid == VM_PAGE_BITS_ALL,
4617		    ("pmap_object_init_pt: invalid page %p", p));
4618		pat_mode = p->md.pat_mode;
4619
4620		/*
4621		 * Abort the mapping if the first page is not physically
4622		 * aligned to a 2MB page boundary.
4623		 */
4624		ptepa = VM_PAGE_TO_PHYS(p);
4625		if (ptepa & (NBPDR - 1))
4626			return;
4627
4628		/*
4629		 * Skip the first page.  Abort the mapping if the rest of
4630		 * the pages are not physically contiguous or have differing
4631		 * memory attributes.
4632		 */
4633		p = TAILQ_NEXT(p, listq);
4634		for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
4635		    pa += PAGE_SIZE) {
4636			KASSERT(p->valid == VM_PAGE_BITS_ALL,
4637			    ("pmap_object_init_pt: invalid page %p", p));
4638			if (pa != VM_PAGE_TO_PHYS(p) ||
4639			    pat_mode != p->md.pat_mode)
4640				return;
4641			p = TAILQ_NEXT(p, listq);
4642		}
4643
4644		/*
4645		 * Map using 2MB pages.  Since "ptepa" is 2M aligned and
4646		 * "size" is a multiple of 2M, adding the PAT setting to "pa"
4647		 * will not affect the termination of this loop.
4648		 */
4649		PMAP_LOCK(pmap);
4650		for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1);
4651		    pa < ptepa + size; pa += NBPDR) {
4652			pdpg = pmap_allocpde(pmap, addr, NULL);
4653			if (pdpg == NULL) {
4654				/*
4655				 * The creation of mappings below is only an
4656				 * optimization.  If a page directory page
4657				 * cannot be allocated without blocking,
4658				 * continue on to the next mapping rather than
4659				 * blocking.
4660				 */
4661				addr += NBPDR;
4662				continue;
4663			}
4664			pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
4665			pde = &pde[pmap_pde_index(addr)];
4666			if ((*pde & PG_V) == 0) {
4667				pde_store(pde, pa | PG_PS | PG_M | PG_A |
4668				    PG_U | PG_RW | PG_V);
4669				pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
4670				atomic_add_long(&pmap_pde_mappings, 1);
4671			} else {
4672				/* Continue on if the PDE is already valid. */
4673				pdpg->wire_count--;
4674				KASSERT(pdpg->wire_count > 0,
4675				    ("pmap_object_init_pt: missing reference "
4676				    "to page directory page, va: 0x%lx", addr));
4677			}
4678			addr += NBPDR;
4679		}
4680		PMAP_UNLOCK(pmap);
4681	}
4682}
4683
4684/*
4685 *	Routine:	pmap_change_wiring
4686 *	Function:	Change the wiring attribute for a map/virtual-address
4687 *			pair.
4688 *	In/out conditions:
4689 *			The mapping must already exist in the pmap.
4690 */
4691void
4692pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
4693{
4694	pd_entry_t *pde;
4695	pt_entry_t *pte;
4696	boolean_t pv_lists_locked;
4697
4698	pv_lists_locked = FALSE;
4699
4700	/*
4701	 * Wiring is not a hardware characteristic so there is no need to
4702	 * invalidate TLB.
4703	 */
4704retry:
4705	PMAP_LOCK(pmap);
4706	pde = pmap_pde(pmap, va);
4707	if ((*pde & PG_PS) != 0) {
4708		if (!wired != ((*pde & PG_W) == 0)) {
4709			if (!pv_lists_locked) {
4710				pv_lists_locked = TRUE;
4711				if (!rw_try_rlock(&pvh_global_lock)) {
4712					PMAP_UNLOCK(pmap);
4713					rw_rlock(&pvh_global_lock);
4714					goto retry;
4715				}
4716			}
4717			if (!pmap_demote_pde(pmap, pde, va))
4718				panic("pmap_change_wiring: demotion failed");
4719		} else
4720			goto out;
4721	}
4722	pte = pmap_pde_to_pte(pde, va);
4723	if (wired && (*pte & PG_W) == 0) {
4724		pmap->pm_stats.wired_count++;
4725		atomic_set_long(pte, PG_W);
4726	} else if (!wired && (*pte & PG_W) != 0) {
4727		pmap->pm_stats.wired_count--;
4728		atomic_clear_long(pte, PG_W);
4729	}
4730out:
4731	if (pv_lists_locked)
4732		rw_runlock(&pvh_global_lock);
4733	PMAP_UNLOCK(pmap);
4734}
4735
4736/*
4737 *	Copy the range specified by src_addr/len
4738 *	from the source map to the range dst_addr/len
4739 *	in the destination map.
4740 *
4741 *	This routine is only advisory and need not do anything.
4742 */
4743
4744void
4745pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
4746    vm_offset_t src_addr)
4747{
4748	struct rwlock *lock;
4749	struct spglist free;
4750	vm_offset_t addr;
4751	vm_offset_t end_addr = src_addr + len;
4752	vm_offset_t va_next;
4753	pt_entry_t PG_A, PG_M, PG_V;
4754
4755	if (dst_addr != src_addr)
4756		return;
4757
4758	if (dst_pmap->pm_type != src_pmap->pm_type)
4759		return;
4760
4761	/*
4762	 * EPT page table entries that require emulation of A/D bits are
4763	 * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although
4764	 * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit
4765	 * (aka EPT_PG_EXECUTE) could still be set. Since some EPT
4766	 * implementations flag an EPT misconfiguration for exec-only
4767	 * mappings we skip this function entirely for emulated pmaps.
4768	 */
4769	if (pmap_emulate_ad_bits(dst_pmap))
4770		return;
4771
4772	lock = NULL;
4773	rw_rlock(&pvh_global_lock);
4774	if (dst_pmap < src_pmap) {
4775		PMAP_LOCK(dst_pmap);
4776		PMAP_LOCK(src_pmap);
4777	} else {
4778		PMAP_LOCK(src_pmap);
4779		PMAP_LOCK(dst_pmap);
4780	}
4781
4782	PG_A = pmap_accessed_bit(dst_pmap);
4783	PG_M = pmap_modified_bit(dst_pmap);
4784	PG_V = pmap_valid_bit(dst_pmap);
4785
4786	for (addr = src_addr; addr < end_addr; addr = va_next) {
4787		pt_entry_t *src_pte, *dst_pte;
4788		vm_page_t dstmpde, dstmpte, srcmpte;
4789		pml4_entry_t *pml4e;
4790		pdp_entry_t *pdpe;
4791		pd_entry_t srcptepaddr, *pde;
4792
4793		KASSERT(addr < UPT_MIN_ADDRESS,
4794		    ("pmap_copy: invalid to pmap_copy page tables"));
4795
4796		pml4e = pmap_pml4e(src_pmap, addr);
4797		if ((*pml4e & PG_V) == 0) {
4798			va_next = (addr + NBPML4) & ~PML4MASK;
4799			if (va_next < addr)
4800				va_next = end_addr;
4801			continue;
4802		}
4803
4804		pdpe = pmap_pml4e_to_pdpe(pml4e, addr);
4805		if ((*pdpe & PG_V) == 0) {
4806			va_next = (addr + NBPDP) & ~PDPMASK;
4807			if (va_next < addr)
4808				va_next = end_addr;
4809			continue;
4810		}
4811
4812		va_next = (addr + NBPDR) & ~PDRMASK;
4813		if (va_next < addr)
4814			va_next = end_addr;
4815
4816		pde = pmap_pdpe_to_pde(pdpe, addr);
4817		srcptepaddr = *pde;
4818		if (srcptepaddr == 0)
4819			continue;
4820
4821		if (srcptepaddr & PG_PS) {
4822			if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr)
4823				continue;
4824			dstmpde = pmap_allocpde(dst_pmap, addr, NULL);
4825			if (dstmpde == NULL)
4826				break;
4827			pde = (pd_entry_t *)
4828			    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpde));
4829			pde = &pde[pmap_pde_index(addr)];
4830			if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 ||
4831			    pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr &
4832			    PG_PS_FRAME, &lock))) {
4833				*pde = srcptepaddr & ~PG_W;
4834				pmap_resident_count_inc(dst_pmap, NBPDR / PAGE_SIZE);
4835			} else
4836				dstmpde->wire_count--;
4837			continue;
4838		}
4839
4840		srcptepaddr &= PG_FRAME;
4841		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
4842		KASSERT(srcmpte->wire_count > 0,
4843		    ("pmap_copy: source page table page is unused"));
4844
4845		if (va_next > end_addr)
4846			va_next = end_addr;
4847
4848		src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
4849		src_pte = &src_pte[pmap_pte_index(addr)];
4850		dstmpte = NULL;
4851		while (addr < va_next) {
4852			pt_entry_t ptetemp;
4853			ptetemp = *src_pte;
4854			/*
4855			 * we only virtual copy managed pages
4856			 */
4857			if ((ptetemp & PG_MANAGED) != 0) {
4858				if (dstmpte != NULL &&
4859				    dstmpte->pindex == pmap_pde_pindex(addr))
4860					dstmpte->wire_count++;
4861				else if ((dstmpte = pmap_allocpte(dst_pmap,
4862				    addr, NULL)) == NULL)
4863					goto out;
4864				dst_pte = (pt_entry_t *)
4865				    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
4866				dst_pte = &dst_pte[pmap_pte_index(addr)];
4867				if (*dst_pte == 0 &&
4868				    pmap_try_insert_pv_entry(dst_pmap, addr,
4869				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME),
4870				    &lock)) {
4871					/*
4872					 * Clear the wired, modified, and
4873					 * accessed (referenced) bits
4874					 * during the copy.
4875					 */
4876					*dst_pte = ptetemp & ~(PG_W | PG_M |
4877					    PG_A);
4878					pmap_resident_count_inc(dst_pmap, 1);
4879				} else {
4880					SLIST_INIT(&free);
4881					if (pmap_unwire_ptp(dst_pmap, addr,
4882					    dstmpte, &free)) {
4883						pmap_invalidate_page(dst_pmap,
4884						    addr);
4885						pmap_free_zero_pages(&free);
4886					}
4887					goto out;
4888				}
4889				if (dstmpte->wire_count >= srcmpte->wire_count)
4890					break;
4891			}
4892			addr += PAGE_SIZE;
4893			src_pte++;
4894		}
4895	}
4896out:
4897	if (lock != NULL)
4898		rw_wunlock(lock);
4899	rw_runlock(&pvh_global_lock);
4900	PMAP_UNLOCK(src_pmap);
4901	PMAP_UNLOCK(dst_pmap);
4902}
4903
4904/*
4905 *	pmap_zero_page zeros the specified hardware page by mapping
4906 *	the page into KVM and using bzero to clear its contents.
4907 */
4908void
4909pmap_zero_page(vm_page_t m)
4910{
4911	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
4912
4913	pagezero((void *)va);
4914}
4915
4916/*
4917 *	pmap_zero_page_area zeros the specified hardware page by mapping
4918 *	the page into KVM and using bzero to clear its contents.
4919 *
4920 *	off and size may not cover an area beyond a single hardware page.
4921 */
4922void
4923pmap_zero_page_area(vm_page_t m, int off, int size)
4924{
4925	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
4926
4927	if (off == 0 && size == PAGE_SIZE)
4928		pagezero((void *)va);
4929	else
4930		bzero((char *)va + off, size);
4931}
4932
4933/*
4934 *	pmap_zero_page_idle zeros the specified hardware page by mapping
4935 *	the page into KVM and using bzero to clear its contents.  This
4936 *	is intended to be called from the vm_pagezero process only and
4937 *	outside of Giant.
4938 */
4939void
4940pmap_zero_page_idle(vm_page_t m)
4941{
4942	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
4943
4944	pagezero((void *)va);
4945}
4946
4947/*
4948 *	pmap_copy_page copies the specified (machine independent)
4949 *	page by mapping the page into virtual memory and using
4950 *	bcopy to copy the page, one machine dependent page at a
4951 *	time.
4952 */
4953void
4954pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
4955{
4956	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
4957	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
4958
4959	pagecopy((void *)src, (void *)dst);
4960}
4961
4962int unmapped_buf_allowed = 1;
4963
4964void
4965pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
4966    vm_offset_t b_offset, int xfersize)
4967{
4968	void *a_cp, *b_cp;
4969	vm_offset_t a_pg_offset, b_pg_offset;
4970	int cnt;
4971
4972	while (xfersize > 0) {
4973		a_pg_offset = a_offset & PAGE_MASK;
4974		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
4975		a_cp = (char *)PHYS_TO_DMAP(ma[a_offset >> PAGE_SHIFT]->
4976		    phys_addr) + a_pg_offset;
4977		b_pg_offset = b_offset & PAGE_MASK;
4978		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
4979		b_cp = (char *)PHYS_TO_DMAP(mb[b_offset >> PAGE_SHIFT]->
4980		    phys_addr) + b_pg_offset;
4981		bcopy(a_cp, b_cp, cnt);
4982		a_offset += cnt;
4983		b_offset += cnt;
4984		xfersize -= cnt;
4985	}
4986}
4987
4988/*
4989 * Returns true if the pmap's pv is one of the first
4990 * 16 pvs linked to from this page.  This count may
4991 * be changed upwards or downwards in the future; it
4992 * is only necessary that true be returned for a small
4993 * subset of pmaps for proper page aging.
4994 */
4995boolean_t
4996pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
4997{
4998	struct md_page *pvh;
4999	struct rwlock *lock;
5000	pv_entry_t pv;
5001	int loops = 0;
5002	boolean_t rv;
5003
5004	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5005	    ("pmap_page_exists_quick: page %p is not managed", m));
5006	rv = FALSE;
5007	rw_rlock(&pvh_global_lock);
5008	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5009	rw_rlock(lock);
5010	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5011		if (PV_PMAP(pv) == pmap) {
5012			rv = TRUE;
5013			break;
5014		}
5015		loops++;
5016		if (loops >= 16)
5017			break;
5018	}
5019	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
5020		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5021		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5022			if (PV_PMAP(pv) == pmap) {
5023				rv = TRUE;
5024				break;
5025			}
5026			loops++;
5027			if (loops >= 16)
5028				break;
5029		}
5030	}
5031	rw_runlock(lock);
5032	rw_runlock(&pvh_global_lock);
5033	return (rv);
5034}
5035
5036/*
5037 *	pmap_page_wired_mappings:
5038 *
5039 *	Return the number of managed mappings to the given physical page
5040 *	that are wired.
5041 */
5042int
5043pmap_page_wired_mappings(vm_page_t m)
5044{
5045	struct rwlock *lock;
5046	struct md_page *pvh;
5047	pmap_t pmap;
5048	pt_entry_t *pte;
5049	pv_entry_t pv;
5050	int count, md_gen, pvh_gen;
5051
5052	if ((m->oflags & VPO_UNMANAGED) != 0)
5053		return (0);
5054	rw_rlock(&pvh_global_lock);
5055	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5056	rw_rlock(lock);
5057restart:
5058	count = 0;
5059	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5060		pmap = PV_PMAP(pv);
5061		if (!PMAP_TRYLOCK(pmap)) {
5062			md_gen = m->md.pv_gen;
5063			rw_runlock(lock);
5064			PMAP_LOCK(pmap);
5065			rw_rlock(lock);
5066			if (md_gen != m->md.pv_gen) {
5067				PMAP_UNLOCK(pmap);
5068				goto restart;
5069			}
5070		}
5071		pte = pmap_pte(pmap, pv->pv_va);
5072		if ((*pte & PG_W) != 0)
5073			count++;
5074		PMAP_UNLOCK(pmap);
5075	}
5076	if ((m->flags & PG_FICTITIOUS) == 0) {
5077		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5078		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5079			pmap = PV_PMAP(pv);
5080			if (!PMAP_TRYLOCK(pmap)) {
5081				md_gen = m->md.pv_gen;
5082				pvh_gen = pvh->pv_gen;
5083				rw_runlock(lock);
5084				PMAP_LOCK(pmap);
5085				rw_rlock(lock);
5086				if (md_gen != m->md.pv_gen ||
5087				    pvh_gen != pvh->pv_gen) {
5088					PMAP_UNLOCK(pmap);
5089					goto restart;
5090				}
5091			}
5092			pte = pmap_pde(pmap, pv->pv_va);
5093			if ((*pte & PG_W) != 0)
5094				count++;
5095			PMAP_UNLOCK(pmap);
5096		}
5097	}
5098	rw_runlock(lock);
5099	rw_runlock(&pvh_global_lock);
5100	return (count);
5101}
5102
5103/*
5104 * Returns TRUE if the given page is mapped individually or as part of
5105 * a 2mpage.  Otherwise, returns FALSE.
5106 */
5107boolean_t
5108pmap_page_is_mapped(vm_page_t m)
5109{
5110	struct rwlock *lock;
5111	boolean_t rv;
5112
5113	if ((m->oflags & VPO_UNMANAGED) != 0)
5114		return (FALSE);
5115	rw_rlock(&pvh_global_lock);
5116	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5117	rw_rlock(lock);
5118	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
5119	    ((m->flags & PG_FICTITIOUS) == 0 &&
5120	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
5121	rw_runlock(lock);
5122	rw_runlock(&pvh_global_lock);
5123	return (rv);
5124}
5125
5126/*
5127 * Destroy all managed, non-wired mappings in the given user-space
5128 * pmap.  This pmap cannot be active on any processor besides the
5129 * caller.
5130 *
5131 * This function cannot be applied to the kernel pmap.  Moreover, it
5132 * is not intended for general use.  It is only to be used during
5133 * process termination.  Consequently, it can be implemented in ways
5134 * that make it faster than pmap_remove().  First, it can more quickly
5135 * destroy mappings by iterating over the pmap's collection of PV
5136 * entries, rather than searching the page table.  Second, it doesn't
5137 * have to test and clear the page table entries atomically, because
5138 * no processor is currently accessing the user address space.  In
5139 * particular, a page table entry's dirty bit won't change state once
5140 * this function starts.
5141 */
5142void
5143pmap_remove_pages(pmap_t pmap)
5144{
5145	pd_entry_t ptepde;
5146	pt_entry_t *pte, tpte;
5147	pt_entry_t PG_M, PG_RW, PG_V;
5148	struct spglist free;
5149	vm_page_t m, mpte, mt;
5150	pv_entry_t pv;
5151	struct md_page *pvh;
5152	struct pv_chunk *pc, *npc;
5153	struct rwlock *lock;
5154	int64_t bit;
5155	uint64_t inuse, bitmask;
5156	int allfree, field, freed, idx;
5157	boolean_t superpage;
5158	vm_paddr_t pa;
5159
5160	/*
5161	 * Assert that the given pmap is only active on the current
5162	 * CPU.  Unfortunately, we cannot block another CPU from
5163	 * activating the pmap while this function is executing.
5164	 */
5165	KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap));
5166#ifdef INVARIANTS
5167	{
5168		cpuset_t other_cpus;
5169
5170		other_cpus = all_cpus;
5171		critical_enter();
5172		CPU_CLR(PCPU_GET(cpuid), &other_cpus);
5173		CPU_AND(&other_cpus, &pmap->pm_active);
5174		critical_exit();
5175		KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap));
5176	}
5177#endif
5178
5179	lock = NULL;
5180	PG_M = pmap_modified_bit(pmap);
5181	PG_V = pmap_valid_bit(pmap);
5182	PG_RW = pmap_rw_bit(pmap);
5183
5184	SLIST_INIT(&free);
5185	rw_rlock(&pvh_global_lock);
5186	PMAP_LOCK(pmap);
5187	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
5188		allfree = 1;
5189		freed = 0;
5190		for (field = 0; field < _NPCM; field++) {
5191			inuse = ~pc->pc_map[field] & pc_freemask[field];
5192			while (inuse != 0) {
5193				bit = bsfq(inuse);
5194				bitmask = 1UL << bit;
5195				idx = field * 64 + bit;
5196				pv = &pc->pc_pventry[idx];
5197				inuse &= ~bitmask;
5198
5199				pte = pmap_pdpe(pmap, pv->pv_va);
5200				ptepde = *pte;
5201				pte = pmap_pdpe_to_pde(pte, pv->pv_va);
5202				tpte = *pte;
5203				if ((tpte & (PG_PS | PG_V)) == PG_V) {
5204					superpage = FALSE;
5205					ptepde = tpte;
5206					pte = (pt_entry_t *)PHYS_TO_DMAP(tpte &
5207					    PG_FRAME);
5208					pte = &pte[pmap_pte_index(pv->pv_va)];
5209					tpte = *pte;
5210				} else {
5211					/*
5212					 * Keep track whether 'tpte' is a
5213					 * superpage explicitly instead of
5214					 * relying on PG_PS being set.
5215					 *
5216					 * This is because PG_PS is numerically
5217					 * identical to PG_PTE_PAT and thus a
5218					 * regular page could be mistaken for
5219					 * a superpage.
5220					 */
5221					superpage = TRUE;
5222				}
5223
5224				if ((tpte & PG_V) == 0) {
5225					panic("bad pte va %lx pte %lx",
5226					    pv->pv_va, tpte);
5227				}
5228
5229/*
5230 * We cannot remove wired pages from a process' mapping at this time
5231 */
5232				if (tpte & PG_W) {
5233					allfree = 0;
5234					continue;
5235				}
5236
5237				if (superpage)
5238					pa = tpte & PG_PS_FRAME;
5239				else
5240					pa = tpte & PG_FRAME;
5241
5242				m = PHYS_TO_VM_PAGE(pa);
5243				KASSERT(m->phys_addr == pa,
5244				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
5245				    m, (uintmax_t)m->phys_addr,
5246				    (uintmax_t)tpte));
5247
5248				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
5249				    m < &vm_page_array[vm_page_array_size],
5250				    ("pmap_remove_pages: bad tpte %#jx",
5251				    (uintmax_t)tpte));
5252
5253				pte_clear(pte);
5254
5255				/*
5256				 * Update the vm_page_t clean/reference bits.
5257				 */
5258				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
5259					if (superpage) {
5260						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
5261							vm_page_dirty(mt);
5262					} else
5263						vm_page_dirty(m);
5264				}
5265
5266				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
5267
5268				/* Mark free */
5269				pc->pc_map[field] |= bitmask;
5270				if (superpage) {
5271					pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
5272					pvh = pa_to_pvh(tpte & PG_PS_FRAME);
5273					TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
5274					pvh->pv_gen++;
5275					if (TAILQ_EMPTY(&pvh->pv_list)) {
5276						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
5277							if ((mt->aflags & PGA_WRITEABLE) != 0 &&
5278							    TAILQ_EMPTY(&mt->md.pv_list))
5279								vm_page_aflag_clear(mt, PGA_WRITEABLE);
5280					}
5281					mpte = pmap_lookup_pt_page(pmap, pv->pv_va);
5282					if (mpte != NULL) {
5283						pmap_remove_pt_page(pmap, mpte);
5284						pmap_resident_count_dec(pmap, 1);
5285						KASSERT(mpte->wire_count == NPTEPG,
5286						    ("pmap_remove_pages: pte page wire count error"));
5287						mpte->wire_count = 0;
5288						pmap_add_delayed_free_list(mpte, &free, FALSE);
5289						atomic_subtract_int(&cnt.v_wire_count, 1);
5290					}
5291				} else {
5292					pmap_resident_count_dec(pmap, 1);
5293					TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
5294					m->md.pv_gen++;
5295					if ((m->aflags & PGA_WRITEABLE) != 0 &&
5296					    TAILQ_EMPTY(&m->md.pv_list) &&
5297					    (m->flags & PG_FICTITIOUS) == 0) {
5298						pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5299						if (TAILQ_EMPTY(&pvh->pv_list))
5300							vm_page_aflag_clear(m, PGA_WRITEABLE);
5301					}
5302				}
5303				pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
5304				freed++;
5305			}
5306		}
5307		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
5308		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
5309		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
5310		if (allfree) {
5311			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
5312			free_pv_chunk(pc);
5313		}
5314	}
5315	if (lock != NULL)
5316		rw_wunlock(lock);
5317	pmap_invalidate_all(pmap);
5318	rw_runlock(&pvh_global_lock);
5319	PMAP_UNLOCK(pmap);
5320	pmap_free_zero_pages(&free);
5321}
5322
5323static boolean_t
5324pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
5325{
5326	struct rwlock *lock;
5327	pv_entry_t pv;
5328	struct md_page *pvh;
5329	pt_entry_t *pte, mask;
5330	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
5331	pmap_t pmap;
5332	int md_gen, pvh_gen;
5333	boolean_t rv;
5334
5335	rv = FALSE;
5336	rw_rlock(&pvh_global_lock);
5337	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5338	rw_rlock(lock);
5339restart:
5340	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5341		pmap = PV_PMAP(pv);
5342		if (!PMAP_TRYLOCK(pmap)) {
5343			md_gen = m->md.pv_gen;
5344			rw_runlock(lock);
5345			PMAP_LOCK(pmap);
5346			rw_rlock(lock);
5347			if (md_gen != m->md.pv_gen) {
5348				PMAP_UNLOCK(pmap);
5349				goto restart;
5350			}
5351		}
5352		pte = pmap_pte(pmap, pv->pv_va);
5353		mask = 0;
5354		if (modified) {
5355			PG_M = pmap_modified_bit(pmap);
5356			PG_RW = pmap_rw_bit(pmap);
5357			mask |= PG_RW | PG_M;
5358		}
5359		if (accessed) {
5360			PG_A = pmap_accessed_bit(pmap);
5361			PG_V = pmap_valid_bit(pmap);
5362			mask |= PG_V | PG_A;
5363		}
5364		rv = (*pte & mask) == mask;
5365		PMAP_UNLOCK(pmap);
5366		if (rv)
5367			goto out;
5368	}
5369	if ((m->flags & PG_FICTITIOUS) == 0) {
5370		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5371		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5372			pmap = PV_PMAP(pv);
5373			if (!PMAP_TRYLOCK(pmap)) {
5374				md_gen = m->md.pv_gen;
5375				pvh_gen = pvh->pv_gen;
5376				rw_runlock(lock);
5377				PMAP_LOCK(pmap);
5378				rw_rlock(lock);
5379				if (md_gen != m->md.pv_gen ||
5380				    pvh_gen != pvh->pv_gen) {
5381					PMAP_UNLOCK(pmap);
5382					goto restart;
5383				}
5384			}
5385			pte = pmap_pde(pmap, pv->pv_va);
5386			mask = 0;
5387			if (modified) {
5388				PG_M = pmap_modified_bit(pmap);
5389				PG_RW = pmap_rw_bit(pmap);
5390				mask |= PG_RW | PG_M;
5391			}
5392			if (accessed) {
5393				PG_A = pmap_accessed_bit(pmap);
5394				PG_V = pmap_valid_bit(pmap);
5395				mask |= PG_V | PG_A;
5396			}
5397			rv = (*pte & mask) == mask;
5398			PMAP_UNLOCK(pmap);
5399			if (rv)
5400				goto out;
5401		}
5402	}
5403out:
5404	rw_runlock(lock);
5405	rw_runlock(&pvh_global_lock);
5406	return (rv);
5407}
5408
5409/*
5410 *	pmap_is_modified:
5411 *
5412 *	Return whether or not the specified physical page was modified
5413 *	in any physical maps.
5414 */
5415boolean_t
5416pmap_is_modified(vm_page_t m)
5417{
5418
5419	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5420	    ("pmap_is_modified: page %p is not managed", m));
5421
5422	/*
5423	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
5424	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
5425	 * is clear, no PTEs can have PG_M set.
5426	 */
5427	VM_OBJECT_ASSERT_WLOCKED(m->object);
5428	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
5429		return (FALSE);
5430	return (pmap_page_test_mappings(m, FALSE, TRUE));
5431}
5432
5433/*
5434 *	pmap_is_prefaultable:
5435 *
5436 *	Return whether or not the specified virtual address is eligible
5437 *	for prefault.
5438 */
5439boolean_t
5440pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
5441{
5442	pd_entry_t *pde;
5443	pt_entry_t *pte, PG_V;
5444	boolean_t rv;
5445
5446	PG_V = pmap_valid_bit(pmap);
5447	rv = FALSE;
5448	PMAP_LOCK(pmap);
5449	pde = pmap_pde(pmap, addr);
5450	if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) {
5451		pte = pmap_pde_to_pte(pde, addr);
5452		rv = (*pte & PG_V) == 0;
5453	}
5454	PMAP_UNLOCK(pmap);
5455	return (rv);
5456}
5457
5458/*
5459 *	pmap_is_referenced:
5460 *
5461 *	Return whether or not the specified physical page was referenced
5462 *	in any physical maps.
5463 */
5464boolean_t
5465pmap_is_referenced(vm_page_t m)
5466{
5467
5468	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5469	    ("pmap_is_referenced: page %p is not managed", m));
5470	return (pmap_page_test_mappings(m, TRUE, FALSE));
5471}
5472
5473/*
5474 * Clear the write and modified bits in each of the given page's mappings.
5475 */
5476void
5477pmap_remove_write(vm_page_t m)
5478{
5479	struct md_page *pvh;
5480	pmap_t pmap;
5481	struct rwlock *lock;
5482	pv_entry_t next_pv, pv;
5483	pd_entry_t *pde;
5484	pt_entry_t oldpte, *pte, PG_M, PG_RW;
5485	vm_offset_t va;
5486	int pvh_gen, md_gen;
5487
5488	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5489	    ("pmap_remove_write: page %p is not managed", m));
5490
5491	/*
5492	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
5493	 * set by another thread while the object is locked.  Thus,
5494	 * if PGA_WRITEABLE is clear, no page table entries need updating.
5495	 */
5496	VM_OBJECT_ASSERT_WLOCKED(m->object);
5497	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
5498		return;
5499	rw_rlock(&pvh_global_lock);
5500	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5501	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5502retry_pv_loop:
5503	rw_wlock(lock);
5504	if ((m->flags & PG_FICTITIOUS) != 0)
5505		goto small_mappings;
5506	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
5507		pmap = PV_PMAP(pv);
5508		if (!PMAP_TRYLOCK(pmap)) {
5509			pvh_gen = pvh->pv_gen;
5510			rw_wunlock(lock);
5511			PMAP_LOCK(pmap);
5512			rw_wlock(lock);
5513			if (pvh_gen != pvh->pv_gen) {
5514				PMAP_UNLOCK(pmap);
5515				rw_wunlock(lock);
5516				goto retry_pv_loop;
5517			}
5518		}
5519		PG_RW = pmap_rw_bit(pmap);
5520		va = pv->pv_va;
5521		pde = pmap_pde(pmap, va);
5522		if ((*pde & PG_RW) != 0)
5523			(void)pmap_demote_pde_locked(pmap, pde, va, &lock);
5524		KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
5525		    ("inconsistent pv lock %p %p for page %p",
5526		    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
5527		PMAP_UNLOCK(pmap);
5528	}
5529small_mappings:
5530	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5531		pmap = PV_PMAP(pv);
5532		if (!PMAP_TRYLOCK(pmap)) {
5533			pvh_gen = pvh->pv_gen;
5534			md_gen = m->md.pv_gen;
5535			rw_wunlock(lock);
5536			PMAP_LOCK(pmap);
5537			rw_wlock(lock);
5538			if (pvh_gen != pvh->pv_gen ||
5539			    md_gen != m->md.pv_gen) {
5540				PMAP_UNLOCK(pmap);
5541				rw_wunlock(lock);
5542				goto retry_pv_loop;
5543			}
5544		}
5545		PG_M = pmap_modified_bit(pmap);
5546		PG_RW = pmap_rw_bit(pmap);
5547		pde = pmap_pde(pmap, pv->pv_va);
5548		KASSERT((*pde & PG_PS) == 0,
5549		    ("pmap_remove_write: found a 2mpage in page %p's pv list",
5550		    m));
5551		pte = pmap_pde_to_pte(pde, pv->pv_va);
5552retry:
5553		oldpte = *pte;
5554		if (oldpte & PG_RW) {
5555			if (!atomic_cmpset_long(pte, oldpte, oldpte &
5556			    ~(PG_RW | PG_M)))
5557				goto retry;
5558			if ((oldpte & PG_M) != 0)
5559				vm_page_dirty(m);
5560			pmap_invalidate_page(pmap, pv->pv_va);
5561		}
5562		PMAP_UNLOCK(pmap);
5563	}
5564	rw_wunlock(lock);
5565	vm_page_aflag_clear(m, PGA_WRITEABLE);
5566	rw_runlock(&pvh_global_lock);
5567}
5568
5569static __inline boolean_t
5570safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte)
5571{
5572
5573	if (!pmap_emulate_ad_bits(pmap))
5574		return (TRUE);
5575
5576	KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type));
5577
5578	/*
5579	 * RWX = 010 or 110 will cause an unconditional EPT misconfiguration
5580	 * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared
5581	 * if the EPT_PG_WRITE bit is set.
5582	 */
5583	if ((pte & EPT_PG_WRITE) != 0)
5584		return (FALSE);
5585
5586	/*
5587	 * RWX = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set.
5588	 */
5589	if ((pte & EPT_PG_EXECUTE) == 0 ||
5590	    ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0))
5591		return (TRUE);
5592	else
5593		return (FALSE);
5594}
5595
5596#define	PMAP_TS_REFERENCED_MAX	5
5597
5598/*
5599 *	pmap_ts_referenced:
5600 *
5601 *	Return a count of reference bits for a page, clearing those bits.
5602 *	It is not necessary for every reference bit to be cleared, but it
5603 *	is necessary that 0 only be returned when there are truly no
5604 *	reference bits set.
5605 *
5606 *	XXX: The exact number of bits to check and clear is a matter that
5607 *	should be tested and standardized at some point in the future for
5608 *	optimal aging of shared pages.
5609 */
5610int
5611pmap_ts_referenced(vm_page_t m)
5612{
5613	struct md_page *pvh;
5614	pv_entry_t pv, pvf;
5615	pmap_t pmap;
5616	struct rwlock *lock;
5617	pd_entry_t oldpde, *pde;
5618	pt_entry_t *pte, PG_A;
5619	vm_offset_t va;
5620	vm_paddr_t pa;
5621	int cleared, md_gen, not_cleared, pvh_gen;
5622	struct spglist free;
5623	boolean_t demoted;
5624
5625	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5626	    ("pmap_ts_referenced: page %p is not managed", m));
5627	SLIST_INIT(&free);
5628	cleared = 0;
5629	pa = VM_PAGE_TO_PHYS(m);
5630	lock = PHYS_TO_PV_LIST_LOCK(pa);
5631	pvh = pa_to_pvh(pa);
5632	rw_rlock(&pvh_global_lock);
5633	rw_wlock(lock);
5634retry:
5635	not_cleared = 0;
5636	if ((m->flags & PG_FICTITIOUS) != 0 ||
5637	    (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
5638		goto small_mappings;
5639	pv = pvf;
5640	do {
5641		if (pvf == NULL)
5642			pvf = pv;
5643		pmap = PV_PMAP(pv);
5644		if (!PMAP_TRYLOCK(pmap)) {
5645			pvh_gen = pvh->pv_gen;
5646			rw_wunlock(lock);
5647			PMAP_LOCK(pmap);
5648			rw_wlock(lock);
5649			if (pvh_gen != pvh->pv_gen) {
5650				PMAP_UNLOCK(pmap);
5651				goto retry;
5652			}
5653		}
5654		PG_A = pmap_accessed_bit(pmap);
5655		va = pv->pv_va;
5656		pde = pmap_pde(pmap, pv->pv_va);
5657		oldpde = *pde;
5658		if ((*pde & PG_A) != 0) {
5659			/*
5660			 * Since this reference bit is shared by 512 4KB
5661			 * pages, it should not be cleared every time it is
5662			 * tested.  Apply a simple "hash" function on the
5663			 * physical page number, the virtual superpage number,
5664			 * and the pmap address to select one 4KB page out of
5665			 * the 512 on which testing the reference bit will
5666			 * result in clearing that reference bit.  This
5667			 * function is designed to avoid the selection of the
5668			 * same 4KB page for every 2MB page mapping.
5669			 *
5670			 * On demotion, a mapping that hasn't been referenced
5671			 * is simply destroyed.  To avoid the possibility of a
5672			 * subsequent page fault on a demoted wired mapping,
5673			 * always leave its reference bit set.  Moreover,
5674			 * since the superpage is wired, the current state of
5675			 * its reference bit won't affect page replacement.
5676			 */
5677			if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^
5678			    (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
5679			    (*pde & PG_W) == 0) {
5680				if (safe_to_clear_referenced(pmap, oldpde)) {
5681					atomic_clear_long(pde, PG_A);
5682					pmap_invalidate_page(pmap, pv->pv_va);
5683					demoted = FALSE;
5684				} else if (pmap_demote_pde_locked(pmap, pde,
5685				    pv->pv_va, &lock)) {
5686					/*
5687					 * Remove the mapping to a single page
5688					 * so that a subsequent access may
5689					 * repromote.  Since the underlying
5690					 * page table page is fully populated,
5691					 * this removal never frees a page
5692					 * table page.
5693					 */
5694					demoted = TRUE;
5695					va += VM_PAGE_TO_PHYS(m) - (oldpde &
5696					    PG_PS_FRAME);
5697					pte = pmap_pde_to_pte(pde, va);
5698					pmap_remove_pte(pmap, pte, va, *pde,
5699					    NULL, &lock);
5700					pmap_invalidate_page(pmap, va);
5701				} else
5702					demoted = TRUE;
5703
5704				if (demoted) {
5705					/*
5706					 * The superpage mapping was removed
5707					 * entirely and therefore 'pv' is no
5708					 * longer valid.
5709					 */
5710					if (pvf == pv)
5711						pvf = NULL;
5712					pv = NULL;
5713				}
5714				cleared++;
5715				KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
5716				    ("inconsistent pv lock %p %p for page %p",
5717				    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
5718			} else
5719				not_cleared++;
5720		}
5721		PMAP_UNLOCK(pmap);
5722		/* Rotate the PV list if it has more than one entry. */
5723		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
5724			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
5725			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
5726			pvh->pv_gen++;
5727		}
5728		if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
5729			goto out;
5730	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
5731small_mappings:
5732	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
5733		goto out;
5734	pv = pvf;
5735	do {
5736		if (pvf == NULL)
5737			pvf = pv;
5738		pmap = PV_PMAP(pv);
5739		if (!PMAP_TRYLOCK(pmap)) {
5740			pvh_gen = pvh->pv_gen;
5741			md_gen = m->md.pv_gen;
5742			rw_wunlock(lock);
5743			PMAP_LOCK(pmap);
5744			rw_wlock(lock);
5745			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
5746				PMAP_UNLOCK(pmap);
5747				goto retry;
5748			}
5749		}
5750		PG_A = pmap_accessed_bit(pmap);
5751		pde = pmap_pde(pmap, pv->pv_va);
5752		KASSERT((*pde & PG_PS) == 0,
5753		    ("pmap_ts_referenced: found a 2mpage in page %p's pv list",
5754		    m));
5755		pte = pmap_pde_to_pte(pde, pv->pv_va);
5756		if ((*pte & PG_A) != 0) {
5757			if (safe_to_clear_referenced(pmap, *pte)) {
5758				atomic_clear_long(pte, PG_A);
5759				pmap_invalidate_page(pmap, pv->pv_va);
5760				cleared++;
5761			} else if ((*pte & PG_W) == 0) {
5762				/*
5763				 * Wired pages cannot be paged out so
5764				 * doing accessed bit emulation for
5765				 * them is wasted effort. We do the
5766				 * hard work for unwired pages only.
5767				 */
5768				pmap_remove_pte(pmap, pte, pv->pv_va,
5769				    *pde, &free, &lock);
5770				pmap_invalidate_page(pmap, pv->pv_va);
5771				cleared++;
5772				if (pvf == pv)
5773					pvf = NULL;
5774				pv = NULL;
5775				KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
5776				    ("inconsistent pv lock %p %p for page %p",
5777				    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
5778			} else
5779				not_cleared++;
5780		}
5781		PMAP_UNLOCK(pmap);
5782		/* Rotate the PV list if it has more than one entry. */
5783		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
5784			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
5785			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
5786			m->md.pv_gen++;
5787		}
5788	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
5789	    not_cleared < PMAP_TS_REFERENCED_MAX);
5790out:
5791	rw_wunlock(lock);
5792	rw_runlock(&pvh_global_lock);
5793	pmap_free_zero_pages(&free);
5794	return (cleared + not_cleared);
5795}
5796
5797/*
5798 *	Apply the given advice to the specified range of addresses within the
5799 *	given pmap.  Depending on the advice, clear the referenced and/or
5800 *	modified flags in each mapping and set the mapped page's dirty field.
5801 */
5802void
5803pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
5804{
5805	struct rwlock *lock;
5806	pml4_entry_t *pml4e;
5807	pdp_entry_t *pdpe;
5808	pd_entry_t oldpde, *pde;
5809	pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V;
5810	vm_offset_t va_next;
5811	vm_page_t m;
5812	boolean_t anychanged, pv_lists_locked;
5813
5814	if (advice != MADV_DONTNEED && advice != MADV_FREE)
5815		return;
5816
5817	/*
5818	 * A/D bit emulation requires an alternate code path when clearing
5819	 * the modified and accessed bits below. Since this function is
5820	 * advisory in nature we skip it entirely for pmaps that require
5821	 * A/D bit emulation.
5822	 */
5823	if (pmap_emulate_ad_bits(pmap))
5824		return;
5825
5826	PG_A = pmap_accessed_bit(pmap);
5827	PG_G = pmap_global_bit(pmap);
5828	PG_M = pmap_modified_bit(pmap);
5829	PG_V = pmap_valid_bit(pmap);
5830	PG_RW = pmap_rw_bit(pmap);
5831
5832	pv_lists_locked = FALSE;
5833resume:
5834	anychanged = FALSE;
5835	PMAP_LOCK(pmap);
5836	for (; sva < eva; sva = va_next) {
5837		pml4e = pmap_pml4e(pmap, sva);
5838		if ((*pml4e & PG_V) == 0) {
5839			va_next = (sva + NBPML4) & ~PML4MASK;
5840			if (va_next < sva)
5841				va_next = eva;
5842			continue;
5843		}
5844		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
5845		if ((*pdpe & PG_V) == 0) {
5846			va_next = (sva + NBPDP) & ~PDPMASK;
5847			if (va_next < sva)
5848				va_next = eva;
5849			continue;
5850		}
5851		va_next = (sva + NBPDR) & ~PDRMASK;
5852		if (va_next < sva)
5853			va_next = eva;
5854		pde = pmap_pdpe_to_pde(pdpe, sva);
5855		oldpde = *pde;
5856		if ((oldpde & PG_V) == 0)
5857			continue;
5858		else if ((oldpde & PG_PS) != 0) {
5859			if ((oldpde & PG_MANAGED) == 0)
5860				continue;
5861			if (!pv_lists_locked) {
5862				pv_lists_locked = TRUE;
5863				if (!rw_try_rlock(&pvh_global_lock)) {
5864					if (anychanged)
5865						pmap_invalidate_all(pmap);
5866					PMAP_UNLOCK(pmap);
5867					rw_rlock(&pvh_global_lock);
5868					goto resume;
5869				}
5870			}
5871			lock = NULL;
5872			if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) {
5873				if (lock != NULL)
5874					rw_wunlock(lock);
5875
5876				/*
5877				 * The large page mapping was destroyed.
5878				 */
5879				continue;
5880			}
5881
5882			/*
5883			 * Unless the page mappings are wired, remove the
5884			 * mapping to a single page so that a subsequent
5885			 * access may repromote.  Since the underlying page
5886			 * table page is fully populated, this removal never
5887			 * frees a page table page.
5888			 */
5889			if ((oldpde & PG_W) == 0) {
5890				pte = pmap_pde_to_pte(pde, sva);
5891				KASSERT((*pte & PG_V) != 0,
5892				    ("pmap_advise: invalid PTE"));
5893				pmap_remove_pte(pmap, pte, sva, *pde, NULL,
5894				    &lock);
5895				anychanged = TRUE;
5896			}
5897			if (lock != NULL)
5898				rw_wunlock(lock);
5899		}
5900		if (va_next > eva)
5901			va_next = eva;
5902		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
5903		    sva += PAGE_SIZE) {
5904			if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED |
5905			    PG_V))
5906				continue;
5907			else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
5908				if (advice == MADV_DONTNEED) {
5909					/*
5910					 * Future calls to pmap_is_modified()
5911					 * can be avoided by making the page
5912					 * dirty now.
5913					 */
5914					m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
5915					vm_page_dirty(m);
5916				}
5917				atomic_clear_long(pte, PG_M | PG_A);
5918			} else if ((*pte & PG_A) != 0)
5919				atomic_clear_long(pte, PG_A);
5920			else
5921				continue;
5922			if ((*pte & PG_G) != 0)
5923				pmap_invalidate_page(pmap, sva);
5924			else
5925				anychanged = TRUE;
5926		}
5927	}
5928	if (anychanged)
5929		pmap_invalidate_all(pmap);
5930	if (pv_lists_locked)
5931		rw_runlock(&pvh_global_lock);
5932	PMAP_UNLOCK(pmap);
5933}
5934
5935/*
5936 *	Clear the modify bits on the specified physical page.
5937 */
5938void
5939pmap_clear_modify(vm_page_t m)
5940{
5941	struct md_page *pvh;
5942	pmap_t pmap;
5943	pv_entry_t next_pv, pv;
5944	pd_entry_t oldpde, *pde;
5945	pt_entry_t oldpte, *pte, PG_M, PG_RW, PG_V;
5946	struct rwlock *lock;
5947	vm_offset_t va;
5948	int md_gen, pvh_gen;
5949
5950	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5951	    ("pmap_clear_modify: page %p is not managed", m));
5952	VM_OBJECT_ASSERT_WLOCKED(m->object);
5953	KASSERT(!vm_page_xbusied(m),
5954	    ("pmap_clear_modify: page %p is exclusive busied", m));
5955
5956	/*
5957	 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
5958	 * If the object containing the page is locked and the page is not
5959	 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
5960	 */
5961	if ((m->aflags & PGA_WRITEABLE) == 0)
5962		return;
5963	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5964	rw_rlock(&pvh_global_lock);
5965	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5966	rw_wlock(lock);
5967restart:
5968	if ((m->flags & PG_FICTITIOUS) != 0)
5969		goto small_mappings;
5970	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
5971		pmap = PV_PMAP(pv);
5972		if (!PMAP_TRYLOCK(pmap)) {
5973			pvh_gen = pvh->pv_gen;
5974			rw_wunlock(lock);
5975			PMAP_LOCK(pmap);
5976			rw_wlock(lock);
5977			if (pvh_gen != pvh->pv_gen) {
5978				PMAP_UNLOCK(pmap);
5979				goto restart;
5980			}
5981		}
5982		PG_M = pmap_modified_bit(pmap);
5983		PG_V = pmap_valid_bit(pmap);
5984		PG_RW = pmap_rw_bit(pmap);
5985		va = pv->pv_va;
5986		pde = pmap_pde(pmap, va);
5987		oldpde = *pde;
5988		if ((oldpde & PG_RW) != 0) {
5989			if (pmap_demote_pde_locked(pmap, pde, va, &lock)) {
5990				if ((oldpde & PG_W) == 0) {
5991					/*
5992					 * Write protect the mapping to a
5993					 * single page so that a subsequent
5994					 * write access may repromote.
5995					 */
5996					va += VM_PAGE_TO_PHYS(m) - (oldpde &
5997					    PG_PS_FRAME);
5998					pte = pmap_pde_to_pte(pde, va);
5999					oldpte = *pte;
6000					if ((oldpte & PG_V) != 0) {
6001						while (!atomic_cmpset_long(pte,
6002						    oldpte,
6003						    oldpte & ~(PG_M | PG_RW)))
6004							oldpte = *pte;
6005						vm_page_dirty(m);
6006						pmap_invalidate_page(pmap, va);
6007					}
6008				}
6009			}
6010		}
6011		PMAP_UNLOCK(pmap);
6012	}
6013small_mappings:
6014	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6015		pmap = PV_PMAP(pv);
6016		if (!PMAP_TRYLOCK(pmap)) {
6017			md_gen = m->md.pv_gen;
6018			pvh_gen = pvh->pv_gen;
6019			rw_wunlock(lock);
6020			PMAP_LOCK(pmap);
6021			rw_wlock(lock);
6022			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
6023				PMAP_UNLOCK(pmap);
6024				goto restart;
6025			}
6026		}
6027		PG_M = pmap_modified_bit(pmap);
6028		PG_RW = pmap_rw_bit(pmap);
6029		pde = pmap_pde(pmap, pv->pv_va);
6030		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
6031		    " a 2mpage in page %p's pv list", m));
6032		pte = pmap_pde_to_pte(pde, pv->pv_va);
6033		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
6034			atomic_clear_long(pte, PG_M);
6035			pmap_invalidate_page(pmap, pv->pv_va);
6036		}
6037		PMAP_UNLOCK(pmap);
6038	}
6039	rw_wunlock(lock);
6040	rw_runlock(&pvh_global_lock);
6041}
6042
6043/*
6044 * Miscellaneous support routines follow
6045 */
6046
6047/* Adjust the cache mode for a 4KB page mapped via a PTE. */
6048static __inline void
6049pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask)
6050{
6051	u_int opte, npte;
6052
6053	/*
6054	 * The cache mode bits are all in the low 32-bits of the
6055	 * PTE, so we can just spin on updating the low 32-bits.
6056	 */
6057	do {
6058		opte = *(u_int *)pte;
6059		npte = opte & ~mask;
6060		npte |= cache_bits;
6061	} while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
6062}
6063
6064/* Adjust the cache mode for a 2MB page mapped via a PDE. */
6065static __inline void
6066pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask)
6067{
6068	u_int opde, npde;
6069
6070	/*
6071	 * The cache mode bits are all in the low 32-bits of the
6072	 * PDE, so we can just spin on updating the low 32-bits.
6073	 */
6074	do {
6075		opde = *(u_int *)pde;
6076		npde = opde & ~mask;
6077		npde |= cache_bits;
6078	} while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
6079}
6080
6081/*
6082 * Map a set of physical memory pages into the kernel virtual
6083 * address space. Return a pointer to where it is mapped. This
6084 * routine is intended to be used for mapping device memory,
6085 * NOT real memory.
6086 */
6087void *
6088pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
6089{
6090	vm_offset_t va, offset;
6091	vm_size_t tmpsize;
6092
6093	/*
6094	 * If the specified range of physical addresses fits within the direct
6095	 * map window, use the direct map.
6096	 */
6097	if (pa < dmaplimit && pa + size < dmaplimit) {
6098		va = PHYS_TO_DMAP(pa);
6099		if (!pmap_change_attr(va, size, mode))
6100			return ((void *)va);
6101	}
6102	offset = pa & PAGE_MASK;
6103	size = round_page(offset + size);
6104	va = kva_alloc(size);
6105	if (!va)
6106		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
6107	pa = trunc_page(pa);
6108	for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
6109		pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
6110	pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
6111	pmap_invalidate_cache_range(va, va + tmpsize);
6112	return ((void *)(va + offset));
6113}
6114
6115void *
6116pmap_mapdev(vm_paddr_t pa, vm_size_t size)
6117{
6118
6119	return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
6120}
6121
6122void *
6123pmap_mapbios(vm_paddr_t pa, vm_size_t size)
6124{
6125
6126	return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
6127}
6128
6129void
6130pmap_unmapdev(vm_offset_t va, vm_size_t size)
6131{
6132	vm_offset_t base, offset;
6133
6134	/* If we gave a direct map region in pmap_mapdev, do nothing */
6135	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
6136		return;
6137	base = trunc_page(va);
6138	offset = va & PAGE_MASK;
6139	size = round_page(offset + size);
6140	kva_free(base, size);
6141}
6142
6143/*
6144 * Tries to demote a 1GB page mapping.
6145 */
6146static boolean_t
6147pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va)
6148{
6149	pdp_entry_t newpdpe, oldpdpe;
6150	pd_entry_t *firstpde, newpde, *pde;
6151	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
6152	vm_paddr_t mpdepa;
6153	vm_page_t mpde;
6154
6155	PG_A = pmap_accessed_bit(pmap);
6156	PG_M = pmap_modified_bit(pmap);
6157	PG_V = pmap_valid_bit(pmap);
6158	PG_RW = pmap_rw_bit(pmap);
6159
6160	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6161	oldpdpe = *pdpe;
6162	KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V),
6163	    ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V"));
6164	if ((mpde = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT |
6165	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
6166		CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx"
6167		    " in pmap %p", va, pmap);
6168		return (FALSE);
6169	}
6170	mpdepa = VM_PAGE_TO_PHYS(mpde);
6171	firstpde = (pd_entry_t *)PHYS_TO_DMAP(mpdepa);
6172	newpdpe = mpdepa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V;
6173	KASSERT((oldpdpe & PG_A) != 0,
6174	    ("pmap_demote_pdpe: oldpdpe is missing PG_A"));
6175	KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW,
6176	    ("pmap_demote_pdpe: oldpdpe is missing PG_M"));
6177	newpde = oldpdpe;
6178
6179	/*
6180	 * Initialize the page directory page.
6181	 */
6182	for (pde = firstpde; pde < firstpde + NPDEPG; pde++) {
6183		*pde = newpde;
6184		newpde += NBPDR;
6185	}
6186
6187	/*
6188	 * Demote the mapping.
6189	 */
6190	*pdpe = newpdpe;
6191
6192	/*
6193	 * Invalidate a stale recursive mapping of the page directory page.
6194	 */
6195	pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va));
6196
6197	pmap_pdpe_demotions++;
6198	CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx"
6199	    " in pmap %p", va, pmap);
6200	return (TRUE);
6201}
6202
6203/*
6204 * Sets the memory attribute for the specified page.
6205 */
6206void
6207pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
6208{
6209
6210	m->md.pat_mode = ma;
6211
6212	/*
6213	 * If "m" is a normal page, update its direct mapping.  This update
6214	 * can be relied upon to perform any cache operations that are
6215	 * required for data coherence.
6216	 */
6217	if ((m->flags & PG_FICTITIOUS) == 0 &&
6218	    pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
6219	    m->md.pat_mode))
6220		panic("memory attribute change on the direct map failed");
6221}
6222
6223/*
6224 * Changes the specified virtual address range's memory type to that given by
6225 * the parameter "mode".  The specified virtual address range must be
6226 * completely contained within either the direct map or the kernel map.  If
6227 * the virtual address range is contained within the kernel map, then the
6228 * memory type for each of the corresponding ranges of the direct map is also
6229 * changed.  (The corresponding ranges of the direct map are those ranges that
6230 * map the same physical pages as the specified virtual address range.)  These
6231 * changes to the direct map are necessary because Intel describes the
6232 * behavior of their processors as "undefined" if two or more mappings to the
6233 * same physical page have different memory types.
6234 *
6235 * Returns zero if the change completed successfully, and either EINVAL or
6236 * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
6237 * of the virtual address range was not mapped, and ENOMEM is returned if
6238 * there was insufficient memory available to complete the change.  In the
6239 * latter case, the memory type may have been changed on some part of the
6240 * virtual address range or the direct map.
6241 */
6242int
6243pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
6244{
6245	int error;
6246
6247	PMAP_LOCK(kernel_pmap);
6248	error = pmap_change_attr_locked(va, size, mode);
6249	PMAP_UNLOCK(kernel_pmap);
6250	return (error);
6251}
6252
6253static int
6254pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
6255{
6256	vm_offset_t base, offset, tmpva;
6257	vm_paddr_t pa_start, pa_end;
6258	pdp_entry_t *pdpe;
6259	pd_entry_t *pde;
6260	pt_entry_t *pte;
6261	int cache_bits_pte, cache_bits_pde, error;
6262	boolean_t changed;
6263
6264	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
6265	base = trunc_page(va);
6266	offset = va & PAGE_MASK;
6267	size = round_page(offset + size);
6268
6269	/*
6270	 * Only supported on kernel virtual addresses, including the direct
6271	 * map but excluding the recursive map.
6272	 */
6273	if (base < DMAP_MIN_ADDRESS)
6274		return (EINVAL);
6275
6276	cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1);
6277	cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0);
6278	changed = FALSE;
6279
6280	/*
6281	 * Pages that aren't mapped aren't supported.  Also break down 2MB pages
6282	 * into 4KB pages if required.
6283	 */
6284	for (tmpva = base; tmpva < base + size; ) {
6285		pdpe = pmap_pdpe(kernel_pmap, tmpva);
6286		if (*pdpe == 0)
6287			return (EINVAL);
6288		if (*pdpe & PG_PS) {
6289			/*
6290			 * If the current 1GB page already has the required
6291			 * memory type, then we need not demote this page. Just
6292			 * increment tmpva to the next 1GB page frame.
6293			 */
6294			if ((*pdpe & X86_PG_PDE_CACHE) == cache_bits_pde) {
6295				tmpva = trunc_1gpage(tmpva) + NBPDP;
6296				continue;
6297			}
6298
6299			/*
6300			 * If the current offset aligns with a 1GB page frame
6301			 * and there is at least 1GB left within the range, then
6302			 * we need not break down this page into 2MB pages.
6303			 */
6304			if ((tmpva & PDPMASK) == 0 &&
6305			    tmpva + PDPMASK < base + size) {
6306				tmpva += NBPDP;
6307				continue;
6308			}
6309			if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva))
6310				return (ENOMEM);
6311		}
6312		pde = pmap_pdpe_to_pde(pdpe, tmpva);
6313		if (*pde == 0)
6314			return (EINVAL);
6315		if (*pde & PG_PS) {
6316			/*
6317			 * If the current 2MB page already has the required
6318			 * memory type, then we need not demote this page. Just
6319			 * increment tmpva to the next 2MB page frame.
6320			 */
6321			if ((*pde & X86_PG_PDE_CACHE) == cache_bits_pde) {
6322				tmpva = trunc_2mpage(tmpva) + NBPDR;
6323				continue;
6324			}
6325
6326			/*
6327			 * If the current offset aligns with a 2MB page frame
6328			 * and there is at least 2MB left within the range, then
6329			 * we need not break down this page into 4KB pages.
6330			 */
6331			if ((tmpva & PDRMASK) == 0 &&
6332			    tmpva + PDRMASK < base + size) {
6333				tmpva += NBPDR;
6334				continue;
6335			}
6336			if (!pmap_demote_pde(kernel_pmap, pde, tmpva))
6337				return (ENOMEM);
6338		}
6339		pte = pmap_pde_to_pte(pde, tmpva);
6340		if (*pte == 0)
6341			return (EINVAL);
6342		tmpva += PAGE_SIZE;
6343	}
6344	error = 0;
6345
6346	/*
6347	 * Ok, all the pages exist, so run through them updating their
6348	 * cache mode if required.
6349	 */
6350	pa_start = pa_end = 0;
6351	for (tmpva = base; tmpva < base + size; ) {
6352		pdpe = pmap_pdpe(kernel_pmap, tmpva);
6353		if (*pdpe & PG_PS) {
6354			if ((*pdpe & X86_PG_PDE_CACHE) != cache_bits_pde) {
6355				pmap_pde_attr(pdpe, cache_bits_pde,
6356				    X86_PG_PDE_CACHE);
6357				changed = TRUE;
6358			}
6359			if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
6360				if (pa_start == pa_end) {
6361					/* Start physical address run. */
6362					pa_start = *pdpe & PG_PS_FRAME;
6363					pa_end = pa_start + NBPDP;
6364				} else if (pa_end == (*pdpe & PG_PS_FRAME))
6365					pa_end += NBPDP;
6366				else {
6367					/* Run ended, update direct map. */
6368					error = pmap_change_attr_locked(
6369					    PHYS_TO_DMAP(pa_start),
6370					    pa_end - pa_start, mode);
6371					if (error != 0)
6372						break;
6373					/* Start physical address run. */
6374					pa_start = *pdpe & PG_PS_FRAME;
6375					pa_end = pa_start + NBPDP;
6376				}
6377			}
6378			tmpva = trunc_1gpage(tmpva) + NBPDP;
6379			continue;
6380		}
6381		pde = pmap_pdpe_to_pde(pdpe, tmpva);
6382		if (*pde & PG_PS) {
6383			if ((*pde & X86_PG_PDE_CACHE) != cache_bits_pde) {
6384				pmap_pde_attr(pde, cache_bits_pde,
6385				    X86_PG_PDE_CACHE);
6386				changed = TRUE;
6387			}
6388			if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
6389				if (pa_start == pa_end) {
6390					/* Start physical address run. */
6391					pa_start = *pde & PG_PS_FRAME;
6392					pa_end = pa_start + NBPDR;
6393				} else if (pa_end == (*pde & PG_PS_FRAME))
6394					pa_end += NBPDR;
6395				else {
6396					/* Run ended, update direct map. */
6397					error = pmap_change_attr_locked(
6398					    PHYS_TO_DMAP(pa_start),
6399					    pa_end - pa_start, mode);
6400					if (error != 0)
6401						break;
6402					/* Start physical address run. */
6403					pa_start = *pde & PG_PS_FRAME;
6404					pa_end = pa_start + NBPDR;
6405				}
6406			}
6407			tmpva = trunc_2mpage(tmpva) + NBPDR;
6408		} else {
6409			pte = pmap_pde_to_pte(pde, tmpva);
6410			if ((*pte & X86_PG_PTE_CACHE) != cache_bits_pte) {
6411				pmap_pte_attr(pte, cache_bits_pte,
6412				    X86_PG_PTE_CACHE);
6413				changed = TRUE;
6414			}
6415			if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
6416				if (pa_start == pa_end) {
6417					/* Start physical address run. */
6418					pa_start = *pte & PG_FRAME;
6419					pa_end = pa_start + PAGE_SIZE;
6420				} else if (pa_end == (*pte & PG_FRAME))
6421					pa_end += PAGE_SIZE;
6422				else {
6423					/* Run ended, update direct map. */
6424					error = pmap_change_attr_locked(
6425					    PHYS_TO_DMAP(pa_start),
6426					    pa_end - pa_start, mode);
6427					if (error != 0)
6428						break;
6429					/* Start physical address run. */
6430					pa_start = *pte & PG_FRAME;
6431					pa_end = pa_start + PAGE_SIZE;
6432				}
6433			}
6434			tmpva += PAGE_SIZE;
6435		}
6436	}
6437	if (error == 0 && pa_start != pa_end)
6438		error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start),
6439		    pa_end - pa_start, mode);
6440
6441	/*
6442	 * Flush CPU caches if required to make sure any data isn't cached that
6443	 * shouldn't be, etc.
6444	 */
6445	if (changed) {
6446		pmap_invalidate_range(kernel_pmap, base, tmpva);
6447		pmap_invalidate_cache_range(base, tmpva);
6448	}
6449	return (error);
6450}
6451
6452/*
6453 * Demotes any mapping within the direct map region that covers more than the
6454 * specified range of physical addresses.  This range's size must be a power
6455 * of two and its starting address must be a multiple of its size.  Since the
6456 * demotion does not change any attributes of the mapping, a TLB invalidation
6457 * is not mandatory.  The caller may, however, request a TLB invalidation.
6458 */
6459void
6460pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate)
6461{
6462	pdp_entry_t *pdpe;
6463	pd_entry_t *pde;
6464	vm_offset_t va;
6465	boolean_t changed;
6466
6467	if (len == 0)
6468		return;
6469	KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2"));
6470	KASSERT((base & (len - 1)) == 0,
6471	    ("pmap_demote_DMAP: base is not a multiple of len"));
6472	if (len < NBPDP && base < dmaplimit) {
6473		va = PHYS_TO_DMAP(base);
6474		changed = FALSE;
6475		PMAP_LOCK(kernel_pmap);
6476		pdpe = pmap_pdpe(kernel_pmap, va);
6477		if ((*pdpe & X86_PG_V) == 0)
6478			panic("pmap_demote_DMAP: invalid PDPE");
6479		if ((*pdpe & PG_PS) != 0) {
6480			if (!pmap_demote_pdpe(kernel_pmap, pdpe, va))
6481				panic("pmap_demote_DMAP: PDPE failed");
6482			changed = TRUE;
6483		}
6484		if (len < NBPDR) {
6485			pde = pmap_pdpe_to_pde(pdpe, va);
6486			if ((*pde & X86_PG_V) == 0)
6487				panic("pmap_demote_DMAP: invalid PDE");
6488			if ((*pde & PG_PS) != 0) {
6489				if (!pmap_demote_pde(kernel_pmap, pde, va))
6490					panic("pmap_demote_DMAP: PDE failed");
6491				changed = TRUE;
6492			}
6493		}
6494		if (changed && invalidate)
6495			pmap_invalidate_page(kernel_pmap, va);
6496		PMAP_UNLOCK(kernel_pmap);
6497	}
6498}
6499
6500/*
6501 * perform the pmap work for mincore
6502 */
6503int
6504pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
6505{
6506	pd_entry_t *pdep;
6507	pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V;
6508	vm_paddr_t pa;
6509	int val;
6510
6511	PG_A = pmap_accessed_bit(pmap);
6512	PG_M = pmap_modified_bit(pmap);
6513	PG_V = pmap_valid_bit(pmap);
6514	PG_RW = pmap_rw_bit(pmap);
6515
6516	PMAP_LOCK(pmap);
6517retry:
6518	pdep = pmap_pde(pmap, addr);
6519	if (pdep != NULL && (*pdep & PG_V)) {
6520		if (*pdep & PG_PS) {
6521			pte = *pdep;
6522			/* Compute the physical address of the 4KB page. */
6523			pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
6524			    PG_FRAME;
6525			val = MINCORE_SUPER;
6526		} else {
6527			pte = *pmap_pde_to_pte(pdep, addr);
6528			pa = pte & PG_FRAME;
6529			val = 0;
6530		}
6531	} else {
6532		pte = 0;
6533		pa = 0;
6534		val = 0;
6535	}
6536	if ((pte & PG_V) != 0) {
6537		val |= MINCORE_INCORE;
6538		if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
6539			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
6540		if ((pte & PG_A) != 0)
6541			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
6542	}
6543	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
6544	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
6545	    (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
6546		/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
6547		if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
6548			goto retry;
6549	} else
6550		PA_UNLOCK_COND(*locked_pa);
6551	PMAP_UNLOCK(pmap);
6552	return (val);
6553}
6554
6555void
6556pmap_activate(struct thread *td)
6557{
6558	pmap_t	pmap, oldpmap;
6559	u_int	cpuid;
6560
6561	critical_enter();
6562	pmap = vmspace_pmap(td->td_proc->p_vmspace);
6563	oldpmap = PCPU_GET(curpmap);
6564	cpuid = PCPU_GET(cpuid);
6565#ifdef SMP
6566	CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
6567	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
6568	CPU_SET_ATOMIC(cpuid, &pmap->pm_save);
6569#else
6570	CPU_CLR(cpuid, &oldpmap->pm_active);
6571	CPU_SET(cpuid, &pmap->pm_active);
6572	CPU_SET(cpuid, &pmap->pm_save);
6573#endif
6574	td->td_pcb->pcb_cr3 = pmap->pm_cr3;
6575	load_cr3(pmap->pm_cr3);
6576	PCPU_SET(curpmap, pmap);
6577	critical_exit();
6578}
6579
6580void
6581pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
6582{
6583}
6584
6585/*
6586 *	Increase the starting virtual address of the given mapping if a
6587 *	different alignment might result in more superpage mappings.
6588 */
6589void
6590pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
6591    vm_offset_t *addr, vm_size_t size)
6592{
6593	vm_offset_t superpage_offset;
6594
6595	if (size < NBPDR)
6596		return;
6597	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
6598		offset += ptoa(object->pg_color);
6599	superpage_offset = offset & PDRMASK;
6600	if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
6601	    (*addr & PDRMASK) == superpage_offset)
6602		return;
6603	if ((*addr & PDRMASK) < superpage_offset)
6604		*addr = (*addr & ~PDRMASK) + superpage_offset;
6605	else
6606		*addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
6607}
6608
6609#ifdef INVARIANTS
6610static unsigned long num_dirty_emulations;
6611SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW,
6612	     &num_dirty_emulations, 0, NULL);
6613
6614static unsigned long num_accessed_emulations;
6615SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW,
6616	     &num_accessed_emulations, 0, NULL);
6617
6618static unsigned long num_superpage_accessed_emulations;
6619SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW,
6620	     &num_superpage_accessed_emulations, 0, NULL);
6621
6622static unsigned long ad_emulation_superpage_promotions;
6623SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW,
6624	     &ad_emulation_superpage_promotions, 0, NULL);
6625#endif	/* INVARIANTS */
6626
6627int
6628pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype)
6629{
6630	int rv;
6631	struct rwlock *lock;
6632	vm_page_t m, mpte;
6633	pd_entry_t *pde;
6634	pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V;
6635	boolean_t pv_lists_locked;
6636
6637	KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE,
6638	    ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype));
6639
6640	if (!pmap_emulate_ad_bits(pmap))
6641		return (-1);
6642
6643	PG_A = pmap_accessed_bit(pmap);
6644	PG_M = pmap_modified_bit(pmap);
6645	PG_V = pmap_valid_bit(pmap);
6646	PG_RW = pmap_rw_bit(pmap);
6647
6648	rv = -1;
6649	lock = NULL;
6650	pv_lists_locked = FALSE;
6651retry:
6652	PMAP_LOCK(pmap);
6653
6654	pde = pmap_pde(pmap, va);
6655	if (pde == NULL || (*pde & PG_V) == 0)
6656		goto done;
6657
6658	if ((*pde & PG_PS) != 0) {
6659		if (ftype == VM_PROT_READ) {
6660#ifdef INVARIANTS
6661			atomic_add_long(&num_superpage_accessed_emulations, 1);
6662#endif
6663			*pde |= PG_A;
6664			rv = 0;
6665		}
6666		goto done;
6667	}
6668
6669	pte = pmap_pde_to_pte(pde, va);
6670	if ((*pte & PG_V) == 0)
6671		goto done;
6672
6673	if (ftype == VM_PROT_WRITE) {
6674		if ((*pte & PG_RW) == 0)
6675			goto done;
6676		*pte |= PG_M;
6677	}
6678	*pte |= PG_A;
6679
6680	/* try to promote the mapping */
6681	if (va < VM_MAXUSER_ADDRESS)
6682		mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
6683	else
6684		mpte = NULL;
6685
6686	m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
6687
6688	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
6689	    pmap_ps_enabled(pmap) &&
6690	    (m->flags & PG_FICTITIOUS) == 0 &&
6691	    vm_reserv_level_iffullpop(m) == 0) {
6692		if (!pv_lists_locked) {
6693			pv_lists_locked = TRUE;
6694			if (!rw_try_rlock(&pvh_global_lock)) {
6695				PMAP_UNLOCK(pmap);
6696				rw_rlock(&pvh_global_lock);
6697				goto retry;
6698			}
6699		}
6700		pmap_promote_pde(pmap, pde, va, &lock);
6701#ifdef INVARIANTS
6702		atomic_add_long(&ad_emulation_superpage_promotions, 1);
6703#endif
6704	}
6705#ifdef INVARIANTS
6706	if (ftype == VM_PROT_WRITE)
6707		atomic_add_long(&num_dirty_emulations, 1);
6708	else
6709		atomic_add_long(&num_accessed_emulations, 1);
6710#endif
6711	rv = 0;		/* success */
6712done:
6713	if (lock != NULL)
6714		rw_wunlock(lock);
6715	if (pv_lists_locked)
6716		rw_runlock(&pvh_global_lock);
6717	PMAP_UNLOCK(pmap);
6718	return (rv);
6719}
6720
6721void
6722pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num)
6723{
6724	pml4_entry_t *pml4;
6725	pdp_entry_t *pdp;
6726	pd_entry_t *pde;
6727	pt_entry_t *pte, PG_V;
6728	int idx;
6729
6730	idx = 0;
6731	PG_V = pmap_valid_bit(pmap);
6732	PMAP_LOCK(pmap);
6733
6734	pml4 = pmap_pml4e(pmap, va);
6735	ptr[idx++] = *pml4;
6736	if ((*pml4 & PG_V) == 0)
6737		goto done;
6738
6739	pdp = pmap_pml4e_to_pdpe(pml4, va);
6740	ptr[idx++] = *pdp;
6741	if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0)
6742		goto done;
6743
6744	pde = pmap_pdpe_to_pde(pdp, va);
6745	ptr[idx++] = *pde;
6746	if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0)
6747		goto done;
6748
6749	pte = pmap_pde_to_pte(pde, va);
6750	ptr[idx++] = *pte;
6751
6752done:
6753	PMAP_UNLOCK(pmap);
6754	*num = idx;
6755}
6756
6757#include "opt_ddb.h"
6758#ifdef DDB
6759#include <ddb/ddb.h>
6760
6761DB_SHOW_COMMAND(pte, pmap_print_pte)
6762{
6763	pmap_t pmap;
6764	pml4_entry_t *pml4;
6765	pdp_entry_t *pdp;
6766	pd_entry_t *pde;
6767	pt_entry_t *pte, PG_V;
6768	vm_offset_t va;
6769
6770	if (have_addr) {
6771		va = (vm_offset_t)addr;
6772		pmap = PCPU_GET(curpmap); /* XXX */
6773	} else {
6774		db_printf("show pte addr\n");
6775		return;
6776	}
6777	PG_V = pmap_valid_bit(pmap);
6778	pml4 = pmap_pml4e(pmap, va);
6779	db_printf("VA %#016lx pml4e %#016lx", va, *pml4);
6780	if ((*pml4 & PG_V) == 0) {
6781		db_printf("\n");
6782		return;
6783	}
6784	pdp = pmap_pml4e_to_pdpe(pml4, va);
6785	db_printf(" pdpe %#016lx", *pdp);
6786	if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) {
6787		db_printf("\n");
6788		return;
6789	}
6790	pde = pmap_pdpe_to_pde(pdp, va);
6791	db_printf(" pde %#016lx", *pde);
6792	if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) {
6793		db_printf("\n");
6794		return;
6795	}
6796	pte = pmap_pde_to_pte(pde, va);
6797	db_printf(" pte %#016lx\n", *pte);
6798}
6799
6800DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap)
6801{
6802	vm_paddr_t a;
6803
6804	if (have_addr) {
6805		a = (vm_paddr_t)addr;
6806		db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a));
6807	} else {
6808		db_printf("show phys2dmap addr\n");
6809	}
6810}
6811#endif
6812