pmap.c revision 267964
1/*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2003 Peter Wemm
9 * All rights reserved.
10 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
11 * All rights reserved.
12 *
13 * This code is derived from software contributed to Berkeley by
14 * the Systems Programming Group of the University of Utah Computer
15 * Science Department and William Jolitz of UUNET Technologies Inc.
16 *
17 * Redistribution and use in source and binary forms, with or without
18 * modification, are permitted provided that the following conditions
19 * are met:
20 * 1. Redistributions of source code must retain the above copyright
21 *    notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 *    notice, this list of conditions and the following disclaimer in the
24 *    documentation and/or other materials provided with the distribution.
25 * 3. All advertising materials mentioning features or use of this software
26 *    must display the following acknowledgement:
27 *	This product includes software developed by the University of
28 *	California, Berkeley and its contributors.
29 * 4. Neither the name of the University nor the names of its contributors
30 *    may be used to endorse or promote products derived from this software
31 *    without specific prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
34 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
35 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
36 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
37 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
38 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
39 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
40 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
41 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
42 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
43 * SUCH DAMAGE.
44 *
45 *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
46 */
47/*-
48 * Copyright (c) 2003 Networks Associates Technology, Inc.
49 * All rights reserved.
50 *
51 * This software was developed for the FreeBSD Project by Jake Burkholder,
52 * Safeport Network Services, and Network Associates Laboratories, the
53 * Security Research Division of Network Associates, Inc. under
54 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
55 * CHATS research program.
56 *
57 * Redistribution and use in source and binary forms, with or without
58 * modification, are permitted provided that the following conditions
59 * are met:
60 * 1. Redistributions of source code must retain the above copyright
61 *    notice, this list of conditions and the following disclaimer.
62 * 2. Redistributions in binary form must reproduce the above copyright
63 *    notice, this list of conditions and the following disclaimer in the
64 *    documentation and/or other materials provided with the distribution.
65 *
66 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
67 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
68 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
69 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
70 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
71 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
72 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
73 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
74 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
75 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
76 * SUCH DAMAGE.
77 */
78
79#define	AMD64_NPT_AWARE
80
81#include <sys/cdefs.h>
82__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/pmap.c 267964 2014-06-27 17:22:18Z jhb $");
83
84/*
85 *	Manages physical address maps.
86 *
87 *	Since the information managed by this module is
88 *	also stored by the logical address mapping module,
89 *	this module may throw away valid virtual-to-physical
90 *	mappings at almost any time.  However, invalidations
91 *	of virtual-to-physical mappings must be done as
92 *	requested.
93 *
94 *	In order to cope with hardware architectures which
95 *	make virtual-to-physical map invalidates expensive,
96 *	this module may delay invalidate or reduced protection
97 *	operations until such time as they are actually
98 *	necessary.  This module is given full information as
99 *	to which processors are currently using which maps,
100 *	and to when physical maps must be made correct.
101 */
102
103#include "opt_pmap.h"
104#include "opt_vm.h"
105
106#include <sys/param.h>
107#include <sys/bus.h>
108#include <sys/systm.h>
109#include <sys/kernel.h>
110#include <sys/ktr.h>
111#include <sys/lock.h>
112#include <sys/malloc.h>
113#include <sys/mman.h>
114#include <sys/mutex.h>
115#include <sys/proc.h>
116#include <sys/rwlock.h>
117#include <sys/sx.h>
118#include <sys/vmmeter.h>
119#include <sys/sched.h>
120#include <sys/sysctl.h>
121#include <sys/_unrhdr.h>
122#include <sys/smp.h>
123
124#include <vm/vm.h>
125#include <vm/vm_param.h>
126#include <vm/vm_kern.h>
127#include <vm/vm_page.h>
128#include <vm/vm_map.h>
129#include <vm/vm_object.h>
130#include <vm/vm_extern.h>
131#include <vm/vm_pageout.h>
132#include <vm/vm_pager.h>
133#include <vm/vm_radix.h>
134#include <vm/vm_reserv.h>
135#include <vm/uma.h>
136
137#include <machine/intr_machdep.h>
138#include <machine/apicvar.h>
139#include <machine/cpu.h>
140#include <machine/cputypes.h>
141#include <machine/md_var.h>
142#include <machine/pcb.h>
143#include <machine/specialreg.h>
144#ifdef SMP
145#include <machine/smp.h>
146#endif
147
148static __inline boolean_t
149pmap_emulate_ad_bits(pmap_t pmap)
150{
151
152	return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0);
153}
154
155static __inline pt_entry_t
156pmap_valid_bit(pmap_t pmap)
157{
158	pt_entry_t mask;
159
160	switch (pmap->pm_type) {
161	case PT_X86:
162		mask = X86_PG_V;
163		break;
164	case PT_EPT:
165		if (pmap_emulate_ad_bits(pmap))
166			mask = EPT_PG_EMUL_V;
167		else
168			mask = EPT_PG_READ;
169		break;
170	default:
171		panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type);
172	}
173
174	return (mask);
175}
176
177static __inline pt_entry_t
178pmap_rw_bit(pmap_t pmap)
179{
180	pt_entry_t mask;
181
182	switch (pmap->pm_type) {
183	case PT_X86:
184		mask = X86_PG_RW;
185		break;
186	case PT_EPT:
187		if (pmap_emulate_ad_bits(pmap))
188			mask = EPT_PG_EMUL_RW;
189		else
190			mask = EPT_PG_WRITE;
191		break;
192	default:
193		panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type);
194	}
195
196	return (mask);
197}
198
199static __inline pt_entry_t
200pmap_global_bit(pmap_t pmap)
201{
202	pt_entry_t mask;
203
204	switch (pmap->pm_type) {
205	case PT_X86:
206		mask = X86_PG_G;
207		break;
208	case PT_EPT:
209		mask = 0;
210		break;
211	default:
212		panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type);
213	}
214
215	return (mask);
216}
217
218static __inline pt_entry_t
219pmap_accessed_bit(pmap_t pmap)
220{
221	pt_entry_t mask;
222
223	switch (pmap->pm_type) {
224	case PT_X86:
225		mask = X86_PG_A;
226		break;
227	case PT_EPT:
228		if (pmap_emulate_ad_bits(pmap))
229			mask = EPT_PG_READ;
230		else
231			mask = EPT_PG_A;
232		break;
233	default:
234		panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type);
235	}
236
237	return (mask);
238}
239
240static __inline pt_entry_t
241pmap_modified_bit(pmap_t pmap)
242{
243	pt_entry_t mask;
244
245	switch (pmap->pm_type) {
246	case PT_X86:
247		mask = X86_PG_M;
248		break;
249	case PT_EPT:
250		if (pmap_emulate_ad_bits(pmap))
251			mask = EPT_PG_WRITE;
252		else
253			mask = EPT_PG_M;
254		break;
255	default:
256		panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type);
257	}
258
259	return (mask);
260}
261
262#if !defined(DIAGNOSTIC)
263#ifdef __GNUC_GNU_INLINE__
264#define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
265#else
266#define PMAP_INLINE	extern inline
267#endif
268#else
269#define PMAP_INLINE
270#endif
271
272#ifdef PV_STATS
273#define PV_STAT(x)	do { x ; } while (0)
274#else
275#define PV_STAT(x)	do { } while (0)
276#endif
277
278#define	pa_index(pa)	((pa) >> PDRSHIFT)
279#define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
280
281#define	NPV_LIST_LOCKS	MAXCPU
282
283#define	PHYS_TO_PV_LIST_LOCK(pa)	\
284			(&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
285
286#define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)	do {	\
287	struct rwlock **_lockp = (lockp);		\
288	struct rwlock *_new_lock;			\
289							\
290	_new_lock = PHYS_TO_PV_LIST_LOCK(pa);		\
291	if (_new_lock != *_lockp) {			\
292		if (*_lockp != NULL)			\
293			rw_wunlock(*_lockp);		\
294		*_lockp = _new_lock;			\
295		rw_wlock(*_lockp);			\
296	}						\
297} while (0)
298
299#define	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)	\
300			CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
301
302#define	RELEASE_PV_LIST_LOCK(lockp)		do {	\
303	struct rwlock **_lockp = (lockp);		\
304							\
305	if (*_lockp != NULL) {				\
306		rw_wunlock(*_lockp);			\
307		*_lockp = NULL;				\
308	}						\
309} while (0)
310
311#define	VM_PAGE_TO_PV_LIST_LOCK(m)	\
312			PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
313
314struct pmap kernel_pmap_store;
315
316vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
317vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
318
319int nkpt;
320SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0,
321    "Number of kernel page table pages allocated on bootup");
322
323static int ndmpdp;
324vm_paddr_t dmaplimit;
325vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
326pt_entry_t pg_nx;
327
328static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
329
330static int pat_works = 1;
331SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1,
332    "Is page attribute table fully functional?");
333
334static int pg_ps_enabled = 1;
335SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0,
336    "Are large page mappings enabled?");
337
338#define	PAT_INDEX_SIZE	8
339static int pat_index[PAT_INDEX_SIZE];	/* cache mode to PAT index conversion */
340
341static u_int64_t	KPTphys;	/* phys addr of kernel level 1 */
342static u_int64_t	KPDphys;	/* phys addr of kernel level 2 */
343u_int64_t		KPDPphys;	/* phys addr of kernel level 3 */
344u_int64_t		KPML4phys;	/* phys addr of kernel level 4 */
345
346static u_int64_t	DMPDphys;	/* phys addr of direct mapped level 2 */
347static u_int64_t	DMPDPphys;	/* phys addr of direct mapped level 3 */
348static int		ndmpdpphys;	/* number of DMPDPphys pages */
349
350static struct rwlock_padalign pvh_global_lock;
351
352/*
353 * Data for the pv entry allocation mechanism
354 */
355static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
356static struct mtx pv_chunks_mutex;
357static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
358static struct md_page *pv_table;
359
360/*
361 * All those kernel PT submaps that BSD is so fond of
362 */
363pt_entry_t *CMAP1 = 0;
364caddr_t CADDR1 = 0;
365
366static int pmap_flags = PMAP_PDE_SUPERPAGE;	/* flags for x86 pmaps */
367
368static struct unrhdr pcid_unr;
369static struct mtx pcid_mtx;
370int pmap_pcid_enabled = 0;
371SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN, &pmap_pcid_enabled,
372    0, "Is TLB Context ID enabled ?");
373int invpcid_works = 0;
374SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0,
375    "Is the invpcid instruction available ?");
376
377static int
378pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS)
379{
380	int i;
381	uint64_t res;
382
383	res = 0;
384	CPU_FOREACH(i) {
385		res += cpuid_to_pcpu[i]->pc_pm_save_cnt;
386	}
387	return (sysctl_handle_64(oidp, &res, 0, req));
388}
389SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RW |
390    CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU",
391    "Count of saved TLB context on switch");
392
393/*
394 * Crashdump maps.
395 */
396static caddr_t crashdumpmap;
397
398static void	free_pv_chunk(struct pv_chunk *pc);
399static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
400static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
401static int	popcnt_pc_map_elem(uint64_t elem);
402static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
403static void	reserve_pv_entries(pmap_t pmap, int needed,
404		    struct rwlock **lockp);
405static void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
406		    struct rwlock **lockp);
407static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
408		    struct rwlock **lockp);
409static void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
410		    struct rwlock **lockp);
411static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
412static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
413		    vm_offset_t va);
414
415static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode);
416static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
417static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde,
418    vm_offset_t va, struct rwlock **lockp);
419static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe,
420    vm_offset_t va);
421static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
422    vm_prot_t prot, struct rwlock **lockp);
423static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
424    vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
425static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
426static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
427static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
428static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va);
429static void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask);
430static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
431    struct rwlock **lockp);
432static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
433    vm_prot_t prot);
434static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask);
435static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
436    struct spglist *free, struct rwlock **lockp);
437static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
438    pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
439static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte);
440static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
441    struct spglist *free);
442static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
443    vm_page_t m, struct rwlock **lockp);
444static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
445    pd_entry_t newpde);
446static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde);
447
448static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
449		struct rwlock **lockp);
450static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va,
451		struct rwlock **lockp);
452static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va,
453		struct rwlock **lockp);
454
455static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
456    struct spglist *free);
457static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
458static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
459
460/*
461 * Move the kernel virtual free pointer to the next
462 * 2MB.  This is used to help improve performance
463 * by using a large (2MB) page for much of the kernel
464 * (.text, .data, .bss)
465 */
466static vm_offset_t
467pmap_kmem_choose(vm_offset_t addr)
468{
469	vm_offset_t newaddr = addr;
470
471	newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
472	return (newaddr);
473}
474
475/********************/
476/* Inline functions */
477/********************/
478
479/* Return a non-clipped PD index for a given VA */
480static __inline vm_pindex_t
481pmap_pde_pindex(vm_offset_t va)
482{
483	return (va >> PDRSHIFT);
484}
485
486
487/* Return various clipped indexes for a given VA */
488static __inline vm_pindex_t
489pmap_pte_index(vm_offset_t va)
490{
491
492	return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1));
493}
494
495static __inline vm_pindex_t
496pmap_pde_index(vm_offset_t va)
497{
498
499	return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
500}
501
502static __inline vm_pindex_t
503pmap_pdpe_index(vm_offset_t va)
504{
505
506	return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1));
507}
508
509static __inline vm_pindex_t
510pmap_pml4e_index(vm_offset_t va)
511{
512
513	return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
514}
515
516/* Return a pointer to the PML4 slot that corresponds to a VA */
517static __inline pml4_entry_t *
518pmap_pml4e(pmap_t pmap, vm_offset_t va)
519{
520
521	return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
522}
523
524/* Return a pointer to the PDP slot that corresponds to a VA */
525static __inline pdp_entry_t *
526pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va)
527{
528	pdp_entry_t *pdpe;
529
530	pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
531	return (&pdpe[pmap_pdpe_index(va)]);
532}
533
534/* Return a pointer to the PDP slot that corresponds to a VA */
535static __inline pdp_entry_t *
536pmap_pdpe(pmap_t pmap, vm_offset_t va)
537{
538	pml4_entry_t *pml4e;
539	pt_entry_t PG_V;
540
541	PG_V = pmap_valid_bit(pmap);
542	pml4e = pmap_pml4e(pmap, va);
543	if ((*pml4e & PG_V) == 0)
544		return (NULL);
545	return (pmap_pml4e_to_pdpe(pml4e, va));
546}
547
548/* Return a pointer to the PD slot that corresponds to a VA */
549static __inline pd_entry_t *
550pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va)
551{
552	pd_entry_t *pde;
553
554	pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
555	return (&pde[pmap_pde_index(va)]);
556}
557
558/* Return a pointer to the PD slot that corresponds to a VA */
559static __inline pd_entry_t *
560pmap_pde(pmap_t pmap, vm_offset_t va)
561{
562	pdp_entry_t *pdpe;
563	pt_entry_t PG_V;
564
565	PG_V = pmap_valid_bit(pmap);
566	pdpe = pmap_pdpe(pmap, va);
567	if (pdpe == NULL || (*pdpe & PG_V) == 0)
568		return (NULL);
569	return (pmap_pdpe_to_pde(pdpe, va));
570}
571
572/* Return a pointer to the PT slot that corresponds to a VA */
573static __inline pt_entry_t *
574pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va)
575{
576	pt_entry_t *pte;
577
578	pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
579	return (&pte[pmap_pte_index(va)]);
580}
581
582/* Return a pointer to the PT slot that corresponds to a VA */
583static __inline pt_entry_t *
584pmap_pte(pmap_t pmap, vm_offset_t va)
585{
586	pd_entry_t *pde;
587	pt_entry_t PG_V;
588
589	PG_V = pmap_valid_bit(pmap);
590	pde = pmap_pde(pmap, va);
591	if (pde == NULL || (*pde & PG_V) == 0)
592		return (NULL);
593	if ((*pde & PG_PS) != 0)	/* compat with i386 pmap_pte() */
594		return ((pt_entry_t *)pde);
595	return (pmap_pde_to_pte(pde, va));
596}
597
598static __inline void
599pmap_resident_count_inc(pmap_t pmap, int count)
600{
601
602	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
603	pmap->pm_stats.resident_count += count;
604}
605
606static __inline void
607pmap_resident_count_dec(pmap_t pmap, int count)
608{
609
610	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
611	KASSERT(pmap->pm_stats.resident_count >= count,
612	    ("pmap %p resident count underflow %ld %d", pmap,
613	    pmap->pm_stats.resident_count, count));
614	pmap->pm_stats.resident_count -= count;
615}
616
617PMAP_INLINE pt_entry_t *
618vtopte(vm_offset_t va)
619{
620	u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
621
622	KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va));
623
624	return (PTmap + ((va >> PAGE_SHIFT) & mask));
625}
626
627static __inline pd_entry_t *
628vtopde(vm_offset_t va)
629{
630	u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
631
632	KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va));
633
634	return (PDmap + ((va >> PDRSHIFT) & mask));
635}
636
637static u_int64_t
638allocpages(vm_paddr_t *firstaddr, int n)
639{
640	u_int64_t ret;
641
642	ret = *firstaddr;
643	bzero((void *)ret, n * PAGE_SIZE);
644	*firstaddr += n * PAGE_SIZE;
645	return (ret);
646}
647
648CTASSERT(powerof2(NDMPML4E));
649
650/* number of kernel PDP slots */
651#define	NKPDPE(ptpgs)		howmany((ptpgs), NPDEPG)
652
653static void
654nkpt_init(vm_paddr_t addr)
655{
656	int pt_pages;
657
658#ifdef NKPT
659	pt_pages = NKPT;
660#else
661	pt_pages = howmany(addr, 1 << PDRSHIFT);
662	pt_pages += NKPDPE(pt_pages);
663
664	/*
665	 * Add some slop beyond the bare minimum required for bootstrapping
666	 * the kernel.
667	 *
668	 * This is quite important when allocating KVA for kernel modules.
669	 * The modules are required to be linked in the negative 2GB of
670	 * the address space.  If we run out of KVA in this region then
671	 * pmap_growkernel() will need to allocate page table pages to map
672	 * the entire 512GB of KVA space which is an unnecessary tax on
673	 * physical memory.
674	 */
675	pt_pages += 8;		/* 16MB additional slop for kernel modules */
676#endif
677	nkpt = pt_pages;
678}
679
680static void
681create_pagetables(vm_paddr_t *firstaddr)
682{
683	int i, j, ndm1g, nkpdpe;
684	pt_entry_t *pt_p;
685	pd_entry_t *pd_p;
686	pdp_entry_t *pdp_p;
687	pml4_entry_t *p4_p;
688
689	/* Allocate page table pages for the direct map */
690	ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT;
691	if (ndmpdp < 4)		/* Minimum 4GB of dirmap */
692		ndmpdp = 4;
693	ndmpdpphys = howmany(ndmpdp, NPDPEPG);
694	if (ndmpdpphys > NDMPML4E) {
695		/*
696		 * Each NDMPML4E allows 512 GB, so limit to that,
697		 * and then readjust ndmpdp and ndmpdpphys.
698		 */
699		printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512);
700		Maxmem = atop(NDMPML4E * NBPML4);
701		ndmpdpphys = NDMPML4E;
702		ndmpdp = NDMPML4E * NPDEPG;
703	}
704	DMPDPphys = allocpages(firstaddr, ndmpdpphys);
705	ndm1g = 0;
706	if ((amd_feature & AMDID_PAGE1GB) != 0)
707		ndm1g = ptoa(Maxmem) >> PDPSHIFT;
708	if (ndm1g < ndmpdp)
709		DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g);
710	dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
711
712	/* Allocate pages */
713	KPML4phys = allocpages(firstaddr, 1);
714	KPDPphys = allocpages(firstaddr, NKPML4E);
715
716	/*
717	 * Allocate the initial number of kernel page table pages required to
718	 * bootstrap.  We defer this until after all memory-size dependent
719	 * allocations are done (e.g. direct map), so that we don't have to
720	 * build in too much slop in our estimate.
721	 *
722	 * Note that when NKPML4E > 1, we have an empty page underneath
723	 * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed)
724	 * pages.  (pmap_enter requires a PD page to exist for each KPML4E.)
725	 */
726	nkpt_init(*firstaddr);
727	nkpdpe = NKPDPE(nkpt);
728
729	KPTphys = allocpages(firstaddr, nkpt);
730	KPDphys = allocpages(firstaddr, nkpdpe);
731
732	/* Fill in the underlying page table pages */
733	/* Nominally read-only (but really R/W) from zero to physfree */
734	/* XXX not fully used, underneath 2M pages */
735	pt_p = (pt_entry_t *)KPTphys;
736	for (i = 0; ptoa(i) < *firstaddr; i++)
737		pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | X86_PG_G;
738
739	/* Now map the page tables at their location within PTmap */
740	pd_p = (pd_entry_t *)KPDphys;
741	for (i = 0; i < nkpt; i++)
742		pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
743
744	/* Map from zero to end of allocations under 2M pages */
745	/* This replaces some of the KPTphys entries above */
746	for (i = 0; (i << PDRSHIFT) < *firstaddr; i++)
747		pd_p[i] = (i << PDRSHIFT) | X86_PG_RW | X86_PG_V | PG_PS |
748		    X86_PG_G;
749
750	/* And connect up the PD to the PDP (leaving room for L4 pages) */
751	pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE));
752	for (i = 0; i < nkpdpe; i++)
753		pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V |
754		    PG_U;
755
756	/*
757	 * Now, set up the direct map region using 2MB and/or 1GB pages.  If
758	 * the end of physical memory is not aligned to a 1GB page boundary,
759	 * then the residual physical memory is mapped with 2MB pages.  Later,
760	 * if pmap_mapdev{_attr}() uses the direct map for non-write-back
761	 * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings
762	 * that are partially used.
763	 */
764	pd_p = (pd_entry_t *)DMPDphys;
765	for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) {
766		pd_p[j] = (vm_paddr_t)i << PDRSHIFT;
767		/* Preset PG_M and PG_A because demotion expects it. */
768		pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G |
769		    X86_PG_M | X86_PG_A;
770	}
771	pdp_p = (pdp_entry_t *)DMPDPphys;
772	for (i = 0; i < ndm1g; i++) {
773		pdp_p[i] = (vm_paddr_t)i << PDPSHIFT;
774		/* Preset PG_M and PG_A because demotion expects it. */
775		pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G |
776		    X86_PG_M | X86_PG_A;
777	}
778	for (j = 0; i < ndmpdp; i++, j++) {
779		pdp_p[i] = DMPDphys + ptoa(j);
780		pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_U;
781	}
782
783	/* And recursively map PML4 to itself in order to get PTmap */
784	p4_p = (pml4_entry_t *)KPML4phys;
785	p4_p[PML4PML4I] = KPML4phys;
786	p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | PG_U;
787
788	/* Connect the Direct Map slot(s) up to the PML4. */
789	for (i = 0; i < ndmpdpphys; i++) {
790		p4_p[DMPML4I + i] = DMPDPphys + ptoa(i);
791		p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | PG_U;
792	}
793
794	/* Connect the KVA slots up to the PML4 */
795	for (i = 0; i < NKPML4E; i++) {
796		p4_p[KPML4BASE + i] = KPDPphys + ptoa(i);
797		p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V | PG_U;
798	}
799}
800
801/*
802 *	Bootstrap the system enough to run with virtual memory.
803 *
804 *	On amd64 this is called after mapping has already been enabled
805 *	and just syncs the pmap module with what has already been done.
806 *	[We can't call it easily with mapping off since the kernel is not
807 *	mapped with PA == VA, hence we would have to relocate every address
808 *	from the linked base (virtual) address "KERNBASE" to the actual
809 *	(physical) address starting relative to 0]
810 */
811void
812pmap_bootstrap(vm_paddr_t *firstaddr)
813{
814	vm_offset_t va;
815	pt_entry_t *pte;
816
817	/*
818	 * Create an initial set of page tables to run the kernel in.
819	 */
820	create_pagetables(firstaddr);
821
822	virtual_avail = (vm_offset_t) KERNBASE + *firstaddr;
823	virtual_avail = pmap_kmem_choose(virtual_avail);
824
825	virtual_end = VM_MAX_KERNEL_ADDRESS;
826
827
828	/* XXX do %cr0 as well */
829	load_cr4(rcr4() | CR4_PGE | CR4_PSE);
830	load_cr3(KPML4phys);
831	if (cpu_stdext_feature & CPUID_STDEXT_SMEP)
832		load_cr4(rcr4() | CR4_SMEP);
833
834	/*
835	 * Initialize the kernel pmap (which is statically allocated).
836	 */
837	PMAP_LOCK_INIT(kernel_pmap);
838	kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys);
839	kernel_pmap->pm_cr3 = KPML4phys;
840	CPU_FILL(&kernel_pmap->pm_active);	/* don't allow deactivation */
841	CPU_FILL(&kernel_pmap->pm_save);	/* always superset of pm_active */
842	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
843	kernel_pmap->pm_flags = pmap_flags;
844
845 	/*
846	 * Initialize the global pv list lock.
847	 */
848	rw_init(&pvh_global_lock, "pmap pv global");
849
850	/*
851	 * Reserve some special page table entries/VA space for temporary
852	 * mapping of pages.
853	 */
854#define	SYSMAP(c, p, v, n)	\
855	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
856
857	va = virtual_avail;
858	pte = vtopte(va);
859
860	/*
861	 * Crashdump maps.  The first page is reused as CMAP1 for the
862	 * memory test.
863	 */
864	SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS)
865	CADDR1 = crashdumpmap;
866
867	virtual_avail = va;
868
869	/* Initialize the PAT MSR. */
870	pmap_init_pat();
871
872	/* Initialize TLB Context Id. */
873	TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
874	if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
875		load_cr4(rcr4() | CR4_PCIDE);
876		mtx_init(&pcid_mtx, "pcid", NULL, MTX_DEF);
877		init_unrhdr(&pcid_unr, 1, (1 << 12) - 1, &pcid_mtx);
878		/* Check for INVPCID support */
879		invpcid_works = (cpu_stdext_feature & CPUID_STDEXT_INVPCID)
880		    != 0;
881		kernel_pmap->pm_pcid = 0;
882#ifndef SMP
883		pmap_pcid_enabled = 0;
884#endif
885	} else
886		pmap_pcid_enabled = 0;
887}
888
889/*
890 * Setup the PAT MSR.
891 */
892void
893pmap_init_pat(void)
894{
895	int pat_table[PAT_INDEX_SIZE];
896	uint64_t pat_msr;
897	u_long cr0, cr4;
898	int i;
899
900	/* Bail if this CPU doesn't implement PAT. */
901	if ((cpu_feature & CPUID_PAT) == 0)
902		panic("no PAT??");
903
904	/* Set default PAT index table. */
905	for (i = 0; i < PAT_INDEX_SIZE; i++)
906		pat_table[i] = -1;
907	pat_table[PAT_WRITE_BACK] = 0;
908	pat_table[PAT_WRITE_THROUGH] = 1;
909	pat_table[PAT_UNCACHEABLE] = 3;
910	pat_table[PAT_WRITE_COMBINING] = 3;
911	pat_table[PAT_WRITE_PROTECTED] = 3;
912	pat_table[PAT_UNCACHED] = 3;
913
914	/* Initialize default PAT entries. */
915	pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
916	    PAT_VALUE(1, PAT_WRITE_THROUGH) |
917	    PAT_VALUE(2, PAT_UNCACHED) |
918	    PAT_VALUE(3, PAT_UNCACHEABLE) |
919	    PAT_VALUE(4, PAT_WRITE_BACK) |
920	    PAT_VALUE(5, PAT_WRITE_THROUGH) |
921	    PAT_VALUE(6, PAT_UNCACHED) |
922	    PAT_VALUE(7, PAT_UNCACHEABLE);
923
924	if (pat_works) {
925		/*
926		 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
927		 * Program 5 and 6 as WP and WC.
928		 * Leave 4 and 7 as WB and UC.
929		 */
930		pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6));
931		pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) |
932		    PAT_VALUE(6, PAT_WRITE_COMBINING);
933		pat_table[PAT_UNCACHED] = 2;
934		pat_table[PAT_WRITE_PROTECTED] = 5;
935		pat_table[PAT_WRITE_COMBINING] = 6;
936	} else {
937		/*
938		 * Just replace PAT Index 2 with WC instead of UC-.
939		 */
940		pat_msr &= ~PAT_MASK(2);
941		pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
942		pat_table[PAT_WRITE_COMBINING] = 2;
943	}
944
945	/* Disable PGE. */
946	cr4 = rcr4();
947	load_cr4(cr4 & ~CR4_PGE);
948
949	/* Disable caches (CD = 1, NW = 0). */
950	cr0 = rcr0();
951	load_cr0((cr0 & ~CR0_NW) | CR0_CD);
952
953	/* Flushes caches and TLBs. */
954	wbinvd();
955	invltlb();
956
957	/* Update PAT and index table. */
958	wrmsr(MSR_PAT, pat_msr);
959	for (i = 0; i < PAT_INDEX_SIZE; i++)
960		pat_index[i] = pat_table[i];
961
962	/* Flush caches and TLBs again. */
963	wbinvd();
964	invltlb();
965
966	/* Restore caches and PGE. */
967	load_cr0(cr0);
968	load_cr4(cr4);
969}
970
971/*
972 *	Initialize a vm_page's machine-dependent fields.
973 */
974void
975pmap_page_init(vm_page_t m)
976{
977
978	TAILQ_INIT(&m->md.pv_list);
979	m->md.pat_mode = PAT_WRITE_BACK;
980}
981
982/*
983 *	Initialize the pmap module.
984 *	Called by vm_init, to initialize any structures that the pmap
985 *	system needs to map virtual memory.
986 */
987void
988pmap_init(void)
989{
990	vm_page_t mpte;
991	vm_size_t s;
992	int i, pv_npg;
993
994	/*
995	 * Initialize the vm page array entries for the kernel pmap's
996	 * page table pages.
997	 */
998	for (i = 0; i < nkpt; i++) {
999		mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
1000		KASSERT(mpte >= vm_page_array &&
1001		    mpte < &vm_page_array[vm_page_array_size],
1002		    ("pmap_init: page table page is out of range"));
1003		mpte->pindex = pmap_pde_pindex(KERNBASE) + i;
1004		mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
1005	}
1006
1007	/*
1008	 * If the kernel is running on a virtual machine, then it must assume
1009	 * that MCA is enabled by the hypervisor.  Moreover, the kernel must
1010	 * be prepared for the hypervisor changing the vendor and family that
1011	 * are reported by CPUID.  Consequently, the workaround for AMD Family
1012	 * 10h Erratum 383 is enabled if the processor's feature set does not
1013	 * include at least one feature that is only supported by older Intel
1014	 * or newer AMD processors.
1015	 */
1016	if (vm_guest == VM_GUEST_VM && (cpu_feature & CPUID_SS) == 0 &&
1017	    (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI |
1018	    CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP |
1019	    AMDID2_FMA4)) == 0)
1020		workaround_erratum383 = 1;
1021
1022	/*
1023	 * Are large page mappings enabled?
1024	 */
1025	TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
1026	if (pg_ps_enabled) {
1027		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
1028		    ("pmap_init: can't assign to pagesizes[1]"));
1029		pagesizes[1] = NBPDR;
1030	}
1031
1032	/*
1033	 * Initialize the pv chunk list mutex.
1034	 */
1035	mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
1036
1037	/*
1038	 * Initialize the pool of pv list locks.
1039	 */
1040	for (i = 0; i < NPV_LIST_LOCKS; i++)
1041		rw_init(&pv_list_locks[i], "pmap pv list");
1042
1043	/*
1044	 * Calculate the size of the pv head table for superpages.
1045	 */
1046	for (i = 0; phys_avail[i + 1]; i += 2);
1047	pv_npg = round_2mpage(phys_avail[(i - 2) + 1]) / NBPDR;
1048
1049	/*
1050	 * Allocate memory for the pv head table for superpages.
1051	 */
1052	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
1053	s = round_page(s);
1054	pv_table = (struct md_page *)kmem_malloc(kernel_arena, s,
1055	    M_WAITOK | M_ZERO);
1056	for (i = 0; i < pv_npg; i++)
1057		TAILQ_INIT(&pv_table[i].pv_list);
1058}
1059
1060static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
1061    "2MB page mapping counters");
1062
1063static u_long pmap_pde_demotions;
1064SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
1065    &pmap_pde_demotions, 0, "2MB page demotions");
1066
1067static u_long pmap_pde_mappings;
1068SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
1069    &pmap_pde_mappings, 0, "2MB page mappings");
1070
1071static u_long pmap_pde_p_failures;
1072SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
1073    &pmap_pde_p_failures, 0, "2MB page promotion failures");
1074
1075static u_long pmap_pde_promotions;
1076SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
1077    &pmap_pde_promotions, 0, "2MB page promotions");
1078
1079static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0,
1080    "1GB page mapping counters");
1081
1082static u_long pmap_pdpe_demotions;
1083SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD,
1084    &pmap_pdpe_demotions, 0, "1GB page demotions");
1085
1086/***************************************************
1087 * Low level helper routines.....
1088 ***************************************************/
1089
1090static pt_entry_t
1091pmap_swap_pat(pmap_t pmap, pt_entry_t entry)
1092{
1093	int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT;
1094
1095	switch (pmap->pm_type) {
1096	case PT_X86:
1097		/* Verify that both PAT bits are not set at the same time */
1098		KASSERT((entry & x86_pat_bits) != x86_pat_bits,
1099		    ("Invalid PAT bits in entry %#lx", entry));
1100
1101		/* Swap the PAT bits if one of them is set */
1102		if ((entry & x86_pat_bits) != 0)
1103			entry ^= x86_pat_bits;
1104		break;
1105	case PT_EPT:
1106		/*
1107		 * Nothing to do - the memory attributes are represented
1108		 * the same way for regular pages and superpages.
1109		 */
1110		break;
1111	default:
1112		panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type);
1113	}
1114
1115	return (entry);
1116}
1117
1118/*
1119 * Determine the appropriate bits to set in a PTE or PDE for a specified
1120 * caching mode.
1121 */
1122static int
1123pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde)
1124{
1125	int cache_bits, pat_flag, pat_idx;
1126
1127	if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0)
1128		panic("Unknown caching mode %d\n", mode);
1129
1130	switch (pmap->pm_type) {
1131	case PT_X86:
1132		/* The PAT bit is different for PTE's and PDE's. */
1133		pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT;
1134
1135		/* Map the caching mode to a PAT index. */
1136		pat_idx = pat_index[mode];
1137
1138		/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
1139		cache_bits = 0;
1140		if (pat_idx & 0x4)
1141			cache_bits |= pat_flag;
1142		if (pat_idx & 0x2)
1143			cache_bits |= PG_NC_PCD;
1144		if (pat_idx & 0x1)
1145			cache_bits |= PG_NC_PWT;
1146		break;
1147
1148	case PT_EPT:
1149		cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode);
1150		break;
1151
1152	default:
1153		panic("unsupported pmap type %d", pmap->pm_type);
1154	}
1155
1156	return (cache_bits);
1157}
1158
1159static int
1160pmap_cache_mask(pmap_t pmap, boolean_t is_pde)
1161{
1162	int mask;
1163
1164	switch (pmap->pm_type) {
1165	case PT_X86:
1166		mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE;
1167		break;
1168	case PT_EPT:
1169		mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7);
1170		break;
1171	default:
1172		panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type);
1173	}
1174
1175	return (mask);
1176}
1177
1178static __inline boolean_t
1179pmap_ps_enabled(pmap_t pmap)
1180{
1181
1182	return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0);
1183}
1184
1185static void
1186pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde)
1187{
1188
1189	switch (pmap->pm_type) {
1190	case PT_X86:
1191		break;
1192	case PT_EPT:
1193		/*
1194		 * XXX
1195		 * This is a little bogus since the generation number is
1196		 * supposed to be bumped up when a region of the address
1197		 * space is invalidated in the page tables.
1198		 *
1199		 * In this case the old PDE entry is valid but yet we want
1200		 * to make sure that any mappings using the old entry are
1201		 * invalidated in the TLB.
1202		 *
1203		 * The reason this works as expected is because we rendezvous
1204		 * "all" host cpus and force any vcpu context to exit as a
1205		 * side-effect.
1206		 */
1207		atomic_add_acq_long(&pmap->pm_eptgen, 1);
1208		break;
1209	default:
1210		panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type);
1211	}
1212	pde_store(pde, newpde);
1213}
1214
1215/*
1216 * After changing the page size for the specified virtual address in the page
1217 * table, flush the corresponding entries from the processor's TLB.  Only the
1218 * calling processor's TLB is affected.
1219 *
1220 * The calling thread must be pinned to a processor.
1221 */
1222static void
1223pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde)
1224{
1225	pt_entry_t PG_G;
1226
1227	if (pmap->pm_type == PT_EPT)
1228		return;
1229
1230	KASSERT(pmap->pm_type == PT_X86,
1231	    ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type));
1232
1233	PG_G = pmap_global_bit(pmap);
1234
1235	if ((newpde & PG_PS) == 0)
1236		/* Demotion: flush a specific 2MB page mapping. */
1237		invlpg(va);
1238	else if ((newpde & PG_G) == 0)
1239		/*
1240		 * Promotion: flush every 4KB page mapping from the TLB
1241		 * because there are too many to flush individually.
1242		 */
1243		invltlb();
1244	else {
1245		/*
1246		 * Promotion: flush every 4KB page mapping from the TLB,
1247		 * including any global (PG_G) mappings.
1248		 */
1249		invltlb_globpcid();
1250	}
1251}
1252#ifdef SMP
1253
1254static void
1255pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va)
1256{
1257	struct invpcid_descr d;
1258	uint64_t cr3;
1259
1260	if (invpcid_works) {
1261		d.pcid = pmap->pm_pcid;
1262		d.pad = 0;
1263		d.addr = va;
1264		invpcid(&d, INVPCID_ADDR);
1265		return;
1266	}
1267
1268	cr3 = rcr3();
1269	critical_enter();
1270	load_cr3(pmap->pm_cr3 | CR3_PCID_SAVE);
1271	invlpg(va);
1272	load_cr3(cr3 | CR3_PCID_SAVE);
1273	critical_exit();
1274}
1275
1276/*
1277 * For SMP, these functions have to use the IPI mechanism for coherence.
1278 *
1279 * N.B.: Before calling any of the following TLB invalidation functions,
1280 * the calling processor must ensure that all stores updating a non-
1281 * kernel page table are globally performed.  Otherwise, another
1282 * processor could cache an old, pre-update entry without being
1283 * invalidated.  This can happen one of two ways: (1) The pmap becomes
1284 * active on another processor after its pm_active field is checked by
1285 * one of the following functions but before a store updating the page
1286 * table is globally performed. (2) The pmap becomes active on another
1287 * processor before its pm_active field is checked but due to
1288 * speculative loads one of the following functions stills reads the
1289 * pmap as inactive on the other processor.
1290 *
1291 * The kernel page table is exempt because its pm_active field is
1292 * immutable.  The kernel page table is always active on every
1293 * processor.
1294 */
1295
1296/*
1297 * Interrupt the cpus that are executing in the guest context.
1298 * This will force the vcpu to exit and the cached EPT mappings
1299 * will be invalidated by the host before the next vmresume.
1300 */
1301static __inline void
1302pmap_invalidate_ept(pmap_t pmap)
1303{
1304	int ipinum;
1305
1306	sched_pin();
1307	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
1308	    ("pmap_invalidate_ept: absurd pm_active"));
1309
1310	/*
1311	 * The TLB mappings associated with a vcpu context are not
1312	 * flushed each time a different vcpu is chosen to execute.
1313	 *
1314	 * This is in contrast with a process's vtop mappings that
1315	 * are flushed from the TLB on each context switch.
1316	 *
1317	 * Therefore we need to do more than just a TLB shootdown on
1318	 * the active cpus in 'pmap->pm_active'. To do this we keep
1319	 * track of the number of invalidations performed on this pmap.
1320	 *
1321	 * Each vcpu keeps a cache of this counter and compares it
1322	 * just before a vmresume. If the counter is out-of-date an
1323	 * invept will be done to flush stale mappings from the TLB.
1324	 */
1325	atomic_add_acq_long(&pmap->pm_eptgen, 1);
1326
1327	/*
1328	 * Force the vcpu to exit and trap back into the hypervisor.
1329	 */
1330	ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK;
1331	ipi_selected(pmap->pm_active, ipinum);
1332	sched_unpin();
1333}
1334
1335void
1336pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1337{
1338	cpuset_t other_cpus;
1339	u_int cpuid;
1340
1341	if (pmap->pm_type == PT_EPT) {
1342		pmap_invalidate_ept(pmap);
1343		return;
1344	}
1345
1346	KASSERT(pmap->pm_type == PT_X86,
1347	    ("pmap_invalidate_page: invalid type %d", pmap->pm_type));
1348
1349	sched_pin();
1350	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
1351		if (!pmap_pcid_enabled) {
1352			invlpg(va);
1353		} else {
1354			if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) {
1355				if (pmap == PCPU_GET(curpmap))
1356					invlpg(va);
1357				else
1358					pmap_invalidate_page_pcid(pmap, va);
1359			} else {
1360				invltlb_globpcid();
1361			}
1362		}
1363		smp_invlpg(pmap, va);
1364	} else {
1365		cpuid = PCPU_GET(cpuid);
1366		other_cpus = all_cpus;
1367		CPU_CLR(cpuid, &other_cpus);
1368		if (CPU_ISSET(cpuid, &pmap->pm_active))
1369			invlpg(va);
1370		else if (pmap_pcid_enabled) {
1371			if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0)
1372				pmap_invalidate_page_pcid(pmap, va);
1373			else
1374				invltlb_globpcid();
1375		}
1376		if (pmap_pcid_enabled)
1377			CPU_AND(&other_cpus, &pmap->pm_save);
1378		else
1379			CPU_AND(&other_cpus, &pmap->pm_active);
1380		if (!CPU_EMPTY(&other_cpus))
1381			smp_masked_invlpg(other_cpus, pmap, va);
1382	}
1383	sched_unpin();
1384}
1385
1386static void
1387pmap_invalidate_range_pcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1388{
1389	struct invpcid_descr d;
1390	uint64_t cr3;
1391	vm_offset_t addr;
1392
1393	if (invpcid_works) {
1394		d.pcid = pmap->pm_pcid;
1395		d.pad = 0;
1396		for (addr = sva; addr < eva; addr += PAGE_SIZE) {
1397			d.addr = addr;
1398			invpcid(&d, INVPCID_ADDR);
1399		}
1400		return;
1401	}
1402
1403	cr3 = rcr3();
1404	critical_enter();
1405	load_cr3(pmap->pm_cr3 | CR3_PCID_SAVE);
1406	for (addr = sva; addr < eva; addr += PAGE_SIZE)
1407		invlpg(addr);
1408	load_cr3(cr3 | CR3_PCID_SAVE);
1409	critical_exit();
1410}
1411
1412void
1413pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1414{
1415	cpuset_t other_cpus;
1416	vm_offset_t addr;
1417	u_int cpuid;
1418
1419	if (pmap->pm_type == PT_EPT) {
1420		pmap_invalidate_ept(pmap);
1421		return;
1422	}
1423
1424	KASSERT(pmap->pm_type == PT_X86,
1425	    ("pmap_invalidate_range: invalid type %d", pmap->pm_type));
1426
1427	sched_pin();
1428	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
1429		if (!pmap_pcid_enabled) {
1430			for (addr = sva; addr < eva; addr += PAGE_SIZE)
1431				invlpg(addr);
1432		} else {
1433			if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) {
1434				if (pmap == PCPU_GET(curpmap)) {
1435					for (addr = sva; addr < eva;
1436					    addr += PAGE_SIZE)
1437						invlpg(addr);
1438				} else {
1439					pmap_invalidate_range_pcid(pmap,
1440					    sva, eva);
1441				}
1442			} else {
1443				invltlb_globpcid();
1444			}
1445		}
1446		smp_invlpg_range(pmap, sva, eva);
1447	} else {
1448		cpuid = PCPU_GET(cpuid);
1449		other_cpus = all_cpus;
1450		CPU_CLR(cpuid, &other_cpus);
1451		if (CPU_ISSET(cpuid, &pmap->pm_active)) {
1452			for (addr = sva; addr < eva; addr += PAGE_SIZE)
1453				invlpg(addr);
1454		} else if (pmap_pcid_enabled) {
1455			if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0)
1456				pmap_invalidate_range_pcid(pmap, sva, eva);
1457			else
1458				invltlb_globpcid();
1459		}
1460		if (pmap_pcid_enabled)
1461			CPU_AND(&other_cpus, &pmap->pm_save);
1462		else
1463			CPU_AND(&other_cpus, &pmap->pm_active);
1464		if (!CPU_EMPTY(&other_cpus))
1465			smp_masked_invlpg_range(other_cpus, pmap, sva, eva);
1466	}
1467	sched_unpin();
1468}
1469
1470void
1471pmap_invalidate_all(pmap_t pmap)
1472{
1473	cpuset_t other_cpus;
1474	struct invpcid_descr d;
1475	uint64_t cr3;
1476	u_int cpuid;
1477
1478	if (pmap->pm_type == PT_EPT) {
1479		pmap_invalidate_ept(pmap);
1480		return;
1481	}
1482
1483	KASSERT(pmap->pm_type == PT_X86,
1484	    ("pmap_invalidate_all: invalid type %d", pmap->pm_type));
1485
1486	sched_pin();
1487	cpuid = PCPU_GET(cpuid);
1488	if (pmap == kernel_pmap ||
1489	    (pmap_pcid_enabled && !CPU_CMP(&pmap->pm_save, &all_cpus)) ||
1490	    !CPU_CMP(&pmap->pm_active, &all_cpus)) {
1491		if (invpcid_works) {
1492			bzero(&d, sizeof(d));
1493			invpcid(&d, INVPCID_CTXGLOB);
1494		} else {
1495			invltlb_globpcid();
1496		}
1497		if (!CPU_ISSET(cpuid, &pmap->pm_active))
1498			CPU_CLR_ATOMIC(cpuid, &pmap->pm_save);
1499		smp_invltlb(pmap);
1500	} else {
1501		other_cpus = all_cpus;
1502		CPU_CLR(cpuid, &other_cpus);
1503
1504		/*
1505		 * This logic is duplicated in the Xinvltlb shootdown
1506		 * IPI handler.
1507		 */
1508		if (pmap_pcid_enabled) {
1509			if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) {
1510				if (invpcid_works) {
1511					d.pcid = pmap->pm_pcid;
1512					d.pad = 0;
1513					d.addr = 0;
1514					invpcid(&d, INVPCID_CTX);
1515				} else {
1516					cr3 = rcr3();
1517					critical_enter();
1518
1519					/*
1520					 * Bit 63 is clear, pcid TLB
1521					 * entries are invalidated.
1522					 */
1523					load_cr3(pmap->pm_cr3);
1524					load_cr3(cr3 | CR3_PCID_SAVE);
1525					critical_exit();
1526				}
1527			} else {
1528				invltlb_globpcid();
1529			}
1530		} else if (CPU_ISSET(cpuid, &pmap->pm_active))
1531			invltlb();
1532		if (!CPU_ISSET(cpuid, &pmap->pm_active))
1533			CPU_CLR_ATOMIC(cpuid, &pmap->pm_save);
1534		if (pmap_pcid_enabled)
1535			CPU_AND(&other_cpus, &pmap->pm_save);
1536		else
1537			CPU_AND(&other_cpus, &pmap->pm_active);
1538		if (!CPU_EMPTY(&other_cpus))
1539			smp_masked_invltlb(other_cpus, pmap);
1540	}
1541	sched_unpin();
1542}
1543
1544void
1545pmap_invalidate_cache(void)
1546{
1547
1548	sched_pin();
1549	wbinvd();
1550	smp_cache_flush();
1551	sched_unpin();
1552}
1553
1554struct pde_action {
1555	cpuset_t invalidate;	/* processors that invalidate their TLB */
1556	pmap_t pmap;
1557	vm_offset_t va;
1558	pd_entry_t *pde;
1559	pd_entry_t newpde;
1560	u_int store;		/* processor that updates the PDE */
1561};
1562
1563static void
1564pmap_update_pde_action(void *arg)
1565{
1566	struct pde_action *act = arg;
1567
1568	if (act->store == PCPU_GET(cpuid))
1569		pmap_update_pde_store(act->pmap, act->pde, act->newpde);
1570}
1571
1572static void
1573pmap_update_pde_teardown(void *arg)
1574{
1575	struct pde_action *act = arg;
1576
1577	if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
1578		pmap_update_pde_invalidate(act->pmap, act->va, act->newpde);
1579}
1580
1581/*
1582 * Change the page size for the specified virtual address in a way that
1583 * prevents any possibility of the TLB ever having two entries that map the
1584 * same virtual address using different page sizes.  This is the recommended
1585 * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
1586 * machine check exception for a TLB state that is improperly diagnosed as a
1587 * hardware error.
1588 */
1589static void
1590pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1591{
1592	struct pde_action act;
1593	cpuset_t active, other_cpus;
1594	u_int cpuid;
1595
1596	sched_pin();
1597	cpuid = PCPU_GET(cpuid);
1598	other_cpus = all_cpus;
1599	CPU_CLR(cpuid, &other_cpus);
1600	if (pmap == kernel_pmap || pmap->pm_type == PT_EPT)
1601		active = all_cpus;
1602	else {
1603		active = pmap->pm_active;
1604		CPU_AND_ATOMIC(&pmap->pm_save, &active);
1605	}
1606	if (CPU_OVERLAP(&active, &other_cpus)) {
1607		act.store = cpuid;
1608		act.invalidate = active;
1609		act.va = va;
1610		act.pmap = pmap;
1611		act.pde = pde;
1612		act.newpde = newpde;
1613		CPU_SET(cpuid, &active);
1614		smp_rendezvous_cpus(active,
1615		    smp_no_rendevous_barrier, pmap_update_pde_action,
1616		    pmap_update_pde_teardown, &act);
1617	} else {
1618		pmap_update_pde_store(pmap, pde, newpde);
1619		if (CPU_ISSET(cpuid, &active))
1620			pmap_update_pde_invalidate(pmap, va, newpde);
1621	}
1622	sched_unpin();
1623}
1624#else /* !SMP */
1625/*
1626 * Normal, non-SMP, invalidation functions.
1627 * We inline these within pmap.c for speed.
1628 */
1629PMAP_INLINE void
1630pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1631{
1632
1633	switch (pmap->pm_type) {
1634	case PT_X86:
1635		if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1636			invlpg(va);
1637		break;
1638	case PT_EPT:
1639		pmap->pm_eptgen++;
1640		break;
1641	default:
1642		panic("pmap_invalidate_page: unknown type: %d", pmap->pm_type);
1643	}
1644}
1645
1646PMAP_INLINE void
1647pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1648{
1649	vm_offset_t addr;
1650
1651	switch (pmap->pm_type) {
1652	case PT_X86:
1653		if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1654			for (addr = sva; addr < eva; addr += PAGE_SIZE)
1655				invlpg(addr);
1656		break;
1657	case PT_EPT:
1658		pmap->pm_eptgen++;
1659		break;
1660	default:
1661		panic("pmap_invalidate_range: unknown type: %d", pmap->pm_type);
1662	}
1663}
1664
1665PMAP_INLINE void
1666pmap_invalidate_all(pmap_t pmap)
1667{
1668
1669	switch (pmap->pm_type) {
1670	case PT_X86:
1671		if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1672			invltlb();
1673		break;
1674	case PT_EPT:
1675		pmap->pm_eptgen++;
1676		break;
1677	default:
1678		panic("pmap_invalidate_all: unknown type %d", pmap->pm_type);
1679	}
1680}
1681
1682PMAP_INLINE void
1683pmap_invalidate_cache(void)
1684{
1685
1686	wbinvd();
1687}
1688
1689static void
1690pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1691{
1692
1693	pmap_update_pde_store(pmap, pde, newpde);
1694	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1695		pmap_update_pde_invalidate(pmap, va, newpde);
1696	else
1697		CPU_ZERO(&pmap->pm_save);
1698}
1699#endif /* !SMP */
1700
1701#define PMAP_CLFLUSH_THRESHOLD   (2 * 1024 * 1024)
1702
1703void
1704pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
1705{
1706
1707	KASSERT((sva & PAGE_MASK) == 0,
1708	    ("pmap_invalidate_cache_range: sva not page-aligned"));
1709	KASSERT((eva & PAGE_MASK) == 0,
1710	    ("pmap_invalidate_cache_range: eva not page-aligned"));
1711
1712	if (cpu_feature & CPUID_SS)
1713		; /* If "Self Snoop" is supported, do nothing. */
1714	else if ((cpu_feature & CPUID_CLFSH) != 0 &&
1715	    eva - sva < PMAP_CLFLUSH_THRESHOLD) {
1716
1717		/*
1718		 * XXX: Some CPUs fault, hang, or trash the local APIC
1719		 * registers if we use CLFLUSH on the local APIC
1720		 * range.  The local APIC is always uncached, so we
1721		 * don't need to flush for that range anyway.
1722		 */
1723		if (pmap_kextract(sva) == lapic_paddr)
1724			return;
1725
1726		/*
1727		 * Otherwise, do per-cache line flush.  Use the mfence
1728		 * instruction to insure that previous stores are
1729		 * included in the write-back.  The processor
1730		 * propagates flush to other processors in the cache
1731		 * coherence domain.
1732		 */
1733		mfence();
1734		for (; sva < eva; sva += cpu_clflush_line_size)
1735			clflush(sva);
1736		mfence();
1737	} else {
1738
1739		/*
1740		 * No targeted cache flush methods are supported by CPU,
1741		 * or the supplied range is bigger than 2MB.
1742		 * Globally invalidate cache.
1743		 */
1744		pmap_invalidate_cache();
1745	}
1746}
1747
1748/*
1749 * Remove the specified set of pages from the data and instruction caches.
1750 *
1751 * In contrast to pmap_invalidate_cache_range(), this function does not
1752 * rely on the CPU's self-snoop feature, because it is intended for use
1753 * when moving pages into a different cache domain.
1754 */
1755void
1756pmap_invalidate_cache_pages(vm_page_t *pages, int count)
1757{
1758	vm_offset_t daddr, eva;
1759	int i;
1760
1761	if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
1762	    (cpu_feature & CPUID_CLFSH) == 0)
1763		pmap_invalidate_cache();
1764	else {
1765		mfence();
1766		for (i = 0; i < count; i++) {
1767			daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i]));
1768			eva = daddr + PAGE_SIZE;
1769			for (; daddr < eva; daddr += cpu_clflush_line_size)
1770				clflush(daddr);
1771		}
1772		mfence();
1773	}
1774}
1775
1776/*
1777 *	Routine:	pmap_extract
1778 *	Function:
1779 *		Extract the physical page address associated
1780 *		with the given map/virtual_address pair.
1781 */
1782vm_paddr_t
1783pmap_extract(pmap_t pmap, vm_offset_t va)
1784{
1785	pdp_entry_t *pdpe;
1786	pd_entry_t *pde;
1787	pt_entry_t *pte, PG_V;
1788	vm_paddr_t pa;
1789
1790	pa = 0;
1791	PG_V = pmap_valid_bit(pmap);
1792	PMAP_LOCK(pmap);
1793	pdpe = pmap_pdpe(pmap, va);
1794	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
1795		if ((*pdpe & PG_PS) != 0)
1796			pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK);
1797		else {
1798			pde = pmap_pdpe_to_pde(pdpe, va);
1799			if ((*pde & PG_V) != 0) {
1800				if ((*pde & PG_PS) != 0) {
1801					pa = (*pde & PG_PS_FRAME) |
1802					    (va & PDRMASK);
1803				} else {
1804					pte = pmap_pde_to_pte(pde, va);
1805					pa = (*pte & PG_FRAME) |
1806					    (va & PAGE_MASK);
1807				}
1808			}
1809		}
1810	}
1811	PMAP_UNLOCK(pmap);
1812	return (pa);
1813}
1814
1815/*
1816 *	Routine:	pmap_extract_and_hold
1817 *	Function:
1818 *		Atomically extract and hold the physical page
1819 *		with the given pmap and virtual address pair
1820 *		if that mapping permits the given protection.
1821 */
1822vm_page_t
1823pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1824{
1825	pd_entry_t pde, *pdep;
1826	pt_entry_t pte, PG_RW, PG_V;
1827	vm_paddr_t pa;
1828	vm_page_t m;
1829
1830	pa = 0;
1831	m = NULL;
1832	PG_RW = pmap_rw_bit(pmap);
1833	PG_V = pmap_valid_bit(pmap);
1834	PMAP_LOCK(pmap);
1835retry:
1836	pdep = pmap_pde(pmap, va);
1837	if (pdep != NULL && (pde = *pdep)) {
1838		if (pde & PG_PS) {
1839			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
1840				if (vm_page_pa_tryrelock(pmap, (pde &
1841				    PG_PS_FRAME) | (va & PDRMASK), &pa))
1842					goto retry;
1843				m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
1844				    (va & PDRMASK));
1845				vm_page_hold(m);
1846			}
1847		} else {
1848			pte = *pmap_pde_to_pte(pdep, va);
1849			if ((pte & PG_V) &&
1850			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
1851				if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME,
1852				    &pa))
1853					goto retry;
1854				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
1855				vm_page_hold(m);
1856			}
1857		}
1858	}
1859	PA_UNLOCK_COND(pa);
1860	PMAP_UNLOCK(pmap);
1861	return (m);
1862}
1863
1864vm_paddr_t
1865pmap_kextract(vm_offset_t va)
1866{
1867	pd_entry_t pde;
1868	vm_paddr_t pa;
1869
1870	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
1871		pa = DMAP_TO_PHYS(va);
1872	} else {
1873		pde = *vtopde(va);
1874		if (pde & PG_PS) {
1875			pa = (pde & PG_PS_FRAME) | (va & PDRMASK);
1876		} else {
1877			/*
1878			 * Beware of a concurrent promotion that changes the
1879			 * PDE at this point!  For example, vtopte() must not
1880			 * be used to access the PTE because it would use the
1881			 * new PDE.  It is, however, safe to use the old PDE
1882			 * because the page table page is preserved by the
1883			 * promotion.
1884			 */
1885			pa = *pmap_pde_to_pte(&pde, va);
1886			pa = (pa & PG_FRAME) | (va & PAGE_MASK);
1887		}
1888	}
1889	return (pa);
1890}
1891
1892/***************************************************
1893 * Low level mapping routines.....
1894 ***************************************************/
1895
1896/*
1897 * Add a wired page to the kva.
1898 * Note: not SMP coherent.
1899 */
1900PMAP_INLINE void
1901pmap_kenter(vm_offset_t va, vm_paddr_t pa)
1902{
1903	pt_entry_t *pte;
1904
1905	pte = vtopte(va);
1906	pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G);
1907}
1908
1909static __inline void
1910pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
1911{
1912	pt_entry_t *pte;
1913	int cache_bits;
1914
1915	pte = vtopte(va);
1916	cache_bits = pmap_cache_bits(kernel_pmap, mode, 0);
1917	pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G | cache_bits);
1918}
1919
1920/*
1921 * Remove a page from the kernel pagetables.
1922 * Note: not SMP coherent.
1923 */
1924PMAP_INLINE void
1925pmap_kremove(vm_offset_t va)
1926{
1927	pt_entry_t *pte;
1928
1929	pte = vtopte(va);
1930	pte_clear(pte);
1931}
1932
1933/*
1934 *	Used to map a range of physical addresses into kernel
1935 *	virtual address space.
1936 *
1937 *	The value passed in '*virt' is a suggested virtual address for
1938 *	the mapping. Architectures which can support a direct-mapped
1939 *	physical to virtual region can return the appropriate address
1940 *	within that region, leaving '*virt' unchanged. Other
1941 *	architectures should map the pages starting at '*virt' and
1942 *	update '*virt' with the first usable address after the mapped
1943 *	region.
1944 */
1945vm_offset_t
1946pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1947{
1948	return PHYS_TO_DMAP(start);
1949}
1950
1951
1952/*
1953 * Add a list of wired pages to the kva
1954 * this routine is only used for temporary
1955 * kernel mappings that do not need to have
1956 * page modification or references recorded.
1957 * Note that old mappings are simply written
1958 * over.  The page *must* be wired.
1959 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1960 */
1961void
1962pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1963{
1964	pt_entry_t *endpte, oldpte, pa, *pte;
1965	vm_page_t m;
1966	int cache_bits;
1967
1968	oldpte = 0;
1969	pte = vtopte(sva);
1970	endpte = pte + count;
1971	while (pte < endpte) {
1972		m = *ma++;
1973		cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0);
1974		pa = VM_PAGE_TO_PHYS(m) | cache_bits;
1975		if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) {
1976			oldpte |= *pte;
1977			pte_store(pte, pa | X86_PG_G | X86_PG_RW | X86_PG_V);
1978		}
1979		pte++;
1980	}
1981	if (__predict_false((oldpte & X86_PG_V) != 0))
1982		pmap_invalidate_range(kernel_pmap, sva, sva + count *
1983		    PAGE_SIZE);
1984}
1985
1986/*
1987 * This routine tears out page mappings from the
1988 * kernel -- it is meant only for temporary mappings.
1989 * Note: SMP coherent.  Uses a ranged shootdown IPI.
1990 */
1991void
1992pmap_qremove(vm_offset_t sva, int count)
1993{
1994	vm_offset_t va;
1995
1996	va = sva;
1997	while (count-- > 0) {
1998		KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va));
1999		pmap_kremove(va);
2000		va += PAGE_SIZE;
2001	}
2002	pmap_invalidate_range(kernel_pmap, sva, va);
2003}
2004
2005/***************************************************
2006 * Page table page management routines.....
2007 ***************************************************/
2008static __inline void
2009pmap_free_zero_pages(struct spglist *free)
2010{
2011	vm_page_t m;
2012
2013	while ((m = SLIST_FIRST(free)) != NULL) {
2014		SLIST_REMOVE_HEAD(free, plinks.s.ss);
2015		/* Preserve the page's PG_ZERO setting. */
2016		vm_page_free_toq(m);
2017	}
2018}
2019
2020/*
2021 * Schedule the specified unused page table page to be freed.  Specifically,
2022 * add the page to the specified list of pages that will be released to the
2023 * physical memory manager after the TLB has been updated.
2024 */
2025static __inline void
2026pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
2027    boolean_t set_PG_ZERO)
2028{
2029
2030	if (set_PG_ZERO)
2031		m->flags |= PG_ZERO;
2032	else
2033		m->flags &= ~PG_ZERO;
2034	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
2035}
2036
2037/*
2038 * Inserts the specified page table page into the specified pmap's collection
2039 * of idle page table pages.  Each of a pmap's page table pages is responsible
2040 * for mapping a distinct range of virtual addresses.  The pmap's collection is
2041 * ordered by this virtual address range.
2042 */
2043static __inline int
2044pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
2045{
2046
2047	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2048	return (vm_radix_insert(&pmap->pm_root, mpte));
2049}
2050
2051/*
2052 * Looks for a page table page mapping the specified virtual address in the
2053 * specified pmap's collection of idle page table pages.  Returns NULL if there
2054 * is no page table page corresponding to the specified virtual address.
2055 */
2056static __inline vm_page_t
2057pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va)
2058{
2059
2060	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2061	return (vm_radix_lookup(&pmap->pm_root, pmap_pde_pindex(va)));
2062}
2063
2064/*
2065 * Removes the specified page table page from the specified pmap's collection
2066 * of idle page table pages.  The specified page table page must be a member of
2067 * the pmap's collection.
2068 */
2069static __inline void
2070pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte)
2071{
2072
2073	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2074	vm_radix_remove(&pmap->pm_root, mpte->pindex);
2075}
2076
2077/*
2078 * Decrements a page table page's wire count, which is used to record the
2079 * number of valid page table entries within the page.  If the wire count
2080 * drops to zero, then the page table page is unmapped.  Returns TRUE if the
2081 * page table page was unmapped and FALSE otherwise.
2082 */
2083static inline boolean_t
2084pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2085{
2086
2087	--m->wire_count;
2088	if (m->wire_count == 0) {
2089		_pmap_unwire_ptp(pmap, va, m, free);
2090		return (TRUE);
2091	} else
2092		return (FALSE);
2093}
2094
2095static void
2096_pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2097{
2098
2099	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2100	/*
2101	 * unmap the page table page
2102	 */
2103	if (m->pindex >= (NUPDE + NUPDPE)) {
2104		/* PDP page */
2105		pml4_entry_t *pml4;
2106		pml4 = pmap_pml4e(pmap, va);
2107		*pml4 = 0;
2108	} else if (m->pindex >= NUPDE) {
2109		/* PD page */
2110		pdp_entry_t *pdp;
2111		pdp = pmap_pdpe(pmap, va);
2112		*pdp = 0;
2113	} else {
2114		/* PTE page */
2115		pd_entry_t *pd;
2116		pd = pmap_pde(pmap, va);
2117		*pd = 0;
2118	}
2119	pmap_resident_count_dec(pmap, 1);
2120	if (m->pindex < NUPDE) {
2121		/* We just released a PT, unhold the matching PD */
2122		vm_page_t pdpg;
2123
2124		pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
2125		pmap_unwire_ptp(pmap, va, pdpg, free);
2126	}
2127	if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
2128		/* We just released a PD, unhold the matching PDP */
2129		vm_page_t pdppg;
2130
2131		pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
2132		pmap_unwire_ptp(pmap, va, pdppg, free);
2133	}
2134
2135	/*
2136	 * This is a release store so that the ordinary store unmapping
2137	 * the page table page is globally performed before TLB shoot-
2138	 * down is begun.
2139	 */
2140	atomic_subtract_rel_int(&cnt.v_wire_count, 1);
2141
2142	/*
2143	 * Put page on a list so that it is released after
2144	 * *ALL* TLB shootdown is done
2145	 */
2146	pmap_add_delayed_free_list(m, free, TRUE);
2147}
2148
2149/*
2150 * After removing a page table entry, this routine is used to
2151 * conditionally free the page, and manage the hold/wire counts.
2152 */
2153static int
2154pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
2155    struct spglist *free)
2156{
2157	vm_page_t mpte;
2158
2159	if (va >= VM_MAXUSER_ADDRESS)
2160		return (0);
2161	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
2162	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
2163	return (pmap_unwire_ptp(pmap, va, mpte, free));
2164}
2165
2166void
2167pmap_pinit0(pmap_t pmap)
2168{
2169
2170	PMAP_LOCK_INIT(pmap);
2171	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
2172	pmap->pm_cr3 = KPML4phys;
2173	pmap->pm_root.rt_root = 0;
2174	CPU_ZERO(&pmap->pm_active);
2175	CPU_ZERO(&pmap->pm_save);
2176	PCPU_SET(curpmap, pmap);
2177	TAILQ_INIT(&pmap->pm_pvchunk);
2178	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
2179	pmap->pm_pcid = pmap_pcid_enabled ? 0 : -1;
2180	pmap->pm_flags = pmap_flags;
2181}
2182
2183/*
2184 * Initialize a preallocated and zeroed pmap structure,
2185 * such as one in a vmspace structure.
2186 */
2187int
2188pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags)
2189{
2190	vm_page_t pml4pg;
2191	vm_paddr_t pml4phys;
2192	int i;
2193
2194	/*
2195	 * allocate the page directory page
2196	 */
2197	while ((pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
2198	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
2199		VM_WAIT;
2200
2201	pml4phys = VM_PAGE_TO_PHYS(pml4pg);
2202	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys);
2203	pmap->pm_pcid = -1;
2204	pmap->pm_cr3 = ~0;	/* initialize to an invalid value */
2205
2206	if ((pml4pg->flags & PG_ZERO) == 0)
2207		pagezero(pmap->pm_pml4);
2208
2209	/*
2210	 * Do not install the host kernel mappings in the nested page
2211	 * tables. These mappings are meaningless in the guest physical
2212	 * address space.
2213	 */
2214	if ((pmap->pm_type = pm_type) == PT_X86) {
2215		pmap->pm_cr3 = pml4phys;
2216
2217		/* Wire in kernel global address entries. */
2218		for (i = 0; i < NKPML4E; i++) {
2219			pmap->pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) |
2220			    X86_PG_RW | X86_PG_V | PG_U;
2221		}
2222		for (i = 0; i < ndmpdpphys; i++) {
2223			pmap->pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) |
2224			    X86_PG_RW | X86_PG_V | PG_U;
2225		}
2226
2227		/* install self-referential address mapping entry(s) */
2228		pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) |
2229		    X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
2230
2231		if (pmap_pcid_enabled) {
2232			pmap->pm_pcid = alloc_unr(&pcid_unr);
2233			if (pmap->pm_pcid != -1)
2234				pmap->pm_cr3 |= pmap->pm_pcid;
2235		}
2236	}
2237
2238	pmap->pm_root.rt_root = 0;
2239	CPU_ZERO(&pmap->pm_active);
2240	TAILQ_INIT(&pmap->pm_pvchunk);
2241	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
2242	pmap->pm_flags = flags;
2243	pmap->pm_eptgen = 0;
2244	CPU_ZERO(&pmap->pm_save);
2245
2246	return (1);
2247}
2248
2249int
2250pmap_pinit(pmap_t pmap)
2251{
2252
2253	return (pmap_pinit_type(pmap, PT_X86, pmap_flags));
2254}
2255
2256/*
2257 * This routine is called if the desired page table page does not exist.
2258 *
2259 * If page table page allocation fails, this routine may sleep before
2260 * returning NULL.  It sleeps only if a lock pointer was given.
2261 *
2262 * Note: If a page allocation fails at page table level two or three,
2263 * one or two pages may be held during the wait, only to be released
2264 * afterwards.  This conservative approach is easily argued to avoid
2265 * race conditions.
2266 */
2267static vm_page_t
2268_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
2269{
2270	vm_page_t m, pdppg, pdpg;
2271	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
2272
2273	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2274
2275	PG_A = pmap_accessed_bit(pmap);
2276	PG_M = pmap_modified_bit(pmap);
2277	PG_V = pmap_valid_bit(pmap);
2278	PG_RW = pmap_rw_bit(pmap);
2279
2280	/*
2281	 * Allocate a page table page.
2282	 */
2283	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
2284	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
2285		if (lockp != NULL) {
2286			RELEASE_PV_LIST_LOCK(lockp);
2287			PMAP_UNLOCK(pmap);
2288			rw_runlock(&pvh_global_lock);
2289			VM_WAIT;
2290			rw_rlock(&pvh_global_lock);
2291			PMAP_LOCK(pmap);
2292		}
2293
2294		/*
2295		 * Indicate the need to retry.  While waiting, the page table
2296		 * page may have been allocated.
2297		 */
2298		return (NULL);
2299	}
2300	if ((m->flags & PG_ZERO) == 0)
2301		pmap_zero_page(m);
2302
2303	/*
2304	 * Map the pagetable page into the process address space, if
2305	 * it isn't already there.
2306	 */
2307
2308	if (ptepindex >= (NUPDE + NUPDPE)) {
2309		pml4_entry_t *pml4;
2310		vm_pindex_t pml4index;
2311
2312		/* Wire up a new PDPE page */
2313		pml4index = ptepindex - (NUPDE + NUPDPE);
2314		pml4 = &pmap->pm_pml4[pml4index];
2315		*pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
2316
2317	} else if (ptepindex >= NUPDE) {
2318		vm_pindex_t pml4index;
2319		vm_pindex_t pdpindex;
2320		pml4_entry_t *pml4;
2321		pdp_entry_t *pdp;
2322
2323		/* Wire up a new PDE page */
2324		pdpindex = ptepindex - NUPDE;
2325		pml4index = pdpindex >> NPML4EPGSHIFT;
2326
2327		pml4 = &pmap->pm_pml4[pml4index];
2328		if ((*pml4 & PG_V) == 0) {
2329			/* Have to allocate a new pdp, recurse */
2330			if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index,
2331			    lockp) == NULL) {
2332				--m->wire_count;
2333				atomic_subtract_int(&cnt.v_wire_count, 1);
2334				vm_page_free_zero(m);
2335				return (NULL);
2336			}
2337		} else {
2338			/* Add reference to pdp page */
2339			pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
2340			pdppg->wire_count++;
2341		}
2342		pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
2343
2344		/* Now find the pdp page */
2345		pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
2346		*pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
2347
2348	} else {
2349		vm_pindex_t pml4index;
2350		vm_pindex_t pdpindex;
2351		pml4_entry_t *pml4;
2352		pdp_entry_t *pdp;
2353		pd_entry_t *pd;
2354
2355		/* Wire up a new PTE page */
2356		pdpindex = ptepindex >> NPDPEPGSHIFT;
2357		pml4index = pdpindex >> NPML4EPGSHIFT;
2358
2359		/* First, find the pdp and check that its valid. */
2360		pml4 = &pmap->pm_pml4[pml4index];
2361		if ((*pml4 & PG_V) == 0) {
2362			/* Have to allocate a new pd, recurse */
2363			if (_pmap_allocpte(pmap, NUPDE + pdpindex,
2364			    lockp) == NULL) {
2365				--m->wire_count;
2366				atomic_subtract_int(&cnt.v_wire_count, 1);
2367				vm_page_free_zero(m);
2368				return (NULL);
2369			}
2370			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
2371			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
2372		} else {
2373			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
2374			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
2375			if ((*pdp & PG_V) == 0) {
2376				/* Have to allocate a new pd, recurse */
2377				if (_pmap_allocpte(pmap, NUPDE + pdpindex,
2378				    lockp) == NULL) {
2379					--m->wire_count;
2380					atomic_subtract_int(&cnt.v_wire_count,
2381					    1);
2382					vm_page_free_zero(m);
2383					return (NULL);
2384				}
2385			} else {
2386				/* Add reference to the pd page */
2387				pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
2388				pdpg->wire_count++;
2389			}
2390		}
2391		pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
2392
2393		/* Now we know where the page directory page is */
2394		pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
2395		*pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
2396	}
2397
2398	pmap_resident_count_inc(pmap, 1);
2399
2400	return (m);
2401}
2402
2403static vm_page_t
2404pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
2405{
2406	vm_pindex_t pdpindex, ptepindex;
2407	pdp_entry_t *pdpe, PG_V;
2408	vm_page_t pdpg;
2409
2410	PG_V = pmap_valid_bit(pmap);
2411
2412retry:
2413	pdpe = pmap_pdpe(pmap, va);
2414	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
2415		/* Add a reference to the pd page. */
2416		pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
2417		pdpg->wire_count++;
2418	} else {
2419		/* Allocate a pd page. */
2420		ptepindex = pmap_pde_pindex(va);
2421		pdpindex = ptepindex >> NPDPEPGSHIFT;
2422		pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp);
2423		if (pdpg == NULL && lockp != NULL)
2424			goto retry;
2425	}
2426	return (pdpg);
2427}
2428
2429static vm_page_t
2430pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
2431{
2432	vm_pindex_t ptepindex;
2433	pd_entry_t *pd, PG_V;
2434	vm_page_t m;
2435
2436	PG_V = pmap_valid_bit(pmap);
2437
2438	/*
2439	 * Calculate pagetable page index
2440	 */
2441	ptepindex = pmap_pde_pindex(va);
2442retry:
2443	/*
2444	 * Get the page directory entry
2445	 */
2446	pd = pmap_pde(pmap, va);
2447
2448	/*
2449	 * This supports switching from a 2MB page to a
2450	 * normal 4K page.
2451	 */
2452	if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
2453		if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) {
2454			/*
2455			 * Invalidation of the 2MB page mapping may have caused
2456			 * the deallocation of the underlying PD page.
2457			 */
2458			pd = NULL;
2459		}
2460	}
2461
2462	/*
2463	 * If the page table page is mapped, we just increment the
2464	 * hold count, and activate it.
2465	 */
2466	if (pd != NULL && (*pd & PG_V) != 0) {
2467		m = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
2468		m->wire_count++;
2469	} else {
2470		/*
2471		 * Here if the pte page isn't mapped, or if it has been
2472		 * deallocated.
2473		 */
2474		m = _pmap_allocpte(pmap, ptepindex, lockp);
2475		if (m == NULL && lockp != NULL)
2476			goto retry;
2477	}
2478	return (m);
2479}
2480
2481
2482/***************************************************
2483 * Pmap allocation/deallocation routines.
2484 ***************************************************/
2485
2486/*
2487 * Release any resources held by the given physical map.
2488 * Called when a pmap initialized by pmap_pinit is being released.
2489 * Should only be called if the map contains no valid mappings.
2490 */
2491void
2492pmap_release(pmap_t pmap)
2493{
2494	vm_page_t m;
2495	int i;
2496
2497	KASSERT(pmap->pm_stats.resident_count == 0,
2498	    ("pmap_release: pmap resident count %ld != 0",
2499	    pmap->pm_stats.resident_count));
2500	KASSERT(vm_radix_is_empty(&pmap->pm_root),
2501	    ("pmap_release: pmap has reserved page table page(s)"));
2502
2503	if (pmap_pcid_enabled) {
2504		/*
2505		 * Invalidate any left TLB entries, to allow the reuse
2506		 * of the pcid.
2507		 */
2508		pmap_invalidate_all(pmap);
2509	}
2510
2511	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4));
2512
2513	for (i = 0; i < NKPML4E; i++)	/* KVA */
2514		pmap->pm_pml4[KPML4BASE + i] = 0;
2515	for (i = 0; i < ndmpdpphys; i++)/* Direct Map */
2516		pmap->pm_pml4[DMPML4I + i] = 0;
2517	pmap->pm_pml4[PML4PML4I] = 0;	/* Recursive Mapping */
2518
2519	m->wire_count--;
2520	atomic_subtract_int(&cnt.v_wire_count, 1);
2521	vm_page_free_zero(m);
2522	if (pmap->pm_pcid != -1)
2523		free_unr(&pcid_unr, pmap->pm_pcid);
2524}
2525
2526static int
2527kvm_size(SYSCTL_HANDLER_ARGS)
2528{
2529	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
2530
2531	return sysctl_handle_long(oidp, &ksize, 0, req);
2532}
2533SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
2534    0, 0, kvm_size, "LU", "Size of KVM");
2535
2536static int
2537kvm_free(SYSCTL_HANDLER_ARGS)
2538{
2539	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
2540
2541	return sysctl_handle_long(oidp, &kfree, 0, req);
2542}
2543SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
2544    0, 0, kvm_free, "LU", "Amount of KVM free");
2545
2546/*
2547 * grow the number of kernel page table entries, if needed
2548 */
2549void
2550pmap_growkernel(vm_offset_t addr)
2551{
2552	vm_paddr_t paddr;
2553	vm_page_t nkpg;
2554	pd_entry_t *pde, newpdir;
2555	pdp_entry_t *pdpe;
2556
2557	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
2558
2559	/*
2560	 * Return if "addr" is within the range of kernel page table pages
2561	 * that were preallocated during pmap bootstrap.  Moreover, leave
2562	 * "kernel_vm_end" and the kernel page table as they were.
2563	 *
2564	 * The correctness of this action is based on the following
2565	 * argument: vm_map_findspace() allocates contiguous ranges of the
2566	 * kernel virtual address space.  It calls this function if a range
2567	 * ends after "kernel_vm_end".  If the kernel is mapped between
2568	 * "kernel_vm_end" and "addr", then the range cannot begin at
2569	 * "kernel_vm_end".  In fact, its beginning address cannot be less
2570	 * than the kernel.  Thus, there is no immediate need to allocate
2571	 * any new kernel page table pages between "kernel_vm_end" and
2572	 * "KERNBASE".
2573	 */
2574	if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR)
2575		return;
2576
2577	addr = roundup2(addr, NBPDR);
2578	if (addr - 1 >= kernel_map->max_offset)
2579		addr = kernel_map->max_offset;
2580	while (kernel_vm_end < addr) {
2581		pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end);
2582		if ((*pdpe & X86_PG_V) == 0) {
2583			/* We need a new PDP entry */
2584			nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT,
2585			    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
2586			    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
2587			if (nkpg == NULL)
2588				panic("pmap_growkernel: no memory to grow kernel");
2589			if ((nkpg->flags & PG_ZERO) == 0)
2590				pmap_zero_page(nkpg);
2591			paddr = VM_PAGE_TO_PHYS(nkpg);
2592			*pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW |
2593			    X86_PG_A | X86_PG_M);
2594			continue; /* try again */
2595		}
2596		pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end);
2597		if ((*pde & X86_PG_V) != 0) {
2598			kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2599			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2600				kernel_vm_end = kernel_map->max_offset;
2601				break;
2602			}
2603			continue;
2604		}
2605
2606		nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end),
2607		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
2608		    VM_ALLOC_ZERO);
2609		if (nkpg == NULL)
2610			panic("pmap_growkernel: no memory to grow kernel");
2611		if ((nkpg->flags & PG_ZERO) == 0)
2612			pmap_zero_page(nkpg);
2613		paddr = VM_PAGE_TO_PHYS(nkpg);
2614		newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
2615		pde_store(pde, newpdir);
2616
2617		kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2618		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2619			kernel_vm_end = kernel_map->max_offset;
2620			break;
2621		}
2622	}
2623}
2624
2625
2626/***************************************************
2627 * page management routines.
2628 ***************************************************/
2629
2630CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
2631CTASSERT(_NPCM == 3);
2632CTASSERT(_NPCPV == 168);
2633
2634static __inline struct pv_chunk *
2635pv_to_chunk(pv_entry_t pv)
2636{
2637
2638	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
2639}
2640
2641#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
2642
2643#define	PC_FREE0	0xfffffffffffffffful
2644#define	PC_FREE1	0xfffffffffffffffful
2645#define	PC_FREE2	0x000000fffffffffful
2646
2647static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
2648
2649#ifdef PV_STATS
2650static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
2651
2652SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
2653	"Current number of pv entry chunks");
2654SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
2655	"Current number of pv entry chunks allocated");
2656SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
2657	"Current number of pv entry chunks frees");
2658SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
2659	"Number of times tried to get a chunk page but failed.");
2660
2661static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
2662static int pv_entry_spare;
2663
2664SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
2665	"Current number of pv entry frees");
2666SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
2667	"Current number of pv entry allocs");
2668SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
2669	"Current number of pv entries");
2670SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
2671	"Current number of spare pv entries");
2672#endif
2673
2674/*
2675 * We are in a serious low memory condition.  Resort to
2676 * drastic measures to free some pages so we can allocate
2677 * another pv entry chunk.
2678 *
2679 * Returns NULL if PV entries were reclaimed from the specified pmap.
2680 *
2681 * We do not, however, unmap 2mpages because subsequent accesses will
2682 * allocate per-page pv entries until repromotion occurs, thereby
2683 * exacerbating the shortage of free pv entries.
2684 */
2685static vm_page_t
2686reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
2687{
2688	struct pch new_tail;
2689	struct pv_chunk *pc;
2690	struct md_page *pvh;
2691	pd_entry_t *pde;
2692	pmap_t pmap;
2693	pt_entry_t *pte, tpte;
2694	pt_entry_t PG_G, PG_A, PG_M, PG_RW;
2695	pv_entry_t pv;
2696	vm_offset_t va;
2697	vm_page_t m, m_pc;
2698	struct spglist free;
2699	uint64_t inuse;
2700	int bit, field, freed;
2701
2702	rw_assert(&pvh_global_lock, RA_LOCKED);
2703	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
2704	KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
2705	pmap = NULL;
2706	m_pc = NULL;
2707	PG_G = PG_A = PG_M = PG_RW = 0;
2708	SLIST_INIT(&free);
2709	TAILQ_INIT(&new_tail);
2710	mtx_lock(&pv_chunks_mutex);
2711	while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && SLIST_EMPTY(&free)) {
2712		TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2713		mtx_unlock(&pv_chunks_mutex);
2714		if (pmap != pc->pc_pmap) {
2715			if (pmap != NULL) {
2716				pmap_invalidate_all(pmap);
2717				if (pmap != locked_pmap)
2718					PMAP_UNLOCK(pmap);
2719			}
2720			pmap = pc->pc_pmap;
2721			/* Avoid deadlock and lock recursion. */
2722			if (pmap > locked_pmap) {
2723				RELEASE_PV_LIST_LOCK(lockp);
2724				PMAP_LOCK(pmap);
2725			} else if (pmap != locked_pmap &&
2726			    !PMAP_TRYLOCK(pmap)) {
2727				pmap = NULL;
2728				TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
2729				mtx_lock(&pv_chunks_mutex);
2730				continue;
2731			}
2732			PG_G = pmap_global_bit(pmap);
2733			PG_A = pmap_accessed_bit(pmap);
2734			PG_M = pmap_modified_bit(pmap);
2735			PG_RW = pmap_rw_bit(pmap);
2736		}
2737
2738		/*
2739		 * Destroy every non-wired, 4 KB page mapping in the chunk.
2740		 */
2741		freed = 0;
2742		for (field = 0; field < _NPCM; field++) {
2743			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
2744			    inuse != 0; inuse &= ~(1UL << bit)) {
2745				bit = bsfq(inuse);
2746				pv = &pc->pc_pventry[field * 64 + bit];
2747				va = pv->pv_va;
2748				pde = pmap_pde(pmap, va);
2749				if ((*pde & PG_PS) != 0)
2750					continue;
2751				pte = pmap_pde_to_pte(pde, va);
2752				if ((*pte & PG_W) != 0)
2753					continue;
2754				tpte = pte_load_clear(pte);
2755				if ((tpte & PG_G) != 0)
2756					pmap_invalidate_page(pmap, va);
2757				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
2758				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2759					vm_page_dirty(m);
2760				if ((tpte & PG_A) != 0)
2761					vm_page_aflag_set(m, PGA_REFERENCED);
2762				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2763				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
2764				m->md.pv_gen++;
2765				if (TAILQ_EMPTY(&m->md.pv_list) &&
2766				    (m->flags & PG_FICTITIOUS) == 0) {
2767					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2768					if (TAILQ_EMPTY(&pvh->pv_list)) {
2769						vm_page_aflag_clear(m,
2770						    PGA_WRITEABLE);
2771					}
2772				}
2773				pc->pc_map[field] |= 1UL << bit;
2774				pmap_unuse_pt(pmap, va, *pde, &free);
2775				freed++;
2776			}
2777		}
2778		if (freed == 0) {
2779			TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
2780			mtx_lock(&pv_chunks_mutex);
2781			continue;
2782		}
2783		/* Every freed mapping is for a 4 KB page. */
2784		pmap_resident_count_dec(pmap, freed);
2785		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
2786		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
2787		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
2788		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2789		if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 &&
2790		    pc->pc_map[2] == PC_FREE2) {
2791			PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
2792			PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
2793			PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
2794			/* Entire chunk is free; return it. */
2795			m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
2796			dump_drop_page(m_pc->phys_addr);
2797			mtx_lock(&pv_chunks_mutex);
2798			break;
2799		}
2800		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2801		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
2802		mtx_lock(&pv_chunks_mutex);
2803		/* One freed pv entry in locked_pmap is sufficient. */
2804		if (pmap == locked_pmap)
2805			break;
2806	}
2807	TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
2808	mtx_unlock(&pv_chunks_mutex);
2809	if (pmap != NULL) {
2810		pmap_invalidate_all(pmap);
2811		if (pmap != locked_pmap)
2812			PMAP_UNLOCK(pmap);
2813	}
2814	if (m_pc == NULL && !SLIST_EMPTY(&free)) {
2815		m_pc = SLIST_FIRST(&free);
2816		SLIST_REMOVE_HEAD(&free, plinks.s.ss);
2817		/* Recycle a freed page table page. */
2818		m_pc->wire_count = 1;
2819		atomic_add_int(&cnt.v_wire_count, 1);
2820	}
2821	pmap_free_zero_pages(&free);
2822	return (m_pc);
2823}
2824
2825/*
2826 * free the pv_entry back to the free list
2827 */
2828static void
2829free_pv_entry(pmap_t pmap, pv_entry_t pv)
2830{
2831	struct pv_chunk *pc;
2832	int idx, field, bit;
2833
2834	rw_assert(&pvh_global_lock, RA_LOCKED);
2835	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2836	PV_STAT(atomic_add_long(&pv_entry_frees, 1));
2837	PV_STAT(atomic_add_int(&pv_entry_spare, 1));
2838	PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
2839	pc = pv_to_chunk(pv);
2840	idx = pv - &pc->pc_pventry[0];
2841	field = idx / 64;
2842	bit = idx % 64;
2843	pc->pc_map[field] |= 1ul << bit;
2844	if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
2845	    pc->pc_map[2] != PC_FREE2) {
2846		/* 98% of the time, pc is already at the head of the list. */
2847		if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
2848			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2849			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2850		}
2851		return;
2852	}
2853	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2854	free_pv_chunk(pc);
2855}
2856
2857static void
2858free_pv_chunk(struct pv_chunk *pc)
2859{
2860	vm_page_t m;
2861
2862	mtx_lock(&pv_chunks_mutex);
2863 	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2864	mtx_unlock(&pv_chunks_mutex);
2865	PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
2866	PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
2867	PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
2868	/* entire chunk is free, return it */
2869	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
2870	dump_drop_page(m->phys_addr);
2871	vm_page_unwire(m, 0);
2872	vm_page_free(m);
2873}
2874
2875/*
2876 * Returns a new PV entry, allocating a new PV chunk from the system when
2877 * needed.  If this PV chunk allocation fails and a PV list lock pointer was
2878 * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
2879 * returned.
2880 *
2881 * The given PV list lock may be released.
2882 */
2883static pv_entry_t
2884get_pv_entry(pmap_t pmap, struct rwlock **lockp)
2885{
2886	int bit, field;
2887	pv_entry_t pv;
2888	struct pv_chunk *pc;
2889	vm_page_t m;
2890
2891	rw_assert(&pvh_global_lock, RA_LOCKED);
2892	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2893	PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
2894retry:
2895	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2896	if (pc != NULL) {
2897		for (field = 0; field < _NPCM; field++) {
2898			if (pc->pc_map[field]) {
2899				bit = bsfq(pc->pc_map[field]);
2900				break;
2901			}
2902		}
2903		if (field < _NPCM) {
2904			pv = &pc->pc_pventry[field * 64 + bit];
2905			pc->pc_map[field] &= ~(1ul << bit);
2906			/* If this was the last item, move it to tail */
2907			if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
2908			    pc->pc_map[2] == 0) {
2909				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2910				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
2911				    pc_list);
2912			}
2913			PV_STAT(atomic_add_long(&pv_entry_count, 1));
2914			PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
2915			return (pv);
2916		}
2917	}
2918	/* No free items, allocate another chunk */
2919	m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
2920	    VM_ALLOC_WIRED);
2921	if (m == NULL) {
2922		if (lockp == NULL) {
2923			PV_STAT(pc_chunk_tryfail++);
2924			return (NULL);
2925		}
2926		m = reclaim_pv_chunk(pmap, lockp);
2927		if (m == NULL)
2928			goto retry;
2929	}
2930	PV_STAT(atomic_add_int(&pc_chunk_count, 1));
2931	PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
2932	dump_add_page(m->phys_addr);
2933	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
2934	pc->pc_pmap = pmap;
2935	pc->pc_map[0] = PC_FREE0 & ~1ul;	/* preallocated bit 0 */
2936	pc->pc_map[1] = PC_FREE1;
2937	pc->pc_map[2] = PC_FREE2;
2938	mtx_lock(&pv_chunks_mutex);
2939	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
2940	mtx_unlock(&pv_chunks_mutex);
2941	pv = &pc->pc_pventry[0];
2942	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2943	PV_STAT(atomic_add_long(&pv_entry_count, 1));
2944	PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
2945	return (pv);
2946}
2947
2948/*
2949 * Returns the number of one bits within the given PV chunk map element.
2950 */
2951static int
2952popcnt_pc_map_elem(uint64_t elem)
2953{
2954	int count;
2955
2956	/*
2957	 * This simple method of counting the one bits performs well because
2958	 * the given element typically contains more zero bits than one bits.
2959	 */
2960	count = 0;
2961	for (; elem != 0; elem &= elem - 1)
2962		count++;
2963	return (count);
2964}
2965
2966/*
2967 * Ensure that the number of spare PV entries in the specified pmap meets or
2968 * exceeds the given count, "needed".
2969 *
2970 * The given PV list lock may be released.
2971 */
2972static void
2973reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
2974{
2975	struct pch new_tail;
2976	struct pv_chunk *pc;
2977	int avail, free;
2978	vm_page_t m;
2979
2980	rw_assert(&pvh_global_lock, RA_LOCKED);
2981	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2982	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
2983
2984	/*
2985	 * Newly allocated PV chunks must be stored in a private list until
2986	 * the required number of PV chunks have been allocated.  Otherwise,
2987	 * reclaim_pv_chunk() could recycle one of these chunks.  In
2988	 * contrast, these chunks must be added to the pmap upon allocation.
2989	 */
2990	TAILQ_INIT(&new_tail);
2991retry:
2992	avail = 0;
2993	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
2994		if ((cpu_feature2 & CPUID2_POPCNT) == 0) {
2995			free = popcnt_pc_map_elem(pc->pc_map[0]);
2996			free += popcnt_pc_map_elem(pc->pc_map[1]);
2997			free += popcnt_pc_map_elem(pc->pc_map[2]);
2998		} else {
2999			free = popcntq(pc->pc_map[0]);
3000			free += popcntq(pc->pc_map[1]);
3001			free += popcntq(pc->pc_map[2]);
3002		}
3003		if (free == 0)
3004			break;
3005		avail += free;
3006		if (avail >= needed)
3007			break;
3008	}
3009	for (; avail < needed; avail += _NPCPV) {
3010		m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
3011		    VM_ALLOC_WIRED);
3012		if (m == NULL) {
3013			m = reclaim_pv_chunk(pmap, lockp);
3014			if (m == NULL)
3015				goto retry;
3016		}
3017		PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3018		PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3019		dump_add_page(m->phys_addr);
3020		pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3021		pc->pc_pmap = pmap;
3022		pc->pc_map[0] = PC_FREE0;
3023		pc->pc_map[1] = PC_FREE1;
3024		pc->pc_map[2] = PC_FREE2;
3025		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3026		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
3027		PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
3028	}
3029	if (!TAILQ_EMPTY(&new_tail)) {
3030		mtx_lock(&pv_chunks_mutex);
3031		TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
3032		mtx_unlock(&pv_chunks_mutex);
3033	}
3034}
3035
3036/*
3037 * First find and then remove the pv entry for the specified pmap and virtual
3038 * address from the specified pv list.  Returns the pv entry if found and NULL
3039 * otherwise.  This operation can be performed on pv lists for either 4KB or
3040 * 2MB page mappings.
3041 */
3042static __inline pv_entry_t
3043pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3044{
3045	pv_entry_t pv;
3046
3047	rw_assert(&pvh_global_lock, RA_LOCKED);
3048	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
3049		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
3050			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
3051			pvh->pv_gen++;
3052			break;
3053		}
3054	}
3055	return (pv);
3056}
3057
3058/*
3059 * After demotion from a 2MB page mapping to 512 4KB page mappings,
3060 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
3061 * entries for each of the 4KB page mappings.
3062 */
3063static void
3064pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3065    struct rwlock **lockp)
3066{
3067	struct md_page *pvh;
3068	struct pv_chunk *pc;
3069	pv_entry_t pv;
3070	vm_offset_t va_last;
3071	vm_page_t m;
3072	int bit, field;
3073
3074	rw_assert(&pvh_global_lock, RA_LOCKED);
3075	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3076	KASSERT((pa & PDRMASK) == 0,
3077	    ("pmap_pv_demote_pde: pa is not 2mpage aligned"));
3078	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3079
3080	/*
3081	 * Transfer the 2mpage's pv entry for this mapping to the first
3082	 * page's pv list.  Once this transfer begins, the pv list lock
3083	 * must not be released until the last pv entry is reinstantiated.
3084	 */
3085	pvh = pa_to_pvh(pa);
3086	va = trunc_2mpage(va);
3087	pv = pmap_pvh_remove(pvh, pmap, va);
3088	KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
3089	m = PHYS_TO_VM_PAGE(pa);
3090	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3091	m->md.pv_gen++;
3092	/* Instantiate the remaining NPTEPG - 1 pv entries. */
3093	PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1));
3094	va_last = va + NBPDR - PAGE_SIZE;
3095	for (;;) {
3096		pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3097		KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
3098		    pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare"));
3099		for (field = 0; field < _NPCM; field++) {
3100			while (pc->pc_map[field]) {
3101				bit = bsfq(pc->pc_map[field]);
3102				pc->pc_map[field] &= ~(1ul << bit);
3103				pv = &pc->pc_pventry[field * 64 + bit];
3104				va += PAGE_SIZE;
3105				pv->pv_va = va;
3106				m++;
3107				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3108			    ("pmap_pv_demote_pde: page %p is not managed", m));
3109				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3110				m->md.pv_gen++;
3111				if (va == va_last)
3112					goto out;
3113			}
3114		}
3115		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3116		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3117	}
3118out:
3119	if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
3120		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3121		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3122	}
3123	PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1));
3124	PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1));
3125}
3126
3127/*
3128 * After promotion from 512 4KB page mappings to a single 2MB page mapping,
3129 * replace the many pv entries for the 4KB page mappings by a single pv entry
3130 * for the 2MB page mapping.
3131 */
3132static void
3133pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3134    struct rwlock **lockp)
3135{
3136	struct md_page *pvh;
3137	pv_entry_t pv;
3138	vm_offset_t va_last;
3139	vm_page_t m;
3140
3141	rw_assert(&pvh_global_lock, RA_LOCKED);
3142	KASSERT((pa & PDRMASK) == 0,
3143	    ("pmap_pv_promote_pde: pa is not 2mpage aligned"));
3144	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3145
3146	/*
3147	 * Transfer the first page's pv entry for this mapping to the 2mpage's
3148	 * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
3149	 * a transfer avoids the possibility that get_pv_entry() calls
3150	 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
3151	 * mappings that is being promoted.
3152	 */
3153	m = PHYS_TO_VM_PAGE(pa);
3154	va = trunc_2mpage(va);
3155	pv = pmap_pvh_remove(&m->md, pmap, va);
3156	KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
3157	pvh = pa_to_pvh(pa);
3158	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
3159	pvh->pv_gen++;
3160	/* Free the remaining NPTEPG - 1 pv entries. */
3161	va_last = va + NBPDR - PAGE_SIZE;
3162	do {
3163		m++;
3164		va += PAGE_SIZE;
3165		pmap_pvh_free(&m->md, pmap, va);
3166	} while (va < va_last);
3167}
3168
3169/*
3170 * First find and then destroy the pv entry for the specified pmap and virtual
3171 * address.  This operation can be performed on pv lists for either 4KB or 2MB
3172 * page mappings.
3173 */
3174static void
3175pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3176{
3177	pv_entry_t pv;
3178
3179	pv = pmap_pvh_remove(pvh, pmap, va);
3180	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
3181	free_pv_entry(pmap, pv);
3182}
3183
3184/*
3185 * Conditionally create the PV entry for a 4KB page mapping if the required
3186 * memory can be allocated without resorting to reclamation.
3187 */
3188static boolean_t
3189pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
3190    struct rwlock **lockp)
3191{
3192	pv_entry_t pv;
3193
3194	rw_assert(&pvh_global_lock, RA_LOCKED);
3195	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3196	/* Pass NULL instead of the lock pointer to disable reclamation. */
3197	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
3198		pv->pv_va = va;
3199		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3200		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3201		m->md.pv_gen++;
3202		return (TRUE);
3203	} else
3204		return (FALSE);
3205}
3206
3207/*
3208 * Conditionally create the PV entry for a 2MB page mapping if the required
3209 * memory can be allocated without resorting to reclamation.
3210 */
3211static boolean_t
3212pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3213    struct rwlock **lockp)
3214{
3215	struct md_page *pvh;
3216	pv_entry_t pv;
3217
3218	rw_assert(&pvh_global_lock, RA_LOCKED);
3219	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3220	/* Pass NULL instead of the lock pointer to disable reclamation. */
3221	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
3222		pv->pv_va = va;
3223		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3224		pvh = pa_to_pvh(pa);
3225		TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
3226		pvh->pv_gen++;
3227		return (TRUE);
3228	} else
3229		return (FALSE);
3230}
3231
3232/*
3233 * Fills a page table page with mappings to consecutive physical pages.
3234 */
3235static void
3236pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
3237{
3238	pt_entry_t *pte;
3239
3240	for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
3241		*pte = newpte;
3242		newpte += PAGE_SIZE;
3243	}
3244}
3245
3246/*
3247 * Tries to demote a 2MB page mapping.  If demotion fails, the 2MB page
3248 * mapping is invalidated.
3249 */
3250static boolean_t
3251pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
3252{
3253	struct rwlock *lock;
3254	boolean_t rv;
3255
3256	lock = NULL;
3257	rv = pmap_demote_pde_locked(pmap, pde, va, &lock);
3258	if (lock != NULL)
3259		rw_wunlock(lock);
3260	return (rv);
3261}
3262
3263static boolean_t
3264pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
3265    struct rwlock **lockp)
3266{
3267	pd_entry_t newpde, oldpde;
3268	pt_entry_t *firstpte, newpte;
3269	pt_entry_t PG_A, PG_G, PG_M, PG_RW, PG_V;
3270	vm_paddr_t mptepa;
3271	vm_page_t mpte;
3272	struct spglist free;
3273	int PG_PTE_CACHE;
3274
3275	PG_G = pmap_global_bit(pmap);
3276	PG_A = pmap_accessed_bit(pmap);
3277	PG_M = pmap_modified_bit(pmap);
3278	PG_RW = pmap_rw_bit(pmap);
3279	PG_V = pmap_valid_bit(pmap);
3280	PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
3281
3282	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3283	oldpde = *pde;
3284	KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
3285	    ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
3286	if ((oldpde & PG_A) != 0 && (mpte = pmap_lookup_pt_page(pmap, va)) !=
3287	    NULL)
3288		pmap_remove_pt_page(pmap, mpte);
3289	else {
3290		KASSERT((oldpde & PG_W) == 0,
3291		    ("pmap_demote_pde: page table page for a wired mapping"
3292		    " is missing"));
3293
3294		/*
3295		 * Invalidate the 2MB page mapping and return "failure" if the
3296		 * mapping was never accessed or the allocation of the new
3297		 * page table page fails.  If the 2MB page mapping belongs to
3298		 * the direct map region of the kernel's address space, then
3299		 * the page allocation request specifies the highest possible
3300		 * priority (VM_ALLOC_INTERRUPT).  Otherwise, the priority is
3301		 * normal.  Page table pages are preallocated for every other
3302		 * part of the kernel address space, so the direct map region
3303		 * is the only part of the kernel address space that must be
3304		 * handled here.
3305		 */
3306		if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
3307		    pmap_pde_pindex(va), (va >= DMAP_MIN_ADDRESS && va <
3308		    DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) |
3309		    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
3310			SLIST_INIT(&free);
3311			pmap_remove_pde(pmap, pde, trunc_2mpage(va), &free,
3312			    lockp);
3313			pmap_invalidate_page(pmap, trunc_2mpage(va));
3314			pmap_free_zero_pages(&free);
3315			CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx"
3316			    " in pmap %p", va, pmap);
3317			return (FALSE);
3318		}
3319		if (va < VM_MAXUSER_ADDRESS)
3320			pmap_resident_count_inc(pmap, 1);
3321	}
3322	mptepa = VM_PAGE_TO_PHYS(mpte);
3323	firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
3324	newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
3325	KASSERT((oldpde & PG_A) != 0,
3326	    ("pmap_demote_pde: oldpde is missing PG_A"));
3327	KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
3328	    ("pmap_demote_pde: oldpde is missing PG_M"));
3329	newpte = oldpde & ~PG_PS;
3330	newpte = pmap_swap_pat(pmap, newpte);
3331
3332	/*
3333	 * If the page table page is new, initialize it.
3334	 */
3335	if (mpte->wire_count == 1) {
3336		mpte->wire_count = NPTEPG;
3337		pmap_fill_ptp(firstpte, newpte);
3338	}
3339	KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
3340	    ("pmap_demote_pde: firstpte and newpte map different physical"
3341	    " addresses"));
3342
3343	/*
3344	 * If the mapping has changed attributes, update the page table
3345	 * entries.
3346	 */
3347	if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
3348		pmap_fill_ptp(firstpte, newpte);
3349
3350	/*
3351	 * The spare PV entries must be reserved prior to demoting the
3352	 * mapping, that is, prior to changing the PDE.  Otherwise, the state
3353	 * of the PDE and the PV lists will be inconsistent, which can result
3354	 * in reclaim_pv_chunk() attempting to remove a PV entry from the
3355	 * wrong PV list and pmap_pv_demote_pde() failing to find the expected
3356	 * PV entry for the 2MB page mapping that is being demoted.
3357	 */
3358	if ((oldpde & PG_MANAGED) != 0)
3359		reserve_pv_entries(pmap, NPTEPG - 1, lockp);
3360
3361	/*
3362	 * Demote the mapping.  This pmap is locked.  The old PDE has
3363	 * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
3364	 * set.  Thus, there is no danger of a race with another
3365	 * processor changing the setting of PG_A and/or PG_M between
3366	 * the read above and the store below.
3367	 */
3368	if (workaround_erratum383)
3369		pmap_update_pde(pmap, va, pde, newpde);
3370	else
3371		pde_store(pde, newpde);
3372
3373	/*
3374	 * Invalidate a stale recursive mapping of the page table page.
3375	 */
3376	if (va >= VM_MAXUSER_ADDRESS)
3377		pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
3378
3379	/*
3380	 * Demote the PV entry.
3381	 */
3382	if ((oldpde & PG_MANAGED) != 0)
3383		pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp);
3384
3385	atomic_add_long(&pmap_pde_demotions, 1);
3386	CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx"
3387	    " in pmap %p", va, pmap);
3388	return (TRUE);
3389}
3390
3391/*
3392 * pmap_remove_kernel_pde: Remove a kernel superpage mapping.
3393 */
3394static void
3395pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
3396{
3397	pd_entry_t newpde;
3398	vm_paddr_t mptepa;
3399	vm_page_t mpte;
3400
3401	KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
3402	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3403	mpte = pmap_lookup_pt_page(pmap, va);
3404	if (mpte == NULL)
3405		panic("pmap_remove_kernel_pde: Missing pt page.");
3406
3407	pmap_remove_pt_page(pmap, mpte);
3408	mptepa = VM_PAGE_TO_PHYS(mpte);
3409	newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V;
3410
3411	/*
3412	 * Initialize the page table page.
3413	 */
3414	pagezero((void *)PHYS_TO_DMAP(mptepa));
3415
3416	/*
3417	 * Demote the mapping.
3418	 */
3419	if (workaround_erratum383)
3420		pmap_update_pde(pmap, va, pde, newpde);
3421	else
3422		pde_store(pde, newpde);
3423
3424	/*
3425	 * Invalidate a stale recursive mapping of the page table page.
3426	 */
3427	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
3428}
3429
3430/*
3431 * pmap_remove_pde: do the things to unmap a superpage in a process
3432 */
3433static int
3434pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
3435    struct spglist *free, struct rwlock **lockp)
3436{
3437	struct md_page *pvh;
3438	pd_entry_t oldpde;
3439	vm_offset_t eva, va;
3440	vm_page_t m, mpte;
3441	pt_entry_t PG_G, PG_A, PG_M, PG_RW;
3442
3443	PG_G = pmap_global_bit(pmap);
3444	PG_A = pmap_accessed_bit(pmap);
3445	PG_M = pmap_modified_bit(pmap);
3446	PG_RW = pmap_rw_bit(pmap);
3447
3448	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3449	KASSERT((sva & PDRMASK) == 0,
3450	    ("pmap_remove_pde: sva is not 2mpage aligned"));
3451	oldpde = pte_load_clear(pdq);
3452	if (oldpde & PG_W)
3453		pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
3454
3455	/*
3456	 * Machines that don't support invlpg, also don't support
3457	 * PG_G.
3458	 */
3459	if (oldpde & PG_G)
3460		pmap_invalidate_page(kernel_pmap, sva);
3461	pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
3462	if (oldpde & PG_MANAGED) {
3463		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME);
3464		pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
3465		pmap_pvh_free(pvh, pmap, sva);
3466		eva = sva + NBPDR;
3467		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
3468		    va < eva; va += PAGE_SIZE, m++) {
3469			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
3470				vm_page_dirty(m);
3471			if (oldpde & PG_A)
3472				vm_page_aflag_set(m, PGA_REFERENCED);
3473			if (TAILQ_EMPTY(&m->md.pv_list) &&
3474			    TAILQ_EMPTY(&pvh->pv_list))
3475				vm_page_aflag_clear(m, PGA_WRITEABLE);
3476		}
3477	}
3478	if (pmap == kernel_pmap) {
3479		pmap_remove_kernel_pde(pmap, pdq, sva);
3480	} else {
3481		mpte = pmap_lookup_pt_page(pmap, sva);
3482		if (mpte != NULL) {
3483			pmap_remove_pt_page(pmap, mpte);
3484			pmap_resident_count_dec(pmap, 1);
3485			KASSERT(mpte->wire_count == NPTEPG,
3486			    ("pmap_remove_pde: pte page wire count error"));
3487			mpte->wire_count = 0;
3488			pmap_add_delayed_free_list(mpte, free, FALSE);
3489			atomic_subtract_int(&cnt.v_wire_count, 1);
3490		}
3491	}
3492	return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free));
3493}
3494
3495/*
3496 * pmap_remove_pte: do the things to unmap a page in a process
3497 */
3498static int
3499pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
3500    pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp)
3501{
3502	struct md_page *pvh;
3503	pt_entry_t oldpte, PG_A, PG_M, PG_RW;
3504	vm_page_t m;
3505
3506	PG_A = pmap_accessed_bit(pmap);
3507	PG_M = pmap_modified_bit(pmap);
3508	PG_RW = pmap_rw_bit(pmap);
3509
3510	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3511	oldpte = pte_load_clear(ptq);
3512	if (oldpte & PG_W)
3513		pmap->pm_stats.wired_count -= 1;
3514	pmap_resident_count_dec(pmap, 1);
3515	if (oldpte & PG_MANAGED) {
3516		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
3517		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
3518			vm_page_dirty(m);
3519		if (oldpte & PG_A)
3520			vm_page_aflag_set(m, PGA_REFERENCED);
3521		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3522		pmap_pvh_free(&m->md, pmap, va);
3523		if (TAILQ_EMPTY(&m->md.pv_list) &&
3524		    (m->flags & PG_FICTITIOUS) == 0) {
3525			pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3526			if (TAILQ_EMPTY(&pvh->pv_list))
3527				vm_page_aflag_clear(m, PGA_WRITEABLE);
3528		}
3529	}
3530	return (pmap_unuse_pt(pmap, va, ptepde, free));
3531}
3532
3533/*
3534 * Remove a single page from a process address space
3535 */
3536static void
3537pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
3538    struct spglist *free)
3539{
3540	struct rwlock *lock;
3541	pt_entry_t *pte, PG_V;
3542
3543	PG_V = pmap_valid_bit(pmap);
3544	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3545	if ((*pde & PG_V) == 0)
3546		return;
3547	pte = pmap_pde_to_pte(pde, va);
3548	if ((*pte & PG_V) == 0)
3549		return;
3550	lock = NULL;
3551	pmap_remove_pte(pmap, pte, va, *pde, free, &lock);
3552	if (lock != NULL)
3553		rw_wunlock(lock);
3554	pmap_invalidate_page(pmap, va);
3555}
3556
3557/*
3558 *	Remove the given range of addresses from the specified map.
3559 *
3560 *	It is assumed that the start and end are properly
3561 *	rounded to the page size.
3562 */
3563void
3564pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
3565{
3566	struct rwlock *lock;
3567	vm_offset_t va, va_next;
3568	pml4_entry_t *pml4e;
3569	pdp_entry_t *pdpe;
3570	pd_entry_t ptpaddr, *pde;
3571	pt_entry_t *pte, PG_G, PG_V;
3572	struct spglist free;
3573	int anyvalid;
3574
3575	PG_G = pmap_global_bit(pmap);
3576	PG_V = pmap_valid_bit(pmap);
3577
3578	/*
3579	 * Perform an unsynchronized read.  This is, however, safe.
3580	 */
3581	if (pmap->pm_stats.resident_count == 0)
3582		return;
3583
3584	anyvalid = 0;
3585	SLIST_INIT(&free);
3586
3587	rw_rlock(&pvh_global_lock);
3588	PMAP_LOCK(pmap);
3589
3590	/*
3591	 * special handling of removing one page.  a very
3592	 * common operation and easy to short circuit some
3593	 * code.
3594	 */
3595	if (sva + PAGE_SIZE == eva) {
3596		pde = pmap_pde(pmap, sva);
3597		if (pde && (*pde & PG_PS) == 0) {
3598			pmap_remove_page(pmap, sva, pde, &free);
3599			goto out;
3600		}
3601	}
3602
3603	lock = NULL;
3604	for (; sva < eva; sva = va_next) {
3605
3606		if (pmap->pm_stats.resident_count == 0)
3607			break;
3608
3609		pml4e = pmap_pml4e(pmap, sva);
3610		if ((*pml4e & PG_V) == 0) {
3611			va_next = (sva + NBPML4) & ~PML4MASK;
3612			if (va_next < sva)
3613				va_next = eva;
3614			continue;
3615		}
3616
3617		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
3618		if ((*pdpe & PG_V) == 0) {
3619			va_next = (sva + NBPDP) & ~PDPMASK;
3620			if (va_next < sva)
3621				va_next = eva;
3622			continue;
3623		}
3624
3625		/*
3626		 * Calculate index for next page table.
3627		 */
3628		va_next = (sva + NBPDR) & ~PDRMASK;
3629		if (va_next < sva)
3630			va_next = eva;
3631
3632		pde = pmap_pdpe_to_pde(pdpe, sva);
3633		ptpaddr = *pde;
3634
3635		/*
3636		 * Weed out invalid mappings.
3637		 */
3638		if (ptpaddr == 0)
3639			continue;
3640
3641		/*
3642		 * Check for large page.
3643		 */
3644		if ((ptpaddr & PG_PS) != 0) {
3645			/*
3646			 * Are we removing the entire large page?  If not,
3647			 * demote the mapping and fall through.
3648			 */
3649			if (sva + NBPDR == va_next && eva >= va_next) {
3650				/*
3651				 * The TLB entry for a PG_G mapping is
3652				 * invalidated by pmap_remove_pde().
3653				 */
3654				if ((ptpaddr & PG_G) == 0)
3655					anyvalid = 1;
3656				pmap_remove_pde(pmap, pde, sva, &free, &lock);
3657				continue;
3658			} else if (!pmap_demote_pde_locked(pmap, pde, sva,
3659			    &lock)) {
3660				/* The large page mapping was destroyed. */
3661				continue;
3662			} else
3663				ptpaddr = *pde;
3664		}
3665
3666		/*
3667		 * Limit our scan to either the end of the va represented
3668		 * by the current page table page, or to the end of the
3669		 * range being removed.
3670		 */
3671		if (va_next > eva)
3672			va_next = eva;
3673
3674		va = va_next;
3675		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
3676		    sva += PAGE_SIZE) {
3677			if (*pte == 0) {
3678				if (va != va_next) {
3679					pmap_invalidate_range(pmap, va, sva);
3680					va = va_next;
3681				}
3682				continue;
3683			}
3684			if ((*pte & PG_G) == 0)
3685				anyvalid = 1;
3686			else if (va == va_next)
3687				va = sva;
3688			if (pmap_remove_pte(pmap, pte, sva, ptpaddr, &free,
3689			    &lock)) {
3690				sva += PAGE_SIZE;
3691				break;
3692			}
3693		}
3694		if (va != va_next)
3695			pmap_invalidate_range(pmap, va, sva);
3696	}
3697	if (lock != NULL)
3698		rw_wunlock(lock);
3699out:
3700	if (anyvalid)
3701		pmap_invalidate_all(pmap);
3702	rw_runlock(&pvh_global_lock);
3703	PMAP_UNLOCK(pmap);
3704	pmap_free_zero_pages(&free);
3705}
3706
3707/*
3708 *	Routine:	pmap_remove_all
3709 *	Function:
3710 *		Removes this physical page from
3711 *		all physical maps in which it resides.
3712 *		Reflects back modify bits to the pager.
3713 *
3714 *	Notes:
3715 *		Original versions of this routine were very
3716 *		inefficient because they iteratively called
3717 *		pmap_remove (slow...)
3718 */
3719
3720void
3721pmap_remove_all(vm_page_t m)
3722{
3723	struct md_page *pvh;
3724	pv_entry_t pv;
3725	pmap_t pmap;
3726	pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW;
3727	pd_entry_t *pde;
3728	vm_offset_t va;
3729	struct spglist free;
3730
3731	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3732	    ("pmap_remove_all: page %p is not managed", m));
3733	SLIST_INIT(&free);
3734	rw_wlock(&pvh_global_lock);
3735	if ((m->flags & PG_FICTITIOUS) != 0)
3736		goto small_mappings;
3737	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3738	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
3739		pmap = PV_PMAP(pv);
3740		PMAP_LOCK(pmap);
3741		va = pv->pv_va;
3742		pde = pmap_pde(pmap, va);
3743		(void)pmap_demote_pde(pmap, pde, va);
3744		PMAP_UNLOCK(pmap);
3745	}
3746small_mappings:
3747	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3748		pmap = PV_PMAP(pv);
3749		PMAP_LOCK(pmap);
3750		PG_A = pmap_accessed_bit(pmap);
3751		PG_M = pmap_modified_bit(pmap);
3752		PG_RW = pmap_rw_bit(pmap);
3753		pmap_resident_count_dec(pmap, 1);
3754		pde = pmap_pde(pmap, pv->pv_va);
3755		KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
3756		    " a 2mpage in page %p's pv list", m));
3757		pte = pmap_pde_to_pte(pde, pv->pv_va);
3758		tpte = pte_load_clear(pte);
3759		if (tpte & PG_W)
3760			pmap->pm_stats.wired_count--;
3761		if (tpte & PG_A)
3762			vm_page_aflag_set(m, PGA_REFERENCED);
3763
3764		/*
3765		 * Update the vm_page_t clean and reference bits.
3766		 */
3767		if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
3768			vm_page_dirty(m);
3769		pmap_unuse_pt(pmap, pv->pv_va, *pde, &free);
3770		pmap_invalidate_page(pmap, pv->pv_va);
3771		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3772		m->md.pv_gen++;
3773		free_pv_entry(pmap, pv);
3774		PMAP_UNLOCK(pmap);
3775	}
3776	vm_page_aflag_clear(m, PGA_WRITEABLE);
3777	rw_wunlock(&pvh_global_lock);
3778	pmap_free_zero_pages(&free);
3779}
3780
3781/*
3782 * pmap_protect_pde: do the things to protect a 2mpage in a process
3783 */
3784static boolean_t
3785pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
3786{
3787	pd_entry_t newpde, oldpde;
3788	vm_offset_t eva, va;
3789	vm_page_t m;
3790	boolean_t anychanged;
3791	pt_entry_t PG_G, PG_M, PG_RW;
3792
3793	PG_G = pmap_global_bit(pmap);
3794	PG_M = pmap_modified_bit(pmap);
3795	PG_RW = pmap_rw_bit(pmap);
3796
3797	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3798	KASSERT((sva & PDRMASK) == 0,
3799	    ("pmap_protect_pde: sva is not 2mpage aligned"));
3800	anychanged = FALSE;
3801retry:
3802	oldpde = newpde = *pde;
3803	if (oldpde & PG_MANAGED) {
3804		eva = sva + NBPDR;
3805		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
3806		    va < eva; va += PAGE_SIZE, m++)
3807			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
3808				vm_page_dirty(m);
3809	}
3810	if ((prot & VM_PROT_WRITE) == 0)
3811		newpde &= ~(PG_RW | PG_M);
3812	if ((prot & VM_PROT_EXECUTE) == 0)
3813		newpde |= pg_nx;
3814	if (newpde != oldpde) {
3815		if (!atomic_cmpset_long(pde, oldpde, newpde))
3816			goto retry;
3817		if (oldpde & PG_G)
3818			pmap_invalidate_page(pmap, sva);
3819		else
3820			anychanged = TRUE;
3821	}
3822	return (anychanged);
3823}
3824
3825/*
3826 *	Set the physical protection on the
3827 *	specified range of this map as requested.
3828 */
3829void
3830pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
3831{
3832	vm_offset_t va_next;
3833	pml4_entry_t *pml4e;
3834	pdp_entry_t *pdpe;
3835	pd_entry_t ptpaddr, *pde;
3836	pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V;
3837	boolean_t anychanged, pv_lists_locked;
3838
3839	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
3840		pmap_remove(pmap, sva, eva);
3841		return;
3842	}
3843
3844	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
3845	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
3846		return;
3847
3848	PG_G = pmap_global_bit(pmap);
3849	PG_M = pmap_modified_bit(pmap);
3850	PG_V = pmap_valid_bit(pmap);
3851	PG_RW = pmap_rw_bit(pmap);
3852	pv_lists_locked = FALSE;
3853resume:
3854	anychanged = FALSE;
3855
3856	PMAP_LOCK(pmap);
3857	for (; sva < eva; sva = va_next) {
3858
3859		pml4e = pmap_pml4e(pmap, sva);
3860		if ((*pml4e & PG_V) == 0) {
3861			va_next = (sva + NBPML4) & ~PML4MASK;
3862			if (va_next < sva)
3863				va_next = eva;
3864			continue;
3865		}
3866
3867		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
3868		if ((*pdpe & PG_V) == 0) {
3869			va_next = (sva + NBPDP) & ~PDPMASK;
3870			if (va_next < sva)
3871				va_next = eva;
3872			continue;
3873		}
3874
3875		va_next = (sva + NBPDR) & ~PDRMASK;
3876		if (va_next < sva)
3877			va_next = eva;
3878
3879		pde = pmap_pdpe_to_pde(pdpe, sva);
3880		ptpaddr = *pde;
3881
3882		/*
3883		 * Weed out invalid mappings.
3884		 */
3885		if (ptpaddr == 0)
3886			continue;
3887
3888		/*
3889		 * Check for large page.
3890		 */
3891		if ((ptpaddr & PG_PS) != 0) {
3892			/*
3893			 * Are we protecting the entire large page?  If not,
3894			 * demote the mapping and fall through.
3895			 */
3896			if (sva + NBPDR == va_next && eva >= va_next) {
3897				/*
3898				 * The TLB entry for a PG_G mapping is
3899				 * invalidated by pmap_protect_pde().
3900				 */
3901				if (pmap_protect_pde(pmap, pde, sva, prot))
3902					anychanged = TRUE;
3903				continue;
3904			} else {
3905				if (!pv_lists_locked) {
3906					pv_lists_locked = TRUE;
3907					if (!rw_try_rlock(&pvh_global_lock)) {
3908						if (anychanged)
3909							pmap_invalidate_all(
3910							    pmap);
3911						PMAP_UNLOCK(pmap);
3912						rw_rlock(&pvh_global_lock);
3913						goto resume;
3914					}
3915				}
3916				if (!pmap_demote_pde(pmap, pde, sva)) {
3917					/*
3918					 * The large page mapping was
3919					 * destroyed.
3920					 */
3921					continue;
3922				}
3923			}
3924		}
3925
3926		if (va_next > eva)
3927			va_next = eva;
3928
3929		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
3930		    sva += PAGE_SIZE) {
3931			pt_entry_t obits, pbits;
3932			vm_page_t m;
3933
3934retry:
3935			obits = pbits = *pte;
3936			if ((pbits & PG_V) == 0)
3937				continue;
3938
3939			if ((prot & VM_PROT_WRITE) == 0) {
3940				if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
3941				    (PG_MANAGED | PG_M | PG_RW)) {
3942					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
3943					vm_page_dirty(m);
3944				}
3945				pbits &= ~(PG_RW | PG_M);
3946			}
3947			if ((prot & VM_PROT_EXECUTE) == 0)
3948				pbits |= pg_nx;
3949
3950			if (pbits != obits) {
3951				if (!atomic_cmpset_long(pte, obits, pbits))
3952					goto retry;
3953				if (obits & PG_G)
3954					pmap_invalidate_page(pmap, sva);
3955				else
3956					anychanged = TRUE;
3957			}
3958		}
3959	}
3960	if (anychanged)
3961		pmap_invalidate_all(pmap);
3962	if (pv_lists_locked)
3963		rw_runlock(&pvh_global_lock);
3964	PMAP_UNLOCK(pmap);
3965}
3966
3967/*
3968 * Tries to promote the 512, contiguous 4KB page mappings that are within a
3969 * single page table page (PTP) to a single 2MB page mapping.  For promotion
3970 * to occur, two conditions must be met: (1) the 4KB page mappings must map
3971 * aligned, contiguous physical memory and (2) the 4KB page mappings must have
3972 * identical characteristics.
3973 */
3974static void
3975pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
3976    struct rwlock **lockp)
3977{
3978	pd_entry_t newpde;
3979	pt_entry_t *firstpte, oldpte, pa, *pte;
3980	pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V;
3981	vm_offset_t oldpteva;
3982	vm_page_t mpte;
3983	int PG_PTE_CACHE;
3984
3985	PG_A = pmap_accessed_bit(pmap);
3986	PG_G = pmap_global_bit(pmap);
3987	PG_M = pmap_modified_bit(pmap);
3988	PG_V = pmap_valid_bit(pmap);
3989	PG_RW = pmap_rw_bit(pmap);
3990	PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
3991
3992	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3993
3994	/*
3995	 * Examine the first PTE in the specified PTP.  Abort if this PTE is
3996	 * either invalid, unused, or does not map the first 4KB physical page
3997	 * within a 2MB page.
3998	 */
3999	firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
4000setpde:
4001	newpde = *firstpte;
4002	if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
4003		atomic_add_long(&pmap_pde_p_failures, 1);
4004		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
4005		    " in pmap %p", va, pmap);
4006		return;
4007	}
4008	if ((newpde & (PG_M | PG_RW)) == PG_RW) {
4009		/*
4010		 * When PG_M is already clear, PG_RW can be cleared without
4011		 * a TLB invalidation.
4012		 */
4013		if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW))
4014			goto setpde;
4015		newpde &= ~PG_RW;
4016	}
4017
4018	/*
4019	 * Examine each of the other PTEs in the specified PTP.  Abort if this
4020	 * PTE maps an unexpected 4KB physical page or does not have identical
4021	 * characteristics to the first PTE.
4022	 */
4023	pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
4024	for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
4025setpte:
4026		oldpte = *pte;
4027		if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
4028			atomic_add_long(&pmap_pde_p_failures, 1);
4029			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
4030			    " in pmap %p", va, pmap);
4031			return;
4032		}
4033		if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
4034			/*
4035			 * When PG_M is already clear, PG_RW can be cleared
4036			 * without a TLB invalidation.
4037			 */
4038			if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW))
4039				goto setpte;
4040			oldpte &= ~PG_RW;
4041			oldpteva = (oldpte & PG_FRAME & PDRMASK) |
4042			    (va & ~PDRMASK);
4043			CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx"
4044			    " in pmap %p", oldpteva, pmap);
4045		}
4046		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
4047			atomic_add_long(&pmap_pde_p_failures, 1);
4048			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
4049			    " in pmap %p", va, pmap);
4050			return;
4051		}
4052		pa -= PAGE_SIZE;
4053	}
4054
4055	/*
4056	 * Save the page table page in its current state until the PDE
4057	 * mapping the superpage is demoted by pmap_demote_pde() or
4058	 * destroyed by pmap_remove_pde().
4059	 */
4060	mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
4061	KASSERT(mpte >= vm_page_array &&
4062	    mpte < &vm_page_array[vm_page_array_size],
4063	    ("pmap_promote_pde: page table page is out of range"));
4064	KASSERT(mpte->pindex == pmap_pde_pindex(va),
4065	    ("pmap_promote_pde: page table page's pindex is wrong"));
4066	if (pmap_insert_pt_page(pmap, mpte)) {
4067		atomic_add_long(&pmap_pde_p_failures, 1);
4068		CTR2(KTR_PMAP,
4069		    "pmap_promote_pde: failure for va %#lx in pmap %p", va,
4070		    pmap);
4071		return;
4072	}
4073
4074	/*
4075	 * Promote the pv entries.
4076	 */
4077	if ((newpde & PG_MANAGED) != 0)
4078		pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp);
4079
4080	/*
4081	 * Propagate the PAT index to its proper position.
4082	 */
4083	newpde = pmap_swap_pat(pmap, newpde);
4084
4085	/*
4086	 * Map the superpage.
4087	 */
4088	if (workaround_erratum383)
4089		pmap_update_pde(pmap, va, pde, PG_PS | newpde);
4090	else
4091		pde_store(pde, PG_PS | newpde);
4092
4093	atomic_add_long(&pmap_pde_promotions, 1);
4094	CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx"
4095	    " in pmap %p", va, pmap);
4096}
4097
4098/*
4099 *	Insert the given physical page (p) at
4100 *	the specified virtual address (v) in the
4101 *	target physical map with the protection requested.
4102 *
4103 *	If specified, the page will be wired down, meaning
4104 *	that the related pte can not be reclaimed.
4105 *
4106 *	NB:  This is the only routine which MAY NOT lazy-evaluate
4107 *	or lose information.  That is, this routine must actually
4108 *	insert this page into the given map NOW.
4109 */
4110void
4111pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m,
4112    vm_prot_t prot, boolean_t wired)
4113{
4114	struct rwlock *lock;
4115	pd_entry_t *pde;
4116	pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V;
4117	pt_entry_t newpte, origpte;
4118	pv_entry_t pv;
4119	vm_paddr_t opa, pa;
4120	vm_page_t mpte, om;
4121
4122	PG_A = pmap_accessed_bit(pmap);
4123	PG_G = pmap_global_bit(pmap);
4124	PG_M = pmap_modified_bit(pmap);
4125	PG_V = pmap_valid_bit(pmap);
4126	PG_RW = pmap_rw_bit(pmap);
4127
4128	va = trunc_page(va);
4129	KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
4130	KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
4131	    ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)",
4132	    va));
4133	KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva ||
4134	    va >= kmi.clean_eva,
4135	    ("pmap_enter: managed mapping within the clean submap"));
4136	if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
4137		VM_OBJECT_ASSERT_WLOCKED(m->object);
4138	pa = VM_PAGE_TO_PHYS(m);
4139	newpte = (pt_entry_t)(pa | PG_A | PG_V);
4140	if ((access & VM_PROT_WRITE) != 0)
4141		newpte |= PG_M;
4142	if ((prot & VM_PROT_WRITE) != 0)
4143		newpte |= PG_RW;
4144	KASSERT((newpte & (PG_M | PG_RW)) != PG_M,
4145	    ("pmap_enter: access includes VM_PROT_WRITE but prot doesn't"));
4146	if ((prot & VM_PROT_EXECUTE) == 0)
4147		newpte |= pg_nx;
4148	if (wired)
4149		newpte |= PG_W;
4150	if (va < VM_MAXUSER_ADDRESS)
4151		newpte |= PG_U;
4152	if (pmap == kernel_pmap)
4153		newpte |= PG_G;
4154	newpte |= pmap_cache_bits(pmap, m->md.pat_mode, 0);
4155
4156	/*
4157	 * Set modified bit gratuitously for writeable mappings if
4158	 * the page is unmanaged. We do not want to take a fault
4159	 * to do the dirty bit accounting for these mappings.
4160	 */
4161	if ((m->oflags & VPO_UNMANAGED) != 0) {
4162		if ((newpte & PG_RW) != 0)
4163			newpte |= PG_M;
4164	}
4165
4166	mpte = NULL;
4167
4168	lock = NULL;
4169	rw_rlock(&pvh_global_lock);
4170	PMAP_LOCK(pmap);
4171
4172	/*
4173	 * In the case that a page table page is not
4174	 * resident, we are creating it here.
4175	 */
4176retry:
4177	pde = pmap_pde(pmap, va);
4178	if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 ||
4179	    pmap_demote_pde_locked(pmap, pde, va, &lock))) {
4180		pte = pmap_pde_to_pte(pde, va);
4181		if (va < VM_MAXUSER_ADDRESS && mpte == NULL) {
4182			mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
4183			mpte->wire_count++;
4184		}
4185	} else if (va < VM_MAXUSER_ADDRESS) {
4186		/*
4187		 * Here if the pte page isn't mapped, or if it has been
4188		 * deallocated.
4189		 */
4190		mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va), &lock);
4191		goto retry;
4192	} else
4193		panic("pmap_enter: invalid page directory va=%#lx", va);
4194
4195	origpte = *pte;
4196
4197	/*
4198	 * Is the specified virtual address already mapped?
4199	 */
4200	if ((origpte & PG_V) != 0) {
4201		/*
4202		 * Wiring change, just update stats. We don't worry about
4203		 * wiring PT pages as they remain resident as long as there
4204		 * are valid mappings in them. Hence, if a user page is wired,
4205		 * the PT page will be also.
4206		 */
4207		if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0)
4208			pmap->pm_stats.wired_count++;
4209		else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0)
4210			pmap->pm_stats.wired_count--;
4211
4212		/*
4213		 * Remove the extra PT page reference.
4214		 */
4215		if (mpte != NULL) {
4216			mpte->wire_count--;
4217			KASSERT(mpte->wire_count > 0,
4218			    ("pmap_enter: missing reference to page table page,"
4219			     " va: 0x%lx", va));
4220		}
4221
4222		/*
4223		 * Has the physical page changed?
4224		 */
4225		opa = origpte & PG_FRAME;
4226		if (opa == pa) {
4227			/*
4228			 * No, might be a protection or wiring change.
4229			 */
4230			if ((origpte & PG_MANAGED) != 0) {
4231				newpte |= PG_MANAGED;
4232				if ((newpte & PG_RW) != 0)
4233					vm_page_aflag_set(m, PGA_WRITEABLE);
4234			}
4235			if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0)
4236				goto unchanged;
4237			goto validate;
4238		}
4239	} else {
4240		/*
4241		 * Increment the counters.
4242		 */
4243		if ((newpte & PG_W) != 0)
4244			pmap->pm_stats.wired_count++;
4245		pmap_resident_count_inc(pmap, 1);
4246	}
4247
4248	/*
4249	 * Enter on the PV list if part of our managed memory.
4250	 */
4251	if ((m->oflags & VPO_UNMANAGED) == 0) {
4252		newpte |= PG_MANAGED;
4253		pv = get_pv_entry(pmap, &lock);
4254		pv->pv_va = va;
4255		CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
4256		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
4257		m->md.pv_gen++;
4258		if ((newpte & PG_RW) != 0)
4259			vm_page_aflag_set(m, PGA_WRITEABLE);
4260	}
4261
4262	/*
4263	 * Update the PTE.
4264	 */
4265	if ((origpte & PG_V) != 0) {
4266validate:
4267		origpte = pte_load_store(pte, newpte);
4268		opa = origpte & PG_FRAME;
4269		if (opa != pa) {
4270			if ((origpte & PG_MANAGED) != 0) {
4271				om = PHYS_TO_VM_PAGE(opa);
4272				if ((origpte & (PG_M | PG_RW)) == (PG_M |
4273				    PG_RW))
4274					vm_page_dirty(om);
4275				if ((origpte & PG_A) != 0)
4276					vm_page_aflag_set(om, PGA_REFERENCED);
4277				CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
4278				pmap_pvh_free(&om->md, pmap, va);
4279				if ((om->aflags & PGA_WRITEABLE) != 0 &&
4280				    TAILQ_EMPTY(&om->md.pv_list) &&
4281				    ((om->flags & PG_FICTITIOUS) != 0 ||
4282				    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
4283					vm_page_aflag_clear(om, PGA_WRITEABLE);
4284			}
4285		} else if ((newpte & PG_M) == 0 && (origpte & (PG_M |
4286		    PG_RW)) == (PG_M | PG_RW)) {
4287			if ((origpte & PG_MANAGED) != 0)
4288				vm_page_dirty(m);
4289
4290			/*
4291			 * Although the PTE may still have PG_RW set, TLB
4292			 * invalidation may nonetheless be required because
4293			 * the PTE no longer has PG_M set.
4294			 */
4295		} else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) {
4296			/*
4297			 * This PTE change does not require TLB invalidation.
4298			 */
4299			goto unchanged;
4300		}
4301		if ((origpte & PG_A) != 0)
4302			pmap_invalidate_page(pmap, va);
4303	} else
4304		pte_store(pte, newpte);
4305
4306unchanged:
4307
4308	/*
4309	 * If both the page table page and the reservation are fully
4310	 * populated, then attempt promotion.
4311	 */
4312	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
4313	    pmap_ps_enabled(pmap) &&
4314	    (m->flags & PG_FICTITIOUS) == 0 &&
4315	    vm_reserv_level_iffullpop(m) == 0)
4316		pmap_promote_pde(pmap, pde, va, &lock);
4317
4318	if (lock != NULL)
4319		rw_wunlock(lock);
4320	rw_runlock(&pvh_global_lock);
4321	PMAP_UNLOCK(pmap);
4322}
4323
4324/*
4325 * Tries to create a 2MB page mapping.  Returns TRUE if successful and FALSE
4326 * otherwise.  Fails if (1) a page table page cannot be allocated without
4327 * blocking, (2) a mapping already exists at the specified virtual address, or
4328 * (3) a pv entry cannot be allocated without reclaiming another pv entry.
4329 */
4330static boolean_t
4331pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
4332    struct rwlock **lockp)
4333{
4334	pd_entry_t *pde, newpde;
4335	pt_entry_t PG_V;
4336	vm_page_t mpde;
4337	struct spglist free;
4338
4339	PG_V = pmap_valid_bit(pmap);
4340	rw_assert(&pvh_global_lock, RA_LOCKED);
4341	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4342
4343	if ((mpde = pmap_allocpde(pmap, va, NULL)) == NULL) {
4344		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
4345		    " in pmap %p", va, pmap);
4346		return (FALSE);
4347	}
4348	pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpde));
4349	pde = &pde[pmap_pde_index(va)];
4350	if ((*pde & PG_V) != 0) {
4351		KASSERT(mpde->wire_count > 1,
4352		    ("pmap_enter_pde: mpde's wire count is too low"));
4353		mpde->wire_count--;
4354		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
4355		    " in pmap %p", va, pmap);
4356		return (FALSE);
4357	}
4358	newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) |
4359	    PG_PS | PG_V;
4360	if ((m->oflags & VPO_UNMANAGED) == 0) {
4361		newpde |= PG_MANAGED;
4362
4363		/*
4364		 * Abort this mapping if its PV entry could not be created.
4365		 */
4366		if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m),
4367		    lockp)) {
4368			SLIST_INIT(&free);
4369			if (pmap_unwire_ptp(pmap, va, mpde, &free)) {
4370				pmap_invalidate_page(pmap, va);
4371				pmap_free_zero_pages(&free);
4372			}
4373			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
4374			    " in pmap %p", va, pmap);
4375			return (FALSE);
4376		}
4377	}
4378	if ((prot & VM_PROT_EXECUTE) == 0)
4379		newpde |= pg_nx;
4380	if (va < VM_MAXUSER_ADDRESS)
4381		newpde |= PG_U;
4382
4383	/*
4384	 * Increment counters.
4385	 */
4386	pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
4387
4388	/*
4389	 * Map the superpage.
4390	 */
4391	pde_store(pde, newpde);
4392
4393	atomic_add_long(&pmap_pde_mappings, 1);
4394	CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
4395	    " in pmap %p", va, pmap);
4396	return (TRUE);
4397}
4398
4399/*
4400 * Maps a sequence of resident pages belonging to the same object.
4401 * The sequence begins with the given page m_start.  This page is
4402 * mapped at the given virtual address start.  Each subsequent page is
4403 * mapped at a virtual address that is offset from start by the same
4404 * amount as the page is offset from m_start within the object.  The
4405 * last page in the sequence is the page with the largest offset from
4406 * m_start that can be mapped at a virtual address less than the given
4407 * virtual address end.  Not every virtual page between start and end
4408 * is mapped; only those for which a resident page exists with the
4409 * corresponding offset from m_start are mapped.
4410 */
4411void
4412pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
4413    vm_page_t m_start, vm_prot_t prot)
4414{
4415	struct rwlock *lock;
4416	vm_offset_t va;
4417	vm_page_t m, mpte;
4418	vm_pindex_t diff, psize;
4419
4420	VM_OBJECT_ASSERT_LOCKED(m_start->object);
4421
4422	psize = atop(end - start);
4423	mpte = NULL;
4424	m = m_start;
4425	lock = NULL;
4426	rw_rlock(&pvh_global_lock);
4427	PMAP_LOCK(pmap);
4428	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
4429		va = start + ptoa(diff);
4430		if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
4431		    (VM_PAGE_TO_PHYS(m) & PDRMASK) == 0 &&
4432		    pmap_ps_enabled(pmap) &&
4433		    vm_reserv_level_iffullpop(m) == 0 &&
4434		    pmap_enter_pde(pmap, va, m, prot, &lock))
4435			m = &m[NBPDR / PAGE_SIZE - 1];
4436		else
4437			mpte = pmap_enter_quick_locked(pmap, va, m, prot,
4438			    mpte, &lock);
4439		m = TAILQ_NEXT(m, listq);
4440	}
4441	if (lock != NULL)
4442		rw_wunlock(lock);
4443	rw_runlock(&pvh_global_lock);
4444	PMAP_UNLOCK(pmap);
4445}
4446
4447/*
4448 * this code makes some *MAJOR* assumptions:
4449 * 1. Current pmap & pmap exists.
4450 * 2. Not wired.
4451 * 3. Read access.
4452 * 4. No page table pages.
4453 * but is *MUCH* faster than pmap_enter...
4454 */
4455
4456void
4457pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
4458{
4459	struct rwlock *lock;
4460
4461	lock = NULL;
4462	rw_rlock(&pvh_global_lock);
4463	PMAP_LOCK(pmap);
4464	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
4465	if (lock != NULL)
4466		rw_wunlock(lock);
4467	rw_runlock(&pvh_global_lock);
4468	PMAP_UNLOCK(pmap);
4469}
4470
4471static vm_page_t
4472pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
4473    vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
4474{
4475	struct spglist free;
4476	pt_entry_t *pte, PG_V;
4477	vm_paddr_t pa;
4478
4479	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
4480	    (m->oflags & VPO_UNMANAGED) != 0,
4481	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
4482	PG_V = pmap_valid_bit(pmap);
4483	rw_assert(&pvh_global_lock, RA_LOCKED);
4484	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4485
4486	/*
4487	 * In the case that a page table page is not
4488	 * resident, we are creating it here.
4489	 */
4490	if (va < VM_MAXUSER_ADDRESS) {
4491		vm_pindex_t ptepindex;
4492		pd_entry_t *ptepa;
4493
4494		/*
4495		 * Calculate pagetable page index
4496		 */
4497		ptepindex = pmap_pde_pindex(va);
4498		if (mpte && (mpte->pindex == ptepindex)) {
4499			mpte->wire_count++;
4500		} else {
4501			/*
4502			 * Get the page directory entry
4503			 */
4504			ptepa = pmap_pde(pmap, va);
4505
4506			/*
4507			 * If the page table page is mapped, we just increment
4508			 * the hold count, and activate it.  Otherwise, we
4509			 * attempt to allocate a page table page.  If this
4510			 * attempt fails, we don't retry.  Instead, we give up.
4511			 */
4512			if (ptepa && (*ptepa & PG_V) != 0) {
4513				if (*ptepa & PG_PS)
4514					return (NULL);
4515				mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME);
4516				mpte->wire_count++;
4517			} else {
4518				/*
4519				 * Pass NULL instead of the PV list lock
4520				 * pointer, because we don't intend to sleep.
4521				 */
4522				mpte = _pmap_allocpte(pmap, ptepindex, NULL);
4523				if (mpte == NULL)
4524					return (mpte);
4525			}
4526		}
4527		pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
4528		pte = &pte[pmap_pte_index(va)];
4529	} else {
4530		mpte = NULL;
4531		pte = vtopte(va);
4532	}
4533	if (*pte) {
4534		if (mpte != NULL) {
4535			mpte->wire_count--;
4536			mpte = NULL;
4537		}
4538		return (mpte);
4539	}
4540
4541	/*
4542	 * Enter on the PV list if part of our managed memory.
4543	 */
4544	if ((m->oflags & VPO_UNMANAGED) == 0 &&
4545	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
4546		if (mpte != NULL) {
4547			SLIST_INIT(&free);
4548			if (pmap_unwire_ptp(pmap, va, mpte, &free)) {
4549				pmap_invalidate_page(pmap, va);
4550				pmap_free_zero_pages(&free);
4551			}
4552			mpte = NULL;
4553		}
4554		return (mpte);
4555	}
4556
4557	/*
4558	 * Increment counters
4559	 */
4560	pmap_resident_count_inc(pmap, 1);
4561
4562	pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 0);
4563	if ((prot & VM_PROT_EXECUTE) == 0)
4564		pa |= pg_nx;
4565
4566	/*
4567	 * Now validate mapping with RO protection
4568	 */
4569	if ((m->oflags & VPO_UNMANAGED) != 0)
4570		pte_store(pte, pa | PG_V | PG_U);
4571	else
4572		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
4573	return (mpte);
4574}
4575
4576/*
4577 * Make a temporary mapping for a physical address.  This is only intended
4578 * to be used for panic dumps.
4579 */
4580void *
4581pmap_kenter_temporary(vm_paddr_t pa, int i)
4582{
4583	vm_offset_t va;
4584
4585	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
4586	pmap_kenter(va, pa);
4587	invlpg(va);
4588	return ((void *)crashdumpmap);
4589}
4590
4591/*
4592 * This code maps large physical mmap regions into the
4593 * processor address space.  Note that some shortcuts
4594 * are taken, but the code works.
4595 */
4596void
4597pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
4598    vm_pindex_t pindex, vm_size_t size)
4599{
4600	pd_entry_t *pde;
4601	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
4602	vm_paddr_t pa, ptepa;
4603	vm_page_t p, pdpg;
4604	int pat_mode;
4605
4606	PG_A = pmap_accessed_bit(pmap);
4607	PG_M = pmap_modified_bit(pmap);
4608	PG_V = pmap_valid_bit(pmap);
4609	PG_RW = pmap_rw_bit(pmap);
4610
4611	VM_OBJECT_ASSERT_WLOCKED(object);
4612	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
4613	    ("pmap_object_init_pt: non-device object"));
4614	if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
4615		if (!pmap_ps_enabled(pmap))
4616			return;
4617		if (!vm_object_populate(object, pindex, pindex + atop(size)))
4618			return;
4619		p = vm_page_lookup(object, pindex);
4620		KASSERT(p->valid == VM_PAGE_BITS_ALL,
4621		    ("pmap_object_init_pt: invalid page %p", p));
4622		pat_mode = p->md.pat_mode;
4623
4624		/*
4625		 * Abort the mapping if the first page is not physically
4626		 * aligned to a 2MB page boundary.
4627		 */
4628		ptepa = VM_PAGE_TO_PHYS(p);
4629		if (ptepa & (NBPDR - 1))
4630			return;
4631
4632		/*
4633		 * Skip the first page.  Abort the mapping if the rest of
4634		 * the pages are not physically contiguous or have differing
4635		 * memory attributes.
4636		 */
4637		p = TAILQ_NEXT(p, listq);
4638		for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
4639		    pa += PAGE_SIZE) {
4640			KASSERT(p->valid == VM_PAGE_BITS_ALL,
4641			    ("pmap_object_init_pt: invalid page %p", p));
4642			if (pa != VM_PAGE_TO_PHYS(p) ||
4643			    pat_mode != p->md.pat_mode)
4644				return;
4645			p = TAILQ_NEXT(p, listq);
4646		}
4647
4648		/*
4649		 * Map using 2MB pages.  Since "ptepa" is 2M aligned and
4650		 * "size" is a multiple of 2M, adding the PAT setting to "pa"
4651		 * will not affect the termination of this loop.
4652		 */
4653		PMAP_LOCK(pmap);
4654		for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1);
4655		    pa < ptepa + size; pa += NBPDR) {
4656			pdpg = pmap_allocpde(pmap, addr, NULL);
4657			if (pdpg == NULL) {
4658				/*
4659				 * The creation of mappings below is only an
4660				 * optimization.  If a page directory page
4661				 * cannot be allocated without blocking,
4662				 * continue on to the next mapping rather than
4663				 * blocking.
4664				 */
4665				addr += NBPDR;
4666				continue;
4667			}
4668			pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
4669			pde = &pde[pmap_pde_index(addr)];
4670			if ((*pde & PG_V) == 0) {
4671				pde_store(pde, pa | PG_PS | PG_M | PG_A |
4672				    PG_U | PG_RW | PG_V);
4673				pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
4674				atomic_add_long(&pmap_pde_mappings, 1);
4675			} else {
4676				/* Continue on if the PDE is already valid. */
4677				pdpg->wire_count--;
4678				KASSERT(pdpg->wire_count > 0,
4679				    ("pmap_object_init_pt: missing reference "
4680				    "to page directory page, va: 0x%lx", addr));
4681			}
4682			addr += NBPDR;
4683		}
4684		PMAP_UNLOCK(pmap);
4685	}
4686}
4687
4688/*
4689 *	Routine:	pmap_change_wiring
4690 *	Function:	Change the wiring attribute for a map/virtual-address
4691 *			pair.
4692 *	In/out conditions:
4693 *			The mapping must already exist in the pmap.
4694 */
4695void
4696pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
4697{
4698	pd_entry_t *pde;
4699	pt_entry_t *pte;
4700	boolean_t pv_lists_locked;
4701
4702	pv_lists_locked = FALSE;
4703
4704	/*
4705	 * Wiring is not a hardware characteristic so there is no need to
4706	 * invalidate TLB.
4707	 */
4708retry:
4709	PMAP_LOCK(pmap);
4710	pde = pmap_pde(pmap, va);
4711	if ((*pde & PG_PS) != 0) {
4712		if (!wired != ((*pde & PG_W) == 0)) {
4713			if (!pv_lists_locked) {
4714				pv_lists_locked = TRUE;
4715				if (!rw_try_rlock(&pvh_global_lock)) {
4716					PMAP_UNLOCK(pmap);
4717					rw_rlock(&pvh_global_lock);
4718					goto retry;
4719				}
4720			}
4721			if (!pmap_demote_pde(pmap, pde, va))
4722				panic("pmap_change_wiring: demotion failed");
4723		} else
4724			goto out;
4725	}
4726	pte = pmap_pde_to_pte(pde, va);
4727	if (wired && (*pte & PG_W) == 0) {
4728		pmap->pm_stats.wired_count++;
4729		atomic_set_long(pte, PG_W);
4730	} else if (!wired && (*pte & PG_W) != 0) {
4731		pmap->pm_stats.wired_count--;
4732		atomic_clear_long(pte, PG_W);
4733	}
4734out:
4735	if (pv_lists_locked)
4736		rw_runlock(&pvh_global_lock);
4737	PMAP_UNLOCK(pmap);
4738}
4739
4740/*
4741 *	Copy the range specified by src_addr/len
4742 *	from the source map to the range dst_addr/len
4743 *	in the destination map.
4744 *
4745 *	This routine is only advisory and need not do anything.
4746 */
4747
4748void
4749pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
4750    vm_offset_t src_addr)
4751{
4752	struct rwlock *lock;
4753	struct spglist free;
4754	vm_offset_t addr;
4755	vm_offset_t end_addr = src_addr + len;
4756	vm_offset_t va_next;
4757	pt_entry_t PG_A, PG_M, PG_V;
4758
4759	if (dst_addr != src_addr)
4760		return;
4761
4762	if (dst_pmap->pm_type != src_pmap->pm_type)
4763		return;
4764
4765	/*
4766	 * EPT page table entries that require emulation of A/D bits are
4767	 * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although
4768	 * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit
4769	 * (aka EPT_PG_EXECUTE) could still be set. Since some EPT
4770	 * implementations flag an EPT misconfiguration for exec-only
4771	 * mappings we skip this function entirely for emulated pmaps.
4772	 */
4773	if (pmap_emulate_ad_bits(dst_pmap))
4774		return;
4775
4776	lock = NULL;
4777	rw_rlock(&pvh_global_lock);
4778	if (dst_pmap < src_pmap) {
4779		PMAP_LOCK(dst_pmap);
4780		PMAP_LOCK(src_pmap);
4781	} else {
4782		PMAP_LOCK(src_pmap);
4783		PMAP_LOCK(dst_pmap);
4784	}
4785
4786	PG_A = pmap_accessed_bit(dst_pmap);
4787	PG_M = pmap_modified_bit(dst_pmap);
4788	PG_V = pmap_valid_bit(dst_pmap);
4789
4790	for (addr = src_addr; addr < end_addr; addr = va_next) {
4791		pt_entry_t *src_pte, *dst_pte;
4792		vm_page_t dstmpde, dstmpte, srcmpte;
4793		pml4_entry_t *pml4e;
4794		pdp_entry_t *pdpe;
4795		pd_entry_t srcptepaddr, *pde;
4796
4797		KASSERT(addr < UPT_MIN_ADDRESS,
4798		    ("pmap_copy: invalid to pmap_copy page tables"));
4799
4800		pml4e = pmap_pml4e(src_pmap, addr);
4801		if ((*pml4e & PG_V) == 0) {
4802			va_next = (addr + NBPML4) & ~PML4MASK;
4803			if (va_next < addr)
4804				va_next = end_addr;
4805			continue;
4806		}
4807
4808		pdpe = pmap_pml4e_to_pdpe(pml4e, addr);
4809		if ((*pdpe & PG_V) == 0) {
4810			va_next = (addr + NBPDP) & ~PDPMASK;
4811			if (va_next < addr)
4812				va_next = end_addr;
4813			continue;
4814		}
4815
4816		va_next = (addr + NBPDR) & ~PDRMASK;
4817		if (va_next < addr)
4818			va_next = end_addr;
4819
4820		pde = pmap_pdpe_to_pde(pdpe, addr);
4821		srcptepaddr = *pde;
4822		if (srcptepaddr == 0)
4823			continue;
4824
4825		if (srcptepaddr & PG_PS) {
4826			if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr)
4827				continue;
4828			dstmpde = pmap_allocpde(dst_pmap, addr, NULL);
4829			if (dstmpde == NULL)
4830				break;
4831			pde = (pd_entry_t *)
4832			    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpde));
4833			pde = &pde[pmap_pde_index(addr)];
4834			if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 ||
4835			    pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr &
4836			    PG_PS_FRAME, &lock))) {
4837				*pde = srcptepaddr & ~PG_W;
4838				pmap_resident_count_inc(dst_pmap, NBPDR / PAGE_SIZE);
4839			} else
4840				dstmpde->wire_count--;
4841			continue;
4842		}
4843
4844		srcptepaddr &= PG_FRAME;
4845		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
4846		KASSERT(srcmpte->wire_count > 0,
4847		    ("pmap_copy: source page table page is unused"));
4848
4849		if (va_next > end_addr)
4850			va_next = end_addr;
4851
4852		src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
4853		src_pte = &src_pte[pmap_pte_index(addr)];
4854		dstmpte = NULL;
4855		while (addr < va_next) {
4856			pt_entry_t ptetemp;
4857			ptetemp = *src_pte;
4858			/*
4859			 * we only virtual copy managed pages
4860			 */
4861			if ((ptetemp & PG_MANAGED) != 0) {
4862				if (dstmpte != NULL &&
4863				    dstmpte->pindex == pmap_pde_pindex(addr))
4864					dstmpte->wire_count++;
4865				else if ((dstmpte = pmap_allocpte(dst_pmap,
4866				    addr, NULL)) == NULL)
4867					goto out;
4868				dst_pte = (pt_entry_t *)
4869				    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
4870				dst_pte = &dst_pte[pmap_pte_index(addr)];
4871				if (*dst_pte == 0 &&
4872				    pmap_try_insert_pv_entry(dst_pmap, addr,
4873				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME),
4874				    &lock)) {
4875					/*
4876					 * Clear the wired, modified, and
4877					 * accessed (referenced) bits
4878					 * during the copy.
4879					 */
4880					*dst_pte = ptetemp & ~(PG_W | PG_M |
4881					    PG_A);
4882					pmap_resident_count_inc(dst_pmap, 1);
4883				} else {
4884					SLIST_INIT(&free);
4885					if (pmap_unwire_ptp(dst_pmap, addr,
4886					    dstmpte, &free)) {
4887						pmap_invalidate_page(dst_pmap,
4888						    addr);
4889						pmap_free_zero_pages(&free);
4890					}
4891					goto out;
4892				}
4893				if (dstmpte->wire_count >= srcmpte->wire_count)
4894					break;
4895			}
4896			addr += PAGE_SIZE;
4897			src_pte++;
4898		}
4899	}
4900out:
4901	if (lock != NULL)
4902		rw_wunlock(lock);
4903	rw_runlock(&pvh_global_lock);
4904	PMAP_UNLOCK(src_pmap);
4905	PMAP_UNLOCK(dst_pmap);
4906}
4907
4908/*
4909 *	pmap_zero_page zeros the specified hardware page by mapping
4910 *	the page into KVM and using bzero to clear its contents.
4911 */
4912void
4913pmap_zero_page(vm_page_t m)
4914{
4915	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
4916
4917	pagezero((void *)va);
4918}
4919
4920/*
4921 *	pmap_zero_page_area zeros the specified hardware page by mapping
4922 *	the page into KVM and using bzero to clear its contents.
4923 *
4924 *	off and size may not cover an area beyond a single hardware page.
4925 */
4926void
4927pmap_zero_page_area(vm_page_t m, int off, int size)
4928{
4929	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
4930
4931	if (off == 0 && size == PAGE_SIZE)
4932		pagezero((void *)va);
4933	else
4934		bzero((char *)va + off, size);
4935}
4936
4937/*
4938 *	pmap_zero_page_idle zeros the specified hardware page by mapping
4939 *	the page into KVM and using bzero to clear its contents.  This
4940 *	is intended to be called from the vm_pagezero process only and
4941 *	outside of Giant.
4942 */
4943void
4944pmap_zero_page_idle(vm_page_t m)
4945{
4946	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
4947
4948	pagezero((void *)va);
4949}
4950
4951/*
4952 *	pmap_copy_page copies the specified (machine independent)
4953 *	page by mapping the page into virtual memory and using
4954 *	bcopy to copy the page, one machine dependent page at a
4955 *	time.
4956 */
4957void
4958pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
4959{
4960	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
4961	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
4962
4963	pagecopy((void *)src, (void *)dst);
4964}
4965
4966int unmapped_buf_allowed = 1;
4967
4968void
4969pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
4970    vm_offset_t b_offset, int xfersize)
4971{
4972	void *a_cp, *b_cp;
4973	vm_offset_t a_pg_offset, b_pg_offset;
4974	int cnt;
4975
4976	while (xfersize > 0) {
4977		a_pg_offset = a_offset & PAGE_MASK;
4978		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
4979		a_cp = (char *)PHYS_TO_DMAP(ma[a_offset >> PAGE_SHIFT]->
4980		    phys_addr) + a_pg_offset;
4981		b_pg_offset = b_offset & PAGE_MASK;
4982		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
4983		b_cp = (char *)PHYS_TO_DMAP(mb[b_offset >> PAGE_SHIFT]->
4984		    phys_addr) + b_pg_offset;
4985		bcopy(a_cp, b_cp, cnt);
4986		a_offset += cnt;
4987		b_offset += cnt;
4988		xfersize -= cnt;
4989	}
4990}
4991
4992/*
4993 * Returns true if the pmap's pv is one of the first
4994 * 16 pvs linked to from this page.  This count may
4995 * be changed upwards or downwards in the future; it
4996 * is only necessary that true be returned for a small
4997 * subset of pmaps for proper page aging.
4998 */
4999boolean_t
5000pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
5001{
5002	struct md_page *pvh;
5003	struct rwlock *lock;
5004	pv_entry_t pv;
5005	int loops = 0;
5006	boolean_t rv;
5007
5008	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5009	    ("pmap_page_exists_quick: page %p is not managed", m));
5010	rv = FALSE;
5011	rw_rlock(&pvh_global_lock);
5012	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5013	rw_rlock(lock);
5014	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5015		if (PV_PMAP(pv) == pmap) {
5016			rv = TRUE;
5017			break;
5018		}
5019		loops++;
5020		if (loops >= 16)
5021			break;
5022	}
5023	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
5024		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5025		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5026			if (PV_PMAP(pv) == pmap) {
5027				rv = TRUE;
5028				break;
5029			}
5030			loops++;
5031			if (loops >= 16)
5032				break;
5033		}
5034	}
5035	rw_runlock(lock);
5036	rw_runlock(&pvh_global_lock);
5037	return (rv);
5038}
5039
5040/*
5041 *	pmap_page_wired_mappings:
5042 *
5043 *	Return the number of managed mappings to the given physical page
5044 *	that are wired.
5045 */
5046int
5047pmap_page_wired_mappings(vm_page_t m)
5048{
5049	struct rwlock *lock;
5050	struct md_page *pvh;
5051	pmap_t pmap;
5052	pt_entry_t *pte;
5053	pv_entry_t pv;
5054	int count, md_gen, pvh_gen;
5055
5056	if ((m->oflags & VPO_UNMANAGED) != 0)
5057		return (0);
5058	rw_rlock(&pvh_global_lock);
5059	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5060	rw_rlock(lock);
5061restart:
5062	count = 0;
5063	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5064		pmap = PV_PMAP(pv);
5065		if (!PMAP_TRYLOCK(pmap)) {
5066			md_gen = m->md.pv_gen;
5067			rw_runlock(lock);
5068			PMAP_LOCK(pmap);
5069			rw_rlock(lock);
5070			if (md_gen != m->md.pv_gen) {
5071				PMAP_UNLOCK(pmap);
5072				goto restart;
5073			}
5074		}
5075		pte = pmap_pte(pmap, pv->pv_va);
5076		if ((*pte & PG_W) != 0)
5077			count++;
5078		PMAP_UNLOCK(pmap);
5079	}
5080	if ((m->flags & PG_FICTITIOUS) == 0) {
5081		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5082		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5083			pmap = PV_PMAP(pv);
5084			if (!PMAP_TRYLOCK(pmap)) {
5085				md_gen = m->md.pv_gen;
5086				pvh_gen = pvh->pv_gen;
5087				rw_runlock(lock);
5088				PMAP_LOCK(pmap);
5089				rw_rlock(lock);
5090				if (md_gen != m->md.pv_gen ||
5091				    pvh_gen != pvh->pv_gen) {
5092					PMAP_UNLOCK(pmap);
5093					goto restart;
5094				}
5095			}
5096			pte = pmap_pde(pmap, pv->pv_va);
5097			if ((*pte & PG_W) != 0)
5098				count++;
5099			PMAP_UNLOCK(pmap);
5100		}
5101	}
5102	rw_runlock(lock);
5103	rw_runlock(&pvh_global_lock);
5104	return (count);
5105}
5106
5107/*
5108 * Returns TRUE if the given page is mapped individually or as part of
5109 * a 2mpage.  Otherwise, returns FALSE.
5110 */
5111boolean_t
5112pmap_page_is_mapped(vm_page_t m)
5113{
5114	struct rwlock *lock;
5115	boolean_t rv;
5116
5117	if ((m->oflags & VPO_UNMANAGED) != 0)
5118		return (FALSE);
5119	rw_rlock(&pvh_global_lock);
5120	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5121	rw_rlock(lock);
5122	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
5123	    ((m->flags & PG_FICTITIOUS) == 0 &&
5124	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
5125	rw_runlock(lock);
5126	rw_runlock(&pvh_global_lock);
5127	return (rv);
5128}
5129
5130/*
5131 * Destroy all managed, non-wired mappings in the given user-space
5132 * pmap.  This pmap cannot be active on any processor besides the
5133 * caller.
5134 *
5135 * This function cannot be applied to the kernel pmap.  Moreover, it
5136 * is not intended for general use.  It is only to be used during
5137 * process termination.  Consequently, it can be implemented in ways
5138 * that make it faster than pmap_remove().  First, it can more quickly
5139 * destroy mappings by iterating over the pmap's collection of PV
5140 * entries, rather than searching the page table.  Second, it doesn't
5141 * have to test and clear the page table entries atomically, because
5142 * no processor is currently accessing the user address space.  In
5143 * particular, a page table entry's dirty bit won't change state once
5144 * this function starts.
5145 */
5146void
5147pmap_remove_pages(pmap_t pmap)
5148{
5149	pd_entry_t ptepde;
5150	pt_entry_t *pte, tpte;
5151	pt_entry_t PG_M, PG_RW, PG_V;
5152	struct spglist free;
5153	vm_page_t m, mpte, mt;
5154	pv_entry_t pv;
5155	struct md_page *pvh;
5156	struct pv_chunk *pc, *npc;
5157	struct rwlock *lock;
5158	int64_t bit;
5159	uint64_t inuse, bitmask;
5160	int allfree, field, freed, idx;
5161	boolean_t superpage;
5162	vm_paddr_t pa;
5163
5164	/*
5165	 * Assert that the given pmap is only active on the current
5166	 * CPU.  Unfortunately, we cannot block another CPU from
5167	 * activating the pmap while this function is executing.
5168	 */
5169	KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap));
5170#ifdef INVARIANTS
5171	{
5172		cpuset_t other_cpus;
5173
5174		other_cpus = all_cpus;
5175		critical_enter();
5176		CPU_CLR(PCPU_GET(cpuid), &other_cpus);
5177		CPU_AND(&other_cpus, &pmap->pm_active);
5178		critical_exit();
5179		KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap));
5180	}
5181#endif
5182
5183	lock = NULL;
5184	PG_M = pmap_modified_bit(pmap);
5185	PG_V = pmap_valid_bit(pmap);
5186	PG_RW = pmap_rw_bit(pmap);
5187
5188	SLIST_INIT(&free);
5189	rw_rlock(&pvh_global_lock);
5190	PMAP_LOCK(pmap);
5191	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
5192		allfree = 1;
5193		freed = 0;
5194		for (field = 0; field < _NPCM; field++) {
5195			inuse = ~pc->pc_map[field] & pc_freemask[field];
5196			while (inuse != 0) {
5197				bit = bsfq(inuse);
5198				bitmask = 1UL << bit;
5199				idx = field * 64 + bit;
5200				pv = &pc->pc_pventry[idx];
5201				inuse &= ~bitmask;
5202
5203				pte = pmap_pdpe(pmap, pv->pv_va);
5204				ptepde = *pte;
5205				pte = pmap_pdpe_to_pde(pte, pv->pv_va);
5206				tpte = *pte;
5207				if ((tpte & (PG_PS | PG_V)) == PG_V) {
5208					superpage = FALSE;
5209					ptepde = tpte;
5210					pte = (pt_entry_t *)PHYS_TO_DMAP(tpte &
5211					    PG_FRAME);
5212					pte = &pte[pmap_pte_index(pv->pv_va)];
5213					tpte = *pte;
5214				} else {
5215					/*
5216					 * Keep track whether 'tpte' is a
5217					 * superpage explicitly instead of
5218					 * relying on PG_PS being set.
5219					 *
5220					 * This is because PG_PS is numerically
5221					 * identical to PG_PTE_PAT and thus a
5222					 * regular page could be mistaken for
5223					 * a superpage.
5224					 */
5225					superpage = TRUE;
5226				}
5227
5228				if ((tpte & PG_V) == 0) {
5229					panic("bad pte va %lx pte %lx",
5230					    pv->pv_va, tpte);
5231				}
5232
5233/*
5234 * We cannot remove wired pages from a process' mapping at this time
5235 */
5236				if (tpte & PG_W) {
5237					allfree = 0;
5238					continue;
5239				}
5240
5241				if (superpage)
5242					pa = tpte & PG_PS_FRAME;
5243				else
5244					pa = tpte & PG_FRAME;
5245
5246				m = PHYS_TO_VM_PAGE(pa);
5247				KASSERT(m->phys_addr == pa,
5248				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
5249				    m, (uintmax_t)m->phys_addr,
5250				    (uintmax_t)tpte));
5251
5252				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
5253				    m < &vm_page_array[vm_page_array_size],
5254				    ("pmap_remove_pages: bad tpte %#jx",
5255				    (uintmax_t)tpte));
5256
5257				pte_clear(pte);
5258
5259				/*
5260				 * Update the vm_page_t clean/reference bits.
5261				 */
5262				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
5263					if (superpage) {
5264						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
5265							vm_page_dirty(mt);
5266					} else
5267						vm_page_dirty(m);
5268				}
5269
5270				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
5271
5272				/* Mark free */
5273				pc->pc_map[field] |= bitmask;
5274				if (superpage) {
5275					pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
5276					pvh = pa_to_pvh(tpte & PG_PS_FRAME);
5277					TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
5278					pvh->pv_gen++;
5279					if (TAILQ_EMPTY(&pvh->pv_list)) {
5280						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
5281							if ((mt->aflags & PGA_WRITEABLE) != 0 &&
5282							    TAILQ_EMPTY(&mt->md.pv_list))
5283								vm_page_aflag_clear(mt, PGA_WRITEABLE);
5284					}
5285					mpte = pmap_lookup_pt_page(pmap, pv->pv_va);
5286					if (mpte != NULL) {
5287						pmap_remove_pt_page(pmap, mpte);
5288						pmap_resident_count_dec(pmap, 1);
5289						KASSERT(mpte->wire_count == NPTEPG,
5290						    ("pmap_remove_pages: pte page wire count error"));
5291						mpte->wire_count = 0;
5292						pmap_add_delayed_free_list(mpte, &free, FALSE);
5293						atomic_subtract_int(&cnt.v_wire_count, 1);
5294					}
5295				} else {
5296					pmap_resident_count_dec(pmap, 1);
5297					TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
5298					m->md.pv_gen++;
5299					if ((m->aflags & PGA_WRITEABLE) != 0 &&
5300					    TAILQ_EMPTY(&m->md.pv_list) &&
5301					    (m->flags & PG_FICTITIOUS) == 0) {
5302						pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5303						if (TAILQ_EMPTY(&pvh->pv_list))
5304							vm_page_aflag_clear(m, PGA_WRITEABLE);
5305					}
5306				}
5307				pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
5308				freed++;
5309			}
5310		}
5311		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
5312		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
5313		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
5314		if (allfree) {
5315			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
5316			free_pv_chunk(pc);
5317		}
5318	}
5319	if (lock != NULL)
5320		rw_wunlock(lock);
5321	pmap_invalidate_all(pmap);
5322	rw_runlock(&pvh_global_lock);
5323	PMAP_UNLOCK(pmap);
5324	pmap_free_zero_pages(&free);
5325}
5326
5327static boolean_t
5328pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
5329{
5330	struct rwlock *lock;
5331	pv_entry_t pv;
5332	struct md_page *pvh;
5333	pt_entry_t *pte, mask;
5334	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
5335	pmap_t pmap;
5336	int md_gen, pvh_gen;
5337	boolean_t rv;
5338
5339	rv = FALSE;
5340	rw_rlock(&pvh_global_lock);
5341	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5342	rw_rlock(lock);
5343restart:
5344	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5345		pmap = PV_PMAP(pv);
5346		if (!PMAP_TRYLOCK(pmap)) {
5347			md_gen = m->md.pv_gen;
5348			rw_runlock(lock);
5349			PMAP_LOCK(pmap);
5350			rw_rlock(lock);
5351			if (md_gen != m->md.pv_gen) {
5352				PMAP_UNLOCK(pmap);
5353				goto restart;
5354			}
5355		}
5356		pte = pmap_pte(pmap, pv->pv_va);
5357		mask = 0;
5358		if (modified) {
5359			PG_M = pmap_modified_bit(pmap);
5360			PG_RW = pmap_rw_bit(pmap);
5361			mask |= PG_RW | PG_M;
5362		}
5363		if (accessed) {
5364			PG_A = pmap_accessed_bit(pmap);
5365			PG_V = pmap_valid_bit(pmap);
5366			mask |= PG_V | PG_A;
5367		}
5368		rv = (*pte & mask) == mask;
5369		PMAP_UNLOCK(pmap);
5370		if (rv)
5371			goto out;
5372	}
5373	if ((m->flags & PG_FICTITIOUS) == 0) {
5374		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5375		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5376			pmap = PV_PMAP(pv);
5377			if (!PMAP_TRYLOCK(pmap)) {
5378				md_gen = m->md.pv_gen;
5379				pvh_gen = pvh->pv_gen;
5380				rw_runlock(lock);
5381				PMAP_LOCK(pmap);
5382				rw_rlock(lock);
5383				if (md_gen != m->md.pv_gen ||
5384				    pvh_gen != pvh->pv_gen) {
5385					PMAP_UNLOCK(pmap);
5386					goto restart;
5387				}
5388			}
5389			pte = pmap_pde(pmap, pv->pv_va);
5390			mask = 0;
5391			if (modified) {
5392				PG_M = pmap_modified_bit(pmap);
5393				PG_RW = pmap_rw_bit(pmap);
5394				mask |= PG_RW | PG_M;
5395			}
5396			if (accessed) {
5397				PG_A = pmap_accessed_bit(pmap);
5398				PG_V = pmap_valid_bit(pmap);
5399				mask |= PG_V | PG_A;
5400			}
5401			rv = (*pte & mask) == mask;
5402			PMAP_UNLOCK(pmap);
5403			if (rv)
5404				goto out;
5405		}
5406	}
5407out:
5408	rw_runlock(lock);
5409	rw_runlock(&pvh_global_lock);
5410	return (rv);
5411}
5412
5413/*
5414 *	pmap_is_modified:
5415 *
5416 *	Return whether or not the specified physical page was modified
5417 *	in any physical maps.
5418 */
5419boolean_t
5420pmap_is_modified(vm_page_t m)
5421{
5422
5423	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5424	    ("pmap_is_modified: page %p is not managed", m));
5425
5426	/*
5427	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
5428	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
5429	 * is clear, no PTEs can have PG_M set.
5430	 */
5431	VM_OBJECT_ASSERT_WLOCKED(m->object);
5432	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
5433		return (FALSE);
5434	return (pmap_page_test_mappings(m, FALSE, TRUE));
5435}
5436
5437/*
5438 *	pmap_is_prefaultable:
5439 *
5440 *	Return whether or not the specified virtual address is eligible
5441 *	for prefault.
5442 */
5443boolean_t
5444pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
5445{
5446	pd_entry_t *pde;
5447	pt_entry_t *pte, PG_V;
5448	boolean_t rv;
5449
5450	PG_V = pmap_valid_bit(pmap);
5451	rv = FALSE;
5452	PMAP_LOCK(pmap);
5453	pde = pmap_pde(pmap, addr);
5454	if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) {
5455		pte = pmap_pde_to_pte(pde, addr);
5456		rv = (*pte & PG_V) == 0;
5457	}
5458	PMAP_UNLOCK(pmap);
5459	return (rv);
5460}
5461
5462/*
5463 *	pmap_is_referenced:
5464 *
5465 *	Return whether or not the specified physical page was referenced
5466 *	in any physical maps.
5467 */
5468boolean_t
5469pmap_is_referenced(vm_page_t m)
5470{
5471
5472	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5473	    ("pmap_is_referenced: page %p is not managed", m));
5474	return (pmap_page_test_mappings(m, TRUE, FALSE));
5475}
5476
5477/*
5478 * Clear the write and modified bits in each of the given page's mappings.
5479 */
5480void
5481pmap_remove_write(vm_page_t m)
5482{
5483	struct md_page *pvh;
5484	pmap_t pmap;
5485	struct rwlock *lock;
5486	pv_entry_t next_pv, pv;
5487	pd_entry_t *pde;
5488	pt_entry_t oldpte, *pte, PG_M, PG_RW;
5489	vm_offset_t va;
5490	int pvh_gen, md_gen;
5491
5492	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5493	    ("pmap_remove_write: page %p is not managed", m));
5494
5495	/*
5496	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
5497	 * set by another thread while the object is locked.  Thus,
5498	 * if PGA_WRITEABLE is clear, no page table entries need updating.
5499	 */
5500	VM_OBJECT_ASSERT_WLOCKED(m->object);
5501	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
5502		return;
5503	rw_rlock(&pvh_global_lock);
5504	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5505	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5506retry_pv_loop:
5507	rw_wlock(lock);
5508	if ((m->flags & PG_FICTITIOUS) != 0)
5509		goto small_mappings;
5510	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
5511		pmap = PV_PMAP(pv);
5512		if (!PMAP_TRYLOCK(pmap)) {
5513			pvh_gen = pvh->pv_gen;
5514			rw_wunlock(lock);
5515			PMAP_LOCK(pmap);
5516			rw_wlock(lock);
5517			if (pvh_gen != pvh->pv_gen) {
5518				PMAP_UNLOCK(pmap);
5519				rw_wunlock(lock);
5520				goto retry_pv_loop;
5521			}
5522		}
5523		PG_RW = pmap_rw_bit(pmap);
5524		va = pv->pv_va;
5525		pde = pmap_pde(pmap, va);
5526		if ((*pde & PG_RW) != 0)
5527			(void)pmap_demote_pde_locked(pmap, pde, va, &lock);
5528		KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
5529		    ("inconsistent pv lock %p %p for page %p",
5530		    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
5531		PMAP_UNLOCK(pmap);
5532	}
5533small_mappings:
5534	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5535		pmap = PV_PMAP(pv);
5536		if (!PMAP_TRYLOCK(pmap)) {
5537			pvh_gen = pvh->pv_gen;
5538			md_gen = m->md.pv_gen;
5539			rw_wunlock(lock);
5540			PMAP_LOCK(pmap);
5541			rw_wlock(lock);
5542			if (pvh_gen != pvh->pv_gen ||
5543			    md_gen != m->md.pv_gen) {
5544				PMAP_UNLOCK(pmap);
5545				rw_wunlock(lock);
5546				goto retry_pv_loop;
5547			}
5548		}
5549		PG_M = pmap_modified_bit(pmap);
5550		PG_RW = pmap_rw_bit(pmap);
5551		pde = pmap_pde(pmap, pv->pv_va);
5552		KASSERT((*pde & PG_PS) == 0,
5553		    ("pmap_remove_write: found a 2mpage in page %p's pv list",
5554		    m));
5555		pte = pmap_pde_to_pte(pde, pv->pv_va);
5556retry:
5557		oldpte = *pte;
5558		if (oldpte & PG_RW) {
5559			if (!atomic_cmpset_long(pte, oldpte, oldpte &
5560			    ~(PG_RW | PG_M)))
5561				goto retry;
5562			if ((oldpte & PG_M) != 0)
5563				vm_page_dirty(m);
5564			pmap_invalidate_page(pmap, pv->pv_va);
5565		}
5566		PMAP_UNLOCK(pmap);
5567	}
5568	rw_wunlock(lock);
5569	vm_page_aflag_clear(m, PGA_WRITEABLE);
5570	rw_runlock(&pvh_global_lock);
5571}
5572
5573static __inline boolean_t
5574safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte)
5575{
5576
5577	if (!pmap_emulate_ad_bits(pmap))
5578		return (TRUE);
5579
5580	KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type));
5581
5582	/*
5583	 * RWX = 010 or 110 will cause an unconditional EPT misconfiguration
5584	 * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared
5585	 * if the EPT_PG_WRITE bit is set.
5586	 */
5587	if ((pte & EPT_PG_WRITE) != 0)
5588		return (FALSE);
5589
5590	/*
5591	 * RWX = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set.
5592	 */
5593	if ((pte & EPT_PG_EXECUTE) == 0 ||
5594	    ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0))
5595		return (TRUE);
5596	else
5597		return (FALSE);
5598}
5599
5600#define	PMAP_TS_REFERENCED_MAX	5
5601
5602/*
5603 *	pmap_ts_referenced:
5604 *
5605 *	Return a count of reference bits for a page, clearing those bits.
5606 *	It is not necessary for every reference bit to be cleared, but it
5607 *	is necessary that 0 only be returned when there are truly no
5608 *	reference bits set.
5609 *
5610 *	XXX: The exact number of bits to check and clear is a matter that
5611 *	should be tested and standardized at some point in the future for
5612 *	optimal aging of shared pages.
5613 */
5614int
5615pmap_ts_referenced(vm_page_t m)
5616{
5617	struct md_page *pvh;
5618	pv_entry_t pv, pvf;
5619	pmap_t pmap;
5620	struct rwlock *lock;
5621	pd_entry_t oldpde, *pde;
5622	pt_entry_t *pte, PG_A;
5623	vm_offset_t va;
5624	vm_paddr_t pa;
5625	int cleared, md_gen, not_cleared, pvh_gen;
5626	struct spglist free;
5627	boolean_t demoted;
5628
5629	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5630	    ("pmap_ts_referenced: page %p is not managed", m));
5631	SLIST_INIT(&free);
5632	cleared = 0;
5633	pa = VM_PAGE_TO_PHYS(m);
5634	lock = PHYS_TO_PV_LIST_LOCK(pa);
5635	pvh = pa_to_pvh(pa);
5636	rw_rlock(&pvh_global_lock);
5637	rw_wlock(lock);
5638retry:
5639	not_cleared = 0;
5640	if ((m->flags & PG_FICTITIOUS) != 0 ||
5641	    (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
5642		goto small_mappings;
5643	pv = pvf;
5644	do {
5645		if (pvf == NULL)
5646			pvf = pv;
5647		pmap = PV_PMAP(pv);
5648		if (!PMAP_TRYLOCK(pmap)) {
5649			pvh_gen = pvh->pv_gen;
5650			rw_wunlock(lock);
5651			PMAP_LOCK(pmap);
5652			rw_wlock(lock);
5653			if (pvh_gen != pvh->pv_gen) {
5654				PMAP_UNLOCK(pmap);
5655				goto retry;
5656			}
5657		}
5658		PG_A = pmap_accessed_bit(pmap);
5659		va = pv->pv_va;
5660		pde = pmap_pde(pmap, pv->pv_va);
5661		oldpde = *pde;
5662		if ((*pde & PG_A) != 0) {
5663			/*
5664			 * Since this reference bit is shared by 512 4KB
5665			 * pages, it should not be cleared every time it is
5666			 * tested.  Apply a simple "hash" function on the
5667			 * physical page number, the virtual superpage number,
5668			 * and the pmap address to select one 4KB page out of
5669			 * the 512 on which testing the reference bit will
5670			 * result in clearing that reference bit.  This
5671			 * function is designed to avoid the selection of the
5672			 * same 4KB page for every 2MB page mapping.
5673			 *
5674			 * On demotion, a mapping that hasn't been referenced
5675			 * is simply destroyed.  To avoid the possibility of a
5676			 * subsequent page fault on a demoted wired mapping,
5677			 * always leave its reference bit set.  Moreover,
5678			 * since the superpage is wired, the current state of
5679			 * its reference bit won't affect page replacement.
5680			 */
5681			if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^
5682			    (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
5683			    (*pde & PG_W) == 0) {
5684				if (safe_to_clear_referenced(pmap, oldpde)) {
5685					atomic_clear_long(pde, PG_A);
5686					pmap_invalidate_page(pmap, pv->pv_va);
5687					demoted = FALSE;
5688				} else if (pmap_demote_pde_locked(pmap, pde,
5689				    pv->pv_va, &lock)) {
5690					/*
5691					 * Remove the mapping to a single page
5692					 * so that a subsequent access may
5693					 * repromote.  Since the underlying
5694					 * page table page is fully populated,
5695					 * this removal never frees a page
5696					 * table page.
5697					 */
5698					demoted = TRUE;
5699					va += VM_PAGE_TO_PHYS(m) - (oldpde &
5700					    PG_PS_FRAME);
5701					pte = pmap_pde_to_pte(pde, va);
5702					pmap_remove_pte(pmap, pte, va, *pde,
5703					    NULL, &lock);
5704					pmap_invalidate_page(pmap, va);
5705				} else
5706					demoted = TRUE;
5707
5708				if (demoted) {
5709					/*
5710					 * The superpage mapping was removed
5711					 * entirely and therefore 'pv' is no
5712					 * longer valid.
5713					 */
5714					if (pvf == pv)
5715						pvf = NULL;
5716					pv = NULL;
5717				}
5718				cleared++;
5719				KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
5720				    ("inconsistent pv lock %p %p for page %p",
5721				    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
5722			} else
5723				not_cleared++;
5724		}
5725		PMAP_UNLOCK(pmap);
5726		/* Rotate the PV list if it has more than one entry. */
5727		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
5728			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
5729			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
5730			pvh->pv_gen++;
5731		}
5732		if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
5733			goto out;
5734	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
5735small_mappings:
5736	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
5737		goto out;
5738	pv = pvf;
5739	do {
5740		if (pvf == NULL)
5741			pvf = pv;
5742		pmap = PV_PMAP(pv);
5743		if (!PMAP_TRYLOCK(pmap)) {
5744			pvh_gen = pvh->pv_gen;
5745			md_gen = m->md.pv_gen;
5746			rw_wunlock(lock);
5747			PMAP_LOCK(pmap);
5748			rw_wlock(lock);
5749			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
5750				PMAP_UNLOCK(pmap);
5751				goto retry;
5752			}
5753		}
5754		PG_A = pmap_accessed_bit(pmap);
5755		pde = pmap_pde(pmap, pv->pv_va);
5756		KASSERT((*pde & PG_PS) == 0,
5757		    ("pmap_ts_referenced: found a 2mpage in page %p's pv list",
5758		    m));
5759		pte = pmap_pde_to_pte(pde, pv->pv_va);
5760		if ((*pte & PG_A) != 0) {
5761			if (safe_to_clear_referenced(pmap, *pte)) {
5762				atomic_clear_long(pte, PG_A);
5763				pmap_invalidate_page(pmap, pv->pv_va);
5764				cleared++;
5765			} else if ((*pte & PG_W) == 0) {
5766				/*
5767				 * Wired pages cannot be paged out so
5768				 * doing accessed bit emulation for
5769				 * them is wasted effort. We do the
5770				 * hard work for unwired pages only.
5771				 */
5772				pmap_remove_pte(pmap, pte, pv->pv_va,
5773				    *pde, &free, &lock);
5774				pmap_invalidate_page(pmap, pv->pv_va);
5775				cleared++;
5776				if (pvf == pv)
5777					pvf = NULL;
5778				pv = NULL;
5779				KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
5780				    ("inconsistent pv lock %p %p for page %p",
5781				    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
5782			} else
5783				not_cleared++;
5784		}
5785		PMAP_UNLOCK(pmap);
5786		/* Rotate the PV list if it has more than one entry. */
5787		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
5788			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
5789			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
5790			m->md.pv_gen++;
5791		}
5792	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
5793	    not_cleared < PMAP_TS_REFERENCED_MAX);
5794out:
5795	rw_wunlock(lock);
5796	rw_runlock(&pvh_global_lock);
5797	pmap_free_zero_pages(&free);
5798	return (cleared + not_cleared);
5799}
5800
5801/*
5802 *	Apply the given advice to the specified range of addresses within the
5803 *	given pmap.  Depending on the advice, clear the referenced and/or
5804 *	modified flags in each mapping and set the mapped page's dirty field.
5805 */
5806void
5807pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
5808{
5809	struct rwlock *lock;
5810	pml4_entry_t *pml4e;
5811	pdp_entry_t *pdpe;
5812	pd_entry_t oldpde, *pde;
5813	pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V;
5814	vm_offset_t va_next;
5815	vm_page_t m;
5816	boolean_t anychanged, pv_lists_locked;
5817
5818	if (advice != MADV_DONTNEED && advice != MADV_FREE)
5819		return;
5820
5821	/*
5822	 * A/D bit emulation requires an alternate code path when clearing
5823	 * the modified and accessed bits below. Since this function is
5824	 * advisory in nature we skip it entirely for pmaps that require
5825	 * A/D bit emulation.
5826	 */
5827	if (pmap_emulate_ad_bits(pmap))
5828		return;
5829
5830	PG_A = pmap_accessed_bit(pmap);
5831	PG_G = pmap_global_bit(pmap);
5832	PG_M = pmap_modified_bit(pmap);
5833	PG_V = pmap_valid_bit(pmap);
5834	PG_RW = pmap_rw_bit(pmap);
5835
5836	pv_lists_locked = FALSE;
5837resume:
5838	anychanged = FALSE;
5839	PMAP_LOCK(pmap);
5840	for (; sva < eva; sva = va_next) {
5841		pml4e = pmap_pml4e(pmap, sva);
5842		if ((*pml4e & PG_V) == 0) {
5843			va_next = (sva + NBPML4) & ~PML4MASK;
5844			if (va_next < sva)
5845				va_next = eva;
5846			continue;
5847		}
5848		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
5849		if ((*pdpe & PG_V) == 0) {
5850			va_next = (sva + NBPDP) & ~PDPMASK;
5851			if (va_next < sva)
5852				va_next = eva;
5853			continue;
5854		}
5855		va_next = (sva + NBPDR) & ~PDRMASK;
5856		if (va_next < sva)
5857			va_next = eva;
5858		pde = pmap_pdpe_to_pde(pdpe, sva);
5859		oldpde = *pde;
5860		if ((oldpde & PG_V) == 0)
5861			continue;
5862		else if ((oldpde & PG_PS) != 0) {
5863			if ((oldpde & PG_MANAGED) == 0)
5864				continue;
5865			if (!pv_lists_locked) {
5866				pv_lists_locked = TRUE;
5867				if (!rw_try_rlock(&pvh_global_lock)) {
5868					if (anychanged)
5869						pmap_invalidate_all(pmap);
5870					PMAP_UNLOCK(pmap);
5871					rw_rlock(&pvh_global_lock);
5872					goto resume;
5873				}
5874			}
5875			lock = NULL;
5876			if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) {
5877				if (lock != NULL)
5878					rw_wunlock(lock);
5879
5880				/*
5881				 * The large page mapping was destroyed.
5882				 */
5883				continue;
5884			}
5885
5886			/*
5887			 * Unless the page mappings are wired, remove the
5888			 * mapping to a single page so that a subsequent
5889			 * access may repromote.  Since the underlying page
5890			 * table page is fully populated, this removal never
5891			 * frees a page table page.
5892			 */
5893			if ((oldpde & PG_W) == 0) {
5894				pte = pmap_pde_to_pte(pde, sva);
5895				KASSERT((*pte & PG_V) != 0,
5896				    ("pmap_advise: invalid PTE"));
5897				pmap_remove_pte(pmap, pte, sva, *pde, NULL,
5898				    &lock);
5899				anychanged = TRUE;
5900			}
5901			if (lock != NULL)
5902				rw_wunlock(lock);
5903		}
5904		if (va_next > eva)
5905			va_next = eva;
5906		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
5907		    sva += PAGE_SIZE) {
5908			if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED |
5909			    PG_V))
5910				continue;
5911			else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
5912				if (advice == MADV_DONTNEED) {
5913					/*
5914					 * Future calls to pmap_is_modified()
5915					 * can be avoided by making the page
5916					 * dirty now.
5917					 */
5918					m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
5919					vm_page_dirty(m);
5920				}
5921				atomic_clear_long(pte, PG_M | PG_A);
5922			} else if ((*pte & PG_A) != 0)
5923				atomic_clear_long(pte, PG_A);
5924			else
5925				continue;
5926			if ((*pte & PG_G) != 0)
5927				pmap_invalidate_page(pmap, sva);
5928			else
5929				anychanged = TRUE;
5930		}
5931	}
5932	if (anychanged)
5933		pmap_invalidate_all(pmap);
5934	if (pv_lists_locked)
5935		rw_runlock(&pvh_global_lock);
5936	PMAP_UNLOCK(pmap);
5937}
5938
5939/*
5940 *	Clear the modify bits on the specified physical page.
5941 */
5942void
5943pmap_clear_modify(vm_page_t m)
5944{
5945	struct md_page *pvh;
5946	pmap_t pmap;
5947	pv_entry_t next_pv, pv;
5948	pd_entry_t oldpde, *pde;
5949	pt_entry_t oldpte, *pte, PG_M, PG_RW, PG_V;
5950	struct rwlock *lock;
5951	vm_offset_t va;
5952	int md_gen, pvh_gen;
5953
5954	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5955	    ("pmap_clear_modify: page %p is not managed", m));
5956	VM_OBJECT_ASSERT_WLOCKED(m->object);
5957	KASSERT(!vm_page_xbusied(m),
5958	    ("pmap_clear_modify: page %p is exclusive busied", m));
5959
5960	/*
5961	 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
5962	 * If the object containing the page is locked and the page is not
5963	 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
5964	 */
5965	if ((m->aflags & PGA_WRITEABLE) == 0)
5966		return;
5967	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5968	rw_rlock(&pvh_global_lock);
5969	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5970	rw_wlock(lock);
5971restart:
5972	if ((m->flags & PG_FICTITIOUS) != 0)
5973		goto small_mappings;
5974	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
5975		pmap = PV_PMAP(pv);
5976		if (!PMAP_TRYLOCK(pmap)) {
5977			pvh_gen = pvh->pv_gen;
5978			rw_wunlock(lock);
5979			PMAP_LOCK(pmap);
5980			rw_wlock(lock);
5981			if (pvh_gen != pvh->pv_gen) {
5982				PMAP_UNLOCK(pmap);
5983				goto restart;
5984			}
5985		}
5986		PG_M = pmap_modified_bit(pmap);
5987		PG_V = pmap_valid_bit(pmap);
5988		PG_RW = pmap_rw_bit(pmap);
5989		va = pv->pv_va;
5990		pde = pmap_pde(pmap, va);
5991		oldpde = *pde;
5992		if ((oldpde & PG_RW) != 0) {
5993			if (pmap_demote_pde_locked(pmap, pde, va, &lock)) {
5994				if ((oldpde & PG_W) == 0) {
5995					/*
5996					 * Write protect the mapping to a
5997					 * single page so that a subsequent
5998					 * write access may repromote.
5999					 */
6000					va += VM_PAGE_TO_PHYS(m) - (oldpde &
6001					    PG_PS_FRAME);
6002					pte = pmap_pde_to_pte(pde, va);
6003					oldpte = *pte;
6004					if ((oldpte & PG_V) != 0) {
6005						while (!atomic_cmpset_long(pte,
6006						    oldpte,
6007						    oldpte & ~(PG_M | PG_RW)))
6008							oldpte = *pte;
6009						vm_page_dirty(m);
6010						pmap_invalidate_page(pmap, va);
6011					}
6012				}
6013			}
6014		}
6015		PMAP_UNLOCK(pmap);
6016	}
6017small_mappings:
6018	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6019		pmap = PV_PMAP(pv);
6020		if (!PMAP_TRYLOCK(pmap)) {
6021			md_gen = m->md.pv_gen;
6022			pvh_gen = pvh->pv_gen;
6023			rw_wunlock(lock);
6024			PMAP_LOCK(pmap);
6025			rw_wlock(lock);
6026			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
6027				PMAP_UNLOCK(pmap);
6028				goto restart;
6029			}
6030		}
6031		PG_M = pmap_modified_bit(pmap);
6032		PG_RW = pmap_rw_bit(pmap);
6033		pde = pmap_pde(pmap, pv->pv_va);
6034		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
6035		    " a 2mpage in page %p's pv list", m));
6036		pte = pmap_pde_to_pte(pde, pv->pv_va);
6037		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
6038			atomic_clear_long(pte, PG_M);
6039			pmap_invalidate_page(pmap, pv->pv_va);
6040		}
6041		PMAP_UNLOCK(pmap);
6042	}
6043	rw_wunlock(lock);
6044	rw_runlock(&pvh_global_lock);
6045}
6046
6047/*
6048 * Miscellaneous support routines follow
6049 */
6050
6051/* Adjust the cache mode for a 4KB page mapped via a PTE. */
6052static __inline void
6053pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask)
6054{
6055	u_int opte, npte;
6056
6057	/*
6058	 * The cache mode bits are all in the low 32-bits of the
6059	 * PTE, so we can just spin on updating the low 32-bits.
6060	 */
6061	do {
6062		opte = *(u_int *)pte;
6063		npte = opte & ~mask;
6064		npte |= cache_bits;
6065	} while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
6066}
6067
6068/* Adjust the cache mode for a 2MB page mapped via a PDE. */
6069static __inline void
6070pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask)
6071{
6072	u_int opde, npde;
6073
6074	/*
6075	 * The cache mode bits are all in the low 32-bits of the
6076	 * PDE, so we can just spin on updating the low 32-bits.
6077	 */
6078	do {
6079		opde = *(u_int *)pde;
6080		npde = opde & ~mask;
6081		npde |= cache_bits;
6082	} while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
6083}
6084
6085/*
6086 * Map a set of physical memory pages into the kernel virtual
6087 * address space. Return a pointer to where it is mapped. This
6088 * routine is intended to be used for mapping device memory,
6089 * NOT real memory.
6090 */
6091void *
6092pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
6093{
6094	vm_offset_t va, offset;
6095	vm_size_t tmpsize;
6096
6097	/*
6098	 * If the specified range of physical addresses fits within the direct
6099	 * map window, use the direct map.
6100	 */
6101	if (pa < dmaplimit && pa + size < dmaplimit) {
6102		va = PHYS_TO_DMAP(pa);
6103		if (!pmap_change_attr(va, size, mode))
6104			return ((void *)va);
6105	}
6106	offset = pa & PAGE_MASK;
6107	size = round_page(offset + size);
6108	va = kva_alloc(size);
6109	if (!va)
6110		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
6111	pa = trunc_page(pa);
6112	for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
6113		pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
6114	pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
6115	pmap_invalidate_cache_range(va, va + tmpsize);
6116	return ((void *)(va + offset));
6117}
6118
6119void *
6120pmap_mapdev(vm_paddr_t pa, vm_size_t size)
6121{
6122
6123	return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
6124}
6125
6126void *
6127pmap_mapbios(vm_paddr_t pa, vm_size_t size)
6128{
6129
6130	return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
6131}
6132
6133void
6134pmap_unmapdev(vm_offset_t va, vm_size_t size)
6135{
6136	vm_offset_t base, offset;
6137
6138	/* If we gave a direct map region in pmap_mapdev, do nothing */
6139	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
6140		return;
6141	base = trunc_page(va);
6142	offset = va & PAGE_MASK;
6143	size = round_page(offset + size);
6144	kva_free(base, size);
6145}
6146
6147/*
6148 * Tries to demote a 1GB page mapping.
6149 */
6150static boolean_t
6151pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va)
6152{
6153	pdp_entry_t newpdpe, oldpdpe;
6154	pd_entry_t *firstpde, newpde, *pde;
6155	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
6156	vm_paddr_t mpdepa;
6157	vm_page_t mpde;
6158
6159	PG_A = pmap_accessed_bit(pmap);
6160	PG_M = pmap_modified_bit(pmap);
6161	PG_V = pmap_valid_bit(pmap);
6162	PG_RW = pmap_rw_bit(pmap);
6163
6164	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6165	oldpdpe = *pdpe;
6166	KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V),
6167	    ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V"));
6168	if ((mpde = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT |
6169	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
6170		CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx"
6171		    " in pmap %p", va, pmap);
6172		return (FALSE);
6173	}
6174	mpdepa = VM_PAGE_TO_PHYS(mpde);
6175	firstpde = (pd_entry_t *)PHYS_TO_DMAP(mpdepa);
6176	newpdpe = mpdepa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V;
6177	KASSERT((oldpdpe & PG_A) != 0,
6178	    ("pmap_demote_pdpe: oldpdpe is missing PG_A"));
6179	KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW,
6180	    ("pmap_demote_pdpe: oldpdpe is missing PG_M"));
6181	newpde = oldpdpe;
6182
6183	/*
6184	 * Initialize the page directory page.
6185	 */
6186	for (pde = firstpde; pde < firstpde + NPDEPG; pde++) {
6187		*pde = newpde;
6188		newpde += NBPDR;
6189	}
6190
6191	/*
6192	 * Demote the mapping.
6193	 */
6194	*pdpe = newpdpe;
6195
6196	/*
6197	 * Invalidate a stale recursive mapping of the page directory page.
6198	 */
6199	pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va));
6200
6201	pmap_pdpe_demotions++;
6202	CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx"
6203	    " in pmap %p", va, pmap);
6204	return (TRUE);
6205}
6206
6207/*
6208 * Sets the memory attribute for the specified page.
6209 */
6210void
6211pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
6212{
6213
6214	m->md.pat_mode = ma;
6215
6216	/*
6217	 * If "m" is a normal page, update its direct mapping.  This update
6218	 * can be relied upon to perform any cache operations that are
6219	 * required for data coherence.
6220	 */
6221	if ((m->flags & PG_FICTITIOUS) == 0 &&
6222	    pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
6223	    m->md.pat_mode))
6224		panic("memory attribute change on the direct map failed");
6225}
6226
6227/*
6228 * Changes the specified virtual address range's memory type to that given by
6229 * the parameter "mode".  The specified virtual address range must be
6230 * completely contained within either the direct map or the kernel map.  If
6231 * the virtual address range is contained within the kernel map, then the
6232 * memory type for each of the corresponding ranges of the direct map is also
6233 * changed.  (The corresponding ranges of the direct map are those ranges that
6234 * map the same physical pages as the specified virtual address range.)  These
6235 * changes to the direct map are necessary because Intel describes the
6236 * behavior of their processors as "undefined" if two or more mappings to the
6237 * same physical page have different memory types.
6238 *
6239 * Returns zero if the change completed successfully, and either EINVAL or
6240 * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
6241 * of the virtual address range was not mapped, and ENOMEM is returned if
6242 * there was insufficient memory available to complete the change.  In the
6243 * latter case, the memory type may have been changed on some part of the
6244 * virtual address range or the direct map.
6245 */
6246int
6247pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
6248{
6249	int error;
6250
6251	PMAP_LOCK(kernel_pmap);
6252	error = pmap_change_attr_locked(va, size, mode);
6253	PMAP_UNLOCK(kernel_pmap);
6254	return (error);
6255}
6256
6257static int
6258pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
6259{
6260	vm_offset_t base, offset, tmpva;
6261	vm_paddr_t pa_start, pa_end;
6262	pdp_entry_t *pdpe;
6263	pd_entry_t *pde;
6264	pt_entry_t *pte;
6265	int cache_bits_pte, cache_bits_pde, error;
6266	boolean_t changed;
6267
6268	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
6269	base = trunc_page(va);
6270	offset = va & PAGE_MASK;
6271	size = round_page(offset + size);
6272
6273	/*
6274	 * Only supported on kernel virtual addresses, including the direct
6275	 * map but excluding the recursive map.
6276	 */
6277	if (base < DMAP_MIN_ADDRESS)
6278		return (EINVAL);
6279
6280	cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1);
6281	cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0);
6282	changed = FALSE;
6283
6284	/*
6285	 * Pages that aren't mapped aren't supported.  Also break down 2MB pages
6286	 * into 4KB pages if required.
6287	 */
6288	for (tmpva = base; tmpva < base + size; ) {
6289		pdpe = pmap_pdpe(kernel_pmap, tmpva);
6290		if (*pdpe == 0)
6291			return (EINVAL);
6292		if (*pdpe & PG_PS) {
6293			/*
6294			 * If the current 1GB page already has the required
6295			 * memory type, then we need not demote this page. Just
6296			 * increment tmpva to the next 1GB page frame.
6297			 */
6298			if ((*pdpe & X86_PG_PDE_CACHE) == cache_bits_pde) {
6299				tmpva = trunc_1gpage(tmpva) + NBPDP;
6300				continue;
6301			}
6302
6303			/*
6304			 * If the current offset aligns with a 1GB page frame
6305			 * and there is at least 1GB left within the range, then
6306			 * we need not break down this page into 2MB pages.
6307			 */
6308			if ((tmpva & PDPMASK) == 0 &&
6309			    tmpva + PDPMASK < base + size) {
6310				tmpva += NBPDP;
6311				continue;
6312			}
6313			if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva))
6314				return (ENOMEM);
6315		}
6316		pde = pmap_pdpe_to_pde(pdpe, tmpva);
6317		if (*pde == 0)
6318			return (EINVAL);
6319		if (*pde & PG_PS) {
6320			/*
6321			 * If the current 2MB page already has the required
6322			 * memory type, then we need not demote this page. Just
6323			 * increment tmpva to the next 2MB page frame.
6324			 */
6325			if ((*pde & X86_PG_PDE_CACHE) == cache_bits_pde) {
6326				tmpva = trunc_2mpage(tmpva) + NBPDR;
6327				continue;
6328			}
6329
6330			/*
6331			 * If the current offset aligns with a 2MB page frame
6332			 * and there is at least 2MB left within the range, then
6333			 * we need not break down this page into 4KB pages.
6334			 */
6335			if ((tmpva & PDRMASK) == 0 &&
6336			    tmpva + PDRMASK < base + size) {
6337				tmpva += NBPDR;
6338				continue;
6339			}
6340			if (!pmap_demote_pde(kernel_pmap, pde, tmpva))
6341				return (ENOMEM);
6342		}
6343		pte = pmap_pde_to_pte(pde, tmpva);
6344		if (*pte == 0)
6345			return (EINVAL);
6346		tmpva += PAGE_SIZE;
6347	}
6348	error = 0;
6349
6350	/*
6351	 * Ok, all the pages exist, so run through them updating their
6352	 * cache mode if required.
6353	 */
6354	pa_start = pa_end = 0;
6355	for (tmpva = base; tmpva < base + size; ) {
6356		pdpe = pmap_pdpe(kernel_pmap, tmpva);
6357		if (*pdpe & PG_PS) {
6358			if ((*pdpe & X86_PG_PDE_CACHE) != cache_bits_pde) {
6359				pmap_pde_attr(pdpe, cache_bits_pde,
6360				    X86_PG_PDE_CACHE);
6361				changed = TRUE;
6362			}
6363			if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
6364				if (pa_start == pa_end) {
6365					/* Start physical address run. */
6366					pa_start = *pdpe & PG_PS_FRAME;
6367					pa_end = pa_start + NBPDP;
6368				} else if (pa_end == (*pdpe & PG_PS_FRAME))
6369					pa_end += NBPDP;
6370				else {
6371					/* Run ended, update direct map. */
6372					error = pmap_change_attr_locked(
6373					    PHYS_TO_DMAP(pa_start),
6374					    pa_end - pa_start, mode);
6375					if (error != 0)
6376						break;
6377					/* Start physical address run. */
6378					pa_start = *pdpe & PG_PS_FRAME;
6379					pa_end = pa_start + NBPDP;
6380				}
6381			}
6382			tmpva = trunc_1gpage(tmpva) + NBPDP;
6383			continue;
6384		}
6385		pde = pmap_pdpe_to_pde(pdpe, tmpva);
6386		if (*pde & PG_PS) {
6387			if ((*pde & X86_PG_PDE_CACHE) != cache_bits_pde) {
6388				pmap_pde_attr(pde, cache_bits_pde,
6389				    X86_PG_PDE_CACHE);
6390				changed = TRUE;
6391			}
6392			if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
6393				if (pa_start == pa_end) {
6394					/* Start physical address run. */
6395					pa_start = *pde & PG_PS_FRAME;
6396					pa_end = pa_start + NBPDR;
6397				} else if (pa_end == (*pde & PG_PS_FRAME))
6398					pa_end += NBPDR;
6399				else {
6400					/* Run ended, update direct map. */
6401					error = pmap_change_attr_locked(
6402					    PHYS_TO_DMAP(pa_start),
6403					    pa_end - pa_start, mode);
6404					if (error != 0)
6405						break;
6406					/* Start physical address run. */
6407					pa_start = *pde & PG_PS_FRAME;
6408					pa_end = pa_start + NBPDR;
6409				}
6410			}
6411			tmpva = trunc_2mpage(tmpva) + NBPDR;
6412		} else {
6413			pte = pmap_pde_to_pte(pde, tmpva);
6414			if ((*pte & X86_PG_PTE_CACHE) != cache_bits_pte) {
6415				pmap_pte_attr(pte, cache_bits_pte,
6416				    X86_PG_PTE_CACHE);
6417				changed = TRUE;
6418			}
6419			if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
6420				if (pa_start == pa_end) {
6421					/* Start physical address run. */
6422					pa_start = *pte & PG_FRAME;
6423					pa_end = pa_start + PAGE_SIZE;
6424				} else if (pa_end == (*pte & PG_FRAME))
6425					pa_end += PAGE_SIZE;
6426				else {
6427					/* Run ended, update direct map. */
6428					error = pmap_change_attr_locked(
6429					    PHYS_TO_DMAP(pa_start),
6430					    pa_end - pa_start, mode);
6431					if (error != 0)
6432						break;
6433					/* Start physical address run. */
6434					pa_start = *pte & PG_FRAME;
6435					pa_end = pa_start + PAGE_SIZE;
6436				}
6437			}
6438			tmpva += PAGE_SIZE;
6439		}
6440	}
6441	if (error == 0 && pa_start != pa_end)
6442		error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start),
6443		    pa_end - pa_start, mode);
6444
6445	/*
6446	 * Flush CPU caches if required to make sure any data isn't cached that
6447	 * shouldn't be, etc.
6448	 */
6449	if (changed) {
6450		pmap_invalidate_range(kernel_pmap, base, tmpva);
6451		pmap_invalidate_cache_range(base, tmpva);
6452	}
6453	return (error);
6454}
6455
6456/*
6457 * Demotes any mapping within the direct map region that covers more than the
6458 * specified range of physical addresses.  This range's size must be a power
6459 * of two and its starting address must be a multiple of its size.  Since the
6460 * demotion does not change any attributes of the mapping, a TLB invalidation
6461 * is not mandatory.  The caller may, however, request a TLB invalidation.
6462 */
6463void
6464pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate)
6465{
6466	pdp_entry_t *pdpe;
6467	pd_entry_t *pde;
6468	vm_offset_t va;
6469	boolean_t changed;
6470
6471	if (len == 0)
6472		return;
6473	KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2"));
6474	KASSERT((base & (len - 1)) == 0,
6475	    ("pmap_demote_DMAP: base is not a multiple of len"));
6476	if (len < NBPDP && base < dmaplimit) {
6477		va = PHYS_TO_DMAP(base);
6478		changed = FALSE;
6479		PMAP_LOCK(kernel_pmap);
6480		pdpe = pmap_pdpe(kernel_pmap, va);
6481		if ((*pdpe & X86_PG_V) == 0)
6482			panic("pmap_demote_DMAP: invalid PDPE");
6483		if ((*pdpe & PG_PS) != 0) {
6484			if (!pmap_demote_pdpe(kernel_pmap, pdpe, va))
6485				panic("pmap_demote_DMAP: PDPE failed");
6486			changed = TRUE;
6487		}
6488		if (len < NBPDR) {
6489			pde = pmap_pdpe_to_pde(pdpe, va);
6490			if ((*pde & X86_PG_V) == 0)
6491				panic("pmap_demote_DMAP: invalid PDE");
6492			if ((*pde & PG_PS) != 0) {
6493				if (!pmap_demote_pde(kernel_pmap, pde, va))
6494					panic("pmap_demote_DMAP: PDE failed");
6495				changed = TRUE;
6496			}
6497		}
6498		if (changed && invalidate)
6499			pmap_invalidate_page(kernel_pmap, va);
6500		PMAP_UNLOCK(kernel_pmap);
6501	}
6502}
6503
6504/*
6505 * perform the pmap work for mincore
6506 */
6507int
6508pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
6509{
6510	pd_entry_t *pdep;
6511	pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V;
6512	vm_paddr_t pa;
6513	int val;
6514
6515	PG_A = pmap_accessed_bit(pmap);
6516	PG_M = pmap_modified_bit(pmap);
6517	PG_V = pmap_valid_bit(pmap);
6518	PG_RW = pmap_rw_bit(pmap);
6519
6520	PMAP_LOCK(pmap);
6521retry:
6522	pdep = pmap_pde(pmap, addr);
6523	if (pdep != NULL && (*pdep & PG_V)) {
6524		if (*pdep & PG_PS) {
6525			pte = *pdep;
6526			/* Compute the physical address of the 4KB page. */
6527			pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
6528			    PG_FRAME;
6529			val = MINCORE_SUPER;
6530		} else {
6531			pte = *pmap_pde_to_pte(pdep, addr);
6532			pa = pte & PG_FRAME;
6533			val = 0;
6534		}
6535	} else {
6536		pte = 0;
6537		pa = 0;
6538		val = 0;
6539	}
6540	if ((pte & PG_V) != 0) {
6541		val |= MINCORE_INCORE;
6542		if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
6543			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
6544		if ((pte & PG_A) != 0)
6545			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
6546	}
6547	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
6548	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
6549	    (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
6550		/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
6551		if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
6552			goto retry;
6553	} else
6554		PA_UNLOCK_COND(*locked_pa);
6555	PMAP_UNLOCK(pmap);
6556	return (val);
6557}
6558
6559void
6560pmap_activate(struct thread *td)
6561{
6562	pmap_t	pmap, oldpmap;
6563	u_int	cpuid;
6564
6565	critical_enter();
6566	pmap = vmspace_pmap(td->td_proc->p_vmspace);
6567	oldpmap = PCPU_GET(curpmap);
6568	cpuid = PCPU_GET(cpuid);
6569#ifdef SMP
6570	CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
6571	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
6572	CPU_SET_ATOMIC(cpuid, &pmap->pm_save);
6573#else
6574	CPU_CLR(cpuid, &oldpmap->pm_active);
6575	CPU_SET(cpuid, &pmap->pm_active);
6576	CPU_SET(cpuid, &pmap->pm_save);
6577#endif
6578	td->td_pcb->pcb_cr3 = pmap->pm_cr3;
6579	load_cr3(pmap->pm_cr3);
6580	PCPU_SET(curpmap, pmap);
6581	critical_exit();
6582}
6583
6584void
6585pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
6586{
6587}
6588
6589/*
6590 *	Increase the starting virtual address of the given mapping if a
6591 *	different alignment might result in more superpage mappings.
6592 */
6593void
6594pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
6595    vm_offset_t *addr, vm_size_t size)
6596{
6597	vm_offset_t superpage_offset;
6598
6599	if (size < NBPDR)
6600		return;
6601	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
6602		offset += ptoa(object->pg_color);
6603	superpage_offset = offset & PDRMASK;
6604	if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
6605	    (*addr & PDRMASK) == superpage_offset)
6606		return;
6607	if ((*addr & PDRMASK) < superpage_offset)
6608		*addr = (*addr & ~PDRMASK) + superpage_offset;
6609	else
6610		*addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
6611}
6612
6613#ifdef INVARIANTS
6614static unsigned long num_dirty_emulations;
6615SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW,
6616	     &num_dirty_emulations, 0, NULL);
6617
6618static unsigned long num_accessed_emulations;
6619SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW,
6620	     &num_accessed_emulations, 0, NULL);
6621
6622static unsigned long num_superpage_accessed_emulations;
6623SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW,
6624	     &num_superpage_accessed_emulations, 0, NULL);
6625
6626static unsigned long ad_emulation_superpage_promotions;
6627SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW,
6628	     &ad_emulation_superpage_promotions, 0, NULL);
6629#endif	/* INVARIANTS */
6630
6631int
6632pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype)
6633{
6634	int rv;
6635	struct rwlock *lock;
6636	vm_page_t m, mpte;
6637	pd_entry_t *pde;
6638	pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V;
6639	boolean_t pv_lists_locked;
6640
6641	KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE,
6642	    ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype));
6643
6644	if (!pmap_emulate_ad_bits(pmap))
6645		return (-1);
6646
6647	PG_A = pmap_accessed_bit(pmap);
6648	PG_M = pmap_modified_bit(pmap);
6649	PG_V = pmap_valid_bit(pmap);
6650	PG_RW = pmap_rw_bit(pmap);
6651
6652	rv = -1;
6653	lock = NULL;
6654	pv_lists_locked = FALSE;
6655retry:
6656	PMAP_LOCK(pmap);
6657
6658	pde = pmap_pde(pmap, va);
6659	if (pde == NULL || (*pde & PG_V) == 0)
6660		goto done;
6661
6662	if ((*pde & PG_PS) != 0) {
6663		if (ftype == VM_PROT_READ) {
6664#ifdef INVARIANTS
6665			atomic_add_long(&num_superpage_accessed_emulations, 1);
6666#endif
6667			*pde |= PG_A;
6668			rv = 0;
6669		}
6670		goto done;
6671	}
6672
6673	pte = pmap_pde_to_pte(pde, va);
6674	if ((*pte & PG_V) == 0)
6675		goto done;
6676
6677	if (ftype == VM_PROT_WRITE) {
6678		if ((*pte & PG_RW) == 0)
6679			goto done;
6680		*pte |= PG_M;
6681	}
6682	*pte |= PG_A;
6683
6684	/* try to promote the mapping */
6685	if (va < VM_MAXUSER_ADDRESS)
6686		mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
6687	else
6688		mpte = NULL;
6689
6690	m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
6691
6692	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
6693	    pmap_ps_enabled(pmap) &&
6694	    (m->flags & PG_FICTITIOUS) == 0 &&
6695	    vm_reserv_level_iffullpop(m) == 0) {
6696		if (!pv_lists_locked) {
6697			pv_lists_locked = TRUE;
6698			if (!rw_try_rlock(&pvh_global_lock)) {
6699				PMAP_UNLOCK(pmap);
6700				rw_rlock(&pvh_global_lock);
6701				goto retry;
6702			}
6703		}
6704		pmap_promote_pde(pmap, pde, va, &lock);
6705#ifdef INVARIANTS
6706		atomic_add_long(&ad_emulation_superpage_promotions, 1);
6707#endif
6708	}
6709#ifdef INVARIANTS
6710	if (ftype == VM_PROT_WRITE)
6711		atomic_add_long(&num_dirty_emulations, 1);
6712	else
6713		atomic_add_long(&num_accessed_emulations, 1);
6714#endif
6715	rv = 0;		/* success */
6716done:
6717	if (lock != NULL)
6718		rw_wunlock(lock);
6719	if (pv_lists_locked)
6720		rw_runlock(&pvh_global_lock);
6721	PMAP_UNLOCK(pmap);
6722	return (rv);
6723}
6724
6725void
6726pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num)
6727{
6728	pml4_entry_t *pml4;
6729	pdp_entry_t *pdp;
6730	pd_entry_t *pde;
6731	pt_entry_t *pte, PG_V;
6732	int idx;
6733
6734	idx = 0;
6735	PG_V = pmap_valid_bit(pmap);
6736	PMAP_LOCK(pmap);
6737
6738	pml4 = pmap_pml4e(pmap, va);
6739	ptr[idx++] = *pml4;
6740	if ((*pml4 & PG_V) == 0)
6741		goto done;
6742
6743	pdp = pmap_pml4e_to_pdpe(pml4, va);
6744	ptr[idx++] = *pdp;
6745	if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0)
6746		goto done;
6747
6748	pde = pmap_pdpe_to_pde(pdp, va);
6749	ptr[idx++] = *pde;
6750	if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0)
6751		goto done;
6752
6753	pte = pmap_pde_to_pte(pde, va);
6754	ptr[idx++] = *pte;
6755
6756done:
6757	PMAP_UNLOCK(pmap);
6758	*num = idx;
6759}
6760
6761#include "opt_ddb.h"
6762#ifdef DDB
6763#include <ddb/ddb.h>
6764
6765DB_SHOW_COMMAND(pte, pmap_print_pte)
6766{
6767	pmap_t pmap;
6768	pml4_entry_t *pml4;
6769	pdp_entry_t *pdp;
6770	pd_entry_t *pde;
6771	pt_entry_t *pte, PG_V;
6772	vm_offset_t va;
6773
6774	if (have_addr) {
6775		va = (vm_offset_t)addr;
6776		pmap = PCPU_GET(curpmap); /* XXX */
6777	} else {
6778		db_printf("show pte addr\n");
6779		return;
6780	}
6781	PG_V = pmap_valid_bit(pmap);
6782	pml4 = pmap_pml4e(pmap, va);
6783	db_printf("VA %#016lx pml4e %#016lx", va, *pml4);
6784	if ((*pml4 & PG_V) == 0) {
6785		db_printf("\n");
6786		return;
6787	}
6788	pdp = pmap_pml4e_to_pdpe(pml4, va);
6789	db_printf(" pdpe %#016lx", *pdp);
6790	if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) {
6791		db_printf("\n");
6792		return;
6793	}
6794	pde = pmap_pdpe_to_pde(pdp, va);
6795	db_printf(" pde %#016lx", *pde);
6796	if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) {
6797		db_printf("\n");
6798		return;
6799	}
6800	pte = pmap_pde_to_pte(pde, va);
6801	db_printf(" pte %#016lx\n", *pte);
6802}
6803
6804DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap)
6805{
6806	vm_paddr_t a;
6807
6808	if (have_addr) {
6809		a = (vm_paddr_t)addr;
6810		db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a));
6811	} else {
6812		db_printf("show phys2dmap addr\n");
6813	}
6814}
6815#endif
6816